blob: 52354fad9a3c4dc38509d8ba643a66be194bf6c6 [file] [log] [blame]
diff --git a/source/i18n/collationdatabuilder.cpp b/source/i18n/collationdatabuilder.cpp
index b10de993..d6ef5171 100644
--- a/source/i18n/collationdatabuilder.cpp
+++ b/source/i18n/collationdatabuilder.cpp
@@ -86,13 +86,25 @@ struct ConditionalCE32 : public UMemory {
* When fetching CEs from the builder, the contexts are built into their runtime form
* so that the normal collation implementation can process them.
* The result is cached in the list head. It is reset when the contexts are modified.
+ * All of these builtCE32 are invalidated by clearContexts(),
+ * via incrementing the contextsEra.
*/
uint32_t builtCE32;
+ /**
+ * The "era" of building intermediate contexts when the above builtCE32 was set.
+ * When the array of cached, temporary contexts overflows, then clearContexts()
+ * removes them all and invalidates the builtCE32 that used to point to built tries.
+ */
+ int32_t era = -1;
/**
* Index of the next ConditionalCE32.
* Negative for the end of the list.
*/
int32_t next;
+ // Note: We could create a separate class for all of the contextual mappings for
+ // a code point, with the builtCE32, the era, and a list of the actual mappings.
+ // The class that represents one mapping would then not need to
+ // store those fields in each element.
};
U_CDECL_BEGIN
@@ -267,7 +279,7 @@ DataBuilderCollationIterator::getCE32FromBuilderData(uint32_t ce32, UErrorCode &
// TODO: ICU-21531 figure out why this happens.
return 0;
}
- if(cond->builtCE32 == Collation::NO_CE32) {
+ if(cond->builtCE32 == Collation::NO_CE32 || cond->era != builder.contextsEra) {
// Build the context-sensitive mappings into their runtime form and cache the result.
cond->builtCE32 = builder.buildContext(cond, errorCode);
if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
@@ -275,6 +287,7 @@ DataBuilderCollationIterator::getCE32FromBuilderData(uint32_t ce32, UErrorCode &
builder.clearContexts();
cond->builtCE32 = builder.buildContext(cond, errorCode);
}
+ cond->era = builder.contextsEra;
builderData.contexts = builder.contexts.getBuffer();
}
return cond->builtCE32;
@@ -1322,13 +1335,10 @@ CollationDataBuilder::buildMappings(CollationData &data, UErrorCode &errorCode)
void
CollationDataBuilder::clearContexts() {
contexts.remove();
- UnicodeSetIterator iter(contextChars);
- while(iter.next()) {
- U_ASSERT(!iter.isString());
- uint32_t ce32 = utrie2_get32(trie, iter.getCodepoint());
- U_ASSERT(isBuilderContextCE32(ce32));
- getConditionalCE32ForCE32(ce32)->builtCE32 = Collation::NO_CE32;
- }
+ // Incrementing the contexts build "era" invalidates all of the builtCE32
+ // from before this clearContexts() call.
+ // Simpler than finding and resetting all of those fields.
+ ++contextsEra;
}
void
@@ -1336,7 +1346,7 @@ CollationDataBuilder::buildContexts(UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return; }
// Ignore abandoned lists and the cached builtCE32,
// and build all contexts from scratch.
- contexts.remove();
+ clearContexts();
UnicodeSetIterator iter(contextChars);
while(U_SUCCESS(errorCode) && iter.next()) {
U_ASSERT(!iter.isString());
@@ -1362,18 +1372,34 @@ CollationDataBuilder::buildContext(ConditionalCE32 *head, UErrorCode &errorCode)
U_ASSERT(head->next >= 0);
UCharsTrieBuilder prefixBuilder(errorCode);
UCharsTrieBuilder contractionBuilder(errorCode);
+ // This outer loop goes from each prefix to the next.
+ // For each prefix it finds the one or more same-prefix entries (firstCond..lastCond).
+ // If there are multiple suffixes for the same prefix,
+ // then an inner loop builds a contraction trie for them.
for(ConditionalCE32 *cond = head;; cond = getConditionalCE32(cond->next)) {
+ if(U_FAILURE(errorCode)) { return 0; } // early out for memory allocation errors
// After the list head, the prefix or suffix can be empty, but not both.
U_ASSERT(cond == head || cond->hasContext());
int32_t prefixLength = cond->prefixLength();
UnicodeString prefix(cond->context, 0, prefixLength + 1);
// Collect all contraction suffixes for one prefix.
ConditionalCE32 *firstCond = cond;
- ConditionalCE32 *lastCond = cond;
- while(cond->next >= 0 &&
- (cond = getConditionalCE32(cond->next))->context.startsWith(prefix)) {
+ ConditionalCE32 *lastCond;
+ do {
lastCond = cond;
- }
+ // Clear the defaultCE32 fields as we go.
+ // They are left over from building a previous version of this list of contexts.
+ //
+ // One of the code paths below may copy a preceding defaultCE32
+ // into its emptySuffixCE32.
+ // If a new suffix has been inserted before what used to be
+ // the firstCond for its prefix, then that previous firstCond could still
+ // contain an outdated defaultCE32 from an earlier buildContext() and
+ // result in an incorrect emptySuffixCE32.
+ // So we reset all defaultCE32 before reading and setting new values.
+ cond->defaultCE32 = Collation::NO_CE32;
+ } while(cond->next >= 0 &&
+ (cond = getConditionalCE32(cond->next))->context.startsWith(prefix));
uint32_t ce32;
int32_t suffixStart = prefixLength + 1; // == prefix.length()
if(lastCond->context.length() == suffixStart) {
diff --git a/source/i18n/collationdatabuilder.h b/source/i18n/collationdatabuilder.h
index 6ae77772..4b981118 100644
--- a/source/i18n/collationdatabuilder.h
+++ b/source/i18n/collationdatabuilder.h
@@ -244,6 +244,15 @@ protected:
UnicodeSet contextChars;
// Serialized UCharsTrie structures for finalized contexts.
UnicodeString contexts;
+private:
+ /**
+ * The "era" of building intermediate contexts.
+ * When the array of cached, temporary contexts overflows, then clearContexts()
+ * removes them all and invalidates the builtCE32 that used to point to built tries.
+ * See ConditionalCE32::era.
+ */
+ int32_t contextsEra = 0;
+protected:
UnicodeSet unsafeBackwardSet;
UBool modified;
diff --git a/source/test/intltest/collationtest.cpp b/source/test/intltest/collationtest.cpp
index 4ce9ada5..5cc45a54 100644
--- a/source/test/intltest/collationtest.cpp
+++ b/source/test/intltest/collationtest.cpp
@@ -79,6 +79,7 @@ public:
void TestTailoredElements();
void TestDataDriven();
void TestLongLocale();
+ void TestBuilderContextsOverflow();
private:
void checkFCD(const char *name, CollationIterator &ci, CodePointIterator &cpi);
@@ -150,6 +151,7 @@ void CollationTest::runIndexedTest(int32_t index, UBool exec, const char *&name,
TESTCASE_AUTO(TestTailoredElements);
TESTCASE_AUTO(TestDataDriven);
TESTCASE_AUTO(TestLongLocale);
+ TESTCASE_AUTO(TestBuilderContextsOverflow);
TESTCASE_AUTO_END;
}
@@ -1862,4 +1864,32 @@ void CollationTest::TestLongLocale() {
LocalPointer<Collator> coll(Collator::createInstance(longLocale, errorCode));
}
+void CollationTest::TestBuilderContextsOverflow() {
+ IcuTestErrorCode errorCode(*this, "TestBuilderContextsOverflow");
+ // ICU-20715: Bad memory access in what looks like a bogus CharsTrie after
+ // intermediate contextual-mappings data overflowed.
+ // Caused by the CollationDataBuilder using some outdated values when building
+ // contextual mappings with both prefix and contraction matching.
+ // Fixed by resetting those outdated values before code looks at them.
+ char16_t rules[] = {
+ u'&', 0x10, 0x2ff, 0x503c, 0x4617,
+ u'=', 0x80, 0x4f7f, 0xff, 0x3c3d, 0x1c4f, 0x3c3c,
+ u'<', 0, 0, 0, 0, u'|', 0, 0, 0, 0, 0, 0xf400, 0x30ff, 0, 0, 0x4f7f, 0xff,
+ u'=', 0, u'|', 0, 0, 0, 0, 0, 0, 0x1f00, 0xe30,
+ 0x3035, 0, 0, 0xd200, 0, 0x7f00, 0xff4f, 0x3d00, 0, 0x7c00,
+ 0, 0, 0, 0, 0, 0, 0, 0x301f, 0x350e, 0x30,
+ 0, 0, 0xd2, 0x7c00, 0, 0, 0, 0, 0, 0,
+ 0, 0x301f, 0x350e, 0x30, 0, 0, 0x52d2, 0x2f3c, 0x5552, 0x493c,
+ 0x1f10, 0x1f50, 0x300, 0, 0, 0xf400, 0x30ff, 0, 0, 0x4f7f,
+ 0xff,
+ u'=', 0, u'|', 0, 0, 0, 0, 0x5000, 0x4617,
+ u'=', 0x80, 0x4f7f, 0, 0, 0xd200, 0
+ };
+ UnicodeString s(false, rules, UPRV_LENGTHOF(rules));
+ LocalPointer<Collator> coll(new RuleBasedCollator(s, errorCode), errorCode);
+ if(errorCode.isSuccess()) {
+ logln("successfully built the Collator");
+ }
+}
+
#endif // !UCONFIG_NO_COLLATION