blob: 52354fad9a3c4dc38509d8ba643a66be194bf6c6 [file] [log] [blame]
Frank Tanga47bd432022-04-27 21:18:38 -07001diff --git a/source/i18n/collationdatabuilder.cpp b/source/i18n/collationdatabuilder.cpp
2index b10de993..d6ef5171 100644
3--- a/source/i18n/collationdatabuilder.cpp
4+++ b/source/i18n/collationdatabuilder.cpp
5@@ -86,13 +86,25 @@ struct ConditionalCE32 : public UMemory {
6 * When fetching CEs from the builder, the contexts are built into their runtime form
7 * so that the normal collation implementation can process them.
8 * The result is cached in the list head. It is reset when the contexts are modified.
9+ * All of these builtCE32 are invalidated by clearContexts(),
10+ * via incrementing the contextsEra.
11 */
12 uint32_t builtCE32;
13+ /**
14+ * The "era" of building intermediate contexts when the above builtCE32 was set.
15+ * When the array of cached, temporary contexts overflows, then clearContexts()
16+ * removes them all and invalidates the builtCE32 that used to point to built tries.
17+ */
18+ int32_t era = -1;
19 /**
20 * Index of the next ConditionalCE32.
21 * Negative for the end of the list.
22 */
23 int32_t next;
24+ // Note: We could create a separate class for all of the contextual mappings for
25+ // a code point, with the builtCE32, the era, and a list of the actual mappings.
26+ // The class that represents one mapping would then not need to
27+ // store those fields in each element.
28 };
29
30 U_CDECL_BEGIN
31@@ -267,7 +279,7 @@ DataBuilderCollationIterator::getCE32FromBuilderData(uint32_t ce32, UErrorCode &
32 // TODO: ICU-21531 figure out why this happens.
33 return 0;
34 }
35- if(cond->builtCE32 == Collation::NO_CE32) {
36+ if(cond->builtCE32 == Collation::NO_CE32 || cond->era != builder.contextsEra) {
37 // Build the context-sensitive mappings into their runtime form and cache the result.
38 cond->builtCE32 = builder.buildContext(cond, errorCode);
39 if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
40@@ -275,6 +287,7 @@ DataBuilderCollationIterator::getCE32FromBuilderData(uint32_t ce32, UErrorCode &
41 builder.clearContexts();
42 cond->builtCE32 = builder.buildContext(cond, errorCode);
43 }
44+ cond->era = builder.contextsEra;
45 builderData.contexts = builder.contexts.getBuffer();
46 }
47 return cond->builtCE32;
48@@ -1322,13 +1335,10 @@ CollationDataBuilder::buildMappings(CollationData &data, UErrorCode &errorCode)
49 void
50 CollationDataBuilder::clearContexts() {
51 contexts.remove();
52- UnicodeSetIterator iter(contextChars);
53- while(iter.next()) {
54- U_ASSERT(!iter.isString());
55- uint32_t ce32 = utrie2_get32(trie, iter.getCodepoint());
56- U_ASSERT(isBuilderContextCE32(ce32));
57- getConditionalCE32ForCE32(ce32)->builtCE32 = Collation::NO_CE32;
58- }
59+ // Incrementing the contexts build "era" invalidates all of the builtCE32
60+ // from before this clearContexts() call.
61+ // Simpler than finding and resetting all of those fields.
62+ ++contextsEra;
63 }
64
65 void
66@@ -1336,7 +1346,7 @@ CollationDataBuilder::buildContexts(UErrorCode &errorCode) {
67 if(U_FAILURE(errorCode)) { return; }
68 // Ignore abandoned lists and the cached builtCE32,
69 // and build all contexts from scratch.
70- contexts.remove();
71+ clearContexts();
72 UnicodeSetIterator iter(contextChars);
73 while(U_SUCCESS(errorCode) && iter.next()) {
74 U_ASSERT(!iter.isString());
75@@ -1362,18 +1372,34 @@ CollationDataBuilder::buildContext(ConditionalCE32 *head, UErrorCode &errorCode)
76 U_ASSERT(head->next >= 0);
77 UCharsTrieBuilder prefixBuilder(errorCode);
78 UCharsTrieBuilder contractionBuilder(errorCode);
79+ // This outer loop goes from each prefix to the next.
80+ // For each prefix it finds the one or more same-prefix entries (firstCond..lastCond).
81+ // If there are multiple suffixes for the same prefix,
82+ // then an inner loop builds a contraction trie for them.
83 for(ConditionalCE32 *cond = head;; cond = getConditionalCE32(cond->next)) {
84+ if(U_FAILURE(errorCode)) { return 0; } // early out for memory allocation errors
85 // After the list head, the prefix or suffix can be empty, but not both.
86 U_ASSERT(cond == head || cond->hasContext());
87 int32_t prefixLength = cond->prefixLength();
88 UnicodeString prefix(cond->context, 0, prefixLength + 1);
89 // Collect all contraction suffixes for one prefix.
90 ConditionalCE32 *firstCond = cond;
91- ConditionalCE32 *lastCond = cond;
92- while(cond->next >= 0 &&
93- (cond = getConditionalCE32(cond->next))->context.startsWith(prefix)) {
94+ ConditionalCE32 *lastCond;
95+ do {
96 lastCond = cond;
97- }
98+ // Clear the defaultCE32 fields as we go.
99+ // They are left over from building a previous version of this list of contexts.
100+ //
101+ // One of the code paths below may copy a preceding defaultCE32
102+ // into its emptySuffixCE32.
103+ // If a new suffix has been inserted before what used to be
104+ // the firstCond for its prefix, then that previous firstCond could still
105+ // contain an outdated defaultCE32 from an earlier buildContext() and
106+ // result in an incorrect emptySuffixCE32.
107+ // So we reset all defaultCE32 before reading and setting new values.
108+ cond->defaultCE32 = Collation::NO_CE32;
109+ } while(cond->next >= 0 &&
110+ (cond = getConditionalCE32(cond->next))->context.startsWith(prefix));
111 uint32_t ce32;
112 int32_t suffixStart = prefixLength + 1; // == prefix.length()
113 if(lastCond->context.length() == suffixStart) {
114diff --git a/source/i18n/collationdatabuilder.h b/source/i18n/collationdatabuilder.h
115index 6ae77772..4b981118 100644
116--- a/source/i18n/collationdatabuilder.h
117+++ b/source/i18n/collationdatabuilder.h
118@@ -244,6 +244,15 @@ protected:
119 UnicodeSet contextChars;
120 // Serialized UCharsTrie structures for finalized contexts.
121 UnicodeString contexts;
122+private:
123+ /**
124+ * The "era" of building intermediate contexts.
125+ * When the array of cached, temporary contexts overflows, then clearContexts()
126+ * removes them all and invalidates the builtCE32 that used to point to built tries.
127+ * See ConditionalCE32::era.
128+ */
129+ int32_t contextsEra = 0;
130+protected:
131 UnicodeSet unsafeBackwardSet;
132 UBool modified;
133
134diff --git a/source/test/intltest/collationtest.cpp b/source/test/intltest/collationtest.cpp
135index 4ce9ada5..5cc45a54 100644
136--- a/source/test/intltest/collationtest.cpp
137+++ b/source/test/intltest/collationtest.cpp
138@@ -79,6 +79,7 @@ public:
139 void TestTailoredElements();
140 void TestDataDriven();
141 void TestLongLocale();
142+ void TestBuilderContextsOverflow();
143
144 private:
145 void checkFCD(const char *name, CollationIterator &ci, CodePointIterator &cpi);
146@@ -150,6 +151,7 @@ void CollationTest::runIndexedTest(int32_t index, UBool exec, const char *&name,
147 TESTCASE_AUTO(TestTailoredElements);
148 TESTCASE_AUTO(TestDataDriven);
149 TESTCASE_AUTO(TestLongLocale);
150+ TESTCASE_AUTO(TestBuilderContextsOverflow);
151 TESTCASE_AUTO_END;
152 }
153
154@@ -1862,4 +1864,32 @@ void CollationTest::TestLongLocale() {
155 LocalPointer<Collator> coll(Collator::createInstance(longLocale, errorCode));
156 }
157
158+void CollationTest::TestBuilderContextsOverflow() {
159+ IcuTestErrorCode errorCode(*this, "TestBuilderContextsOverflow");
160+ // ICU-20715: Bad memory access in what looks like a bogus CharsTrie after
161+ // intermediate contextual-mappings data overflowed.
162+ // Caused by the CollationDataBuilder using some outdated values when building
163+ // contextual mappings with both prefix and contraction matching.
164+ // Fixed by resetting those outdated values before code looks at them.
165+ char16_t rules[] = {
166+ u'&', 0x10, 0x2ff, 0x503c, 0x4617,
167+ u'=', 0x80, 0x4f7f, 0xff, 0x3c3d, 0x1c4f, 0x3c3c,
168+ u'<', 0, 0, 0, 0, u'|', 0, 0, 0, 0, 0, 0xf400, 0x30ff, 0, 0, 0x4f7f, 0xff,
169+ u'=', 0, u'|', 0, 0, 0, 0, 0, 0, 0x1f00, 0xe30,
170+ 0x3035, 0, 0, 0xd200, 0, 0x7f00, 0xff4f, 0x3d00, 0, 0x7c00,
171+ 0, 0, 0, 0, 0, 0, 0, 0x301f, 0x350e, 0x30,
172+ 0, 0, 0xd2, 0x7c00, 0, 0, 0, 0, 0, 0,
173+ 0, 0x301f, 0x350e, 0x30, 0, 0, 0x52d2, 0x2f3c, 0x5552, 0x493c,
174+ 0x1f10, 0x1f50, 0x300, 0, 0, 0xf400, 0x30ff, 0, 0, 0x4f7f,
175+ 0xff,
176+ u'=', 0, u'|', 0, 0, 0, 0, 0x5000, 0x4617,
177+ u'=', 0x80, 0x4f7f, 0, 0, 0xd200, 0
178+ };
179+ UnicodeString s(false, rules, UPRV_LENGTHOF(rules));
180+ LocalPointer<Collator> coll(new RuleBasedCollator(s, errorCode), errorCode);
181+ if(errorCode.isSuccess()) {
182+ logln("successfully built the Collator");
183+ }
184+}
185+
186 #endif // !UCONFIG_NO_COLLATION