Frank Tang | a47bd43 | 2022-04-27 21:18:38 -0700 | [diff] [blame] | 1 | diff --git a/source/i18n/collationdatabuilder.cpp b/source/i18n/collationdatabuilder.cpp |
| 2 | index b10de993..d6ef5171 100644 |
| 3 | --- a/source/i18n/collationdatabuilder.cpp |
| 4 | +++ b/source/i18n/collationdatabuilder.cpp |
| 5 | @@ -86,13 +86,25 @@ struct ConditionalCE32 : public UMemory { |
| 6 | * When fetching CEs from the builder, the contexts are built into their runtime form |
| 7 | * so that the normal collation implementation can process them. |
| 8 | * The result is cached in the list head. It is reset when the contexts are modified. |
| 9 | + * All of these builtCE32 are invalidated by clearContexts(), |
| 10 | + * via incrementing the contextsEra. |
| 11 | */ |
| 12 | uint32_t builtCE32; |
| 13 | + /** |
| 14 | + * The "era" of building intermediate contexts when the above builtCE32 was set. |
| 15 | + * When the array of cached, temporary contexts overflows, then clearContexts() |
| 16 | + * removes them all and invalidates the builtCE32 that used to point to built tries. |
| 17 | + */ |
| 18 | + int32_t era = -1; |
| 19 | /** |
| 20 | * Index of the next ConditionalCE32. |
| 21 | * Negative for the end of the list. |
| 22 | */ |
| 23 | int32_t next; |
| 24 | + // Note: We could create a separate class for all of the contextual mappings for |
| 25 | + // a code point, with the builtCE32, the era, and a list of the actual mappings. |
| 26 | + // The class that represents one mapping would then not need to |
| 27 | + // store those fields in each element. |
| 28 | }; |
| 29 | |
| 30 | U_CDECL_BEGIN |
| 31 | @@ -267,7 +279,7 @@ DataBuilderCollationIterator::getCE32FromBuilderData(uint32_t ce32, UErrorCode & |
| 32 | // TODO: ICU-21531 figure out why this happens. |
| 33 | return 0; |
| 34 | } |
| 35 | - if(cond->builtCE32 == Collation::NO_CE32) { |
| 36 | + if(cond->builtCE32 == Collation::NO_CE32 || cond->era != builder.contextsEra) { |
| 37 | // Build the context-sensitive mappings into their runtime form and cache the result. |
| 38 | cond->builtCE32 = builder.buildContext(cond, errorCode); |
| 39 | if(errorCode == U_BUFFER_OVERFLOW_ERROR) { |
| 40 | @@ -275,6 +287,7 @@ DataBuilderCollationIterator::getCE32FromBuilderData(uint32_t ce32, UErrorCode & |
| 41 | builder.clearContexts(); |
| 42 | cond->builtCE32 = builder.buildContext(cond, errorCode); |
| 43 | } |
| 44 | + cond->era = builder.contextsEra; |
| 45 | builderData.contexts = builder.contexts.getBuffer(); |
| 46 | } |
| 47 | return cond->builtCE32; |
| 48 | @@ -1322,13 +1335,10 @@ CollationDataBuilder::buildMappings(CollationData &data, UErrorCode &errorCode) |
| 49 | void |
| 50 | CollationDataBuilder::clearContexts() { |
| 51 | contexts.remove(); |
| 52 | - UnicodeSetIterator iter(contextChars); |
| 53 | - while(iter.next()) { |
| 54 | - U_ASSERT(!iter.isString()); |
| 55 | - uint32_t ce32 = utrie2_get32(trie, iter.getCodepoint()); |
| 56 | - U_ASSERT(isBuilderContextCE32(ce32)); |
| 57 | - getConditionalCE32ForCE32(ce32)->builtCE32 = Collation::NO_CE32; |
| 58 | - } |
| 59 | + // Incrementing the contexts build "era" invalidates all of the builtCE32 |
| 60 | + // from before this clearContexts() call. |
| 61 | + // Simpler than finding and resetting all of those fields. |
| 62 | + ++contextsEra; |
| 63 | } |
| 64 | |
| 65 | void |
| 66 | @@ -1336,7 +1346,7 @@ CollationDataBuilder::buildContexts(UErrorCode &errorCode) { |
| 67 | if(U_FAILURE(errorCode)) { return; } |
| 68 | // Ignore abandoned lists and the cached builtCE32, |
| 69 | // and build all contexts from scratch. |
| 70 | - contexts.remove(); |
| 71 | + clearContexts(); |
| 72 | UnicodeSetIterator iter(contextChars); |
| 73 | while(U_SUCCESS(errorCode) && iter.next()) { |
| 74 | U_ASSERT(!iter.isString()); |
| 75 | @@ -1362,18 +1372,34 @@ CollationDataBuilder::buildContext(ConditionalCE32 *head, UErrorCode &errorCode) |
| 76 | U_ASSERT(head->next >= 0); |
| 77 | UCharsTrieBuilder prefixBuilder(errorCode); |
| 78 | UCharsTrieBuilder contractionBuilder(errorCode); |
| 79 | + // This outer loop goes from each prefix to the next. |
| 80 | + // For each prefix it finds the one or more same-prefix entries (firstCond..lastCond). |
| 81 | + // If there are multiple suffixes for the same prefix, |
| 82 | + // then an inner loop builds a contraction trie for them. |
| 83 | for(ConditionalCE32 *cond = head;; cond = getConditionalCE32(cond->next)) { |
| 84 | + if(U_FAILURE(errorCode)) { return 0; } // early out for memory allocation errors |
| 85 | // After the list head, the prefix or suffix can be empty, but not both. |
| 86 | U_ASSERT(cond == head || cond->hasContext()); |
| 87 | int32_t prefixLength = cond->prefixLength(); |
| 88 | UnicodeString prefix(cond->context, 0, prefixLength + 1); |
| 89 | // Collect all contraction suffixes for one prefix. |
| 90 | ConditionalCE32 *firstCond = cond; |
| 91 | - ConditionalCE32 *lastCond = cond; |
| 92 | - while(cond->next >= 0 && |
| 93 | - (cond = getConditionalCE32(cond->next))->context.startsWith(prefix)) { |
| 94 | + ConditionalCE32 *lastCond; |
| 95 | + do { |
| 96 | lastCond = cond; |
| 97 | - } |
| 98 | + // Clear the defaultCE32 fields as we go. |
| 99 | + // They are left over from building a previous version of this list of contexts. |
| 100 | + // |
| 101 | + // One of the code paths below may copy a preceding defaultCE32 |
| 102 | + // into its emptySuffixCE32. |
| 103 | + // If a new suffix has been inserted before what used to be |
| 104 | + // the firstCond for its prefix, then that previous firstCond could still |
| 105 | + // contain an outdated defaultCE32 from an earlier buildContext() and |
| 106 | + // result in an incorrect emptySuffixCE32. |
| 107 | + // So we reset all defaultCE32 before reading and setting new values. |
| 108 | + cond->defaultCE32 = Collation::NO_CE32; |
| 109 | + } while(cond->next >= 0 && |
| 110 | + (cond = getConditionalCE32(cond->next))->context.startsWith(prefix)); |
| 111 | uint32_t ce32; |
| 112 | int32_t suffixStart = prefixLength + 1; // == prefix.length() |
| 113 | if(lastCond->context.length() == suffixStart) { |
| 114 | diff --git a/source/i18n/collationdatabuilder.h b/source/i18n/collationdatabuilder.h |
| 115 | index 6ae77772..4b981118 100644 |
| 116 | --- a/source/i18n/collationdatabuilder.h |
| 117 | +++ b/source/i18n/collationdatabuilder.h |
| 118 | @@ -244,6 +244,15 @@ protected: |
| 119 | UnicodeSet contextChars; |
| 120 | // Serialized UCharsTrie structures for finalized contexts. |
| 121 | UnicodeString contexts; |
| 122 | +private: |
| 123 | + /** |
| 124 | + * The "era" of building intermediate contexts. |
| 125 | + * When the array of cached, temporary contexts overflows, then clearContexts() |
| 126 | + * removes them all and invalidates the builtCE32 that used to point to built tries. |
| 127 | + * See ConditionalCE32::era. |
| 128 | + */ |
| 129 | + int32_t contextsEra = 0; |
| 130 | +protected: |
| 131 | UnicodeSet unsafeBackwardSet; |
| 132 | UBool modified; |
| 133 | |
| 134 | diff --git a/source/test/intltest/collationtest.cpp b/source/test/intltest/collationtest.cpp |
| 135 | index 4ce9ada5..5cc45a54 100644 |
| 136 | --- a/source/test/intltest/collationtest.cpp |
| 137 | +++ b/source/test/intltest/collationtest.cpp |
| 138 | @@ -79,6 +79,7 @@ public: |
| 139 | void TestTailoredElements(); |
| 140 | void TestDataDriven(); |
| 141 | void TestLongLocale(); |
| 142 | + void TestBuilderContextsOverflow(); |
| 143 | |
| 144 | private: |
| 145 | void checkFCD(const char *name, CollationIterator &ci, CodePointIterator &cpi); |
| 146 | @@ -150,6 +151,7 @@ void CollationTest::runIndexedTest(int32_t index, UBool exec, const char *&name, |
| 147 | TESTCASE_AUTO(TestTailoredElements); |
| 148 | TESTCASE_AUTO(TestDataDriven); |
| 149 | TESTCASE_AUTO(TestLongLocale); |
| 150 | + TESTCASE_AUTO(TestBuilderContextsOverflow); |
| 151 | TESTCASE_AUTO_END; |
| 152 | } |
| 153 | |
| 154 | @@ -1862,4 +1864,32 @@ void CollationTest::TestLongLocale() { |
| 155 | LocalPointer<Collator> coll(Collator::createInstance(longLocale, errorCode)); |
| 156 | } |
| 157 | |
| 158 | +void CollationTest::TestBuilderContextsOverflow() { |
| 159 | + IcuTestErrorCode errorCode(*this, "TestBuilderContextsOverflow"); |
| 160 | + // ICU-20715: Bad memory access in what looks like a bogus CharsTrie after |
| 161 | + // intermediate contextual-mappings data overflowed. |
| 162 | + // Caused by the CollationDataBuilder using some outdated values when building |
| 163 | + // contextual mappings with both prefix and contraction matching. |
| 164 | + // Fixed by resetting those outdated values before code looks at them. |
| 165 | + char16_t rules[] = { |
| 166 | + u'&', 0x10, 0x2ff, 0x503c, 0x4617, |
| 167 | + u'=', 0x80, 0x4f7f, 0xff, 0x3c3d, 0x1c4f, 0x3c3c, |
| 168 | + u'<', 0, 0, 0, 0, u'|', 0, 0, 0, 0, 0, 0xf400, 0x30ff, 0, 0, 0x4f7f, 0xff, |
| 169 | + u'=', 0, u'|', 0, 0, 0, 0, 0, 0, 0x1f00, 0xe30, |
| 170 | + 0x3035, 0, 0, 0xd200, 0, 0x7f00, 0xff4f, 0x3d00, 0, 0x7c00, |
| 171 | + 0, 0, 0, 0, 0, 0, 0, 0x301f, 0x350e, 0x30, |
| 172 | + 0, 0, 0xd2, 0x7c00, 0, 0, 0, 0, 0, 0, |
| 173 | + 0, 0x301f, 0x350e, 0x30, 0, 0, 0x52d2, 0x2f3c, 0x5552, 0x493c, |
| 174 | + 0x1f10, 0x1f50, 0x300, 0, 0, 0xf400, 0x30ff, 0, 0, 0x4f7f, |
| 175 | + 0xff, |
| 176 | + u'=', 0, u'|', 0, 0, 0, 0, 0x5000, 0x4617, |
| 177 | + u'=', 0x80, 0x4f7f, 0, 0, 0xd200, 0 |
| 178 | + }; |
| 179 | + UnicodeString s(false, rules, UPRV_LENGTHOF(rules)); |
| 180 | + LocalPointer<Collator> coll(new RuleBasedCollator(s, errorCode), errorCode); |
| 181 | + if(errorCode.isSuccess()) { |
| 182 | + logln("successfully built the Collator"); |
| 183 | + } |
| 184 | +} |
| 185 | + |
| 186 | #endif // !UCONFIG_NO_COLLATION |