Jungshik Shin | 87232d8 | 2017-05-13 21:10:13 -0700 | [diff] [blame] | 1 | // © 2016 and later: Unicode, Inc. and others. |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 2 | // License & terms of use: http://www.unicode.org/copyright.html |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3 | /* |
| 4 | ************************************************************************* |
| 5 | * COPYRIGHT: |
| 6 | * Copyright (c) 1996-2012, International Business Machines Corporation and |
| 7 | * others. All Rights Reserved. |
| 8 | ************************************************************************* |
| 9 | */ |
| 10 | |
| 11 | #include "unicode/utypes.h" |
| 12 | |
| 13 | #if !UCONFIG_NO_NORMALIZATION |
| 14 | |
| 15 | #include "unicode/uniset.h" |
| 16 | #include "unicode/unistr.h" |
| 17 | #include "unicode/chariter.h" |
| 18 | #include "unicode/schriter.h" |
| 19 | #include "unicode/uchriter.h" |
| 20 | #include "unicode/normlzr.h" |
| 21 | #include "unicode/utf16.h" |
| 22 | #include "cmemory.h" |
| 23 | #include "normalizer2impl.h" |
| 24 | #include "uprops.h" // for uniset_getUnicode32Instance() |
| 25 | |
Frank Tang | 69c72a6 | 2019-04-03 21:41:21 -0700 | [diff] [blame] | 26 | #if defined(move32) |
Jungshik Shin | 87232d8 | 2017-05-13 21:10:13 -0700 | [diff] [blame] | 27 | // System can define move32 intrinsics, but the char iters define move32 method |
| 28 | // using same undef trick in headers, so undef here to re-enable the method. |
| 29 | #undef move32 |
| 30 | #endif |
| 31 | |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 32 | U_NAMESPACE_BEGIN |
| 33 | |
| 34 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer) |
| 35 | |
| 36 | //------------------------------------------------------------------------- |
| 37 | // Constructors and other boilerplate |
| 38 | //------------------------------------------------------------------------- |
| 39 | |
| 40 | Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) : |
| 41 | UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), |
| 42 | text(new StringCharacterIterator(str)), |
| 43 | currentIndex(0), nextIndex(0), |
| 44 | buffer(), bufferPos(0) |
| 45 | { |
| 46 | init(); |
| 47 | } |
| 48 | |
Jungshik Shin | 87232d8 | 2017-05-13 21:10:13 -0700 | [diff] [blame] | 49 | Normalizer::Normalizer(ConstChar16Ptr str, int32_t length, UNormalizationMode mode) : |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 50 | UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), |
| 51 | text(new UCharCharacterIterator(str, length)), |
| 52 | currentIndex(0), nextIndex(0), |
| 53 | buffer(), bufferPos(0) |
| 54 | { |
| 55 | init(); |
| 56 | } |
| 57 | |
| 58 | Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) : |
| 59 | UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0), |
| 60 | text(iter.clone()), |
| 61 | currentIndex(0), nextIndex(0), |
| 62 | buffer(), bufferPos(0) |
| 63 | { |
| 64 | init(); |
| 65 | } |
| 66 | |
| 67 | Normalizer::Normalizer(const Normalizer ©) : |
| 68 | UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions), |
| 69 | text(copy.text->clone()), |
| 70 | currentIndex(copy.currentIndex), nextIndex(copy.nextIndex), |
| 71 | buffer(copy.buffer), bufferPos(copy.bufferPos) |
| 72 | { |
| 73 | init(); |
| 74 | } |
| 75 | |
| 76 | void |
| 77 | Normalizer::init() { |
| 78 | UErrorCode errorCode=U_ZERO_ERROR; |
| 79 | fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode); |
| 80 | if(fOptions&UNORM_UNICODE_3_2) { |
| 81 | delete fFilteredNorm2; |
| 82 | fNorm2=fFilteredNorm2= |
| 83 | new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode)); |
| 84 | } |
| 85 | if(U_FAILURE(errorCode)) { |
| 86 | errorCode=U_ZERO_ERROR; |
| 87 | fNorm2=Normalizer2Factory::getNoopInstance(errorCode); |
| 88 | } |
| 89 | } |
| 90 | |
| 91 | Normalizer::~Normalizer() |
| 92 | { |
| 93 | delete fFilteredNorm2; |
| 94 | delete text; |
| 95 | } |
| 96 | |
| 97 | Normalizer* |
| 98 | Normalizer::clone() const |
| 99 | { |
| 100 | return new Normalizer(*this); |
| 101 | } |
| 102 | |
| 103 | /** |
| 104 | * Generates a hash code for this iterator. |
| 105 | */ |
| 106 | int32_t Normalizer::hashCode() const |
| 107 | { |
| 108 | return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex; |
| 109 | } |
| 110 | |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 111 | bool Normalizer::operator==(const Normalizer& that) const |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 112 | { |
| 113 | return |
| 114 | this==&that || |
| 115 | (fUMode==that.fUMode && |
| 116 | fOptions==that.fOptions && |
| 117 | *text==*that.text && |
| 118 | buffer==that.buffer && |
| 119 | bufferPos==that.bufferPos && |
| 120 | nextIndex==that.nextIndex); |
| 121 | } |
| 122 | |
| 123 | //------------------------------------------------------------------------- |
| 124 | // Static utility methods |
| 125 | //------------------------------------------------------------------------- |
| 126 | |
| 127 | void U_EXPORT2 |
| 128 | Normalizer::normalize(const UnicodeString& source, |
| 129 | UNormalizationMode mode, int32_t options, |
| 130 | UnicodeString& result, |
| 131 | UErrorCode &status) { |
| 132 | if(source.isBogus() || U_FAILURE(status)) { |
| 133 | result.setToBogus(); |
| 134 | if(U_SUCCESS(status)) { |
| 135 | status=U_ILLEGAL_ARGUMENT_ERROR; |
| 136 | } |
| 137 | } else { |
| 138 | UnicodeString localDest; |
| 139 | UnicodeString *dest; |
| 140 | |
| 141 | if(&source!=&result) { |
| 142 | dest=&result; |
| 143 | } else { |
| 144 | // the source and result strings are the same object, use a temporary one |
| 145 | dest=&localDest; |
| 146 | } |
| 147 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); |
| 148 | if(U_SUCCESS(status)) { |
| 149 | if(options&UNORM_UNICODE_3_2) { |
| 150 | FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). |
| 151 | normalize(source, *dest, status); |
| 152 | } else { |
| 153 | n2->normalize(source, *dest, status); |
| 154 | } |
| 155 | } |
| 156 | if(dest==&localDest && U_SUCCESS(status)) { |
| 157 | result=*dest; |
| 158 | } |
| 159 | } |
| 160 | } |
| 161 | |
| 162 | void U_EXPORT2 |
| 163 | Normalizer::compose(const UnicodeString& source, |
| 164 | UBool compat, int32_t options, |
| 165 | UnicodeString& result, |
| 166 | UErrorCode &status) { |
| 167 | normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status); |
| 168 | } |
| 169 | |
| 170 | void U_EXPORT2 |
| 171 | Normalizer::decompose(const UnicodeString& source, |
| 172 | UBool compat, int32_t options, |
| 173 | UnicodeString& result, |
| 174 | UErrorCode &status) { |
| 175 | normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status); |
| 176 | } |
| 177 | |
| 178 | UNormalizationCheckResult |
| 179 | Normalizer::quickCheck(const UnicodeString& source, |
| 180 | UNormalizationMode mode, int32_t options, |
| 181 | UErrorCode &status) { |
| 182 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); |
| 183 | if(U_SUCCESS(status)) { |
| 184 | if(options&UNORM_UNICODE_3_2) { |
| 185 | return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). |
| 186 | quickCheck(source, status); |
| 187 | } else { |
| 188 | return n2->quickCheck(source, status); |
| 189 | } |
| 190 | } else { |
| 191 | return UNORM_MAYBE; |
| 192 | } |
| 193 | } |
| 194 | |
| 195 | UBool |
| 196 | Normalizer::isNormalized(const UnicodeString& source, |
| 197 | UNormalizationMode mode, int32_t options, |
| 198 | UErrorCode &status) { |
| 199 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status); |
| 200 | if(U_SUCCESS(status)) { |
| 201 | if(options&UNORM_UNICODE_3_2) { |
| 202 | return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)). |
| 203 | isNormalized(source, status); |
| 204 | } else { |
| 205 | return n2->isNormalized(source, status); |
| 206 | } |
| 207 | } else { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 208 | return false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 209 | } |
| 210 | } |
| 211 | |
| 212 | UnicodeString & U_EXPORT2 |
| 213 | Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right, |
| 214 | UnicodeString &result, |
| 215 | UNormalizationMode mode, int32_t options, |
| 216 | UErrorCode &errorCode) { |
| 217 | if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) { |
| 218 | result.setToBogus(); |
| 219 | if(U_SUCCESS(errorCode)) { |
| 220 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 221 | } |
| 222 | } else { |
| 223 | UnicodeString localDest; |
| 224 | UnicodeString *dest; |
| 225 | |
| 226 | if(&right!=&result) { |
| 227 | dest=&result; |
| 228 | } else { |
| 229 | // the right and result strings are the same object, use a temporary one |
| 230 | dest=&localDest; |
| 231 | } |
| 232 | *dest=left; |
| 233 | const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode); |
| 234 | if(U_SUCCESS(errorCode)) { |
| 235 | if(options&UNORM_UNICODE_3_2) { |
| 236 | FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)). |
| 237 | append(*dest, right, errorCode); |
| 238 | } else { |
| 239 | n2->append(*dest, right, errorCode); |
| 240 | } |
| 241 | } |
| 242 | if(dest==&localDest && U_SUCCESS(errorCode)) { |
| 243 | result=*dest; |
| 244 | } |
| 245 | } |
| 246 | return result; |
| 247 | } |
| 248 | |
| 249 | //------------------------------------------------------------------------- |
| 250 | // Iteration API |
| 251 | //------------------------------------------------------------------------- |
| 252 | |
| 253 | /** |
| 254 | * Return the current character in the normalized text. |
| 255 | */ |
| 256 | UChar32 Normalizer::current() { |
| 257 | if(bufferPos<buffer.length() || nextNormalize()) { |
| 258 | return buffer.char32At(bufferPos); |
| 259 | } else { |
| 260 | return DONE; |
| 261 | } |
| 262 | } |
| 263 | |
| 264 | /** |
| 265 | * Return the next character in the normalized text and advance |
| 266 | * the iteration position by one. If the end |
| 267 | * of the text has already been reached, {@link #DONE} is returned. |
| 268 | */ |
| 269 | UChar32 Normalizer::next() { |
| 270 | if(bufferPos<buffer.length() || nextNormalize()) { |
| 271 | UChar32 c=buffer.char32At(bufferPos); |
| 272 | bufferPos+=U16_LENGTH(c); |
| 273 | return c; |
| 274 | } else { |
| 275 | return DONE; |
| 276 | } |
| 277 | } |
| 278 | |
| 279 | /** |
| 280 | * Return the previous character in the normalized text and decrement |
| 281 | * the iteration position by one. If the beginning |
| 282 | * of the text has already been reached, {@link #DONE} is returned. |
| 283 | */ |
| 284 | UChar32 Normalizer::previous() { |
| 285 | if(bufferPos>0 || previousNormalize()) { |
| 286 | UChar32 c=buffer.char32At(bufferPos-1); |
| 287 | bufferPos-=U16_LENGTH(c); |
| 288 | return c; |
| 289 | } else { |
| 290 | return DONE; |
| 291 | } |
| 292 | } |
| 293 | |
| 294 | void Normalizer::reset() { |
| 295 | currentIndex=nextIndex=text->setToStart(); |
| 296 | clearBuffer(); |
| 297 | } |
| 298 | |
| 299 | void |
| 300 | Normalizer::setIndexOnly(int32_t index) { |
| 301 | text->setIndex(index); // pins index |
| 302 | currentIndex=nextIndex=text->getIndex(); |
| 303 | clearBuffer(); |
| 304 | } |
| 305 | |
| 306 | /** |
| 307 | * Return the first character in the normalized text. This resets |
| 308 | * the <tt>Normalizer's</tt> position to the beginning of the text. |
| 309 | */ |
| 310 | UChar32 Normalizer::first() { |
| 311 | reset(); |
| 312 | return next(); |
| 313 | } |
| 314 | |
| 315 | /** |
| 316 | * Return the last character in the normalized text. This resets |
| 317 | * the <tt>Normalizer's</tt> position to be just before the |
| 318 | * the input text corresponding to that normalized character. |
| 319 | */ |
| 320 | UChar32 Normalizer::last() { |
| 321 | currentIndex=nextIndex=text->setToEnd(); |
| 322 | clearBuffer(); |
| 323 | return previous(); |
| 324 | } |
| 325 | |
| 326 | /** |
| 327 | * Retrieve the current iteration position in the input text that is |
| 328 | * being normalized. This method is useful in applications such as |
| 329 | * searching, where you need to be able to determine the position in |
| 330 | * the input text that corresponds to a given normalized output character. |
| 331 | * <p> |
| 332 | * <b>Note:</b> This method sets the position in the <em>input</em>, while |
| 333 | * {@link #next} and {@link #previous} iterate through characters in the |
| 334 | * <em>output</em>. This means that there is not necessarily a one-to-one |
| 335 | * correspondence between characters returned by <tt>next</tt> and |
| 336 | * <tt>previous</tt> and the indices passed to and returned from |
| 337 | * <tt>setIndex</tt> and {@link #getIndex}. |
| 338 | * |
| 339 | */ |
| 340 | int32_t Normalizer::getIndex() const { |
| 341 | if(bufferPos<buffer.length()) { |
| 342 | return currentIndex; |
| 343 | } else { |
| 344 | return nextIndex; |
| 345 | } |
| 346 | } |
| 347 | |
| 348 | /** |
| 349 | * Retrieve the index of the start of the input text. This is the begin index |
| 350 | * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt> |
| 351 | * over which this <tt>Normalizer</tt> is iterating |
| 352 | */ |
| 353 | int32_t Normalizer::startIndex() const { |
| 354 | return text->startIndex(); |
| 355 | } |
| 356 | |
| 357 | /** |
| 358 | * Retrieve the index of the end of the input text. This is the end index |
| 359 | * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt> |
| 360 | * over which this <tt>Normalizer</tt> is iterating |
| 361 | */ |
| 362 | int32_t Normalizer::endIndex() const { |
| 363 | return text->endIndex(); |
| 364 | } |
| 365 | |
| 366 | //------------------------------------------------------------------------- |
| 367 | // Property access methods |
| 368 | //------------------------------------------------------------------------- |
| 369 | |
| 370 | void |
| 371 | Normalizer::setMode(UNormalizationMode newMode) |
| 372 | { |
| 373 | fUMode = newMode; |
| 374 | init(); |
| 375 | } |
| 376 | |
| 377 | UNormalizationMode |
| 378 | Normalizer::getUMode() const |
| 379 | { |
| 380 | return fUMode; |
| 381 | } |
| 382 | |
| 383 | void |
| 384 | Normalizer::setOption(int32_t option, |
| 385 | UBool value) |
| 386 | { |
| 387 | if (value) { |
| 388 | fOptions |= option; |
| 389 | } else { |
| 390 | fOptions &= (~option); |
| 391 | } |
| 392 | init(); |
| 393 | } |
| 394 | |
| 395 | UBool |
| 396 | Normalizer::getOption(int32_t option) const |
| 397 | { |
| 398 | return (fOptions & option) != 0; |
| 399 | } |
| 400 | |
| 401 | /** |
| 402 | * Set the input text over which this <tt>Normalizer</tt> will iterate. |
| 403 | * The iteration position is set to the beginning of the input text. |
| 404 | */ |
| 405 | void |
| 406 | Normalizer::setText(const UnicodeString& newText, |
| 407 | UErrorCode &status) |
| 408 | { |
| 409 | if (U_FAILURE(status)) { |
| 410 | return; |
| 411 | } |
| 412 | CharacterIterator *newIter = new StringCharacterIterator(newText); |
| 413 | if (newIter == NULL) { |
| 414 | status = U_MEMORY_ALLOCATION_ERROR; |
| 415 | return; |
| 416 | } |
| 417 | delete text; |
| 418 | text = newIter; |
| 419 | reset(); |
| 420 | } |
| 421 | |
| 422 | /** |
| 423 | * Set the input text over which this <tt>Normalizer</tt> will iterate. |
| 424 | * The iteration position is set to the beginning of the string. |
| 425 | */ |
| 426 | void |
| 427 | Normalizer::setText(const CharacterIterator& newText, |
| 428 | UErrorCode &status) |
| 429 | { |
| 430 | if (U_FAILURE(status)) { |
| 431 | return; |
| 432 | } |
| 433 | CharacterIterator *newIter = newText.clone(); |
| 434 | if (newIter == NULL) { |
| 435 | status = U_MEMORY_ALLOCATION_ERROR; |
| 436 | return; |
| 437 | } |
| 438 | delete text; |
| 439 | text = newIter; |
| 440 | reset(); |
| 441 | } |
| 442 | |
| 443 | void |
Jungshik Shin | 87232d8 | 2017-05-13 21:10:13 -0700 | [diff] [blame] | 444 | Normalizer::setText(ConstChar16Ptr newText, |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 445 | int32_t length, |
| 446 | UErrorCode &status) |
| 447 | { |
| 448 | if (U_FAILURE(status)) { |
| 449 | return; |
| 450 | } |
| 451 | CharacterIterator *newIter = new UCharCharacterIterator(newText, length); |
| 452 | if (newIter == NULL) { |
| 453 | status = U_MEMORY_ALLOCATION_ERROR; |
| 454 | return; |
| 455 | } |
| 456 | delete text; |
| 457 | text = newIter; |
| 458 | reset(); |
| 459 | } |
| 460 | |
| 461 | /** |
| 462 | * Copies the text under iteration into the UnicodeString referred to by "result". |
| 463 | * @param result Receives a copy of the text under iteration. |
| 464 | */ |
| 465 | void |
| 466 | Normalizer::getText(UnicodeString& result) |
| 467 | { |
| 468 | text->getText(result); |
| 469 | } |
| 470 | |
| 471 | //------------------------------------------------------------------------- |
| 472 | // Private utility methods |
| 473 | //------------------------------------------------------------------------- |
| 474 | |
| 475 | void Normalizer::clearBuffer() { |
| 476 | buffer.remove(); |
| 477 | bufferPos=0; |
| 478 | } |
| 479 | |
| 480 | UBool |
| 481 | Normalizer::nextNormalize() { |
| 482 | clearBuffer(); |
| 483 | currentIndex=nextIndex; |
| 484 | text->setIndex(nextIndex); |
| 485 | if(!text->hasNext()) { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 486 | return false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 487 | } |
| 488 | // Skip at least one character so we make progress. |
| 489 | UnicodeString segment(text->next32PostInc()); |
| 490 | while(text->hasNext()) { |
| 491 | UChar32 c; |
| 492 | if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) { |
| 493 | text->move32(-1, CharacterIterator::kCurrent); |
| 494 | break; |
| 495 | } |
| 496 | segment.append(c); |
| 497 | } |
| 498 | nextIndex=text->getIndex(); |
| 499 | UErrorCode errorCode=U_ZERO_ERROR; |
| 500 | fNorm2->normalize(segment, buffer, errorCode); |
| 501 | return U_SUCCESS(errorCode) && !buffer.isEmpty(); |
| 502 | } |
| 503 | |
| 504 | UBool |
| 505 | Normalizer::previousNormalize() { |
| 506 | clearBuffer(); |
| 507 | nextIndex=currentIndex; |
| 508 | text->setIndex(currentIndex); |
| 509 | if(!text->hasPrevious()) { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 510 | return false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 511 | } |
| 512 | UnicodeString segment; |
| 513 | while(text->hasPrevious()) { |
| 514 | UChar32 c=text->previous32(); |
| 515 | segment.insert(0, c); |
| 516 | if(fNorm2->hasBoundaryBefore(c)) { |
| 517 | break; |
| 518 | } |
| 519 | } |
| 520 | currentIndex=text->getIndex(); |
| 521 | UErrorCode errorCode=U_ZERO_ERROR; |
| 522 | fNorm2->normalize(segment, buffer, errorCode); |
| 523 | bufferPos=buffer.length(); |
| 524 | return U_SUCCESS(errorCode) && !buffer.isEmpty(); |
| 525 | } |
| 526 | |
| 527 | U_NAMESPACE_END |
| 528 | |
| 529 | #endif /* #if !UCONFIG_NO_NORMALIZATION */ |