blob: 8037c09b4f0a97a497c4ae173307145e4e7a24dc [file] [log] [blame]
Jungshik Shin87232d82017-05-13 21:10:13 -07001// © 2016 and later: Unicode, Inc. and others.
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07002// License & terms of use: http://www.unicode.org/copyright.html
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00003/*
4*******************************************************************************
5*
Jungshik Shin70f82502016-01-29 00:32:36 -08006* Copyright (C) 2001-2015, International Business Machines
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00007* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: ustrcase.cpp
Jungshik Shin87232d82017-05-13 21:10:13 -070011* encoding: UTF-8
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000012* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2002feb20
16* created by: Markus W. Scherer
17*
18* Implementation file for string casing C API functions.
19* Uses functions from uchar.c for basic functionality that requires access
20* to the Unicode Character Database (uprops.dat).
21*/
22
23#include "unicode/utypes.h"
24#include "unicode/brkiter.h"
Jungshik Shin87232d82017-05-13 21:10:13 -070025#include "unicode/casemap.h"
26#include "unicode/edits.h"
Jungshik Shinb3189662017-11-07 11:18:34 -080027#include "unicode/stringoptions.h"
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000028#include "unicode/ustring.h"
29#include "unicode/ucasemap.h"
30#include "unicode/ubrk.h"
31#include "unicode/utf.h"
32#include "unicode/utf16.h"
33#include "cmemory.h"
34#include "ucase.h"
Jungshik Shin87232d82017-05-13 21:10:13 -070035#include "ucasemap_imp.h"
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000036#include "ustr_imp.h"
Jungshik Shin70f82502016-01-29 00:32:36 -080037#include "uassert.h"
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000038
Frank Tangd2858cb2022-04-08 20:34:12 -070039/**
40 * Code point for COMBINING ACUTE ACCENT
41 * @internal
42 */
43#define ACUTE u'\u0301'
44
Jungshik Shin87232d82017-05-13 21:10:13 -070045U_NAMESPACE_BEGIN
46
47namespace {
48
49int32_t checkOverflowAndEditsError(int32_t destIndex, int32_t destCapacity,
50 Edits *edits, UErrorCode &errorCode) {
51 if (U_SUCCESS(errorCode)) {
52 if (destIndex > destCapacity) {
53 errorCode = U_BUFFER_OVERFLOW_ERROR;
54 } else if (edits != NULL) {
55 edits->copyErrorTo(errorCode);
56 }
57 }
58 return destIndex;
59}
60
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000061/* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */
Jungshik Shinf61e46d2018-05-04 13:00:45 -070062inline int32_t
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000063appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity,
Jungshik Shin87232d82017-05-13 21:10:13 -070064 int32_t result, const UChar *s,
65 int32_t cpLength, uint32_t options, icu::Edits *edits) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000066 UChar32 c;
67 int32_t length;
68
69 /* decode the result */
70 if(result<0) {
71 /* (not) original code point */
Jungshik Shin87232d82017-05-13 21:10:13 -070072 if(edits!=NULL) {
73 edits->addUnchanged(cpLength);
Jungshik Shinb3189662017-11-07 11:18:34 -080074 }
75 if(options & U_OMIT_UNCHANGED_TEXT) {
76 return destIndex;
Jungshik Shin87232d82017-05-13 21:10:13 -070077 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000078 c=~result;
Jungshik Shin87232d82017-05-13 21:10:13 -070079 if(destIndex<destCapacity && c<=0xffff) { // BMP slightly-fastpath
80 dest[destIndex++]=(UChar)c;
81 return destIndex;
82 }
83 length=cpLength;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000084 } else {
Jungshik Shin87232d82017-05-13 21:10:13 -070085 if(result<=UCASE_MAX_STRING_LENGTH) {
86 c=U_SENTINEL;
87 length=result;
88 } else if(destIndex<destCapacity && result<=0xffff) { // BMP slightly-fastpath
89 dest[destIndex++]=(UChar)result;
90 if(edits!=NULL) {
91 edits->addReplace(cpLength, 1);
92 }
93 return destIndex;
94 } else {
95 c=result;
96 length=U16_LENGTH(c);
97 }
98 if(edits!=NULL) {
99 edits->addReplace(cpLength, length);
100 }
Jungshik Shin5feb9ad2016-10-21 12:52:48 -0700101 }
102 if(length>(INT32_MAX-destIndex)) {
103 return -1; // integer overflow
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000104 }
105
106 if(destIndex<destCapacity) {
107 /* append the result */
Jungshik Shin5feb9ad2016-10-21 12:52:48 -0700108 if(c>=0) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000109 /* code point */
Frank Tang1f164ee2022-11-08 12:31:27 -0800110 UBool isError=false;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000111 U16_APPEND(dest, destIndex, destCapacity, c, isError);
112 if(isError) {
113 /* overflow, nothing written */
Jungshik Shin5feb9ad2016-10-21 12:52:48 -0700114 destIndex+=length;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000115 }
116 } else {
117 /* string */
118 if((destIndex+length)<=destCapacity) {
119 while(length>0) {
120 dest[destIndex++]=*s++;
121 --length;
122 }
123 } else {
124 /* overflow */
125 destIndex+=length;
126 }
127 }
128 } else {
129 /* preflight */
Jungshik Shin5feb9ad2016-10-21 12:52:48 -0700130 destIndex+=length;
131 }
132 return destIndex;
133}
134
Jungshik Shinf61e46d2018-05-04 13:00:45 -0700135inline int32_t
Jungshik Shin5feb9ad2016-10-21 12:52:48 -0700136appendUChar(UChar *dest, int32_t destIndex, int32_t destCapacity, UChar c) {
137 if(destIndex<destCapacity) {
138 dest[destIndex]=c;
139 } else if(destIndex==INT32_MAX) {
140 return -1; // integer overflow
141 }
142 return destIndex+1;
143}
144
Jungshik Shinf61e46d2018-05-04 13:00:45 -0700145int32_t
146appendNonEmptyUnchanged(UChar *dest, int32_t destIndex, int32_t destCapacity,
147 const UChar *s, int32_t length, uint32_t options, icu::Edits *edits) {
148 if(edits!=NULL) {
149 edits->addUnchanged(length);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000150 }
Jungshik Shinf61e46d2018-05-04 13:00:45 -0700151 if(options & U_OMIT_UNCHANGED_TEXT) {
152 return destIndex;
153 }
154 if(length>(INT32_MAX-destIndex)) {
155 return -1; // integer overflow
156 }
157 if((destIndex+length)<=destCapacity) {
158 u_memcpy(dest+destIndex, s, length);
159 }
160 return destIndex + length;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000161}
162
Jungshik Shinf61e46d2018-05-04 13:00:45 -0700163inline int32_t
164appendUnchanged(UChar *dest, int32_t destIndex, int32_t destCapacity,
165 const UChar *s, int32_t length, uint32_t options, icu::Edits *edits) {
166 if (length <= 0) {
167 return destIndex;
168 }
169 return appendNonEmptyUnchanged(dest, destIndex, destCapacity, s, length, options, edits);
170}
171
172UChar32 U_CALLCONV
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000173utf16_caseContextIterator(void *context, int8_t dir) {
174 UCaseContext *csc=(UCaseContext *)context;
175 UChar32 c;
176
177 if(dir<0) {
178 /* reset for backward iteration */
179 csc->index=csc->cpStart;
180 csc->dir=dir;
181 } else if(dir>0) {
182 /* reset for forward iteration */
183 csc->index=csc->cpLimit;
184 csc->dir=dir;
185 } else {
186 /* continue current iteration direction */
187 dir=csc->dir;
188 }
189
190 if(dir<0) {
191 if(csc->start<csc->index) {
192 U16_PREV((const UChar *)csc->p, csc->start, csc->index, c);
193 return c;
194 }
195 } else {
196 if(csc->index<csc->limit) {
197 U16_NEXT((const UChar *)csc->p, csc->index, csc->limit, c);
198 return c;
199 }
200 }
201 return U_SENTINEL;
202}
203
Jungshik Shinf61e46d2018-05-04 13:00:45 -0700204/**
205 * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
206 * caseLocale < 0: Case-folds [srcStart..srcLimit[.
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000207 */
Jungshik Shinf61e46d2018-05-04 13:00:45 -0700208int32_t toLower(int32_t caseLocale, uint32_t options,
209 UChar *dest, int32_t destCapacity,
210 const UChar *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit,
211 icu::Edits *edits, UErrorCode &errorCode) {
212 const int8_t *latinToLower;
213 if (caseLocale == UCASE_LOC_ROOT ||
214 (caseLocale >= 0 ?
215 !(caseLocale == UCASE_LOC_TURKISH || caseLocale == UCASE_LOC_LITHUANIAN) :
216 (options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) {
217 latinToLower = LatinCase::TO_LOWER_NORMAL;
218 } else {
219 latinToLower = LatinCase::TO_LOWER_TR_LT;
220 }
221 const UTrie2 *trie = ucase_getTrie();
222 int32_t destIndex = 0;
223 int32_t prev = srcStart;
224 int32_t srcIndex = srcStart;
225 for (;;) {
226 // fast path for simple cases
Jungshik Shin42d50272018-10-24 01:22:09 -0700227 UChar lead = 0;
Jungshik Shinf61e46d2018-05-04 13:00:45 -0700228 while (srcIndex < srcLimit) {
229 lead = src[srcIndex];
230 int32_t delta;
231 if (lead < LatinCase::LONG_S) {
232 int8_t d = latinToLower[lead];
233 if (d == LatinCase::EXC) { break; }
234 ++srcIndex;
235 if (d == 0) { continue; }
236 delta = d;
237 } else if (lead >= 0xd800) {
238 break; // surrogate or higher
239 } else {
240 uint16_t props = UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, lead);
241 if (UCASE_HAS_EXCEPTION(props)) { break; }
242 ++srcIndex;
243 if (!UCASE_IS_UPPER_OR_TITLE(props) || (delta = UCASE_GET_DELTA(props)) == 0) {
244 continue;
245 }
246 }
Jungshik Shin42d50272018-10-24 01:22:09 -0700247 lead += static_cast<UChar>(delta);
Jungshik Shinf61e46d2018-05-04 13:00:45 -0700248 destIndex = appendUnchanged(dest, destIndex, destCapacity,
249 src + prev, srcIndex - 1 - prev, options, edits);
250 if (destIndex >= 0) {
251 destIndex = appendUChar(dest, destIndex, destCapacity, lead);
252 if (edits != nullptr) {
253 edits->addReplace(1, 1);
254 }
255 }
256 if (destIndex < 0) {
257 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
258 return 0;
259 }
260 prev = srcIndex;
261 }
262 if (srcIndex >= srcLimit) {
263 break;
264 }
265 // slow path
266 int32_t cpStart = srcIndex++;
267 UChar trail;
Jungshik Shin87232d82017-05-13 21:10:13 -0700268 UChar32 c;
Jungshik Shinf61e46d2018-05-04 13:00:45 -0700269 if (U16_IS_LEAD(lead) && srcIndex < srcLimit && U16_IS_TRAIL(trail = src[srcIndex])) {
270 c = U16_GET_SUPPLEMENTARY(lead, trail);
271 ++srcIndex;
272 } else {
273 c = lead;
274 }
Jungshik Shin87232d82017-05-13 21:10:13 -0700275 const UChar *s;
Jungshik Shinf61e46d2018-05-04 13:00:45 -0700276 if (caseLocale >= 0) {
277 csc->cpStart = cpStart;
278 csc->cpLimit = srcIndex;
279 c = ucase_toFullLower(c, utf16_caseContextIterator, csc, &s, caseLocale);
280 } else {
281 c = ucase_toFullFolding(c, &s, options);
282 }
283 if (c >= 0) {
284 destIndex = appendUnchanged(dest, destIndex, destCapacity,
285 src + prev, cpStart - prev, options, edits);
286 if (destIndex >= 0) {
287 destIndex = appendResult(dest, destIndex, destCapacity, c, s,
288 srcIndex - cpStart, options, edits);
289 }
290 if (destIndex < 0) {
291 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
292 return 0;
293 }
294 prev = srcIndex;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000295 }
296 }
Jungshik Shinf61e46d2018-05-04 13:00:45 -0700297 destIndex = appendUnchanged(dest, destIndex, destCapacity,
298 src + prev, srcIndex - prev, options, edits);
299 if (destIndex < 0) {
300 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
301 return 0;
302 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000303 return destIndex;
304}
305
Jungshik Shinf61e46d2018-05-04 13:00:45 -0700306int32_t toUpper(int32_t caseLocale, uint32_t options,
307 UChar *dest, int32_t destCapacity,
308 const UChar *src, UCaseContext *csc, int32_t srcLength,
309 icu::Edits *edits, UErrorCode &errorCode) {
310 const int8_t *latinToUpper;
311 if (caseLocale == UCASE_LOC_TURKISH) {
312 latinToUpper = LatinCase::TO_UPPER_TR;
313 } else {
314 latinToUpper = LatinCase::TO_UPPER_NORMAL;
315 }
316 const UTrie2 *trie = ucase_getTrie();
317 int32_t destIndex = 0;
318 int32_t prev = 0;
319 int32_t srcIndex = 0;
320 for (;;) {
321 // fast path for simple cases
Jungshik Shin42d50272018-10-24 01:22:09 -0700322 UChar lead = 0;
Jungshik Shinf61e46d2018-05-04 13:00:45 -0700323 while (srcIndex < srcLength) {
324 lead = src[srcIndex];
325 int32_t delta;
326 if (lead < LatinCase::LONG_S) {
327 int8_t d = latinToUpper[lead];
328 if (d == LatinCase::EXC) { break; }
329 ++srcIndex;
330 if (d == 0) { continue; }
331 delta = d;
332 } else if (lead >= 0xd800) {
333 break; // surrogate or higher
334 } else {
335 uint16_t props = UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, lead);
336 if (UCASE_HAS_EXCEPTION(props)) { break; }
337 ++srcIndex;
338 if (UCASE_GET_TYPE(props) != UCASE_LOWER || (delta = UCASE_GET_DELTA(props)) == 0) {
339 continue;
340 }
341 }
Jungshik Shin42d50272018-10-24 01:22:09 -0700342 lead += static_cast<UChar>(delta);
Jungshik Shinf61e46d2018-05-04 13:00:45 -0700343 destIndex = appendUnchanged(dest, destIndex, destCapacity,
344 src + prev, srcIndex - 1 - prev, options, edits);
345 if (destIndex >= 0) {
346 destIndex = appendUChar(dest, destIndex, destCapacity, lead);
347 if (edits != nullptr) {
348 edits->addReplace(1, 1);
349 }
350 }
351 if (destIndex < 0) {
352 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
353 return 0;
354 }
355 prev = srcIndex;
356 }
357 if (srcIndex >= srcLength) {
358 break;
359 }
360 // slow path
361 int32_t cpStart;
362 csc->cpStart = cpStart = srcIndex++;
363 UChar trail;
364 UChar32 c;
365 if (U16_IS_LEAD(lead) && srcIndex < srcLength && U16_IS_TRAIL(trail = src[srcIndex])) {
366 c = U16_GET_SUPPLEMENTARY(lead, trail);
367 ++srcIndex;
368 } else {
369 c = lead;
370 }
371 csc->cpLimit = srcIndex;
372 const UChar *s;
373 c = ucase_toFullUpper(c, utf16_caseContextIterator, csc, &s, caseLocale);
374 if (c >= 0) {
375 destIndex = appendUnchanged(dest, destIndex, destCapacity,
376 src + prev, cpStart - prev, options, edits);
377 if (destIndex >= 0) {
378 destIndex = appendResult(dest, destIndex, destCapacity, c, s,
379 srcIndex - cpStart, options, edits);
380 }
381 if (destIndex < 0) {
382 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
383 return 0;
384 }
385 prev = srcIndex;
386 }
387 }
388 destIndex = appendUnchanged(dest, destIndex, destCapacity,
389 src + prev, srcIndex - prev, options, edits);
390 if (destIndex < 0) {
391 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
392 return 0;
393 }
394 return destIndex;
395}
396
397} // namespace
398
399U_NAMESPACE_END
400
401U_NAMESPACE_USE
402
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000403#if !UCONFIG_NO_BREAK_ITERATION
404
Frank Tangd2858cb2022-04-08 20:34:12 -0700405namespace {
406
407/**
408 * Input: c is a letter I with or without acute accent.
409 * start is the index in src after c, and is less than segmentLimit.
410 * If a plain i/I is followed by a plain j/J,
411 * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute,
412 * then we output accordingly.
413 *
414 * @return the src index after the titlecased sequence, or the start index if no Dutch IJ
415 */
416int32_t maybeTitleDutchIJ(const UChar *src, UChar32 c, int32_t start, int32_t segmentLimit,
417 UChar *dest, int32_t &destIndex, int32_t destCapacity, uint32_t options,
418 icu::Edits *edits) {
419 U_ASSERT(start < segmentLimit);
420
421 int32_t index = start;
422 bool withAcute = false;
423
424 // If the conditions are met, then the following variables tell us what to output.
425 int32_t unchanged1 = 0; // code units before the j, or the whole sequence (0..3)
426 bool doTitleJ = false; // true if the j needs to be titlecased
427 int32_t unchanged2 = 0; // after the j (0 or 1)
428
429 // next character after the first letter
430 UChar c2 = src[index++];
431
432 // Is the first letter an i/I with accent?
433 if (c == u'I') {
434 if (c2 == ACUTE) {
435 withAcute = true;
436 unchanged1 = 1;
437 if (index == segmentLimit) { return start; }
438 c2 = src[index++];
439 }
440 } else { // Í
441 withAcute = true;
442 }
443
444 // Is the next character a j/J?
445 if (c2 == u'j') {
446 doTitleJ = true;
447 } else if (c2 == u'J') {
448 ++unchanged1;
449 } else {
450 return start;
451 }
452
453 // A plain i/I must be followed by a plain j/J.
454 // An i/I with acute must be followed by a j/J with acute.
455 if (withAcute) {
456 if (index == segmentLimit || src[index++] != ACUTE) { return start; }
457 if (doTitleJ) {
458 unchanged2 = 1;
459 } else {
460 ++unchanged1;
461 }
462 }
463
464 // There must not be another combining mark.
465 if (index < segmentLimit) {
466 int32_t cp;
467 int32_t i = index;
468 U16_NEXT(src, i, segmentLimit, cp);
469 uint32_t typeMask = U_GET_GC_MASK(cp);
470 if ((typeMask & U_GC_M_MASK) != 0) {
471 return start;
472 }
473 }
474
475 // Output the rest of the Dutch IJ.
476 destIndex = appendUnchanged(dest, destIndex, destCapacity, src + start, unchanged1, options, edits);
477 start += unchanged1;
478 if (doTitleJ) {
479 destIndex = appendUChar(dest, destIndex, destCapacity, u'J');
480 if (edits != nullptr) {
481 edits->addReplace(1, 1);
482 }
483 ++start;
484 }
485 destIndex = appendUnchanged(dest, destIndex, destCapacity, src + start, unchanged2, options, edits);
486
487 U_ASSERT(start + unchanged2 == index);
488 return index;
489}
490
491} // namespace
492
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000493U_CFUNC int32_t U_CALLCONV
Jungshik Shin87232d82017-05-13 21:10:13 -0700494ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *iter,
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000495 UChar *dest, int32_t destCapacity,
496 const UChar *src, int32_t srcLength,
Jungshik Shin87232d82017-05-13 21:10:13 -0700497 icu::Edits *edits,
498 UErrorCode &errorCode) {
Jungshik Shinb3189662017-11-07 11:18:34 -0800499 if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000500 return 0;
501 }
502
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000503 /* set up local variables */
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000504 UCaseContext csc=UCASECONTEXT_INITIALIZER;
505 csc.p=(void *)src;
506 csc.limit=srcLength;
Jungshik Shin87232d82017-05-13 21:10:13 -0700507 int32_t destIndex=0;
508 int32_t prev=0;
Frank Tangd2858cb2022-04-08 20:34:12 -0700509 bool isFirstIndex=true;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000510
511 /* titlecasing loop */
512 while(prev<srcLength) {
513 /* find next index where to titlecase */
Jungshik Shin87232d82017-05-13 21:10:13 -0700514 int32_t index;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000515 if(isFirstIndex) {
Frank Tangd2858cb2022-04-08 20:34:12 -0700516 isFirstIndex=false;
Jungshik Shin87232d82017-05-13 21:10:13 -0700517 index=iter->first();
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000518 } else {
Jungshik Shin87232d82017-05-13 21:10:13 -0700519 index=iter->next();
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000520 }
Jungshik Shin87232d82017-05-13 21:10:13 -0700521 if(index==UBRK_DONE || index>srcLength) {
522 index=srcLength;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000523 }
524
525 /*
Jungshik Shinb3189662017-11-07 11:18:34 -0800526 * Segment [prev..index[ into 3 parts:
527 * a) skipped characters (copy as-is) [prev..titleStart[
528 * b) first letter (titlecase) [titleStart..titleLimit[
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000529 * c) subsequent characters (lowercase) [titleLimit..index[
530 */
Jungshik Shin87232d82017-05-13 21:10:13 -0700531 if(prev<index) {
Jungshik Shinb3189662017-11-07 11:18:34 -0800532 // Find and copy skipped characters [prev..titleStart[
Jungshik Shin87232d82017-05-13 21:10:13 -0700533 int32_t titleStart=prev;
534 int32_t titleLimit=prev;
535 UChar32 c;
536 U16_NEXT(src, titleLimit, index, c);
Jungshik Shinb3189662017-11-07 11:18:34 -0800537 if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
538 // Adjust the titlecasing index to the next cased character,
539 // or to the next letter/number/symbol/private use.
540 // Stop with titleStart<titleLimit<=index
541 // if there is a character to be titlecased,
542 // or else stop with titleStart==titleLimit==index.
Frank Tangd2858cb2022-04-08 20:34:12 -0700543 bool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
Jungshik Shinb3189662017-11-07 11:18:34 -0800544 while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000545 titleStart=titleLimit;
Jungshik Shin87232d82017-05-13 21:10:13 -0700546 if(titleLimit==index) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000547 break;
548 }
Jungshik Shin87232d82017-05-13 21:10:13 -0700549 U16_NEXT(src, titleLimit, index, c);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000550 }
Jungshik Shinb3189662017-11-07 11:18:34 -0800551 if (prev < titleStart) {
552 destIndex=appendUnchanged(dest, destIndex, destCapacity,
553 src+prev, titleStart-prev, options, edits);
554 if(destIndex<0) {
555 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
556 return 0;
557 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000558 }
559 }
560
561 if(titleStart<titleLimit) {
562 /* titlecase c which is from [titleStart..titleLimit[ */
563 csc.cpStart=titleStart;
564 csc.cpLimit=titleLimit;
Jungshik Shin87232d82017-05-13 21:10:13 -0700565 const UChar *s;
566 c=ucase_toFullTitle(c, utf16_caseContextIterator, &csc, &s, caseLocale);
567 destIndex=appendResult(dest, destIndex, destCapacity, c, s,
568 titleLimit-titleStart, options, edits);
Jungshik Shin5feb9ad2016-10-21 12:52:48 -0700569 if(destIndex<0) {
Jungshik Shin87232d82017-05-13 21:10:13 -0700570 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
Jungshik Shin5feb9ad2016-10-21 12:52:48 -0700571 return 0;
572 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000573
574 /* Special case Dutch IJ titlecasing */
Jungshik Shin87232d82017-05-13 21:10:13 -0700575 if (titleStart+1 < index &&
Frank Tangd2858cb2022-04-08 20:34:12 -0700576 caseLocale == UCASE_LOC_DUTCH) {
577 if (c < 0) {
578 c = ~c;
579 }
580
581 if (c == u'I' || c == u'Í') {
582 titleLimit = maybeTitleDutchIJ(src, c, titleStart + 1, index,
583 dest, destIndex, destCapacity, options,
584 edits);
Jungshik Shin5feb9ad2016-10-21 12:52:48 -0700585 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000586 }
587
588 /* lowercase [titleLimit..index[ */
Jungshik Shin87232d82017-05-13 21:10:13 -0700589 if(titleLimit<index) {
590 if((options&U_TITLECASE_NO_LOWERCASE)==0) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000591 /* Normal operation: Lowercase the rest of the word. */
592 destIndex+=
Jungshik Shinf61e46d2018-05-04 13:00:45 -0700593 toLower(
594 caseLocale, options,
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000595 dest+destIndex, destCapacity-destIndex,
Jungshik Shinf61e46d2018-05-04 13:00:45 -0700596 src, &csc, titleLimit, index,
Jungshik Shin87232d82017-05-13 21:10:13 -0700597 edits, errorCode);
598 if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
599 errorCode=U_ZERO_ERROR;
600 }
601 if(U_FAILURE(errorCode)) {
Jungshik Shin5feb9ad2016-10-21 12:52:48 -0700602 return destIndex;
603 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000604 } else {
605 /* Optionally just copy the rest of the word unchanged. */
Jungshik Shin87232d82017-05-13 21:10:13 -0700606 destIndex=appendUnchanged(dest, destIndex, destCapacity,
607 src+titleLimit, index-titleLimit, options, edits);
Jungshik Shin5feb9ad2016-10-21 12:52:48 -0700608 if(destIndex<0) {
Jungshik Shin87232d82017-05-13 21:10:13 -0700609 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
Jungshik Shin5feb9ad2016-10-21 12:52:48 -0700610 return 0;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000611 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000612 }
613 }
614 }
615 }
616
Jungshik Shin87232d82017-05-13 21:10:13 -0700617 prev=index;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000618 }
619
Jungshik Shin87232d82017-05-13 21:10:13 -0700620 return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000621}
622
623#endif // !UCONFIG_NO_BREAK_ITERATION
624
Jungshik Shin5feb9ad2016-10-21 12:52:48 -0700625U_NAMESPACE_BEGIN
626namespace GreekUpper {
627
628// Data generated by prototype code, see
Frank Tang3e05d9d2021-11-08 14:04:04 -0800629// https://icu.unicode.org/design/case/greek-upper
Jungshik Shin5feb9ad2016-10-21 12:52:48 -0700630// TODO: Move this data into ucase.icu.
631static const uint16_t data0370[] = {
632 // U+0370..03FF
633 0x0370,
634 0x0370,
635 0x0372,
636 0x0372,
637 0,
638 0,
639 0x0376,
640 0x0376,
641 0,
642 0,
643 0x037A,
644 0x03FD,
645 0x03FE,
646 0x03FF,
647 0,
648 0x037F,
649 0,
650 0,
651 0,
652 0,
653 0,
654 0,
655 0x0391 | HAS_VOWEL | HAS_ACCENT,
656 0,
657 0x0395 | HAS_VOWEL | HAS_ACCENT,
658 0x0397 | HAS_VOWEL | HAS_ACCENT,
659 0x0399 | HAS_VOWEL | HAS_ACCENT,
660 0,
661 0x039F | HAS_VOWEL | HAS_ACCENT,
662 0,
663 0x03A5 | HAS_VOWEL | HAS_ACCENT,
664 0x03A9 | HAS_VOWEL | HAS_ACCENT,
665 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
666 0x0391 | HAS_VOWEL,
667 0x0392,
668 0x0393,
669 0x0394,
670 0x0395 | HAS_VOWEL,
671 0x0396,
672 0x0397 | HAS_VOWEL,
673 0x0398,
674 0x0399 | HAS_VOWEL,
675 0x039A,
676 0x039B,
677 0x039C,
678 0x039D,
679 0x039E,
680 0x039F | HAS_VOWEL,
681 0x03A0,
682 0x03A1,
683 0,
684 0x03A3,
685 0x03A4,
686 0x03A5 | HAS_VOWEL,
687 0x03A6,
688 0x03A7,
689 0x03A8,
690 0x03A9 | HAS_VOWEL,
691 0x0399 | HAS_VOWEL | HAS_DIALYTIKA,
692 0x03A5 | HAS_VOWEL | HAS_DIALYTIKA,
693 0x0391 | HAS_VOWEL | HAS_ACCENT,
694 0x0395 | HAS_VOWEL | HAS_ACCENT,
695 0x0397 | HAS_VOWEL | HAS_ACCENT,
696 0x0399 | HAS_VOWEL | HAS_ACCENT,
697 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
698 0x0391 | HAS_VOWEL,
699 0x0392,
700 0x0393,
701 0x0394,
702 0x0395 | HAS_VOWEL,
703 0x0396,
704 0x0397 | HAS_VOWEL,
705 0x0398,
706 0x0399 | HAS_VOWEL,
707 0x039A,
708 0x039B,
709 0x039C,
710 0x039D,
711 0x039E,
712 0x039F | HAS_VOWEL,
713 0x03A0,
714 0x03A1,
715 0x03A3,
716 0x03A3,
717 0x03A4,
718 0x03A5 | HAS_VOWEL,
719 0x03A6,
720 0x03A7,
721 0x03A8,
722 0x03A9 | HAS_VOWEL,
723 0x0399 | HAS_VOWEL | HAS_DIALYTIKA,
724 0x03A5 | HAS_VOWEL | HAS_DIALYTIKA,
725 0x039F | HAS_VOWEL | HAS_ACCENT,
726 0x03A5 | HAS_VOWEL | HAS_ACCENT,
727 0x03A9 | HAS_VOWEL | HAS_ACCENT,
728 0x03CF,
729 0x0392,
730 0x0398,
731 0x03D2,
732 0x03D2 | HAS_ACCENT,
733 0x03D2 | HAS_DIALYTIKA,
734 0x03A6,
735 0x03A0,
736 0x03CF,
737 0x03D8,
738 0x03D8,
739 0x03DA,
740 0x03DA,
741 0x03DC,
742 0x03DC,
743 0x03DE,
744 0x03DE,
745 0x03E0,
746 0x03E0,
747 0,
748 0,
749 0,
750 0,
751 0,
752 0,
753 0,
754 0,
755 0,
756 0,
757 0,
758 0,
759 0,
760 0,
761 0x039A,
762 0x03A1,
763 0x03F9,
764 0x037F,
765 0x03F4,
766 0x0395 | HAS_VOWEL,
767 0,
768 0x03F7,
769 0x03F7,
770 0x03F9,
771 0x03FA,
772 0x03FA,
773 0x03FC,
774 0x03FD,
775 0x03FE,
776 0x03FF,
777};
778
779static const uint16_t data1F00[] = {
780 // U+1F00..1FFF
781 0x0391 | HAS_VOWEL,
782 0x0391 | HAS_VOWEL,
783 0x0391 | HAS_VOWEL | HAS_ACCENT,
784 0x0391 | HAS_VOWEL | HAS_ACCENT,
785 0x0391 | HAS_VOWEL | HAS_ACCENT,
786 0x0391 | HAS_VOWEL | HAS_ACCENT,
787 0x0391 | HAS_VOWEL | HAS_ACCENT,
788 0x0391 | HAS_VOWEL | HAS_ACCENT,
789 0x0391 | HAS_VOWEL,
790 0x0391 | HAS_VOWEL,
791 0x0391 | HAS_VOWEL | HAS_ACCENT,
792 0x0391 | HAS_VOWEL | HAS_ACCENT,
793 0x0391 | HAS_VOWEL | HAS_ACCENT,
794 0x0391 | HAS_VOWEL | HAS_ACCENT,
795 0x0391 | HAS_VOWEL | HAS_ACCENT,
796 0x0391 | HAS_VOWEL | HAS_ACCENT,
797 0x0395 | HAS_VOWEL,
798 0x0395 | HAS_VOWEL,
799 0x0395 | HAS_VOWEL | HAS_ACCENT,
800 0x0395 | HAS_VOWEL | HAS_ACCENT,
801 0x0395 | HAS_VOWEL | HAS_ACCENT,
802 0x0395 | HAS_VOWEL | HAS_ACCENT,
803 0,
804 0,
805 0x0395 | HAS_VOWEL,
806 0x0395 | HAS_VOWEL,
807 0x0395 | HAS_VOWEL | HAS_ACCENT,
808 0x0395 | HAS_VOWEL | HAS_ACCENT,
809 0x0395 | HAS_VOWEL | HAS_ACCENT,
810 0x0395 | HAS_VOWEL | HAS_ACCENT,
811 0,
812 0,
813 0x0397 | HAS_VOWEL,
814 0x0397 | HAS_VOWEL,
815 0x0397 | HAS_VOWEL | HAS_ACCENT,
816 0x0397 | HAS_VOWEL | HAS_ACCENT,
817 0x0397 | HAS_VOWEL | HAS_ACCENT,
818 0x0397 | HAS_VOWEL | HAS_ACCENT,
819 0x0397 | HAS_VOWEL | HAS_ACCENT,
820 0x0397 | HAS_VOWEL | HAS_ACCENT,
821 0x0397 | HAS_VOWEL,
822 0x0397 | HAS_VOWEL,
823 0x0397 | HAS_VOWEL | HAS_ACCENT,
824 0x0397 | HAS_VOWEL | HAS_ACCENT,
825 0x0397 | HAS_VOWEL | HAS_ACCENT,
826 0x0397 | HAS_VOWEL | HAS_ACCENT,
827 0x0397 | HAS_VOWEL | HAS_ACCENT,
828 0x0397 | HAS_VOWEL | HAS_ACCENT,
829 0x0399 | HAS_VOWEL,
830 0x0399 | HAS_VOWEL,
831 0x0399 | HAS_VOWEL | HAS_ACCENT,
832 0x0399 | HAS_VOWEL | HAS_ACCENT,
833 0x0399 | HAS_VOWEL | HAS_ACCENT,
834 0x0399 | HAS_VOWEL | HAS_ACCENT,
835 0x0399 | HAS_VOWEL | HAS_ACCENT,
836 0x0399 | HAS_VOWEL | HAS_ACCENT,
837 0x0399 | HAS_VOWEL,
838 0x0399 | HAS_VOWEL,
839 0x0399 | HAS_VOWEL | HAS_ACCENT,
840 0x0399 | HAS_VOWEL | HAS_ACCENT,
841 0x0399 | HAS_VOWEL | HAS_ACCENT,
842 0x0399 | HAS_VOWEL | HAS_ACCENT,
843 0x0399 | HAS_VOWEL | HAS_ACCENT,
844 0x0399 | HAS_VOWEL | HAS_ACCENT,
845 0x039F | HAS_VOWEL,
846 0x039F | HAS_VOWEL,
847 0x039F | HAS_VOWEL | HAS_ACCENT,
848 0x039F | HAS_VOWEL | HAS_ACCENT,
849 0x039F | HAS_VOWEL | HAS_ACCENT,
850 0x039F | HAS_VOWEL | HAS_ACCENT,
851 0,
852 0,
853 0x039F | HAS_VOWEL,
854 0x039F | HAS_VOWEL,
855 0x039F | HAS_VOWEL | HAS_ACCENT,
856 0x039F | HAS_VOWEL | HAS_ACCENT,
857 0x039F | HAS_VOWEL | HAS_ACCENT,
858 0x039F | HAS_VOWEL | HAS_ACCENT,
859 0,
860 0,
861 0x03A5 | HAS_VOWEL,
862 0x03A5 | HAS_VOWEL,
863 0x03A5 | HAS_VOWEL | HAS_ACCENT,
864 0x03A5 | HAS_VOWEL | HAS_ACCENT,
865 0x03A5 | HAS_VOWEL | HAS_ACCENT,
866 0x03A5 | HAS_VOWEL | HAS_ACCENT,
867 0x03A5 | HAS_VOWEL | HAS_ACCENT,
868 0x03A5 | HAS_VOWEL | HAS_ACCENT,
869 0,
870 0x03A5 | HAS_VOWEL,
871 0,
872 0x03A5 | HAS_VOWEL | HAS_ACCENT,
873 0,
874 0x03A5 | HAS_VOWEL | HAS_ACCENT,
875 0,
876 0x03A5 | HAS_VOWEL | HAS_ACCENT,
877 0x03A9 | HAS_VOWEL,
878 0x03A9 | HAS_VOWEL,
879 0x03A9 | HAS_VOWEL | HAS_ACCENT,
880 0x03A9 | HAS_VOWEL | HAS_ACCENT,
881 0x03A9 | HAS_VOWEL | HAS_ACCENT,
882 0x03A9 | HAS_VOWEL | HAS_ACCENT,
883 0x03A9 | HAS_VOWEL | HAS_ACCENT,
884 0x03A9 | HAS_VOWEL | HAS_ACCENT,
885 0x03A9 | HAS_VOWEL,
886 0x03A9 | HAS_VOWEL,
887 0x03A9 | HAS_VOWEL | HAS_ACCENT,
888 0x03A9 | HAS_VOWEL | HAS_ACCENT,
889 0x03A9 | HAS_VOWEL | HAS_ACCENT,
890 0x03A9 | HAS_VOWEL | HAS_ACCENT,
891 0x03A9 | HAS_VOWEL | HAS_ACCENT,
892 0x03A9 | HAS_VOWEL | HAS_ACCENT,
893 0x0391 | HAS_VOWEL | HAS_ACCENT,
894 0x0391 | HAS_VOWEL | HAS_ACCENT,
895 0x0395 | HAS_VOWEL | HAS_ACCENT,
896 0x0395 | HAS_VOWEL | HAS_ACCENT,
897 0x0397 | HAS_VOWEL | HAS_ACCENT,
898 0x0397 | HAS_VOWEL | HAS_ACCENT,
899 0x0399 | HAS_VOWEL | HAS_ACCENT,
900 0x0399 | HAS_VOWEL | HAS_ACCENT,
901 0x039F | HAS_VOWEL | HAS_ACCENT,
902 0x039F | HAS_VOWEL | HAS_ACCENT,
903 0x03A5 | HAS_VOWEL | HAS_ACCENT,
904 0x03A5 | HAS_VOWEL | HAS_ACCENT,
905 0x03A9 | HAS_VOWEL | HAS_ACCENT,
906 0x03A9 | HAS_VOWEL | HAS_ACCENT,
907 0,
908 0,
909 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
910 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
911 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
912 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
913 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
914 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
915 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
916 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
917 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
918 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
919 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
920 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
921 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
922 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
923 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
924 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
925 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
926 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
927 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
928 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
929 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
930 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
931 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
932 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
933 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
934 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
935 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
936 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
937 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
938 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
939 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
940 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
941 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
942 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
943 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
944 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
945 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
946 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
947 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
948 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
949 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
950 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
951 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
952 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
953 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
954 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
955 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
956 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
957 0x0391 | HAS_VOWEL,
958 0x0391 | HAS_VOWEL,
959 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
960 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
961 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
962 0,
963 0x0391 | HAS_VOWEL | HAS_ACCENT,
964 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
965 0x0391 | HAS_VOWEL,
966 0x0391 | HAS_VOWEL,
967 0x0391 | HAS_VOWEL | HAS_ACCENT,
968 0x0391 | HAS_VOWEL | HAS_ACCENT,
969 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
970 0,
971 0x0399 | HAS_VOWEL,
972 0,
973 0,
974 0,
975 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
976 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
977 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
978 0,
979 0x0397 | HAS_VOWEL | HAS_ACCENT,
980 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
981 0x0395 | HAS_VOWEL | HAS_ACCENT,
982 0x0395 | HAS_VOWEL | HAS_ACCENT,
983 0x0397 | HAS_VOWEL | HAS_ACCENT,
984 0x0397 | HAS_VOWEL | HAS_ACCENT,
985 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
986 0,
987 0,
988 0,
989 0x0399 | HAS_VOWEL,
990 0x0399 | HAS_VOWEL,
991 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
992 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
993 0,
994 0,
995 0x0399 | HAS_VOWEL | HAS_ACCENT,
996 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
997 0x0399 | HAS_VOWEL,
998 0x0399 | HAS_VOWEL,
999 0x0399 | HAS_VOWEL | HAS_ACCENT,
1000 0x0399 | HAS_VOWEL | HAS_ACCENT,
1001 0,
1002 0,
1003 0,
1004 0,
1005 0x03A5 | HAS_VOWEL,
1006 0x03A5 | HAS_VOWEL,
1007 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
1008 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
1009 0x03A1,
1010 0x03A1,
1011 0x03A5 | HAS_VOWEL | HAS_ACCENT,
1012 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
1013 0x03A5 | HAS_VOWEL,
1014 0x03A5 | HAS_VOWEL,
1015 0x03A5 | HAS_VOWEL | HAS_ACCENT,
1016 0x03A5 | HAS_VOWEL | HAS_ACCENT,
1017 0x03A1,
1018 0,
1019 0,
1020 0,
1021 0,
1022 0,
1023 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
1024 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
1025 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
1026 0,
1027 0x03A9 | HAS_VOWEL | HAS_ACCENT,
1028 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
1029 0x039F | HAS_VOWEL | HAS_ACCENT,
1030 0x039F | HAS_VOWEL | HAS_ACCENT,
1031 0x03A9 | HAS_VOWEL | HAS_ACCENT,
1032 0x03A9 | HAS_VOWEL | HAS_ACCENT,
1033 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
1034 0,
1035 0,
1036 0,
1037};
1038
1039// U+2126 Ohm sign
1040static const uint16_t data2126 = 0x03A9 | HAS_VOWEL;
1041
1042uint32_t getLetterData(UChar32 c) {
1043 if (c < 0x370 || 0x2126 < c || (0x3ff < c && c < 0x1f00)) {
1044 return 0;
1045 } else if (c <= 0x3ff) {
1046 return data0370[c - 0x370];
1047 } else if (c <= 0x1fff) {
1048 return data1F00[c - 0x1f00];
1049 } else if (c == 0x2126) {
1050 return data2126;
1051 } else {
1052 return 0;
1053 }
1054}
1055
1056uint32_t getDiacriticData(UChar32 c) {
1057 switch (c) {
1058 case 0x0300: // varia
1059 case 0x0301: // tonos = oxia
1060 case 0x0342: // perispomeni
1061 case 0x0302: // circumflex can look like perispomeni
1062 case 0x0303: // tilde can look like perispomeni
1063 case 0x0311: // inverted breve can look like perispomeni
1064 return HAS_ACCENT;
1065 case 0x0308: // dialytika = diaeresis
1066 return HAS_COMBINING_DIALYTIKA;
1067 case 0x0344: // dialytika tonos
1068 return HAS_COMBINING_DIALYTIKA | HAS_ACCENT;
1069 case 0x0345: // ypogegrammeni = iota subscript
1070 return HAS_YPOGEGRAMMENI;
1071 case 0x0304: // macron
1072 case 0x0306: // breve
1073 case 0x0313: // comma above
1074 case 0x0314: // reversed comma above
1075 case 0x0343: // koronis
1076 return HAS_OTHER_GREEK_DIACRITIC;
1077 default:
1078 return 0;
1079 }
1080}
1081
Jungshik Shin87232d82017-05-13 21:10:13 -07001082UBool isFollowedByCasedLetter(const UChar *s, int32_t i, int32_t length) {
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07001083 while (i < length) {
1084 UChar32 c;
1085 U16_NEXT(s, i, length, c);
Jungshik Shin87232d82017-05-13 21:10:13 -07001086 int32_t type = ucase_getTypeOrIgnorable(c);
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07001087 if ((type & UCASE_IGNORABLE) != 0) {
1088 // Case-ignorable, continue with the loop.
1089 } else if (type != UCASE_NONE) {
Frank Tang1f164ee2022-11-08 12:31:27 -08001090 return true; // Followed by cased letter.
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07001091 } else {
Frank Tang1f164ee2022-11-08 12:31:27 -08001092 return false; // Uncased and not case-ignorable.
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07001093 }
1094 }
Frank Tang1f164ee2022-11-08 12:31:27 -08001095 return false; // Not followed by cased letter.
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07001096}
1097
1098/**
1099 * Greek string uppercasing with a state machine.
1100 * Probably simpler than a stateless function that has to figure out complex context-before
1101 * for each character.
1102 * TODO: Try to re-consolidate one way or another with the non-Greek function.
1103 */
Jungshik Shin87232d82017-05-13 21:10:13 -07001104int32_t toUpper(uint32_t options,
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07001105 UChar *dest, int32_t destCapacity,
1106 const UChar *src, int32_t srcLength,
Jungshik Shin87232d82017-05-13 21:10:13 -07001107 Edits *edits,
1108 UErrorCode &errorCode) {
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07001109 int32_t destIndex=0;
1110 uint32_t state = 0;
1111 for (int32_t i = 0; i < srcLength;) {
1112 int32_t nextIndex = i;
1113 UChar32 c;
1114 U16_NEXT(src, nextIndex, srcLength, c);
1115 uint32_t nextState = 0;
Jungshik Shin87232d82017-05-13 21:10:13 -07001116 int32_t type = ucase_getTypeOrIgnorable(c);
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07001117 if ((type & UCASE_IGNORABLE) != 0) {
1118 // c is case-ignorable
1119 nextState |= (state & AFTER_CASED);
1120 } else if (type != UCASE_NONE) {
1121 // c is cased
1122 nextState |= AFTER_CASED;
1123 }
1124 uint32_t data = getLetterData(c);
1125 if (data > 0) {
1126 uint32_t upper = data & UPPER_MASK;
1127 // Add a dialytika to this iota or ypsilon vowel
1128 // if we removed a tonos from the previous vowel,
1129 // and that previous vowel did not also have (or gain) a dialytika.
1130 // Adding one only to the final vowel in a longer sequence
1131 // (which does not occur in normal writing) would require lookahead.
1132 // Set the same flag as for preserving an existing dialytika.
1133 if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
1134 (upper == 0x399 || upper == 0x3A5)) {
1135 data |= HAS_DIALYTIKA;
1136 }
1137 int32_t numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota.
1138 if ((data & HAS_YPOGEGRAMMENI) != 0) {
1139 numYpogegrammeni = 1;
1140 }
1141 // Skip combining diacritics after this Greek letter.
1142 while (nextIndex < srcLength) {
1143 uint32_t diacriticData = getDiacriticData(src[nextIndex]);
1144 if (diacriticData != 0) {
1145 data |= diacriticData;
1146 if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
1147 ++numYpogegrammeni;
1148 }
1149 ++nextIndex;
1150 } else {
1151 break; // not a Greek diacritic
1152 }
1153 }
1154 if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
1155 nextState |= AFTER_VOWEL_WITH_ACCENT;
1156 }
1157 // Map according to Greek rules.
Frank Tang1f164ee2022-11-08 12:31:27 -08001158 UBool addTonos = false;
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07001159 if (upper == 0x397 &&
1160 (data & HAS_ACCENT) != 0 &&
1161 numYpogegrammeni == 0 &&
1162 (state & AFTER_CASED) == 0 &&
Jungshik Shin87232d82017-05-13 21:10:13 -07001163 !isFollowedByCasedLetter(src, nextIndex, srcLength)) {
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07001164 // Keep disjunctive "or" with (only) a tonos.
1165 // We use the same "word boundary" conditions as for the Final_Sigma test.
1166 if (i == nextIndex) {
1167 upper = 0x389; // Preserve the precomposed form.
1168 } else {
Frank Tang1f164ee2022-11-08 12:31:27 -08001169 addTonos = true;
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07001170 }
1171 } else if ((data & HAS_DIALYTIKA) != 0) {
1172 // Preserve a vowel with dialytika in precomposed form if it exists.
1173 if (upper == 0x399) {
1174 upper = 0x3AA;
1175 data &= ~HAS_EITHER_DIALYTIKA;
1176 } else if (upper == 0x3A5) {
1177 upper = 0x3AB;
1178 data &= ~HAS_EITHER_DIALYTIKA;
1179 }
1180 }
Jungshik Shin87232d82017-05-13 21:10:13 -07001181
Jungshik Shinb3189662017-11-07 11:18:34 -08001182 UBool change;
1183 if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) {
Frank Tang1f164ee2022-11-08 12:31:27 -08001184 change = true; // common, simple usage
Jungshik Shinb3189662017-11-07 11:18:34 -08001185 } else {
Jungshik Shin87232d82017-05-13 21:10:13 -07001186 // Find out first whether we are changing the text.
1187 change = src[i] != upper || numYpogegrammeni > 0;
1188 int32_t i2 = i + 1;
1189 if ((data & HAS_EITHER_DIALYTIKA) != 0) {
1190 change |= i2 >= nextIndex || src[i2] != 0x308;
1191 ++i2;
1192 }
1193 if (addTonos) {
1194 change |= i2 >= nextIndex || src[i2] != 0x301;
1195 ++i2;
1196 }
1197 int32_t oldLength = nextIndex - i;
1198 int32_t newLength = (i2 - i) + numYpogegrammeni;
1199 change |= oldLength != newLength;
1200 if (change) {
1201 if (edits != NULL) {
1202 edits->addReplace(oldLength, newLength);
1203 }
1204 } else {
1205 if (edits != NULL) {
1206 edits->addUnchanged(oldLength);
1207 }
1208 // Write unchanged text?
Jungshik Shinb3189662017-11-07 11:18:34 -08001209 change = (options & U_OMIT_UNCHANGED_TEXT) == 0;
Jungshik Shin87232d82017-05-13 21:10:13 -07001210 }
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07001211 }
Jungshik Shin87232d82017-05-13 21:10:13 -07001212
1213 if (change) {
1214 destIndex=appendUChar(dest, destIndex, destCapacity, (UChar)upper);
1215 if (destIndex >= 0 && (data & HAS_EITHER_DIALYTIKA) != 0) {
1216 destIndex=appendUChar(dest, destIndex, destCapacity, 0x308); // restore or add a dialytika
1217 }
1218 if (destIndex >= 0 && addTonos) {
1219 destIndex=appendUChar(dest, destIndex, destCapacity, 0x301);
1220 }
1221 while (destIndex >= 0 && numYpogegrammeni > 0) {
1222 destIndex=appendUChar(dest, destIndex, destCapacity, 0x399);
1223 --numYpogegrammeni;
1224 }
1225 if(destIndex<0) {
1226 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1227 return 0;
1228 }
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07001229 }
1230 } else {
1231 const UChar *s;
Jungshik Shin87232d82017-05-13 21:10:13 -07001232 c=ucase_toFullUpper(c, NULL, NULL, &s, UCASE_LOC_GREEK);
1233 destIndex = appendResult(dest, destIndex, destCapacity, c, s,
1234 nextIndex - i, options, edits);
1235 if (destIndex < 0) {
1236 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
1237 return 0;
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07001238 }
1239 }
1240 i = nextIndex;
1241 state = nextState;
1242 }
1243
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07001244 return destIndex;
1245}
1246
1247} // namespace GreekUpper
1248U_NAMESPACE_END
1249
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001250/* functions available in the common library (for unistr_case.cpp) */
1251
1252U_CFUNC int32_t U_CALLCONV
Jungshik Shin87232d82017-05-13 21:10:13 -07001253ustrcase_internalToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001254 UChar *dest, int32_t destCapacity,
1255 const UChar *src, int32_t srcLength,
Jungshik Shin87232d82017-05-13 21:10:13 -07001256 icu::Edits *edits,
1257 UErrorCode &errorCode) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001258 UCaseContext csc=UCASECONTEXT_INITIALIZER;
1259 csc.p=(void *)src;
1260 csc.limit=srcLength;
Jungshik Shinf61e46d2018-05-04 13:00:45 -07001261 int32_t destIndex = toLower(
1262 caseLocale, options,
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001263 dest, destCapacity,
1264 src, &csc, 0, srcLength,
Jungshik Shin87232d82017-05-13 21:10:13 -07001265 edits, errorCode);
1266 return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001267}
1268
1269U_CFUNC int32_t U_CALLCONV
Jungshik Shin87232d82017-05-13 21:10:13 -07001270ustrcase_internalToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001271 UChar *dest, int32_t destCapacity,
1272 const UChar *src, int32_t srcLength,
Jungshik Shin87232d82017-05-13 21:10:13 -07001273 icu::Edits *edits,
1274 UErrorCode &errorCode) {
1275 int32_t destIndex;
1276 if (caseLocale == UCASE_LOC_GREEK) {
1277 destIndex = GreekUpper::toUpper(options, dest, destCapacity,
1278 src, srcLength, edits, errorCode);
1279 } else {
1280 UCaseContext csc=UCASECONTEXT_INITIALIZER;
1281 csc.p=(void *)src;
1282 csc.limit=srcLength;
Jungshik Shinf61e46d2018-05-04 13:00:45 -07001283 destIndex = toUpper(
1284 caseLocale, options,
Jungshik Shin87232d82017-05-13 21:10:13 -07001285 dest, destCapacity,
Jungshik Shinf61e46d2018-05-04 13:00:45 -07001286 src, &csc, srcLength,
Jungshik Shin87232d82017-05-13 21:10:13 -07001287 edits, errorCode);
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07001288 }
Jungshik Shin87232d82017-05-13 21:10:13 -07001289 return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001290}
1291
Jungshik Shin87232d82017-05-13 21:10:13 -07001292U_CFUNC int32_t U_CALLCONV
1293ustrcase_internalFold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
1294 UChar *dest, int32_t destCapacity,
1295 const UChar *src, int32_t srcLength,
1296 icu::Edits *edits,
1297 UErrorCode &errorCode) {
Jungshik Shinf61e46d2018-05-04 13:00:45 -07001298 int32_t destIndex = toLower(
1299 -1, options,
1300 dest, destCapacity,
1301 src, nullptr, 0, srcLength,
1302 edits, errorCode);
Jungshik Shin87232d82017-05-13 21:10:13 -07001303 return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001304}
1305
1306U_CFUNC int32_t
Jungshik Shin87232d82017-05-13 21:10:13 -07001307ustrcase_map(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001308 UChar *dest, int32_t destCapacity,
1309 const UChar *src, int32_t srcLength,
1310 UStringCaseMapper *stringCaseMapper,
Jungshik Shin87232d82017-05-13 21:10:13 -07001311 icu::Edits *edits,
1312 UErrorCode &errorCode) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001313 int32_t destLength;
1314
1315 /* check argument values */
Jungshik Shin87232d82017-05-13 21:10:13 -07001316 if(U_FAILURE(errorCode)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001317 return 0;
1318 }
1319 if( destCapacity<0 ||
1320 (dest==NULL && destCapacity>0) ||
1321 src==NULL ||
1322 srcLength<-1
1323 ) {
Jungshik Shin87232d82017-05-13 21:10:13 -07001324 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
1325 return 0;
1326 }
1327
1328 /* get the string length */
1329 if(srcLength==-1) {
1330 srcLength=u_strlen(src);
1331 }
1332
1333 /* check for overlapping source and destination */
1334 if( dest!=NULL &&
1335 ((src>=dest && src<(dest+destCapacity)) ||
1336 (dest>=src && dest<(src+srcLength)))
1337 ) {
1338 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
1339 return 0;
1340 }
1341
Jungshik Shinb3189662017-11-07 11:18:34 -08001342 if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
Jungshik Shin87232d82017-05-13 21:10:13 -07001343 edits->reset();
1344 }
1345 destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
1346 dest, destCapacity, src, srcLength, edits, errorCode);
1347 return u_terminateUChars(dest, destCapacity, destLength, &errorCode);
1348}
1349
1350U_CFUNC int32_t
1351ustrcase_mapWithOverlap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
1352 UChar *dest, int32_t destCapacity,
1353 const UChar *src, int32_t srcLength,
1354 UStringCaseMapper *stringCaseMapper,
1355 UErrorCode &errorCode) {
1356 UChar buffer[300];
1357 UChar *temp;
1358
1359 int32_t destLength;
1360
1361 /* check argument values */
1362 if(U_FAILURE(errorCode)) {
1363 return 0;
1364 }
1365 if( destCapacity<0 ||
1366 (dest==NULL && destCapacity>0) ||
1367 src==NULL ||
1368 srcLength<-1
1369 ) {
1370 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001371 return 0;
1372 }
1373
1374 /* get the string length */
1375 if(srcLength==-1) {
1376 srcLength=u_strlen(src);
1377 }
1378
1379 /* check for overlapping source and destination */
1380 if( dest!=NULL &&
1381 ((src>=dest && src<(dest+destCapacity)) ||
1382 (dest>=src && dest<(src+srcLength)))
1383 ) {
1384 /* overlap: provide a temporary destination buffer and later copy the result */
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001385 if(destCapacity<=UPRV_LENGTHOF(buffer)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001386 /* the stack buffer is large enough */
1387 temp=buffer;
1388 } else {
1389 /* allocate a buffer */
1390 temp=(UChar *)uprv_malloc(destCapacity*U_SIZEOF_UCHAR);
1391 if(temp==NULL) {
Jungshik Shin87232d82017-05-13 21:10:13 -07001392 errorCode=U_MEMORY_ALLOCATION_ERROR;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001393 return 0;
1394 }
1395 }
1396 } else {
1397 temp=dest;
1398 }
1399
Jungshik Shin87232d82017-05-13 21:10:13 -07001400 destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
1401 temp, destCapacity, src, srcLength, NULL, errorCode);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001402 if(temp!=dest) {
1403 /* copy the result string to the destination buffer */
Jungshik Shin87232d82017-05-13 21:10:13 -07001404 if (U_SUCCESS(errorCode) && 0 < destLength && destLength <= destCapacity) {
1405 u_memmove(dest, temp, destLength);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001406 }
1407 if(temp!=buffer) {
1408 uprv_free(temp);
1409 }
1410 }
1411
Jungshik Shin87232d82017-05-13 21:10:13 -07001412 return u_terminateUChars(dest, destCapacity, destLength, &errorCode);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001413}
1414
1415/* public API functions */
1416
1417U_CAPI int32_t U_EXPORT2
1418u_strFoldCase(UChar *dest, int32_t destCapacity,
1419 const UChar *src, int32_t srcLength,
1420 uint32_t options,
1421 UErrorCode *pErrorCode) {
Jungshik Shin87232d82017-05-13 21:10:13 -07001422 return ustrcase_mapWithOverlap(
1423 UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001424 dest, destCapacity,
1425 src, srcLength,
Jungshik Shin87232d82017-05-13 21:10:13 -07001426 ustrcase_internalFold, *pErrorCode);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001427}
1428
Jungshik Shin87232d82017-05-13 21:10:13 -07001429U_NAMESPACE_BEGIN
1430
1431int32_t CaseMap::fold(
1432 uint32_t options,
1433 const UChar *src, int32_t srcLength,
1434 UChar *dest, int32_t destCapacity, Edits *edits,
1435 UErrorCode &errorCode) {
1436 return ustrcase_map(
1437 UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
1438 dest, destCapacity,
1439 src, srcLength,
1440 ustrcase_internalFold, edits, errorCode);
1441}
1442
1443U_NAMESPACE_END
1444
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001445/* case-insensitive string comparisons -------------------------------------- */
1446
1447/*
1448 * This function is a copy of unorm_cmpEquivFold() minus the parts for
1449 * canonical equivalence.
1450 * Keep the functions in sync, and see there for how this works.
1451 * The duplication is for modularization:
1452 * It makes caseless (but not canonical caseless) matches independent of
1453 * the normalization code.
1454 */
1455
1456/* stack element for previous-level source/decomposition pointers */
1457struct CmpEquivLevel {
1458 const UChar *start, *s, *limit;
1459};
1460typedef struct CmpEquivLevel CmpEquivLevel;
1461
Jungshik Shin70f82502016-01-29 00:32:36 -08001462/**
1463 * Internal implementation code comparing string with case fold.
1464 * This function is called from u_strcmpFold() and u_caseInsensitivePrefixMatch().
1465 *
1466 * @param s1 input string 1
1467 * @param length1 length of string 1, or -1 (NULL terminated)
1468 * @param s2 input string 2
1469 * @param length2 length of string 2, or -1 (NULL terminated)
1470 * @param options compare options
1471 * @param matchLen1 (output) length of partial prefix match in s1
1472 * @param matchLen2 (output) length of partial prefix match in s2
1473 * @param pErrorCode receives error status
1474 * @return The result of comparison
1475 */
1476static int32_t _cmpFold(
1477 const UChar *s1, int32_t length1,
1478 const UChar *s2, int32_t length2,
1479 uint32_t options,
1480 int32_t *matchLen1, int32_t *matchLen2,
1481 UErrorCode *pErrorCode) {
1482 int32_t cmpRes = 0;
1483
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001484 /* current-level start/limit - s1/s2 as current */
1485 const UChar *start1, *start2, *limit1, *limit2;
1486
Jungshik Shin70f82502016-01-29 00:32:36 -08001487 /* points to the original start address */
1488 const UChar *org1, *org2;
1489
1490 /* points to the end of match + 1 */
1491 const UChar *m1, *m2;
1492
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001493 /* case folding variables */
1494 const UChar *p;
1495 int32_t length;
1496
1497 /* stacks of previous-level start/current/limit */
1498 CmpEquivLevel stack1[2], stack2[2];
1499
1500 /* case folding buffers, only use current-level start/limit */
1501 UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1];
1502
1503 /* track which is the current level per string */
1504 int32_t level1, level2;
1505
1506 /* current code units, and code points for lookups */
1507 UChar32 c1, c2, cp1, cp2;
1508
1509 /* no argument error checking because this itself is not an API */
1510
1511 /*
1512 * assume that at least the option U_COMPARE_IGNORE_CASE is set
1513 * otherwise this function would have to behave exactly as uprv_strCompare()
1514 */
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001515 if(U_FAILURE(*pErrorCode)) {
1516 return 0;
1517 }
1518
1519 /* initialize */
Jungshik Shin70f82502016-01-29 00:32:36 -08001520 if(matchLen1) {
1521 U_ASSERT(matchLen2 !=NULL);
1522 *matchLen1=0;
1523 *matchLen2=0;
1524 }
1525
1526 start1=m1=org1=s1;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001527 if(length1==-1) {
1528 limit1=NULL;
1529 } else {
1530 limit1=s1+length1;
1531 }
1532
Jungshik Shin70f82502016-01-29 00:32:36 -08001533 start2=m2=org2=s2;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001534 if(length2==-1) {
1535 limit2=NULL;
1536 } else {
1537 limit2=s2+length2;
1538 }
1539
1540 level1=level2=0;
1541 c1=c2=-1;
1542
1543 /* comparison loop */
1544 for(;;) {
1545 /*
1546 * here a code unit value of -1 means "get another code unit"
1547 * below it will mean "this source is finished"
1548 */
1549
1550 if(c1<0) {
1551 /* get next code unit from string 1, post-increment */
1552 for(;;) {
1553 if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) {
1554 if(level1==0) {
1555 c1=-1;
1556 break;
1557 }
1558 } else {
1559 ++s1;
1560 break;
1561 }
1562
1563 /* reached end of level buffer, pop one level */
1564 do {
1565 --level1;
1566 start1=stack1[level1].start; /*Not uninitialized*/
1567 } while(start1==NULL);
1568 s1=stack1[level1].s; /*Not uninitialized*/
1569 limit1=stack1[level1].limit; /*Not uninitialized*/
1570 }
1571 }
1572
1573 if(c2<0) {
1574 /* get next code unit from string 2, post-increment */
1575 for(;;) {
1576 if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) {
1577 if(level2==0) {
1578 c2=-1;
1579 break;
1580 }
1581 } else {
1582 ++s2;
1583 break;
1584 }
1585
1586 /* reached end of level buffer, pop one level */
1587 do {
1588 --level2;
1589 start2=stack2[level2].start; /*Not uninitialized*/
1590 } while(start2==NULL);
1591 s2=stack2[level2].s; /*Not uninitialized*/
1592 limit2=stack2[level2].limit; /*Not uninitialized*/
1593 }
1594 }
1595
1596 /*
1597 * compare c1 and c2
1598 * either variable c1, c2 is -1 only if the corresponding string is finished
1599 */
1600 if(c1==c2) {
Jungshik Shin70f82502016-01-29 00:32:36 -08001601 const UChar *next1, *next2;
1602
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001603 if(c1<0) {
Jungshik Shin70f82502016-01-29 00:32:36 -08001604 cmpRes=0; /* c1==c2==-1 indicating end of strings */
1605 break;
1606 }
1607
1608 /*
1609 * Note: Move the match positions in both strings at the same time
1610 * only when corresponding code point(s) in the original strings
1611 * are fully consumed. For example, when comparing s1="Fust" and
1612 * s2="Fu\u00dfball", s2[2] is folded into "ss", and s1[2] matches
1613 * the first code point in the case-folded data. But the second "s"
1614 * has no matching code point in s1, so this implementation returns
1615 * 2 as the prefix match length ("Fu").
1616 */
1617 next1=next2=NULL;
1618 if(level1==0) {
1619 next1=s1;
1620 } else if(s1==limit1) {
1621 /* Note: This implementation only use a single level of stack.
1622 * If this code needs to be changed to use multiple levels
1623 * of stacks, the code above should check if the current
1624 * code is at the end of all stacks.
1625 */
1626 U_ASSERT(level1==1);
1627
1628 /* is s1 at the end of the current stack? */
1629 next1=stack1[0].s;
1630 }
1631
1632 if (next1!=NULL) {
1633 if(level2==0) {
1634 next2=s2;
1635 } else if(s2==limit2) {
1636 U_ASSERT(level2==1);
1637
1638 /* is s2 at the end of the current stack? */
1639 next2=stack2[0].s;
1640 }
1641 if(next2!=NULL) {
1642 m1=next1;
1643 m2=next2;
1644 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001645 }
1646 c1=c2=-1; /* make us fetch new code units */
1647 continue;
1648 } else if(c1<0) {
Jungshik Shin70f82502016-01-29 00:32:36 -08001649 cmpRes=-1; /* string 1 ends before string 2 */
1650 break;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001651 } else if(c2<0) {
Jungshik Shin70f82502016-01-29 00:32:36 -08001652 cmpRes=1; /* string 2 ends before string 1 */
1653 break;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001654 }
1655 /* c1!=c2 && c1>=0 && c2>=0 */
1656
1657 /* get complete code points for c1, c2 for lookups if either is a surrogate */
1658 cp1=c1;
1659 if(U_IS_SURROGATE(c1)) {
1660 UChar c;
1661
1662 if(U_IS_SURROGATE_LEAD(c1)) {
1663 if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) {
1664 /* advance ++s1; only below if cp1 decomposes/case-folds */
1665 cp1=U16_GET_SUPPLEMENTARY(c1, c);
1666 }
1667 } else /* isTrail(c1) */ {
1668 if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) {
1669 cp1=U16_GET_SUPPLEMENTARY(c, c1);
1670 }
1671 }
1672 }
1673
1674 cp2=c2;
1675 if(U_IS_SURROGATE(c2)) {
1676 UChar c;
1677
1678 if(U_IS_SURROGATE_LEAD(c2)) {
1679 if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) {
1680 /* advance ++s2; only below if cp2 decomposes/case-folds */
1681 cp2=U16_GET_SUPPLEMENTARY(c2, c);
1682 }
1683 } else /* isTrail(c2) */ {
1684 if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) {
1685 cp2=U16_GET_SUPPLEMENTARY(c, c2);
1686 }
1687 }
1688 }
1689
1690 /*
1691 * go down one level for each string
1692 * continue with the main loop as soon as there is a real change
1693 */
1694
1695 if( level1==0 &&
Jungshik Shin87232d82017-05-13 21:10:13 -07001696 (length=ucase_toFullFolding((UChar32)cp1, &p, options))>=0
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001697 ) {
1698 /* cp1 case-folds to the code point "length" or to p[length] */
1699 if(U_IS_SURROGATE(c1)) {
1700 if(U_IS_SURROGATE_LEAD(c1)) {
1701 /* advance beyond source surrogate pair if it case-folds */
1702 ++s1;
1703 } else /* isTrail(c1) */ {
1704 /*
1705 * we got a supplementary code point when hitting its trail surrogate,
1706 * therefore the lead surrogate must have been the same as in the other string;
1707 * compare this decomposition with the lead surrogate in the other string
1708 * remember that this simulates bulk text replacement:
1709 * the decomposition would replace the entire code point
1710 */
1711 --s2;
Jungshik Shin70f82502016-01-29 00:32:36 -08001712 --m2;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001713 c2=*(s2-1);
1714 }
1715 }
1716
1717 /* push current level pointers */
1718 stack1[0].start=start1;
1719 stack1[0].s=s1;
1720 stack1[0].limit=limit1;
1721 ++level1;
1722
1723 /* copy the folding result to fold1[] */
1724 if(length<=UCASE_MAX_STRING_LENGTH) {
1725 u_memcpy(fold1, p, length);
1726 } else {
1727 int32_t i=0;
1728 U16_APPEND_UNSAFE(fold1, i, length);
1729 length=i;
1730 }
1731
1732 /* set next level pointers to case folding */
1733 start1=s1=fold1;
1734 limit1=fold1+length;
1735
1736 /* get ready to read from decomposition, continue with loop */
1737 c1=-1;
1738 continue;
1739 }
1740
1741 if( level2==0 &&
Jungshik Shin87232d82017-05-13 21:10:13 -07001742 (length=ucase_toFullFolding((UChar32)cp2, &p, options))>=0
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001743 ) {
1744 /* cp2 case-folds to the code point "length" or to p[length] */
1745 if(U_IS_SURROGATE(c2)) {
1746 if(U_IS_SURROGATE_LEAD(c2)) {
1747 /* advance beyond source surrogate pair if it case-folds */
1748 ++s2;
1749 } else /* isTrail(c2) */ {
1750 /*
1751 * we got a supplementary code point when hitting its trail surrogate,
1752 * therefore the lead surrogate must have been the same as in the other string;
1753 * compare this decomposition with the lead surrogate in the other string
1754 * remember that this simulates bulk text replacement:
1755 * the decomposition would replace the entire code point
1756 */
1757 --s1;
Jungshik Shin70f82502016-01-29 00:32:36 -08001758 --m2;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001759 c1=*(s1-1);
1760 }
1761 }
1762
1763 /* push current level pointers */
1764 stack2[0].start=start2;
1765 stack2[0].s=s2;
1766 stack2[0].limit=limit2;
1767 ++level2;
1768
1769 /* copy the folding result to fold2[] */
1770 if(length<=UCASE_MAX_STRING_LENGTH) {
1771 u_memcpy(fold2, p, length);
1772 } else {
1773 int32_t i=0;
1774 U16_APPEND_UNSAFE(fold2, i, length);
1775 length=i;
1776 }
1777
1778 /* set next level pointers to case folding */
1779 start2=s2=fold2;
1780 limit2=fold2+length;
1781
1782 /* get ready to read from decomposition, continue with loop */
1783 c2=-1;
1784 continue;
1785 }
1786
1787 /*
1788 * no decomposition/case folding, max level for both sides:
1789 * return difference result
1790 *
1791 * code point order comparison must not just return cp1-cp2
1792 * because when single surrogates are present then the surrogate pairs
1793 * that formed cp1 and cp2 may be from different string indexes
1794 *
1795 * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
1796 * c1=d800 cp1=10001 c2=dc00 cp2=10000
1797 * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
1798 *
1799 * therefore, use same fix-up as in ustring.c/uprv_strCompare()
1800 * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
1801 * so we have slightly different pointer/start/limit comparisons here
1802 */
1803
1804 if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) {
1805 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
1806 if(
1807 (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) ||
1808 (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2)))
1809 ) {
1810 /* part of a surrogate pair, leave >=d800 */
1811 } else {
1812 /* BMP code point - may be surrogate code point - make <d800 */
1813 c1-=0x2800;
1814 }
1815
1816 if(
1817 (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) ||
1818 (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2)))
1819 ) {
1820 /* part of a surrogate pair, leave >=d800 */
1821 } else {
1822 /* BMP code point - may be surrogate code point - make <d800 */
1823 c2-=0x2800;
1824 }
1825 }
1826
Jungshik Shin70f82502016-01-29 00:32:36 -08001827 cmpRes=c1-c2;
1828 break;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001829 }
Jungshik Shin70f82502016-01-29 00:32:36 -08001830
1831 if(matchLen1) {
Jungshik Shin42d50272018-10-24 01:22:09 -07001832 *matchLen1=static_cast<int32_t>(m1-org1);
1833 *matchLen2=static_cast<int32_t>(m2-org2);
Jungshik Shin70f82502016-01-29 00:32:36 -08001834 }
1835 return cmpRes;
1836}
1837
1838/* internal function */
1839U_CFUNC int32_t
1840u_strcmpFold(const UChar *s1, int32_t length1,
1841 const UChar *s2, int32_t length2,
1842 uint32_t options,
1843 UErrorCode *pErrorCode) {
1844 return _cmpFold(s1, length1, s2, length2, options, NULL, NULL, pErrorCode);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001845}
1846
1847/* public API functions */
1848
1849U_CAPI int32_t U_EXPORT2
1850u_strCaseCompare(const UChar *s1, int32_t length1,
1851 const UChar *s2, int32_t length2,
1852 uint32_t options,
1853 UErrorCode *pErrorCode) {
1854 /* argument checking */
1855 if(pErrorCode==0 || U_FAILURE(*pErrorCode)) {
1856 return 0;
1857 }
1858 if(s1==NULL || length1<-1 || s2==NULL || length2<-1) {
1859 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1860 return 0;
1861 }
1862 return u_strcmpFold(s1, length1, s2, length2,
1863 options|U_COMPARE_IGNORE_CASE,
1864 pErrorCode);
1865}
1866
1867U_CAPI int32_t U_EXPORT2
1868u_strcasecmp(const UChar *s1, const UChar *s2, uint32_t options) {
1869 UErrorCode errorCode=U_ZERO_ERROR;
1870 return u_strcmpFold(s1, -1, s2, -1,
1871 options|U_COMPARE_IGNORE_CASE,
1872 &errorCode);
1873}
1874
1875U_CAPI int32_t U_EXPORT2
1876u_memcasecmp(const UChar *s1, const UChar *s2, int32_t length, uint32_t options) {
1877 UErrorCode errorCode=U_ZERO_ERROR;
1878 return u_strcmpFold(s1, length, s2, length,
1879 options|U_COMPARE_IGNORE_CASE,
1880 &errorCode);
1881}
1882
1883U_CAPI int32_t U_EXPORT2
1884u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options) {
1885 UErrorCode errorCode=U_ZERO_ERROR;
1886 return u_strcmpFold(s1, n, s2, n,
1887 options|(U_COMPARE_IGNORE_CASE|_STRNCMP_STYLE),
1888 &errorCode);
1889}
Jungshik Shin70f82502016-01-29 00:32:36 -08001890
1891/* internal API - detect length of shared prefix */
1892U_CAPI void
1893u_caseInsensitivePrefixMatch(const UChar *s1, int32_t length1,
1894 const UChar *s2, int32_t length2,
1895 uint32_t options,
1896 int32_t *matchLen1, int32_t *matchLen2,
1897 UErrorCode *pErrorCode) {
1898 _cmpFold(s1, length1, s2, length2, options,
1899 matchLen1, matchLen2, pErrorCode);
1900}