blob: 388c86b1bba79105c1a08787768f6d10f732f646 [file] [log] [blame]
Jungshik Shin87232d82017-05-13 21:10:13 -07001// © 2016 and later: Unicode, Inc. and others.
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07002// License & terms of use: http://www.unicode.org/copyright.html
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00003/*
4*******************************************************************************
5*
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08006* Copyright (C) 2004-2014, International Business Machines
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00007* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: ucase.cpp
Jungshik Shin87232d82017-05-13 21:10:13 -070011* encoding: UTF-8
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000012* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2004aug30
16* created by: Markus W. Scherer
17*
18* Low-level Unicode character/string case mapping code.
19* Much code moved here (and modified) from uchar.c.
20*/
21
22#include "unicode/utypes.h"
23#include "unicode/unistr.h"
24#include "unicode/uset.h"
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000025#include "unicode/utf16.h"
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000026#include "cmemory.h"
Frank Tangd2858cb2022-04-08 20:34:12 -070027#include "uassert.h"
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000028#include "ucase.h"
Frank Tangd2858cb2022-04-08 20:34:12 -070029#include "umutex.h"
30#include "utrie2.h"
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000031
Frank Tangd2858cb2022-04-08 20:34:12 -070032/* ucase_props_data.h is machine-generated by genprops/casepropsbuilder.cpp */
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000033#define INCLUDED_FROM_UCASE_CPP
34#include "ucase_props_data.h"
35
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000036/* set of property starts for UnicodeSet ------------------------------------ */
37
38static UBool U_CALLCONV
39_enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
40 /* add the start code point to the USet */
41 const USetAdder *sa=(const USetAdder *)context;
42 sa->add(sa->set, start);
43 return TRUE;
44}
45
46U_CFUNC void U_EXPORT2
Jungshik Shin87232d82017-05-13 21:10:13 -070047ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000048 if(U_FAILURE(*pErrorCode)) {
49 return;
50 }
51
52 /* add the start code point of each same-value range of the trie */
Jungshik Shin87232d82017-05-13 21:10:13 -070053 utrie2_enum(&ucase_props_singleton.trie, NULL, _enumPropertyStartsRange, sa);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000054
55 /* add code points with hardcoded properties, plus the ones following them */
56
57 /* (none right now, see comment below) */
58
59 /*
60 * Omit code points with hardcoded specialcasing properties
61 * because we do not build property UnicodeSets for them right now.
62 */
63}
64
65/* data access primitives --------------------------------------------------- */
66
Frank Tangd2858cb2022-04-08 20:34:12 -070067U_CAPI const struct UCaseProps * U_EXPORT2
68ucase_getSingleton(int32_t *pExceptionsLength, int32_t *pUnfoldLength) {
69 *pExceptionsLength = UPRV_LENGTHOF(ucase_props_exceptions);
70 *pUnfoldLength = UPRV_LENGTHOF(ucase_props_unfold);
71 return &ucase_props_singleton;
72}
73
Jungshik Shinf61e46d2018-05-04 13:00:45 -070074U_CFUNC const UTrie2 * U_EXPORT2
75ucase_getTrie() {
76 return &ucase_props_singleton.trie;
77}
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000078
Jungshik Shinf61e46d2018-05-04 13:00:45 -070079#define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000080
81/* number of bits in an 8-bit integer value */
82static const uint8_t flagsOffset[256]={
83 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
84 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
85 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
86 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
87 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
88 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
89 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
90 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
91 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
92 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
93 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
94 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
95 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
96 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
97 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
98 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
99};
100
101#define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
102#define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
103
104/*
105 * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
106 *
107 * @param excWord (in) initial exceptions word
108 * @param idx (in) desired slot index
109 * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
110 * moved to the last uint16_t of the value, use +1 for beginning of next slot
111 * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
112 */
Frank Tangb8696612019-10-25 14:58:21 -0700113#define GET_SLOT_VALUE(excWord, idx, pExc16, value) UPRV_BLOCK_MACRO_BEGIN { \
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000114 if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
115 (pExc16)+=SLOT_OFFSET(excWord, idx); \
116 (value)=*pExc16; \
117 } else { \
118 (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
119 (value)=*pExc16++; \
120 (value)=((value)<<16)|*pExc16; \
Frank Tangb8696612019-10-25 14:58:21 -0700121 } \
122} UPRV_BLOCK_MACRO_END
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000123
124/* simple case mappings ----------------------------------------------------- */
125
126U_CAPI UChar32 U_EXPORT2
Jungshik Shin87232d82017-05-13 21:10:13 -0700127ucase_tolower(UChar32 c) {
128 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
Jungshik Shinf61e46d2018-05-04 13:00:45 -0700129 if(!UCASE_HAS_EXCEPTION(props)) {
130 if(UCASE_IS_UPPER_OR_TITLE(props)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000131 c+=UCASE_GET_DELTA(props);
132 }
133 } else {
Jungshik Shin87232d82017-05-13 21:10:13 -0700134 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000135 uint16_t excWord=*pe++;
Jungshik Shina9a2bd32018-07-07 03:36:01 -0700136 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
137 int32_t delta;
138 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
139 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
140 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000141 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
142 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
143 }
144 }
145 return c;
146}
147
148U_CAPI UChar32 U_EXPORT2
Jungshik Shin87232d82017-05-13 21:10:13 -0700149ucase_toupper(UChar32 c) {
150 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
Jungshik Shinf61e46d2018-05-04 13:00:45 -0700151 if(!UCASE_HAS_EXCEPTION(props)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000152 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
153 c+=UCASE_GET_DELTA(props);
154 }
155 } else {
Jungshik Shin87232d82017-05-13 21:10:13 -0700156 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000157 uint16_t excWord=*pe++;
Jungshik Shina9a2bd32018-07-07 03:36:01 -0700158 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
159 int32_t delta;
160 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
161 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
162 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000163 if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
164 GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
165 }
166 }
167 return c;
168}
169
170U_CAPI UChar32 U_EXPORT2
Jungshik Shin87232d82017-05-13 21:10:13 -0700171ucase_totitle(UChar32 c) {
172 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
Jungshik Shinf61e46d2018-05-04 13:00:45 -0700173 if(!UCASE_HAS_EXCEPTION(props)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000174 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
175 c+=UCASE_GET_DELTA(props);
176 }
177 } else {
Jungshik Shin87232d82017-05-13 21:10:13 -0700178 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000179 uint16_t excWord=*pe++;
Jungshik Shina9a2bd32018-07-07 03:36:01 -0700180 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
181 int32_t delta;
182 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
183 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
184 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000185 int32_t idx;
186 if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
187 idx=UCASE_EXC_TITLE;
188 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
189 idx=UCASE_EXC_UPPER;
190 } else {
191 return c;
192 }
193 GET_SLOT_VALUE(excWord, idx, pe, c);
194 }
195 return c;
196}
197
198static const UChar iDot[2] = { 0x69, 0x307 };
199static const UChar jDot[2] = { 0x6a, 0x307 };
200static const UChar iOgonekDot[3] = { 0x12f, 0x307 };
201static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 };
202static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 };
203static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 };
204
205
206U_CFUNC void U_EXPORT2
Jungshik Shin87232d82017-05-13 21:10:13 -0700207ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000208 uint16_t props;
209
210 /*
211 * Hardcode the case closure of i and its relatives and ignore the
212 * data file data for these characters.
213 * The Turkic dotless i and dotted I with their case mapping conditions
214 * and case folding option make the related characters behave specially.
215 * This code matches their closure behavior to their case folding behavior.
216 */
217
218 switch(c) {
219 case 0x49:
220 /* regular i and I are in one equivalence class */
221 sa->add(sa->set, 0x69);
222 return;
223 case 0x69:
224 sa->add(sa->set, 0x49);
225 return;
226 case 0x130:
227 /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
228 sa->addString(sa->set, iDot, 2);
229 return;
230 case 0x131:
231 /* dotless i is in a class by itself */
232 return;
233 default:
234 /* otherwise use the data file data */
235 break;
236 }
237
Jungshik Shin87232d82017-05-13 21:10:13 -0700238 props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
Jungshik Shinf61e46d2018-05-04 13:00:45 -0700239 if(!UCASE_HAS_EXCEPTION(props)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000240 if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
241 /* add the one simple case mapping, no matter what type it is */
242 int32_t delta=UCASE_GET_DELTA(props);
243 if(delta!=0) {
244 sa->add(sa->set, c+delta);
245 }
246 }
247 } else {
248 /*
249 * c has exceptions, so there may be multiple simple and/or
250 * full case mappings. Add them all.
251 */
Jungshik Shin87232d82017-05-13 21:10:13 -0700252 const uint16_t *pe0, *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000253 const UChar *closure;
254 uint16_t excWord=*pe++;
255 int32_t idx, closureLength, fullLength, length;
256
257 pe0=pe;
258
259 /* add all simple case mappings */
260 for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
261 if(HAS_SLOT(excWord, idx)) {
262 pe=pe0;
263 GET_SLOT_VALUE(excWord, idx, pe, c);
264 sa->add(sa->set, c);
265 }
266 }
Jungshik Shina9a2bd32018-07-07 03:36:01 -0700267 if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) {
268 pe=pe0;
269 int32_t delta;
270 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
271 sa->add(sa->set, (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta);
272 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000273
274 /* get the closure string pointer & length */
275 if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
276 pe=pe0;
277 GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
278 closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
279 closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */
280 } else {
281 closureLength=0;
282 closure=NULL;
283 }
284
285 /* add the full case folding */
286 if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
287 pe=pe0;
288 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
289
290 /* start of full case mapping strings */
291 ++pe;
292
293 fullLength&=0xffff; /* bits 16 and higher are reserved */
294
295 /* skip the lowercase result string */
296 pe+=fullLength&UCASE_FULL_LOWER;
297 fullLength>>=4;
298
299 /* add the full case folding string */
300 length=fullLength&0xf;
301 if(length!=0) {
302 sa->addString(sa->set, (const UChar *)pe, length);
303 pe+=length;
304 }
305
306 /* skip the uppercase and titlecase strings */
307 fullLength>>=4;
308 pe+=fullLength&0xf;
309 fullLength>>=4;
310 pe+=fullLength;
311
312 closure=(const UChar *)pe; /* behind full case mappings */
313 }
314
315 /* add each code point in the closure string */
316 for(idx=0; idx<closureLength;) {
317 U16_NEXT_UNSAFE(closure, idx, c);
318 sa->add(sa->set, c);
319 }
320 }
321}
322
323/*
324 * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
325 * must be length>0 and max>0 and length<=max
326 */
327static inline int32_t
328strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) {
329 int32_t c1, c2;
330
331 max-=length; /* we require length<=max, so no need to decrement max in the loop */
332 do {
333 c1=*s++;
334 c2=*t++;
335 if(c2==0) {
336 return 1; /* reached the end of t but not of s */
337 }
338 c1-=c2;
339 if(c1!=0) {
340 return c1; /* return difference result */
341 }
342 } while(--length>0);
343 /* ends with length==0 */
344
345 if(max==0 || *t==0) {
346 return 0; /* equal to length of both strings */
347 } else {
Frank Tang3e05d9d2021-11-08 14:04:04 -0800348 return -max; /* return length difference */
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000349 }
350}
351
352U_CFUNC UBool U_EXPORT2
Jungshik Shin87232d82017-05-13 21:10:13 -0700353ucase_addStringCaseClosure(const UChar *s, int32_t length, const USetAdder *sa) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000354 int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
355
Jungshik Shin87232d82017-05-13 21:10:13 -0700356 if(ucase_props_singleton.unfold==NULL || s==NULL) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000357 return FALSE; /* no reverse case folding data, or no string */
358 }
359 if(length<=1) {
360 /* the string is too short to find any match */
361 /*
362 * more precise would be:
363 * if(!u_strHasMoreChar32Than(s, length, 1))
364 * but this does not make much practical difference because
365 * a single supplementary code point would just not be found
366 */
367 return FALSE;
368 }
369
Jungshik Shin87232d82017-05-13 21:10:13 -0700370 const uint16_t *unfold=ucase_props_singleton.unfold;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000371 unfoldRows=unfold[UCASE_UNFOLD_ROWS];
372 unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
373 unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
374 unfold+=unfoldRowWidth;
375
376 if(length>unfoldStringWidth) {
377 /* the string is too long to find any match */
378 return FALSE;
379 }
380
381 /* do a binary search for the string */
382 start=0;
383 limit=unfoldRows;
384 while(start<limit) {
385 i=(start+limit)/2;
386 const UChar *p=reinterpret_cast<const UChar *>(unfold+(i*unfoldRowWidth));
387 result=strcmpMax(s, length, p, unfoldStringWidth);
388
389 if(result==0) {
390 /* found the string: add each code point, and its case closure */
391 UChar32 c;
392
393 for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
394 U16_NEXT_UNSAFE(p, i, c);
395 sa->add(sa->set, c);
Jungshik Shin87232d82017-05-13 21:10:13 -0700396 ucase_addCaseClosure(c, sa);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000397 }
398 return TRUE;
399 } else if(result<0) {
400 limit=i;
401 } else /* result>0 */ {
402 start=i+1;
403 }
404 }
405
406 return FALSE; /* string not found */
407}
408
409U_NAMESPACE_BEGIN
410
411FullCaseFoldingIterator::FullCaseFoldingIterator()
412 : unfold(reinterpret_cast<const UChar *>(ucase_props_singleton.unfold)),
413 unfoldRows(unfold[UCASE_UNFOLD_ROWS]),
414 unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]),
415 unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]),
416 currentRow(0),
417 rowCpIndex(unfoldStringWidth) {
418 unfold+=unfoldRowWidth;
419}
420
421UChar32
422FullCaseFoldingIterator::next(UnicodeString &full) {
423 // Advance past the last-delivered code point.
424 const UChar *p=unfold+(currentRow*unfoldRowWidth);
425 if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) {
426 ++currentRow;
427 p+=unfoldRowWidth;
428 rowCpIndex=unfoldStringWidth;
429 }
430 if(currentRow>=unfoldRows) { return U_SENTINEL; }
431 // Set "full" to the NUL-terminated string in the first unfold column.
432 int32_t length=unfoldStringWidth;
433 while(length>0 && p[length-1]==0) { --length; }
434 full.setTo(FALSE, p, length);
435 // Return the code point.
436 UChar32 c;
437 U16_NEXT_UNSAFE(p, rowCpIndex, c);
438 return c;
439}
440
Jungshik Shinf61e46d2018-05-04 13:00:45 -0700441namespace LatinCase {
442
443const int8_t TO_LOWER_NORMAL[LIMIT] = {
444 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
445 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
446 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
447 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
448
449 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
450 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
451 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
452 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
453
454 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
455 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
456 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
457 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
458
459 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
460 32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
461 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
462 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
463
464 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
465 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
466 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
467 EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
468
469 0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
470 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
471 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
472 1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
473};
474
475const int8_t TO_LOWER_TR_LT[LIMIT] = {
476 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
477 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
478 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
479 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
480
481 0, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32, 32, 32, 32,
482 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
483 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
484 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
485
486 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
487 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
488 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
489 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
490
491 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32,
492 32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
493 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
494 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
495
496 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
497 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
498 1, 0, 1, 0, 1, 0, 1, 0, EXC, 0, 1, 0, 1, 0, EXC, 0,
499 EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
500
501 0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
502 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
503 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
504 1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
505};
506
507const int8_t TO_UPPER_NORMAL[LIMIT] = {
508 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
509 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
510 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
511 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
512
513 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
514 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
515 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
516 -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
517
518 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
519 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
520 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
521 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
522
523 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
524 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
525 -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
526 -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
527
528 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
529 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
530 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
531 0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
532
533 -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
534 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
535 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
536 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
537};
538
539const int8_t TO_UPPER_TR[LIMIT] = {
540 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
541 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
542 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
543 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
544
545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
546 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
547 0, -32, -32, -32, -32, -32, -32, -32, -32, EXC, -32, -32, -32, -32, -32, -32,
548 -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
549
550 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
551 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
552 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
553 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
554
555 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
556 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
557 -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
558 -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
559
560 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
561 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
562 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
563 0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
564
565 -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
566 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
567 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
568 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
569};
570
571} // namespace LatinCase
572
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000573U_NAMESPACE_END
574
575/** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
576U_CAPI int32_t U_EXPORT2
Jungshik Shin87232d82017-05-13 21:10:13 -0700577ucase_getType(UChar32 c) {
578 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000579 return UCASE_GET_TYPE(props);
580}
581
582/** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
583U_CAPI int32_t U_EXPORT2
Jungshik Shin87232d82017-05-13 21:10:13 -0700584ucase_getTypeOrIgnorable(UChar32 c) {
585 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000586 return UCASE_GET_TYPE_AND_IGNORABLE(props);
587}
588
589/** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
590static inline int32_t
Jungshik Shin87232d82017-05-13 21:10:13 -0700591getDotType(UChar32 c) {
592 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
Jungshik Shinf61e46d2018-05-04 13:00:45 -0700593 if(!UCASE_HAS_EXCEPTION(props)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000594 return props&UCASE_DOT_MASK;
595 } else {
Jungshik Shin87232d82017-05-13 21:10:13 -0700596 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000597 return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
598 }
599}
600
601U_CAPI UBool U_EXPORT2
Jungshik Shin87232d82017-05-13 21:10:13 -0700602ucase_isSoftDotted(UChar32 c) {
603 return (UBool)(getDotType(c)==UCASE_SOFT_DOTTED);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000604}
605
606U_CAPI UBool U_EXPORT2
Jungshik Shin87232d82017-05-13 21:10:13 -0700607ucase_isCaseSensitive(UChar32 c) {
608 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
Jungshik Shina9a2bd32018-07-07 03:36:01 -0700609 if(!UCASE_HAS_EXCEPTION(props)) {
610 return (UBool)((props&UCASE_SENSITIVE)!=0);
611 } else {
612 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
613 return (UBool)((*pe&UCASE_EXC_SENSITIVE)!=0);
614 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000615}
616
617/* string casing ------------------------------------------------------------ */
618
619/*
620 * These internal functions form the core of string case mappings.
621 * They map single code points to result code points or strings and take
622 * all necessary conditions (context, locale ID, options) into account.
623 *
624 * They do not iterate over the source or write to the destination
625 * so that the same functions are useful for non-standard string storage,
626 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
627 * For the same reason, the "surrounding text" context is passed in as a
628 * UCaseContextIterator which does not make any assumptions about
629 * the underlying storage.
630 *
631 * This section contains helper functions that check for conditions
632 * in the input text surrounding the current code point
633 * according to SpecialCasing.txt.
634 *
635 * Each helper function gets the index
636 * - after the current code point if it looks at following text
637 * - before the current code point if it looks at preceding text
638 *
639 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
640 *
641 * Final_Sigma
642 * C is preceded by a sequence consisting of
643 * a cased letter and a case-ignorable sequence,
644 * and C is not followed by a sequence consisting of
645 * an ignorable sequence and then a cased letter.
646 *
647 * More_Above
648 * C is followed by one or more characters of combining class 230 (ABOVE)
649 * in the combining character sequence.
650 *
651 * After_Soft_Dotted
652 * The last preceding character with combining class of zero before C
653 * was Soft_Dotted,
654 * and there is no intervening combining character class 230 (ABOVE).
655 *
656 * Before_Dot
657 * C is followed by combining dot above (U+0307).
658 * Any sequence of characters with a combining class that is neither 0 nor 230
659 * may intervene between the current character and the combining dot above.
660 *
661 * The erratum from 2002-10-31 adds the condition
662 *
663 * After_I
664 * The last preceding base character was an uppercase I, and there is no
665 * intervening combining character class 230 (ABOVE).
666 *
667 * (See Jitterbug 2344 and the comments on After_I below.)
668 *
669 * Helper definitions in Unicode 3.2 UAX 21:
670 *
671 * D1. A character C is defined to be cased
672 * if it meets any of the following criteria:
673 *
674 * - The general category of C is Titlecase Letter (Lt)
675 * - In [CoreProps], C has one of the properties Uppercase, or Lowercase
676 * - Given D = NFD(C), then it is not the case that:
677 * D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
Frank Tang7e7574b2021-04-13 21:19:13 -0700678 * (This third criterion does not add any characters to the list
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000679 * for Unicode 3.2. Ignored.)
680 *
681 * D2. A character C is defined to be case-ignorable
682 * if it meets either of the following criteria:
683 *
684 * - The general category of C is
685 * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
686 * Letter Modifier (Lm), or Symbol Modifier (Sk)
Frank Tangd2858cb2022-04-08 20:34:12 -0700687 * - C is one of the following characters
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000688 * U+0027 APOSTROPHE
689 * U+00AD SOFT HYPHEN (SHY)
690 * U+2019 RIGHT SINGLE QUOTATION MARK
691 * (the preferred character for apostrophe)
692 *
693 * D3. A case-ignorable sequence is a sequence of
694 * zero or more case-ignorable characters.
695 */
696
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000697#define is_d(c) ((c)=='d' || (c)=='D')
698#define is_e(c) ((c)=='e' || (c)=='E')
699#define is_i(c) ((c)=='i' || (c)=='I')
700#define is_l(c) ((c)=='l' || (c)=='L')
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000701#define is_r(c) ((c)=='r' || (c)=='R')
702#define is_t(c) ((c)=='t' || (c)=='T')
703#define is_u(c) ((c)=='u' || (c)=='U')
Frank Tangf90543d2020-10-30 19:02:04 -0700704#define is_y(c) ((c)=='y' || (c)=='Y')
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000705#define is_z(c) ((c)=='z' || (c)=='Z')
706
707/* separator? */
708#define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
709
710/**
711 * Requires non-NULL locale ID but otherwise does the equivalent of
712 * checking for language codes as if uloc_getLanguage() were called:
713 * Accepts both 2- and 3-letter codes and accepts case variants.
714 */
715U_CFUNC int32_t
Jungshik Shin87232d82017-05-13 21:10:13 -0700716ucase_getCaseLocale(const char *locale) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000717 /*
718 * This function used to use uloc_getLanguage(), but the current code
719 * removes the dependency of this low-level code on uloc implementation code
720 * and is faster because not the whole locale ID has to be
721 * examined and copied/transformed.
722 *
723 * Because this code does not want to depend on uloc, the caller must
724 * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
725 */
Jungshik Shin87232d82017-05-13 21:10:13 -0700726 char c=*locale++;
727 // Fastpath for English "en" which is often used for default (=root locale) case mappings,
728 // and for Chinese "zh": Very common but no special case mapping behavior.
729 // Then check lowercase vs. uppercase to reduce the number of comparisons
730 // for other locales without special behavior.
731 if(c=='e') {
Jungshik Shin5feb9ad2016-10-21 12:52:48 -0700732 /* el or ell? */
733 c=*locale++;
734 if(is_l(c)) {
735 c=*locale++;
736 if(is_l(c)) {
737 c=*locale;
738 }
739 if(is_sep(c)) {
Jungshik Shin87232d82017-05-13 21:10:13 -0700740 return UCASE_LOC_GREEK;
Jungshik Shin5feb9ad2016-10-21 12:52:48 -0700741 }
742 }
Jungshik Shin87232d82017-05-13 21:10:13 -0700743 // en, es, ... -> root
744 } else if(c=='z') {
745 return UCASE_LOC_ROOT;
746#if U_CHARSET_FAMILY==U_ASCII_FAMILY
747 } else if(c>='a') { // ASCII a-z = 0x61..0x7a, after A-Z
748#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
749 } else if(c<='z') { // EBCDIC a-z = 0x81..0xa9 with two gaps, before A-Z
750#else
751# error Unknown charset family!
752#endif
753 // lowercase c
754 if(c=='t') {
755 /* tr or tur? */
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000756 c=*locale++;
Jungshik Shin87232d82017-05-13 21:10:13 -0700757 if(is_u(c)) {
758 c=*locale++;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000759 }
Jungshik Shin87232d82017-05-13 21:10:13 -0700760 if(is_r(c)) {
761 c=*locale;
762 if(is_sep(c)) {
763 return UCASE_LOC_TURKISH;
764 }
765 }
766 } else if(c=='a') {
767 /* az or aze? */
768 c=*locale++;
769 if(is_z(c)) {
770 c=*locale++;
771 if(is_e(c)) {
772 c=*locale;
773 }
774 if(is_sep(c)) {
775 return UCASE_LOC_TURKISH;
776 }
777 }
778 } else if(c=='l') {
779 /* lt or lit? */
780 c=*locale++;
781 if(is_i(c)) {
782 c=*locale++;
783 }
784 if(is_t(c)) {
785 c=*locale;
786 if(is_sep(c)) {
787 return UCASE_LOC_LITHUANIAN;
788 }
789 }
790 } else if(c=='n') {
791 /* nl or nld? */
792 c=*locale++;
793 if(is_l(c)) {
794 c=*locale++;
795 if(is_d(c)) {
796 c=*locale;
797 }
798 if(is_sep(c)) {
799 return UCASE_LOC_DUTCH;
800 }
801 }
Frank Tangf90543d2020-10-30 19:02:04 -0700802 } else if(c=='h') {
803 /* hy or hye? *not* hyw */
804 c=*locale++;
805 if(is_y(c)) {
806 c=*locale++;
807 if(is_e(c)) {
808 c=*locale;
809 }
810 if(is_sep(c)) {
811 return UCASE_LOC_ARMENIAN;
812 }
813 }
Jungshik Shin87232d82017-05-13 21:10:13 -0700814 }
815 } else {
816 // uppercase c
817 // Same code as for lowercase c but also check for 'E'.
818 if(c=='T') {
819 /* tr or tur? */
820 c=*locale++;
821 if(is_u(c)) {
822 c=*locale++;
823 }
824 if(is_r(c)) {
825 c=*locale;
826 if(is_sep(c)) {
827 return UCASE_LOC_TURKISH;
828 }
829 }
830 } else if(c=='A') {
831 /* az or aze? */
832 c=*locale++;
833 if(is_z(c)) {
834 c=*locale++;
835 if(is_e(c)) {
836 c=*locale;
837 }
838 if(is_sep(c)) {
839 return UCASE_LOC_TURKISH;
840 }
841 }
842 } else if(c=='L') {
843 /* lt or lit? */
844 c=*locale++;
845 if(is_i(c)) {
846 c=*locale++;
847 }
848 if(is_t(c)) {
849 c=*locale;
850 if(is_sep(c)) {
851 return UCASE_LOC_LITHUANIAN;
852 }
853 }
854 } else if(c=='E') {
855 /* el or ell? */
856 c=*locale++;
857 if(is_l(c)) {
858 c=*locale++;
859 if(is_l(c)) {
860 c=*locale;
861 }
862 if(is_sep(c)) {
863 return UCASE_LOC_GREEK;
864 }
865 }
866 } else if(c=='N') {
867 /* nl or nld? */
868 c=*locale++;
869 if(is_l(c)) {
870 c=*locale++;
871 if(is_d(c)) {
872 c=*locale;
873 }
874 if(is_sep(c)) {
875 return UCASE_LOC_DUTCH;
876 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000877 }
Frank Tangf90543d2020-10-30 19:02:04 -0700878 } else if(c=='H') {
879 /* hy or hye? *not* hyw */
880 c=*locale++;
881 if(is_y(c)) {
882 c=*locale++;
883 if(is_e(c)) {
884 c=*locale;
885 }
886 if(is_sep(c)) {
887 return UCASE_LOC_ARMENIAN;
888 }
889 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000890 }
891 }
Jungshik Shin87232d82017-05-13 21:10:13 -0700892 return UCASE_LOC_ROOT;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000893}
894
895/*
896 * Is followed by
897 * {case-ignorable}* cased
898 * ?
899 * (dir determines looking forward/backward)
900 * If a character is case-ignorable, it is skipped regardless of whether
901 * it is also cased or not.
902 */
903static UBool
Jungshik Shin87232d82017-05-13 21:10:13 -0700904isFollowedByCasedLetter(UCaseContextIterator *iter, void *context, int8_t dir) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000905 UChar32 c;
906
907 if(iter==NULL) {
908 return FALSE;
909 }
910
911 for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
Jungshik Shin87232d82017-05-13 21:10:13 -0700912 int32_t type=ucase_getTypeOrIgnorable(c);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000913 if(type&4) {
914 /* case-ignorable, continue with the loop */
915 } else if(type!=UCASE_NONE) {
916 return TRUE; /* followed by cased letter */
917 } else {
918 return FALSE; /* uncased and not case-ignorable */
919 }
920 }
921
922 return FALSE; /* not followed by cased letter */
923}
924
925/* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
926static UBool
Jungshik Shin87232d82017-05-13 21:10:13 -0700927isPrecededBySoftDotted(UCaseContextIterator *iter, void *context) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000928 UChar32 c;
929 int32_t dotType;
930 int8_t dir;
931
932 if(iter==NULL) {
933 return FALSE;
934 }
935
936 for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
Jungshik Shin87232d82017-05-13 21:10:13 -0700937 dotType=getDotType(c);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000938 if(dotType==UCASE_SOFT_DOTTED) {
939 return TRUE; /* preceded by TYPE_i */
940 } else if(dotType!=UCASE_OTHER_ACCENT) {
941 return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
942 }
943 }
944
945 return FALSE; /* not preceded by TYPE_i */
946}
947
948/*
949 * See Jitterbug 2344:
950 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
951 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
952 * we made those releases compatible with Unicode 3.2 which had not fixed
953 * a related bug in SpecialCasing.txt.
954 *
955 * From the Jitterbug 2344 text:
956 * ... this bug is listed as a Unicode erratum
957 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
958 * <quote>
959 * There are two errors in SpecialCasing.txt.
960 * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
961 * 2. An incorrect context definition. Correct as follows:
962 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
963 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
964 * ---
965 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
966 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
967 * where the context After_I is defined as:
968 * The last preceding base character was an uppercase I, and there is no
969 * intervening combining character class 230 (ABOVE).
970 * </quote>
971 *
972 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
973 *
974 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
975 * # This matches the behavior of the canonically equivalent I-dot_above
976 *
977 * See also the description in this place in older versions of uchar.c (revision 1.100).
978 *
979 * Markus W. Scherer 2003-feb-15
980 */
981
982/* Is preceded by base character 'I' with no intervening cc=230 ? */
983static UBool
Jungshik Shin87232d82017-05-13 21:10:13 -0700984isPrecededBy_I(UCaseContextIterator *iter, void *context) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000985 UChar32 c;
986 int32_t dotType;
987 int8_t dir;
988
989 if(iter==NULL) {
990 return FALSE;
991 }
992
993 for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
994 if(c==0x49) {
995 return TRUE; /* preceded by I */
996 }
Jungshik Shin87232d82017-05-13 21:10:13 -0700997 dotType=getDotType(c);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000998 if(dotType!=UCASE_OTHER_ACCENT) {
999 return FALSE; /* preceded by different base character (not I), or intervening cc==230 */
1000 }
1001 }
1002
1003 return FALSE; /* not preceded by I */
1004}
1005
1006/* Is followed by one or more cc==230 ? */
1007static UBool
Jungshik Shin87232d82017-05-13 21:10:13 -07001008isFollowedByMoreAbove(UCaseContextIterator *iter, void *context) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001009 UChar32 c;
1010 int32_t dotType;
1011 int8_t dir;
1012
1013 if(iter==NULL) {
1014 return FALSE;
1015 }
1016
1017 for(dir=1; (c=iter(context, dir))>=0; dir=0) {
Jungshik Shin87232d82017-05-13 21:10:13 -07001018 dotType=getDotType(c);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001019 if(dotType==UCASE_ABOVE) {
1020 return TRUE; /* at least one cc==230 following */
1021 } else if(dotType!=UCASE_OTHER_ACCENT) {
1022 return FALSE; /* next base character, no more cc==230 following */
1023 }
1024 }
1025
1026 return FALSE; /* no more cc==230 following */
1027}
1028
1029/* Is followed by a dot above (without cc==230 in between) ? */
1030static UBool
Jungshik Shin87232d82017-05-13 21:10:13 -07001031isFollowedByDotAbove(UCaseContextIterator *iter, void *context) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001032 UChar32 c;
1033 int32_t dotType;
1034 int8_t dir;
1035
1036 if(iter==NULL) {
1037 return FALSE;
1038 }
1039
1040 for(dir=1; (c=iter(context, dir))>=0; dir=0) {
1041 if(c==0x307) {
1042 return TRUE;
1043 }
Jungshik Shin87232d82017-05-13 21:10:13 -07001044 dotType=getDotType(c);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001045 if(dotType!=UCASE_OTHER_ACCENT) {
1046 return FALSE; /* next base character or cc==230 in between */
1047 }
1048 }
1049
1050 return FALSE; /* no dot above following */
1051}
1052
1053U_CAPI int32_t U_EXPORT2
Jungshik Shin87232d82017-05-13 21:10:13 -07001054ucase_toFullLower(UChar32 c,
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001055 UCaseContextIterator *iter, void *context,
1056 const UChar **pString,
Jungshik Shin87232d82017-05-13 21:10:13 -07001057 int32_t loc) {
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07001058 // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1059 U_ASSERT(c >= 0);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001060 UChar32 result=c;
Frank Tangd2858cb2022-04-08 20:34:12 -07001061 // Reset the output pointer in case it was uninitialized.
1062 *pString=nullptr;
Jungshik Shin87232d82017-05-13 21:10:13 -07001063 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
Jungshik Shinf61e46d2018-05-04 13:00:45 -07001064 if(!UCASE_HAS_EXCEPTION(props)) {
1065 if(UCASE_IS_UPPER_OR_TITLE(props)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001066 result=c+UCASE_GET_DELTA(props);
1067 }
1068 } else {
Jungshik Shin87232d82017-05-13 21:10:13 -07001069 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001070 uint16_t excWord=*pe++;
1071 int32_t full;
1072
1073 pe2=pe;
1074
1075 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1076 /* use hardcoded conditions and mappings */
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001077
1078 /*
1079 * Test for conditional mappings first
1080 * (otherwise the unconditional default mappings are always taken),
1081 * then test for characters that have unconditional mappings in SpecialCasing.txt,
1082 * then get the UnicodeData.txt mappings.
1083 */
1084 if( loc==UCASE_LOC_LITHUANIAN &&
1085 /* base characters, find accents above */
1086 (((c==0x49 || c==0x4a || c==0x12e) &&
Jungshik Shin87232d82017-05-13 21:10:13 -07001087 isFollowedByMoreAbove(iter, context)) ||
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001088 /* precomposed with accent above, no need to find one */
1089 (c==0xcc || c==0xcd || c==0x128))
1090 ) {
1091 /*
1092 # Lithuanian
1093
1094 # Lithuanian retains the dot in a lowercase i when followed by accents.
1095
1096 # Introduce an explicit dot above when lowercasing capital I's and J's
1097 # whenever there are more accents above.
1098 # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
1099
1100 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
1101 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
1102 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
1103 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
1104 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
1105 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
1106 */
1107 switch(c) {
1108 case 0x49: /* LATIN CAPITAL LETTER I */
1109 *pString=iDot;
1110 return 2;
1111 case 0x4a: /* LATIN CAPITAL LETTER J */
1112 *pString=jDot;
1113 return 2;
1114 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
1115 *pString=iOgonekDot;
1116 return 2;
1117 case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */
1118 *pString=iDotGrave;
1119 return 3;
1120 case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */
1121 *pString=iDotAcute;
1122 return 3;
1123 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
1124 *pString=iDotTilde;
1125 return 3;
1126 default:
1127 return 0; /* will not occur */
1128 }
1129 /* # Turkish and Azeri */
1130 } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
1131 /*
1132 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1133 # The following rules handle those cases.
1134
1135 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
1136 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
1137 */
1138 return 0x69;
Jungshik Shin87232d82017-05-13 21:10:13 -07001139 } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(iter, context)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001140 /*
1141 # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
1142 # This matches the behavior of the canonically equivalent I-dot_above
1143
1144 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
1145 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
1146 */
1147 return 0; /* remove the dot (continue without output) */
Jungshik Shin87232d82017-05-13 21:10:13 -07001148 } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter, context)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001149 /*
1150 # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
1151
1152 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
1153 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
1154 */
1155 return 0x131;
1156 } else if(c==0x130) {
1157 /*
1158 # Preserve canonical equivalence for I with dot. Turkic is handled below.
1159
1160 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1161 */
1162 *pString=iDot;
1163 return 2;
1164 } else if( c==0x3a3 &&
Jungshik Shin87232d82017-05-13 21:10:13 -07001165 !isFollowedByCasedLetter(iter, context, 1) &&
1166 isFollowedByCasedLetter(iter, context, -1) /* -1=preceded */
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001167 ) {
1168 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
1169 /*
1170 # Special case for final form of sigma
1171
1172 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
1173 */
1174 return 0x3c2; /* greek small final sigma */
1175 } else {
1176 /* no known conditional special case mapping, use a normal mapping */
1177 }
1178 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1179 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1180 full&=UCASE_FULL_LOWER;
1181 if(full!=0) {
1182 /* set the output pointer to the lowercase mapping */
1183 *pString=reinterpret_cast<const UChar *>(pe+1);
1184
1185 /* return the string length */
1186 return full;
1187 }
1188 }
1189
Jungshik Shina9a2bd32018-07-07 03:36:01 -07001190 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1191 int32_t delta;
1192 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1193 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1194 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001195 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1196 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
1197 }
1198 }
1199
1200 return (result==c) ? ~result : result;
1201}
1202
1203/* internal */
1204static int32_t
Jungshik Shin87232d82017-05-13 21:10:13 -07001205toUpperOrTitle(UChar32 c,
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001206 UCaseContextIterator *iter, void *context,
1207 const UChar **pString,
Jungshik Shin87232d82017-05-13 21:10:13 -07001208 int32_t loc,
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001209 UBool upperNotTitle) {
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07001210 // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1211 U_ASSERT(c >= 0);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001212 UChar32 result=c;
Frank Tangd2858cb2022-04-08 20:34:12 -07001213 // Reset the output pointer in case it was uninitialized.
1214 *pString=nullptr;
Jungshik Shin87232d82017-05-13 21:10:13 -07001215 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
Jungshik Shinf61e46d2018-05-04 13:00:45 -07001216 if(!UCASE_HAS_EXCEPTION(props)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001217 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
1218 result=c+UCASE_GET_DELTA(props);
1219 }
1220 } else {
Jungshik Shin87232d82017-05-13 21:10:13 -07001221 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001222 uint16_t excWord=*pe++;
1223 int32_t full, idx;
1224
1225 pe2=pe;
1226
1227 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1228 /* use hardcoded conditions and mappings */
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001229 if(loc==UCASE_LOC_TURKISH && c==0x69) {
1230 /*
1231 # Turkish and Azeri
1232
1233 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1234 # The following rules handle those cases.
1235
1236 # When uppercasing, i turns into a dotted capital I
1237
1238 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
1239 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
1240 */
1241 return 0x130;
Jungshik Shin87232d82017-05-13 21:10:13 -07001242 } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(iter, context)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001243 /*
1244 # Lithuanian
1245
1246 # Lithuanian retains the dot in a lowercase i when followed by accents.
1247
1248 # Remove DOT ABOVE after "i" with upper or titlecase
1249
1250 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1251 */
1252 return 0; /* remove the dot (continue without output) */
Frank Tangf90543d2020-10-30 19:02:04 -07001253 } else if(c==0x0587) {
1254 // See ICU-13416:
1255 // և ligature ech-yiwn
1256 // uppercases to ԵՒ=ech+yiwn by default and in Western Armenian,
1257 // but to ԵՎ=ech+vew in Eastern Armenian.
1258 if(loc==UCASE_LOC_ARMENIAN) {
1259 *pString=upperNotTitle ? u"ԵՎ" : u"Եվ";
1260 } else {
1261 *pString=upperNotTitle ? u"ԵՒ" : u"Եւ";
1262 }
1263 return 2;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001264 } else {
1265 /* no known conditional special case mapping, use a normal mapping */
1266 }
1267 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1268 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1269
1270 /* start of full case mapping strings */
1271 ++pe;
1272
1273 /* skip the lowercase and case-folding result strings */
1274 pe+=full&UCASE_FULL_LOWER;
1275 full>>=4;
1276 pe+=full&0xf;
1277 full>>=4;
1278
1279 if(upperNotTitle) {
1280 full&=0xf;
1281 } else {
1282 /* skip the uppercase result string */
1283 pe+=full&0xf;
1284 full=(full>>4)&0xf;
1285 }
1286
1287 if(full!=0) {
1288 /* set the output pointer to the result string */
1289 *pString=reinterpret_cast<const UChar *>(pe);
1290
1291 /* return the string length */
1292 return full;
1293 }
1294 }
1295
Jungshik Shina9a2bd32018-07-07 03:36:01 -07001296 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
1297 int32_t delta;
1298 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1299 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1300 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001301 if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
1302 idx=UCASE_EXC_TITLE;
1303 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
1304 /* here, titlecase is same as uppercase */
1305 idx=UCASE_EXC_UPPER;
1306 } else {
1307 return ~c;
1308 }
1309 GET_SLOT_VALUE(excWord, idx, pe2, result);
1310 }
1311
1312 return (result==c) ? ~result : result;
1313}
1314
1315U_CAPI int32_t U_EXPORT2
Jungshik Shin87232d82017-05-13 21:10:13 -07001316ucase_toFullUpper(UChar32 c,
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001317 UCaseContextIterator *iter, void *context,
1318 const UChar **pString,
Jungshik Shin87232d82017-05-13 21:10:13 -07001319 int32_t caseLocale) {
1320 return toUpperOrTitle(c, iter, context, pString, caseLocale, TRUE);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001321}
1322
1323U_CAPI int32_t U_EXPORT2
Jungshik Shin87232d82017-05-13 21:10:13 -07001324ucase_toFullTitle(UChar32 c,
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001325 UCaseContextIterator *iter, void *context,
1326 const UChar **pString,
Jungshik Shin87232d82017-05-13 21:10:13 -07001327 int32_t caseLocale) {
1328 return toUpperOrTitle(c, iter, context, pString, caseLocale, FALSE);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001329}
1330
1331/* case folding ------------------------------------------------------------- */
1332
1333/*
1334 * Case folding is similar to lowercasing.
1335 * The result may be a simple mapping, i.e., a single code point, or
1336 * a full mapping, i.e., a string.
1337 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1338 * then only the lowercase mapping is stored.
1339 *
1340 * Some special cases are hardcoded because their conditions cannot be
1341 * parsed and processed from CaseFolding.txt.
1342 *
1343 * Unicode 3.2 CaseFolding.txt specifies for its status field:
1344
1345# C: common case folding, common mappings shared by both simple and full mappings.
1346# F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1347# S: simple case folding, mappings to single characters where different from F.
1348# T: special case for uppercase I and dotted uppercase I
1349# - For non-Turkic languages, this mapping is normally not used.
1350# - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1351#
1352# Usage:
1353# A. To do a simple case folding, use the mappings with status C + S.
1354# B. To do a full case folding, use the mappings with status C + F.
1355#
1356# The mappings with status T can be used or omitted depending on the desired case-folding
1357# behavior. (The default option is to exclude them.)
1358
1359 * Unicode 3.2 has 'T' mappings as follows:
1360
13610049; T; 0131; # LATIN CAPITAL LETTER I
13620130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1363
1364 * while the default mappings for these code points are:
1365
13660049; C; 0069; # LATIN CAPITAL LETTER I
13670130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1368
1369 * U+0130 has no simple case folding (simple-case-folds to itself).
1370 */
1371
1372/* return the simple case folding mapping for c */
1373U_CAPI UChar32 U_EXPORT2
Jungshik Shin87232d82017-05-13 21:10:13 -07001374ucase_fold(UChar32 c, uint32_t options) {
1375 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
Jungshik Shinf61e46d2018-05-04 13:00:45 -07001376 if(!UCASE_HAS_EXCEPTION(props)) {
1377 if(UCASE_IS_UPPER_OR_TITLE(props)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001378 c+=UCASE_GET_DELTA(props);
1379 }
1380 } else {
Jungshik Shin87232d82017-05-13 21:10:13 -07001381 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001382 uint16_t excWord=*pe++;
1383 int32_t idx;
1384 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1385 /* special case folding mappings, hardcoded */
1386 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1387 /* default mappings */
1388 if(c==0x49) {
1389 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1390 return 0x69;
1391 } else if(c==0x130) {
1392 /* no simple case folding for U+0130 */
1393 return c;
1394 }
1395 } else {
1396 /* Turkic mappings */
1397 if(c==0x49) {
1398 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1399 return 0x131;
1400 } else if(c==0x130) {
1401 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1402 return 0x69;
1403 }
1404 }
1405 }
Jungshik Shina9a2bd32018-07-07 03:36:01 -07001406 if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
1407 return c;
1408 }
1409 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1410 int32_t delta;
1411 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
1412 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1413 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001414 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1415 idx=UCASE_EXC_FOLD;
1416 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1417 idx=UCASE_EXC_LOWER;
1418 } else {
1419 return c;
1420 }
1421 GET_SLOT_VALUE(excWord, idx, pe, c);
1422 }
1423 return c;
1424}
1425
1426/*
1427 * Issue for canonical caseless match (UAX #21):
1428 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1429 * canonical equivalence, unlike default-option casefolding.
1430 * For example, I-grave and I + grave fold to strings that are not canonically
1431 * equivalent.
1432 * For more details, see the comment in unorm_compare() in unorm.cpp
1433 * and the intermediate prototype changes for Jitterbug 2021.
1434 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1435 *
1436 * This did not get fixed because it appears that it is not possible to fix
1437 * it for uppercase and lowercase characters (I-grave vs. i-grave)
1438 * together in a way that they still fold to common result strings.
1439 */
1440
1441U_CAPI int32_t U_EXPORT2
Jungshik Shin87232d82017-05-13 21:10:13 -07001442ucase_toFullFolding(UChar32 c,
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001443 const UChar **pString,
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07001444 uint32_t options) {
1445 // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1446 U_ASSERT(c >= 0);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001447 UChar32 result=c;
Frank Tangd2858cb2022-04-08 20:34:12 -07001448 // Reset the output pointer in case it was uninitialized.
1449 *pString=nullptr;
Jungshik Shin87232d82017-05-13 21:10:13 -07001450 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
Jungshik Shinf61e46d2018-05-04 13:00:45 -07001451 if(!UCASE_HAS_EXCEPTION(props)) {
1452 if(UCASE_IS_UPPER_OR_TITLE(props)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001453 result=c+UCASE_GET_DELTA(props);
1454 }
1455 } else {
Jungshik Shin87232d82017-05-13 21:10:13 -07001456 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001457 uint16_t excWord=*pe++;
1458 int32_t full, idx;
1459
1460 pe2=pe;
1461
1462 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1463 /* use hardcoded conditions and mappings */
1464 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1465 /* default mappings */
1466 if(c==0x49) {
1467 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1468 return 0x69;
1469 } else if(c==0x130) {
1470 /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1471 *pString=iDot;
1472 return 2;
1473 }
1474 } else {
1475 /* Turkic mappings */
1476 if(c==0x49) {
1477 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1478 return 0x131;
1479 } else if(c==0x130) {
1480 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1481 return 0x69;
1482 }
1483 }
1484 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1485 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1486
1487 /* start of full case mapping strings */
1488 ++pe;
1489
1490 /* skip the lowercase result string */
1491 pe+=full&UCASE_FULL_LOWER;
1492 full=(full>>4)&0xf;
1493
1494 if(full!=0) {
1495 /* set the output pointer to the result string */
1496 *pString=reinterpret_cast<const UChar *>(pe);
1497
1498 /* return the string length */
1499 return full;
1500 }
1501 }
1502
Jungshik Shina9a2bd32018-07-07 03:36:01 -07001503 if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
1504 return ~c;
1505 }
1506 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1507 int32_t delta;
1508 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1509 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1510 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001511 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1512 idx=UCASE_EXC_FOLD;
1513 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1514 idx=UCASE_EXC_LOWER;
1515 } else {
1516 return ~c;
1517 }
1518 GET_SLOT_VALUE(excWord, idx, pe2, result);
1519 }
1520
1521 return (result==c) ? ~result : result;
1522}
1523
1524/* case mapping properties API ---------------------------------------------- */
1525
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001526/* public API (see uchar.h) */
1527
1528U_CAPI UBool U_EXPORT2
1529u_isULowercase(UChar32 c) {
Jungshik Shin87232d82017-05-13 21:10:13 -07001530 return (UBool)(UCASE_LOWER==ucase_getType(c));
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001531}
1532
1533U_CAPI UBool U_EXPORT2
1534u_isUUppercase(UChar32 c) {
Jungshik Shin87232d82017-05-13 21:10:13 -07001535 return (UBool)(UCASE_UPPER==ucase_getType(c));
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001536}
1537
1538/* Transforms the Unicode character to its lower case equivalent.*/
1539U_CAPI UChar32 U_EXPORT2
1540u_tolower(UChar32 c) {
Jungshik Shin87232d82017-05-13 21:10:13 -07001541 return ucase_tolower(c);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001542}
Frank Tangd2858cb2022-04-08 20:34:12 -07001543
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001544/* Transforms the Unicode character to its upper case equivalent.*/
1545U_CAPI UChar32 U_EXPORT2
1546u_toupper(UChar32 c) {
Jungshik Shin87232d82017-05-13 21:10:13 -07001547 return ucase_toupper(c);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001548}
1549
1550/* Transforms the Unicode character to its title case equivalent.*/
1551U_CAPI UChar32 U_EXPORT2
1552u_totitle(UChar32 c) {
Jungshik Shin87232d82017-05-13 21:10:13 -07001553 return ucase_totitle(c);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001554}
1555
1556/* return the simple case folding mapping for c */
1557U_CAPI UChar32 U_EXPORT2
1558u_foldCase(UChar32 c, uint32_t options) {
Jungshik Shin87232d82017-05-13 21:10:13 -07001559 return ucase_fold(c, options);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001560}
1561
1562U_CFUNC int32_t U_EXPORT2
1563ucase_hasBinaryProperty(UChar32 c, UProperty which) {
1564 /* case mapping properties */
1565 const UChar *resultString;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001566 switch(which) {
1567 case UCHAR_LOWERCASE:
Jungshik Shin87232d82017-05-13 21:10:13 -07001568 return (UBool)(UCASE_LOWER==ucase_getType(c));
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001569 case UCHAR_UPPERCASE:
Jungshik Shin87232d82017-05-13 21:10:13 -07001570 return (UBool)(UCASE_UPPER==ucase_getType(c));
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001571 case UCHAR_SOFT_DOTTED:
Jungshik Shin87232d82017-05-13 21:10:13 -07001572 return ucase_isSoftDotted(c);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001573 case UCHAR_CASE_SENSITIVE:
Jungshik Shin87232d82017-05-13 21:10:13 -07001574 return ucase_isCaseSensitive(c);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001575 case UCHAR_CASED:
Jungshik Shin87232d82017-05-13 21:10:13 -07001576 return (UBool)(UCASE_NONE!=ucase_getType(c));
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001577 case UCHAR_CASE_IGNORABLE:
Jungshik Shin87232d82017-05-13 21:10:13 -07001578 return (UBool)(ucase_getTypeOrIgnorable(c)>>2);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001579 /*
1580 * Note: The following Changes_When_Xyz are defined as testing whether
1581 * the NFD form of the input changes when Xyz-case-mapped.
1582 * However, this simpler implementation of these properties,
1583 * ignoring NFD, passes the tests.
1584 * The implementation needs to be changed if the tests start failing.
1585 * When that happens, optimizations should be used to work with the
1586 * per-single-code point ucase_toFullXyz() functions unless
1587 * the NFD form has more than one code point,
1588 * and the property starts set needs to be the union of the
1589 * start sets for normalization and case mappings.
1590 */
1591 case UCHAR_CHANGES_WHEN_LOWERCASED:
Jungshik Shin87232d82017-05-13 21:10:13 -07001592 return (UBool)(ucase_toFullLower(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001593 case UCHAR_CHANGES_WHEN_UPPERCASED:
Jungshik Shin87232d82017-05-13 21:10:13 -07001594 return (UBool)(ucase_toFullUpper(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001595 case UCHAR_CHANGES_WHEN_TITLECASED:
Jungshik Shin87232d82017-05-13 21:10:13 -07001596 return (UBool)(ucase_toFullTitle(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001597 /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
1598 case UCHAR_CHANGES_WHEN_CASEMAPPED:
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001599 return (UBool)(
Jungshik Shin87232d82017-05-13 21:10:13 -07001600 ucase_toFullLower(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0 ||
1601 ucase_toFullUpper(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0 ||
1602 ucase_toFullTitle(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001603 default:
1604 return FALSE;
1605 }
1606}