blob: 4aa856507aafb1ee9b66713a0494e5a9a314a888 [file] [log] [blame]
Jungshik Shin87232d82017-05-13 21:10:13 -07001// © 2016 and later: Unicode, Inc. and others.
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07002// License & terms of use: http://www.unicode.org/copyright.html
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00003/*
4*******************************************************************************
5*
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08006* Copyright (C) 2004-2014, International Business Machines
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00007* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: ucase.cpp
Jungshik Shin87232d82017-05-13 21:10:13 -070011* encoding: UTF-8
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000012* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2004aug30
16* created by: Markus W. Scherer
17*
18* Low-level Unicode character/string case mapping code.
19* Much code moved here (and modified) from uchar.c.
20*/
21
22#include "unicode/utypes.h"
23#include "unicode/unistr.h"
24#include "unicode/uset.h"
25#include "unicode/udata.h" /* UDataInfo */
26#include "unicode/utf16.h"
27#include "ucmndata.h" /* DataHeader */
28#include "udatamem.h"
29#include "umutex.h"
30#include "uassert.h"
31#include "cmemory.h"
32#include "utrie2.h"
33#include "ucase.h"
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000034
35struct UCaseProps {
36 UDataMemory *mem;
37 const int32_t *indexes;
38 const uint16_t *exceptions;
39 const uint16_t *unfold;
40
41 UTrie2 trie;
42 uint8_t formatVersion[4];
43};
44
45/* ucase_props_data.h is machine-generated by gencase --csource */
46#define INCLUDED_FROM_UCASE_CPP
47#include "ucase_props_data.h"
48
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000049/* set of property starts for UnicodeSet ------------------------------------ */
50
51static UBool U_CALLCONV
52_enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
53 /* add the start code point to the USet */
54 const USetAdder *sa=(const USetAdder *)context;
55 sa->add(sa->set, start);
56 return TRUE;
57}
58
59U_CFUNC void U_EXPORT2
Jungshik Shin87232d82017-05-13 21:10:13 -070060ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000061 if(U_FAILURE(*pErrorCode)) {
62 return;
63 }
64
65 /* add the start code point of each same-value range of the trie */
Jungshik Shin87232d82017-05-13 21:10:13 -070066 utrie2_enum(&ucase_props_singleton.trie, NULL, _enumPropertyStartsRange, sa);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000067
68 /* add code points with hardcoded properties, plus the ones following them */
69
70 /* (none right now, see comment below) */
71
72 /*
73 * Omit code points with hardcoded specialcasing properties
74 * because we do not build property UnicodeSets for them right now.
75 */
76}
77
78/* data access primitives --------------------------------------------------- */
79
Jungshik Shinf61e46d2018-05-04 13:00:45 -070080U_CFUNC const UTrie2 * U_EXPORT2
81ucase_getTrie() {
82 return &ucase_props_singleton.trie;
83}
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000084
Jungshik Shinf61e46d2018-05-04 13:00:45 -070085#define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000086
87/* number of bits in an 8-bit integer value */
88static const uint8_t flagsOffset[256]={
89 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
90 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
91 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
92 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
93 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
94 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
95 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
96 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
97 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
98 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
99 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
100 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
101 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
102 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
103 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
104 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
105};
106
107#define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
108#define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
109
110/*
111 * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
112 *
113 * @param excWord (in) initial exceptions word
114 * @param idx (in) desired slot index
115 * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
116 * moved to the last uint16_t of the value, use +1 for beginning of next slot
117 * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
118 */
Frank Tangb8696612019-10-25 14:58:21 -0700119#define GET_SLOT_VALUE(excWord, idx, pExc16, value) UPRV_BLOCK_MACRO_BEGIN { \
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000120 if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
121 (pExc16)+=SLOT_OFFSET(excWord, idx); \
122 (value)=*pExc16; \
123 } else { \
124 (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
125 (value)=*pExc16++; \
126 (value)=((value)<<16)|*pExc16; \
Frank Tangb8696612019-10-25 14:58:21 -0700127 } \
128} UPRV_BLOCK_MACRO_END
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000129
130/* simple case mappings ----------------------------------------------------- */
131
132U_CAPI UChar32 U_EXPORT2
Jungshik Shin87232d82017-05-13 21:10:13 -0700133ucase_tolower(UChar32 c) {
134 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
Jungshik Shinf61e46d2018-05-04 13:00:45 -0700135 if(!UCASE_HAS_EXCEPTION(props)) {
136 if(UCASE_IS_UPPER_OR_TITLE(props)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000137 c+=UCASE_GET_DELTA(props);
138 }
139 } else {
Jungshik Shin87232d82017-05-13 21:10:13 -0700140 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000141 uint16_t excWord=*pe++;
Jungshik Shina9a2bd32018-07-07 03:36:01 -0700142 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
143 int32_t delta;
144 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
145 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
146 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000147 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
148 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
149 }
150 }
151 return c;
152}
153
154U_CAPI UChar32 U_EXPORT2
Jungshik Shin87232d82017-05-13 21:10:13 -0700155ucase_toupper(UChar32 c) {
156 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
Jungshik Shinf61e46d2018-05-04 13:00:45 -0700157 if(!UCASE_HAS_EXCEPTION(props)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000158 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
159 c+=UCASE_GET_DELTA(props);
160 }
161 } else {
Jungshik Shin87232d82017-05-13 21:10:13 -0700162 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000163 uint16_t excWord=*pe++;
Jungshik Shina9a2bd32018-07-07 03:36:01 -0700164 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
165 int32_t delta;
166 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
167 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
168 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000169 if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
170 GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
171 }
172 }
173 return c;
174}
175
176U_CAPI UChar32 U_EXPORT2
Jungshik Shin87232d82017-05-13 21:10:13 -0700177ucase_totitle(UChar32 c) {
178 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
Jungshik Shinf61e46d2018-05-04 13:00:45 -0700179 if(!UCASE_HAS_EXCEPTION(props)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000180 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
181 c+=UCASE_GET_DELTA(props);
182 }
183 } else {
Jungshik Shin87232d82017-05-13 21:10:13 -0700184 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000185 uint16_t excWord=*pe++;
Jungshik Shina9a2bd32018-07-07 03:36:01 -0700186 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
187 int32_t delta;
188 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
189 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
190 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000191 int32_t idx;
192 if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
193 idx=UCASE_EXC_TITLE;
194 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
195 idx=UCASE_EXC_UPPER;
196 } else {
197 return c;
198 }
199 GET_SLOT_VALUE(excWord, idx, pe, c);
200 }
201 return c;
202}
203
204static const UChar iDot[2] = { 0x69, 0x307 };
205static const UChar jDot[2] = { 0x6a, 0x307 };
206static const UChar iOgonekDot[3] = { 0x12f, 0x307 };
207static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 };
208static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 };
209static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 };
210
211
212U_CFUNC void U_EXPORT2
Jungshik Shin87232d82017-05-13 21:10:13 -0700213ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000214 uint16_t props;
215
216 /*
217 * Hardcode the case closure of i and its relatives and ignore the
218 * data file data for these characters.
219 * The Turkic dotless i and dotted I with their case mapping conditions
220 * and case folding option make the related characters behave specially.
221 * This code matches their closure behavior to their case folding behavior.
222 */
223
224 switch(c) {
225 case 0x49:
226 /* regular i and I are in one equivalence class */
227 sa->add(sa->set, 0x69);
228 return;
229 case 0x69:
230 sa->add(sa->set, 0x49);
231 return;
232 case 0x130:
233 /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
234 sa->addString(sa->set, iDot, 2);
235 return;
236 case 0x131:
237 /* dotless i is in a class by itself */
238 return;
239 default:
240 /* otherwise use the data file data */
241 break;
242 }
243
Jungshik Shin87232d82017-05-13 21:10:13 -0700244 props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
Jungshik Shinf61e46d2018-05-04 13:00:45 -0700245 if(!UCASE_HAS_EXCEPTION(props)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000246 if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
247 /* add the one simple case mapping, no matter what type it is */
248 int32_t delta=UCASE_GET_DELTA(props);
249 if(delta!=0) {
250 sa->add(sa->set, c+delta);
251 }
252 }
253 } else {
254 /*
255 * c has exceptions, so there may be multiple simple and/or
256 * full case mappings. Add them all.
257 */
Jungshik Shin87232d82017-05-13 21:10:13 -0700258 const uint16_t *pe0, *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000259 const UChar *closure;
260 uint16_t excWord=*pe++;
261 int32_t idx, closureLength, fullLength, length;
262
263 pe0=pe;
264
265 /* add all simple case mappings */
266 for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
267 if(HAS_SLOT(excWord, idx)) {
268 pe=pe0;
269 GET_SLOT_VALUE(excWord, idx, pe, c);
270 sa->add(sa->set, c);
271 }
272 }
Jungshik Shina9a2bd32018-07-07 03:36:01 -0700273 if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) {
274 pe=pe0;
275 int32_t delta;
276 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
277 sa->add(sa->set, (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta);
278 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000279
280 /* get the closure string pointer & length */
281 if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
282 pe=pe0;
283 GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
284 closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
285 closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */
286 } else {
287 closureLength=0;
288 closure=NULL;
289 }
290
291 /* add the full case folding */
292 if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
293 pe=pe0;
294 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
295
296 /* start of full case mapping strings */
297 ++pe;
298
299 fullLength&=0xffff; /* bits 16 and higher are reserved */
300
301 /* skip the lowercase result string */
302 pe+=fullLength&UCASE_FULL_LOWER;
303 fullLength>>=4;
304
305 /* add the full case folding string */
306 length=fullLength&0xf;
307 if(length!=0) {
308 sa->addString(sa->set, (const UChar *)pe, length);
309 pe+=length;
310 }
311
312 /* skip the uppercase and titlecase strings */
313 fullLength>>=4;
314 pe+=fullLength&0xf;
315 fullLength>>=4;
316 pe+=fullLength;
317
318 closure=(const UChar *)pe; /* behind full case mappings */
319 }
320
321 /* add each code point in the closure string */
322 for(idx=0; idx<closureLength;) {
323 U16_NEXT_UNSAFE(closure, idx, c);
324 sa->add(sa->set, c);
325 }
326 }
327}
328
329/*
330 * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
331 * must be length>0 and max>0 and length<=max
332 */
333static inline int32_t
334strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) {
335 int32_t c1, c2;
336
337 max-=length; /* we require length<=max, so no need to decrement max in the loop */
338 do {
339 c1=*s++;
340 c2=*t++;
341 if(c2==0) {
342 return 1; /* reached the end of t but not of s */
343 }
344 c1-=c2;
345 if(c1!=0) {
346 return c1; /* return difference result */
347 }
348 } while(--length>0);
349 /* ends with length==0 */
350
351 if(max==0 || *t==0) {
352 return 0; /* equal to length of both strings */
353 } else {
Frank Tang3e05d9d2021-11-08 14:04:04 -0800354 return -max; /* return length difference */
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000355 }
356}
357
358U_CFUNC UBool U_EXPORT2
Jungshik Shin87232d82017-05-13 21:10:13 -0700359ucase_addStringCaseClosure(const UChar *s, int32_t length, const USetAdder *sa) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000360 int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
361
Jungshik Shin87232d82017-05-13 21:10:13 -0700362 if(ucase_props_singleton.unfold==NULL || s==NULL) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000363 return FALSE; /* no reverse case folding data, or no string */
364 }
365 if(length<=1) {
366 /* the string is too short to find any match */
367 /*
368 * more precise would be:
369 * if(!u_strHasMoreChar32Than(s, length, 1))
370 * but this does not make much practical difference because
371 * a single supplementary code point would just not be found
372 */
373 return FALSE;
374 }
375
Jungshik Shin87232d82017-05-13 21:10:13 -0700376 const uint16_t *unfold=ucase_props_singleton.unfold;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000377 unfoldRows=unfold[UCASE_UNFOLD_ROWS];
378 unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
379 unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
380 unfold+=unfoldRowWidth;
381
382 if(length>unfoldStringWidth) {
383 /* the string is too long to find any match */
384 return FALSE;
385 }
386
387 /* do a binary search for the string */
388 start=0;
389 limit=unfoldRows;
390 while(start<limit) {
391 i=(start+limit)/2;
392 const UChar *p=reinterpret_cast<const UChar *>(unfold+(i*unfoldRowWidth));
393 result=strcmpMax(s, length, p, unfoldStringWidth);
394
395 if(result==0) {
396 /* found the string: add each code point, and its case closure */
397 UChar32 c;
398
399 for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
400 U16_NEXT_UNSAFE(p, i, c);
401 sa->add(sa->set, c);
Jungshik Shin87232d82017-05-13 21:10:13 -0700402 ucase_addCaseClosure(c, sa);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000403 }
404 return TRUE;
405 } else if(result<0) {
406 limit=i;
407 } else /* result>0 */ {
408 start=i+1;
409 }
410 }
411
412 return FALSE; /* string not found */
413}
414
415U_NAMESPACE_BEGIN
416
417FullCaseFoldingIterator::FullCaseFoldingIterator()
418 : unfold(reinterpret_cast<const UChar *>(ucase_props_singleton.unfold)),
419 unfoldRows(unfold[UCASE_UNFOLD_ROWS]),
420 unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]),
421 unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]),
422 currentRow(0),
423 rowCpIndex(unfoldStringWidth) {
424 unfold+=unfoldRowWidth;
425}
426
427UChar32
428FullCaseFoldingIterator::next(UnicodeString &full) {
429 // Advance past the last-delivered code point.
430 const UChar *p=unfold+(currentRow*unfoldRowWidth);
431 if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) {
432 ++currentRow;
433 p+=unfoldRowWidth;
434 rowCpIndex=unfoldStringWidth;
435 }
436 if(currentRow>=unfoldRows) { return U_SENTINEL; }
437 // Set "full" to the NUL-terminated string in the first unfold column.
438 int32_t length=unfoldStringWidth;
439 while(length>0 && p[length-1]==0) { --length; }
440 full.setTo(FALSE, p, length);
441 // Return the code point.
442 UChar32 c;
443 U16_NEXT_UNSAFE(p, rowCpIndex, c);
444 return c;
445}
446
Jungshik Shinf61e46d2018-05-04 13:00:45 -0700447namespace LatinCase {
448
449const int8_t TO_LOWER_NORMAL[LIMIT] = {
450 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
451 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
452 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
453 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
454
455 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
456 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
457 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
458 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
459
460 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
461 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
462 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
463 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
464
465 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
466 32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
467 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
468 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
469
470 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
471 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
472 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
473 EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
474
475 0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
476 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
477 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
478 1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
479};
480
481const int8_t TO_LOWER_TR_LT[LIMIT] = {
482 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
483 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
484 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
485 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
486
487 0, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32, 32, 32, 32,
488 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
489 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
490 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
491
492 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
493 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
494 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
495 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
496
497 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32,
498 32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
499 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
500 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
501
502 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
503 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
504 1, 0, 1, 0, 1, 0, 1, 0, EXC, 0, 1, 0, 1, 0, EXC, 0,
505 EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
506
507 0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
508 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
509 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
510 1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
511};
512
513const int8_t TO_UPPER_NORMAL[LIMIT] = {
514 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
515 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
516 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
517 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
518
519 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
520 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
521 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
522 -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
523
524 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
525 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
526 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
527 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
528
529 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
530 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
531 -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
532 -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
533
534 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
535 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
536 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
537 0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
538
539 -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
540 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
541 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
542 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
543};
544
545const int8_t TO_UPPER_TR[LIMIT] = {
546 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
547 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
548 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
549 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
550
551 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
552 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
553 0, -32, -32, -32, -32, -32, -32, -32, -32, EXC, -32, -32, -32, -32, -32, -32,
554 -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
555
556 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
557 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
558 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
559 0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
560
561 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
562 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
563 -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
564 -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
565
566 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
567 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
568 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
569 0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
570
571 -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
572 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
573 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
574 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
575};
576
577} // namespace LatinCase
578
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000579U_NAMESPACE_END
580
581/** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
582U_CAPI int32_t U_EXPORT2
Jungshik Shin87232d82017-05-13 21:10:13 -0700583ucase_getType(UChar32 c) {
584 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000585 return UCASE_GET_TYPE(props);
586}
587
588/** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
589U_CAPI int32_t U_EXPORT2
Jungshik Shin87232d82017-05-13 21:10:13 -0700590ucase_getTypeOrIgnorable(UChar32 c) {
591 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000592 return UCASE_GET_TYPE_AND_IGNORABLE(props);
593}
594
595/** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
596static inline int32_t
Jungshik Shin87232d82017-05-13 21:10:13 -0700597getDotType(UChar32 c) {
598 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
Jungshik Shinf61e46d2018-05-04 13:00:45 -0700599 if(!UCASE_HAS_EXCEPTION(props)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000600 return props&UCASE_DOT_MASK;
601 } else {
Jungshik Shin87232d82017-05-13 21:10:13 -0700602 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000603 return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
604 }
605}
606
607U_CAPI UBool U_EXPORT2
Jungshik Shin87232d82017-05-13 21:10:13 -0700608ucase_isSoftDotted(UChar32 c) {
609 return (UBool)(getDotType(c)==UCASE_SOFT_DOTTED);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000610}
611
612U_CAPI UBool U_EXPORT2
Jungshik Shin87232d82017-05-13 21:10:13 -0700613ucase_isCaseSensitive(UChar32 c) {
614 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
Jungshik Shina9a2bd32018-07-07 03:36:01 -0700615 if(!UCASE_HAS_EXCEPTION(props)) {
616 return (UBool)((props&UCASE_SENSITIVE)!=0);
617 } else {
618 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
619 return (UBool)((*pe&UCASE_EXC_SENSITIVE)!=0);
620 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000621}
622
623/* string casing ------------------------------------------------------------ */
624
625/*
626 * These internal functions form the core of string case mappings.
627 * They map single code points to result code points or strings and take
628 * all necessary conditions (context, locale ID, options) into account.
629 *
630 * They do not iterate over the source or write to the destination
631 * so that the same functions are useful for non-standard string storage,
632 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
633 * For the same reason, the "surrounding text" context is passed in as a
634 * UCaseContextIterator which does not make any assumptions about
635 * the underlying storage.
636 *
637 * This section contains helper functions that check for conditions
638 * in the input text surrounding the current code point
639 * according to SpecialCasing.txt.
640 *
641 * Each helper function gets the index
642 * - after the current code point if it looks at following text
643 * - before the current code point if it looks at preceding text
644 *
645 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
646 *
647 * Final_Sigma
648 * C is preceded by a sequence consisting of
649 * a cased letter and a case-ignorable sequence,
650 * and C is not followed by a sequence consisting of
651 * an ignorable sequence and then a cased letter.
652 *
653 * More_Above
654 * C is followed by one or more characters of combining class 230 (ABOVE)
655 * in the combining character sequence.
656 *
657 * After_Soft_Dotted
658 * The last preceding character with combining class of zero before C
659 * was Soft_Dotted,
660 * and there is no intervening combining character class 230 (ABOVE).
661 *
662 * Before_Dot
663 * C is followed by combining dot above (U+0307).
664 * Any sequence of characters with a combining class that is neither 0 nor 230
665 * may intervene between the current character and the combining dot above.
666 *
667 * The erratum from 2002-10-31 adds the condition
668 *
669 * After_I
670 * The last preceding base character was an uppercase I, and there is no
671 * intervening combining character class 230 (ABOVE).
672 *
673 * (See Jitterbug 2344 and the comments on After_I below.)
674 *
675 * Helper definitions in Unicode 3.2 UAX 21:
676 *
677 * D1. A character C is defined to be cased
678 * if it meets any of the following criteria:
679 *
680 * - The general category of C is Titlecase Letter (Lt)
681 * - In [CoreProps], C has one of the properties Uppercase, or Lowercase
682 * - Given D = NFD(C), then it is not the case that:
683 * D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
Frank Tang7e7574b2021-04-13 21:19:13 -0700684 * (This third criterion does not add any characters to the list
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000685 * for Unicode 3.2. Ignored.)
686 *
687 * D2. A character C is defined to be case-ignorable
688 * if it meets either of the following criteria:
689 *
690 * - The general category of C is
691 * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
692 * Letter Modifier (Lm), or Symbol Modifier (Sk)
693 * - C is one of the following characters
694 * U+0027 APOSTROPHE
695 * U+00AD SOFT HYPHEN (SHY)
696 * U+2019 RIGHT SINGLE QUOTATION MARK
697 * (the preferred character for apostrophe)
698 *
699 * D3. A case-ignorable sequence is a sequence of
700 * zero or more case-ignorable characters.
701 */
702
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000703#define is_d(c) ((c)=='d' || (c)=='D')
704#define is_e(c) ((c)=='e' || (c)=='E')
705#define is_i(c) ((c)=='i' || (c)=='I')
706#define is_l(c) ((c)=='l' || (c)=='L')
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000707#define is_r(c) ((c)=='r' || (c)=='R')
708#define is_t(c) ((c)=='t' || (c)=='T')
709#define is_u(c) ((c)=='u' || (c)=='U')
Frank Tangf90543d2020-10-30 19:02:04 -0700710#define is_y(c) ((c)=='y' || (c)=='Y')
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000711#define is_z(c) ((c)=='z' || (c)=='Z')
712
713/* separator? */
714#define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
715
716/**
717 * Requires non-NULL locale ID but otherwise does the equivalent of
718 * checking for language codes as if uloc_getLanguage() were called:
719 * Accepts both 2- and 3-letter codes and accepts case variants.
720 */
721U_CFUNC int32_t
Jungshik Shin87232d82017-05-13 21:10:13 -0700722ucase_getCaseLocale(const char *locale) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000723 /*
724 * This function used to use uloc_getLanguage(), but the current code
725 * removes the dependency of this low-level code on uloc implementation code
726 * and is faster because not the whole locale ID has to be
727 * examined and copied/transformed.
728 *
729 * Because this code does not want to depend on uloc, the caller must
730 * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
731 */
Jungshik Shin87232d82017-05-13 21:10:13 -0700732 char c=*locale++;
733 // Fastpath for English "en" which is often used for default (=root locale) case mappings,
734 // and for Chinese "zh": Very common but no special case mapping behavior.
735 // Then check lowercase vs. uppercase to reduce the number of comparisons
736 // for other locales without special behavior.
737 if(c=='e') {
Jungshik Shin5feb9ad2016-10-21 12:52:48 -0700738 /* el or ell? */
739 c=*locale++;
740 if(is_l(c)) {
741 c=*locale++;
742 if(is_l(c)) {
743 c=*locale;
744 }
745 if(is_sep(c)) {
Jungshik Shin87232d82017-05-13 21:10:13 -0700746 return UCASE_LOC_GREEK;
Jungshik Shin5feb9ad2016-10-21 12:52:48 -0700747 }
748 }
Jungshik Shin87232d82017-05-13 21:10:13 -0700749 // en, es, ... -> root
750 } else if(c=='z') {
751 return UCASE_LOC_ROOT;
752#if U_CHARSET_FAMILY==U_ASCII_FAMILY
753 } else if(c>='a') { // ASCII a-z = 0x61..0x7a, after A-Z
754#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
755 } else if(c<='z') { // EBCDIC a-z = 0x81..0xa9 with two gaps, before A-Z
756#else
757# error Unknown charset family!
758#endif
759 // lowercase c
760 if(c=='t') {
761 /* tr or tur? */
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000762 c=*locale++;
Jungshik Shin87232d82017-05-13 21:10:13 -0700763 if(is_u(c)) {
764 c=*locale++;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000765 }
Jungshik Shin87232d82017-05-13 21:10:13 -0700766 if(is_r(c)) {
767 c=*locale;
768 if(is_sep(c)) {
769 return UCASE_LOC_TURKISH;
770 }
771 }
772 } else if(c=='a') {
773 /* az or aze? */
774 c=*locale++;
775 if(is_z(c)) {
776 c=*locale++;
777 if(is_e(c)) {
778 c=*locale;
779 }
780 if(is_sep(c)) {
781 return UCASE_LOC_TURKISH;
782 }
783 }
784 } else if(c=='l') {
785 /* lt or lit? */
786 c=*locale++;
787 if(is_i(c)) {
788 c=*locale++;
789 }
790 if(is_t(c)) {
791 c=*locale;
792 if(is_sep(c)) {
793 return UCASE_LOC_LITHUANIAN;
794 }
795 }
796 } else if(c=='n') {
797 /* nl or nld? */
798 c=*locale++;
799 if(is_l(c)) {
800 c=*locale++;
801 if(is_d(c)) {
802 c=*locale;
803 }
804 if(is_sep(c)) {
805 return UCASE_LOC_DUTCH;
806 }
807 }
Frank Tangf90543d2020-10-30 19:02:04 -0700808 } else if(c=='h') {
809 /* hy or hye? *not* hyw */
810 c=*locale++;
811 if(is_y(c)) {
812 c=*locale++;
813 if(is_e(c)) {
814 c=*locale;
815 }
816 if(is_sep(c)) {
817 return UCASE_LOC_ARMENIAN;
818 }
819 }
Jungshik Shin87232d82017-05-13 21:10:13 -0700820 }
821 } else {
822 // uppercase c
823 // Same code as for lowercase c but also check for 'E'.
824 if(c=='T') {
825 /* tr or tur? */
826 c=*locale++;
827 if(is_u(c)) {
828 c=*locale++;
829 }
830 if(is_r(c)) {
831 c=*locale;
832 if(is_sep(c)) {
833 return UCASE_LOC_TURKISH;
834 }
835 }
836 } else if(c=='A') {
837 /* az or aze? */
838 c=*locale++;
839 if(is_z(c)) {
840 c=*locale++;
841 if(is_e(c)) {
842 c=*locale;
843 }
844 if(is_sep(c)) {
845 return UCASE_LOC_TURKISH;
846 }
847 }
848 } else if(c=='L') {
849 /* lt or lit? */
850 c=*locale++;
851 if(is_i(c)) {
852 c=*locale++;
853 }
854 if(is_t(c)) {
855 c=*locale;
856 if(is_sep(c)) {
857 return UCASE_LOC_LITHUANIAN;
858 }
859 }
860 } else if(c=='E') {
861 /* el or ell? */
862 c=*locale++;
863 if(is_l(c)) {
864 c=*locale++;
865 if(is_l(c)) {
866 c=*locale;
867 }
868 if(is_sep(c)) {
869 return UCASE_LOC_GREEK;
870 }
871 }
872 } else if(c=='N') {
873 /* nl or nld? */
874 c=*locale++;
875 if(is_l(c)) {
876 c=*locale++;
877 if(is_d(c)) {
878 c=*locale;
879 }
880 if(is_sep(c)) {
881 return UCASE_LOC_DUTCH;
882 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000883 }
Frank Tangf90543d2020-10-30 19:02:04 -0700884 } else if(c=='H') {
885 /* hy or hye? *not* hyw */
886 c=*locale++;
887 if(is_y(c)) {
888 c=*locale++;
889 if(is_e(c)) {
890 c=*locale;
891 }
892 if(is_sep(c)) {
893 return UCASE_LOC_ARMENIAN;
894 }
895 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000896 }
897 }
Jungshik Shin87232d82017-05-13 21:10:13 -0700898 return UCASE_LOC_ROOT;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000899}
900
901/*
902 * Is followed by
903 * {case-ignorable}* cased
904 * ?
905 * (dir determines looking forward/backward)
906 * If a character is case-ignorable, it is skipped regardless of whether
907 * it is also cased or not.
908 */
909static UBool
Jungshik Shin87232d82017-05-13 21:10:13 -0700910isFollowedByCasedLetter(UCaseContextIterator *iter, void *context, int8_t dir) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000911 UChar32 c;
912
913 if(iter==NULL) {
914 return FALSE;
915 }
916
917 for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
Jungshik Shin87232d82017-05-13 21:10:13 -0700918 int32_t type=ucase_getTypeOrIgnorable(c);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000919 if(type&4) {
920 /* case-ignorable, continue with the loop */
921 } else if(type!=UCASE_NONE) {
922 return TRUE; /* followed by cased letter */
923 } else {
924 return FALSE; /* uncased and not case-ignorable */
925 }
926 }
927
928 return FALSE; /* not followed by cased letter */
929}
930
931/* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
932static UBool
Jungshik Shin87232d82017-05-13 21:10:13 -0700933isPrecededBySoftDotted(UCaseContextIterator *iter, void *context) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000934 UChar32 c;
935 int32_t dotType;
936 int8_t dir;
937
938 if(iter==NULL) {
939 return FALSE;
940 }
941
942 for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
Jungshik Shin87232d82017-05-13 21:10:13 -0700943 dotType=getDotType(c);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000944 if(dotType==UCASE_SOFT_DOTTED) {
945 return TRUE; /* preceded by TYPE_i */
946 } else if(dotType!=UCASE_OTHER_ACCENT) {
947 return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
948 }
949 }
950
951 return FALSE; /* not preceded by TYPE_i */
952}
953
954/*
955 * See Jitterbug 2344:
956 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
957 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
958 * we made those releases compatible with Unicode 3.2 which had not fixed
959 * a related bug in SpecialCasing.txt.
960 *
961 * From the Jitterbug 2344 text:
962 * ... this bug is listed as a Unicode erratum
963 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
964 * <quote>
965 * There are two errors in SpecialCasing.txt.
966 * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
967 * 2. An incorrect context definition. Correct as follows:
968 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
969 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
970 * ---
971 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
972 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
973 * where the context After_I is defined as:
974 * The last preceding base character was an uppercase I, and there is no
975 * intervening combining character class 230 (ABOVE).
976 * </quote>
977 *
978 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
979 *
980 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
981 * # This matches the behavior of the canonically equivalent I-dot_above
982 *
983 * See also the description in this place in older versions of uchar.c (revision 1.100).
984 *
985 * Markus W. Scherer 2003-feb-15
986 */
987
988/* Is preceded by base character 'I' with no intervening cc=230 ? */
989static UBool
Jungshik Shin87232d82017-05-13 21:10:13 -0700990isPrecededBy_I(UCaseContextIterator *iter, void *context) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000991 UChar32 c;
992 int32_t dotType;
993 int8_t dir;
994
995 if(iter==NULL) {
996 return FALSE;
997 }
998
999 for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
1000 if(c==0x49) {
1001 return TRUE; /* preceded by I */
1002 }
Jungshik Shin87232d82017-05-13 21:10:13 -07001003 dotType=getDotType(c);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001004 if(dotType!=UCASE_OTHER_ACCENT) {
1005 return FALSE; /* preceded by different base character (not I), or intervening cc==230 */
1006 }
1007 }
1008
1009 return FALSE; /* not preceded by I */
1010}
1011
1012/* Is followed by one or more cc==230 ? */
1013static UBool
Jungshik Shin87232d82017-05-13 21:10:13 -07001014isFollowedByMoreAbove(UCaseContextIterator *iter, void *context) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001015 UChar32 c;
1016 int32_t dotType;
1017 int8_t dir;
1018
1019 if(iter==NULL) {
1020 return FALSE;
1021 }
1022
1023 for(dir=1; (c=iter(context, dir))>=0; dir=0) {
Jungshik Shin87232d82017-05-13 21:10:13 -07001024 dotType=getDotType(c);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001025 if(dotType==UCASE_ABOVE) {
1026 return TRUE; /* at least one cc==230 following */
1027 } else if(dotType!=UCASE_OTHER_ACCENT) {
1028 return FALSE; /* next base character, no more cc==230 following */
1029 }
1030 }
1031
1032 return FALSE; /* no more cc==230 following */
1033}
1034
1035/* Is followed by a dot above (without cc==230 in between) ? */
1036static UBool
Jungshik Shin87232d82017-05-13 21:10:13 -07001037isFollowedByDotAbove(UCaseContextIterator *iter, void *context) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001038 UChar32 c;
1039 int32_t dotType;
1040 int8_t dir;
1041
1042 if(iter==NULL) {
1043 return FALSE;
1044 }
1045
1046 for(dir=1; (c=iter(context, dir))>=0; dir=0) {
1047 if(c==0x307) {
1048 return TRUE;
1049 }
Jungshik Shin87232d82017-05-13 21:10:13 -07001050 dotType=getDotType(c);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001051 if(dotType!=UCASE_OTHER_ACCENT) {
1052 return FALSE; /* next base character or cc==230 in between */
1053 }
1054 }
1055
1056 return FALSE; /* no dot above following */
1057}
1058
1059U_CAPI int32_t U_EXPORT2
Jungshik Shin87232d82017-05-13 21:10:13 -07001060ucase_toFullLower(UChar32 c,
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001061 UCaseContextIterator *iter, void *context,
1062 const UChar **pString,
Jungshik Shin87232d82017-05-13 21:10:13 -07001063 int32_t loc) {
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07001064 // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1065 U_ASSERT(c >= 0);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001066 UChar32 result=c;
Jungshik Shin87232d82017-05-13 21:10:13 -07001067 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
Jungshik Shinf61e46d2018-05-04 13:00:45 -07001068 if(!UCASE_HAS_EXCEPTION(props)) {
1069 if(UCASE_IS_UPPER_OR_TITLE(props)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001070 result=c+UCASE_GET_DELTA(props);
1071 }
1072 } else {
Jungshik Shin87232d82017-05-13 21:10:13 -07001073 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001074 uint16_t excWord=*pe++;
1075 int32_t full;
1076
1077 pe2=pe;
1078
1079 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1080 /* use hardcoded conditions and mappings */
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001081
1082 /*
1083 * Test for conditional mappings first
1084 * (otherwise the unconditional default mappings are always taken),
1085 * then test for characters that have unconditional mappings in SpecialCasing.txt,
1086 * then get the UnicodeData.txt mappings.
1087 */
1088 if( loc==UCASE_LOC_LITHUANIAN &&
1089 /* base characters, find accents above */
1090 (((c==0x49 || c==0x4a || c==0x12e) &&
Jungshik Shin87232d82017-05-13 21:10:13 -07001091 isFollowedByMoreAbove(iter, context)) ||
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001092 /* precomposed with accent above, no need to find one */
1093 (c==0xcc || c==0xcd || c==0x128))
1094 ) {
1095 /*
1096 # Lithuanian
1097
1098 # Lithuanian retains the dot in a lowercase i when followed by accents.
1099
1100 # Introduce an explicit dot above when lowercasing capital I's and J's
1101 # whenever there are more accents above.
1102 # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
1103
1104 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
1105 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
1106 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
1107 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
1108 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
1109 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
1110 */
1111 switch(c) {
1112 case 0x49: /* LATIN CAPITAL LETTER I */
1113 *pString=iDot;
1114 return 2;
1115 case 0x4a: /* LATIN CAPITAL LETTER J */
1116 *pString=jDot;
1117 return 2;
1118 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
1119 *pString=iOgonekDot;
1120 return 2;
1121 case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */
1122 *pString=iDotGrave;
1123 return 3;
1124 case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */
1125 *pString=iDotAcute;
1126 return 3;
1127 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
1128 *pString=iDotTilde;
1129 return 3;
1130 default:
1131 return 0; /* will not occur */
1132 }
1133 /* # Turkish and Azeri */
1134 } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
1135 /*
1136 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1137 # The following rules handle those cases.
1138
1139 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
1140 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
1141 */
1142 return 0x69;
Jungshik Shin87232d82017-05-13 21:10:13 -07001143 } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(iter, context)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001144 /*
1145 # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
1146 # This matches the behavior of the canonically equivalent I-dot_above
1147
1148 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
1149 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
1150 */
Jungshik Shinfd2abab2017-05-15 16:17:01 -07001151 *pString=nullptr;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001152 return 0; /* remove the dot (continue without output) */
Jungshik Shin87232d82017-05-13 21:10:13 -07001153 } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter, context)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001154 /*
1155 # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
1156
1157 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
1158 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
1159 */
1160 return 0x131;
1161 } else if(c==0x130) {
1162 /*
1163 # Preserve canonical equivalence for I with dot. Turkic is handled below.
1164
1165 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1166 */
1167 *pString=iDot;
1168 return 2;
1169 } else if( c==0x3a3 &&
Jungshik Shin87232d82017-05-13 21:10:13 -07001170 !isFollowedByCasedLetter(iter, context, 1) &&
1171 isFollowedByCasedLetter(iter, context, -1) /* -1=preceded */
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001172 ) {
1173 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
1174 /*
1175 # Special case for final form of sigma
1176
1177 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
1178 */
1179 return 0x3c2; /* greek small final sigma */
1180 } else {
1181 /* no known conditional special case mapping, use a normal mapping */
1182 }
1183 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1184 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1185 full&=UCASE_FULL_LOWER;
1186 if(full!=0) {
1187 /* set the output pointer to the lowercase mapping */
1188 *pString=reinterpret_cast<const UChar *>(pe+1);
1189
1190 /* return the string length */
1191 return full;
1192 }
1193 }
1194
Jungshik Shina9a2bd32018-07-07 03:36:01 -07001195 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1196 int32_t delta;
1197 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1198 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1199 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001200 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1201 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
1202 }
1203 }
1204
1205 return (result==c) ? ~result : result;
1206}
1207
1208/* internal */
1209static int32_t
Jungshik Shin87232d82017-05-13 21:10:13 -07001210toUpperOrTitle(UChar32 c,
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001211 UCaseContextIterator *iter, void *context,
1212 const UChar **pString,
Jungshik Shin87232d82017-05-13 21:10:13 -07001213 int32_t loc,
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001214 UBool upperNotTitle) {
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07001215 // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1216 U_ASSERT(c >= 0);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001217 UChar32 result=c;
Jungshik Shin87232d82017-05-13 21:10:13 -07001218 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
Jungshik Shinf61e46d2018-05-04 13:00:45 -07001219 if(!UCASE_HAS_EXCEPTION(props)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001220 if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
1221 result=c+UCASE_GET_DELTA(props);
1222 }
1223 } else {
Jungshik Shin87232d82017-05-13 21:10:13 -07001224 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001225 uint16_t excWord=*pe++;
1226 int32_t full, idx;
1227
1228 pe2=pe;
1229
1230 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1231 /* use hardcoded conditions and mappings */
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001232 if(loc==UCASE_LOC_TURKISH && c==0x69) {
1233 /*
1234 # Turkish and Azeri
1235
1236 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1237 # The following rules handle those cases.
1238
1239 # When uppercasing, i turns into a dotted capital I
1240
1241 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
1242 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
1243 */
1244 return 0x130;
Jungshik Shin87232d82017-05-13 21:10:13 -07001245 } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(iter, context)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001246 /*
1247 # Lithuanian
1248
1249 # Lithuanian retains the dot in a lowercase i when followed by accents.
1250
1251 # Remove DOT ABOVE after "i" with upper or titlecase
1252
1253 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1254 */
Jungshik Shinfd2abab2017-05-15 16:17:01 -07001255 *pString=nullptr;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001256 return 0; /* remove the dot (continue without output) */
Frank Tangf90543d2020-10-30 19:02:04 -07001257 } else if(c==0x0587) {
1258 // See ICU-13416:
1259 // և ligature ech-yiwn
1260 // uppercases to ԵՒ=ech+yiwn by default and in Western Armenian,
1261 // but to ԵՎ=ech+vew in Eastern Armenian.
1262 if(loc==UCASE_LOC_ARMENIAN) {
1263 *pString=upperNotTitle ? u"ԵՎ" : u"Եվ";
1264 } else {
1265 *pString=upperNotTitle ? u"ԵՒ" : u"Եւ";
1266 }
1267 return 2;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001268 } else {
1269 /* no known conditional special case mapping, use a normal mapping */
1270 }
1271 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1272 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1273
1274 /* start of full case mapping strings */
1275 ++pe;
1276
1277 /* skip the lowercase and case-folding result strings */
1278 pe+=full&UCASE_FULL_LOWER;
1279 full>>=4;
1280 pe+=full&0xf;
1281 full>>=4;
1282
1283 if(upperNotTitle) {
1284 full&=0xf;
1285 } else {
1286 /* skip the uppercase result string */
1287 pe+=full&0xf;
1288 full=(full>>4)&0xf;
1289 }
1290
1291 if(full!=0) {
1292 /* set the output pointer to the result string */
1293 *pString=reinterpret_cast<const UChar *>(pe);
1294
1295 /* return the string length */
1296 return full;
1297 }
1298 }
1299
Jungshik Shina9a2bd32018-07-07 03:36:01 -07001300 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
1301 int32_t delta;
1302 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1303 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1304 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001305 if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
1306 idx=UCASE_EXC_TITLE;
1307 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
1308 /* here, titlecase is same as uppercase */
1309 idx=UCASE_EXC_UPPER;
1310 } else {
1311 return ~c;
1312 }
1313 GET_SLOT_VALUE(excWord, idx, pe2, result);
1314 }
1315
1316 return (result==c) ? ~result : result;
1317}
1318
1319U_CAPI int32_t U_EXPORT2
Jungshik Shin87232d82017-05-13 21:10:13 -07001320ucase_toFullUpper(UChar32 c,
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001321 UCaseContextIterator *iter, void *context,
1322 const UChar **pString,
Jungshik Shin87232d82017-05-13 21:10:13 -07001323 int32_t caseLocale) {
1324 return toUpperOrTitle(c, iter, context, pString, caseLocale, TRUE);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001325}
1326
1327U_CAPI int32_t U_EXPORT2
Jungshik Shin87232d82017-05-13 21:10:13 -07001328ucase_toFullTitle(UChar32 c,
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001329 UCaseContextIterator *iter, void *context,
1330 const UChar **pString,
Jungshik Shin87232d82017-05-13 21:10:13 -07001331 int32_t caseLocale) {
1332 return toUpperOrTitle(c, iter, context, pString, caseLocale, FALSE);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001333}
1334
1335/* case folding ------------------------------------------------------------- */
1336
1337/*
1338 * Case folding is similar to lowercasing.
1339 * The result may be a simple mapping, i.e., a single code point, or
1340 * a full mapping, i.e., a string.
1341 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1342 * then only the lowercase mapping is stored.
1343 *
1344 * Some special cases are hardcoded because their conditions cannot be
1345 * parsed and processed from CaseFolding.txt.
1346 *
1347 * Unicode 3.2 CaseFolding.txt specifies for its status field:
1348
1349# C: common case folding, common mappings shared by both simple and full mappings.
1350# F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1351# S: simple case folding, mappings to single characters where different from F.
1352# T: special case for uppercase I and dotted uppercase I
1353# - For non-Turkic languages, this mapping is normally not used.
1354# - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1355#
1356# Usage:
1357# A. To do a simple case folding, use the mappings with status C + S.
1358# B. To do a full case folding, use the mappings with status C + F.
1359#
1360# The mappings with status T can be used or omitted depending on the desired case-folding
1361# behavior. (The default option is to exclude them.)
1362
1363 * Unicode 3.2 has 'T' mappings as follows:
1364
13650049; T; 0131; # LATIN CAPITAL LETTER I
13660130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1367
1368 * while the default mappings for these code points are:
1369
13700049; C; 0069; # LATIN CAPITAL LETTER I
13710130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1372
1373 * U+0130 has no simple case folding (simple-case-folds to itself).
1374 */
1375
1376/* return the simple case folding mapping for c */
1377U_CAPI UChar32 U_EXPORT2
Jungshik Shin87232d82017-05-13 21:10:13 -07001378ucase_fold(UChar32 c, uint32_t options) {
1379 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
Jungshik Shinf61e46d2018-05-04 13:00:45 -07001380 if(!UCASE_HAS_EXCEPTION(props)) {
1381 if(UCASE_IS_UPPER_OR_TITLE(props)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001382 c+=UCASE_GET_DELTA(props);
1383 }
1384 } else {
Jungshik Shin87232d82017-05-13 21:10:13 -07001385 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001386 uint16_t excWord=*pe++;
1387 int32_t idx;
1388 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1389 /* special case folding mappings, hardcoded */
1390 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1391 /* default mappings */
1392 if(c==0x49) {
1393 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1394 return 0x69;
1395 } else if(c==0x130) {
1396 /* no simple case folding for U+0130 */
1397 return c;
1398 }
1399 } else {
1400 /* Turkic mappings */
1401 if(c==0x49) {
1402 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1403 return 0x131;
1404 } else if(c==0x130) {
1405 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1406 return 0x69;
1407 }
1408 }
1409 }
Jungshik Shina9a2bd32018-07-07 03:36:01 -07001410 if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
1411 return c;
1412 }
1413 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1414 int32_t delta;
1415 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
1416 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1417 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001418 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1419 idx=UCASE_EXC_FOLD;
1420 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1421 idx=UCASE_EXC_LOWER;
1422 } else {
1423 return c;
1424 }
1425 GET_SLOT_VALUE(excWord, idx, pe, c);
1426 }
1427 return c;
1428}
1429
1430/*
1431 * Issue for canonical caseless match (UAX #21):
1432 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1433 * canonical equivalence, unlike default-option casefolding.
1434 * For example, I-grave and I + grave fold to strings that are not canonically
1435 * equivalent.
1436 * For more details, see the comment in unorm_compare() in unorm.cpp
1437 * and the intermediate prototype changes for Jitterbug 2021.
1438 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1439 *
1440 * This did not get fixed because it appears that it is not possible to fix
1441 * it for uppercase and lowercase characters (I-grave vs. i-grave)
1442 * together in a way that they still fold to common result strings.
1443 */
1444
1445U_CAPI int32_t U_EXPORT2
Jungshik Shin87232d82017-05-13 21:10:13 -07001446ucase_toFullFolding(UChar32 c,
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001447 const UChar **pString,
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07001448 uint32_t options) {
1449 // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1450 U_ASSERT(c >= 0);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001451 UChar32 result=c;
Jungshik Shin87232d82017-05-13 21:10:13 -07001452 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
Jungshik Shinf61e46d2018-05-04 13:00:45 -07001453 if(!UCASE_HAS_EXCEPTION(props)) {
1454 if(UCASE_IS_UPPER_OR_TITLE(props)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001455 result=c+UCASE_GET_DELTA(props);
1456 }
1457 } else {
Jungshik Shin87232d82017-05-13 21:10:13 -07001458 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001459 uint16_t excWord=*pe++;
1460 int32_t full, idx;
1461
1462 pe2=pe;
1463
1464 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1465 /* use hardcoded conditions and mappings */
1466 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1467 /* default mappings */
1468 if(c==0x49) {
1469 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1470 return 0x69;
1471 } else if(c==0x130) {
1472 /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1473 *pString=iDot;
1474 return 2;
1475 }
1476 } else {
1477 /* Turkic mappings */
1478 if(c==0x49) {
1479 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1480 return 0x131;
1481 } else if(c==0x130) {
1482 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1483 return 0x69;
1484 }
1485 }
1486 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1487 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1488
1489 /* start of full case mapping strings */
1490 ++pe;
1491
1492 /* skip the lowercase result string */
1493 pe+=full&UCASE_FULL_LOWER;
1494 full=(full>>4)&0xf;
1495
1496 if(full!=0) {
1497 /* set the output pointer to the result string */
1498 *pString=reinterpret_cast<const UChar *>(pe);
1499
1500 /* return the string length */
1501 return full;
1502 }
1503 }
1504
Jungshik Shina9a2bd32018-07-07 03:36:01 -07001505 if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
1506 return ~c;
1507 }
1508 if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1509 int32_t delta;
1510 GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1511 return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1512 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001513 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1514 idx=UCASE_EXC_FOLD;
1515 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1516 idx=UCASE_EXC_LOWER;
1517 } else {
1518 return ~c;
1519 }
1520 GET_SLOT_VALUE(excWord, idx, pe2, result);
1521 }
1522
1523 return (result==c) ? ~result : result;
1524}
1525
1526/* case mapping properties API ---------------------------------------------- */
1527
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001528/* public API (see uchar.h) */
1529
1530U_CAPI UBool U_EXPORT2
1531u_isULowercase(UChar32 c) {
Jungshik Shin87232d82017-05-13 21:10:13 -07001532 return (UBool)(UCASE_LOWER==ucase_getType(c));
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001533}
1534
1535U_CAPI UBool U_EXPORT2
1536u_isUUppercase(UChar32 c) {
Jungshik Shin87232d82017-05-13 21:10:13 -07001537 return (UBool)(UCASE_UPPER==ucase_getType(c));
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001538}
1539
1540/* Transforms the Unicode character to its lower case equivalent.*/
1541U_CAPI UChar32 U_EXPORT2
1542u_tolower(UChar32 c) {
Jungshik Shin87232d82017-05-13 21:10:13 -07001543 return ucase_tolower(c);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001544}
1545
1546/* Transforms the Unicode character to its upper case equivalent.*/
1547U_CAPI UChar32 U_EXPORT2
1548u_toupper(UChar32 c) {
Jungshik Shin87232d82017-05-13 21:10:13 -07001549 return ucase_toupper(c);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001550}
1551
1552/* Transforms the Unicode character to its title case equivalent.*/
1553U_CAPI UChar32 U_EXPORT2
1554u_totitle(UChar32 c) {
Jungshik Shin87232d82017-05-13 21:10:13 -07001555 return ucase_totitle(c);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001556}
1557
1558/* return the simple case folding mapping for c */
1559U_CAPI UChar32 U_EXPORT2
1560u_foldCase(UChar32 c, uint32_t options) {
Jungshik Shin87232d82017-05-13 21:10:13 -07001561 return ucase_fold(c, options);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001562}
1563
1564U_CFUNC int32_t U_EXPORT2
1565ucase_hasBinaryProperty(UChar32 c, UProperty which) {
1566 /* case mapping properties */
1567 const UChar *resultString;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001568 switch(which) {
1569 case UCHAR_LOWERCASE:
Jungshik Shin87232d82017-05-13 21:10:13 -07001570 return (UBool)(UCASE_LOWER==ucase_getType(c));
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001571 case UCHAR_UPPERCASE:
Jungshik Shin87232d82017-05-13 21:10:13 -07001572 return (UBool)(UCASE_UPPER==ucase_getType(c));
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001573 case UCHAR_SOFT_DOTTED:
Jungshik Shin87232d82017-05-13 21:10:13 -07001574 return ucase_isSoftDotted(c);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001575 case UCHAR_CASE_SENSITIVE:
Jungshik Shin87232d82017-05-13 21:10:13 -07001576 return ucase_isCaseSensitive(c);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001577 case UCHAR_CASED:
Jungshik Shin87232d82017-05-13 21:10:13 -07001578 return (UBool)(UCASE_NONE!=ucase_getType(c));
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001579 case UCHAR_CASE_IGNORABLE:
Jungshik Shin87232d82017-05-13 21:10:13 -07001580 return (UBool)(ucase_getTypeOrIgnorable(c)>>2);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001581 /*
1582 * Note: The following Changes_When_Xyz are defined as testing whether
1583 * the NFD form of the input changes when Xyz-case-mapped.
1584 * However, this simpler implementation of these properties,
1585 * ignoring NFD, passes the tests.
1586 * The implementation needs to be changed if the tests start failing.
1587 * When that happens, optimizations should be used to work with the
1588 * per-single-code point ucase_toFullXyz() functions unless
1589 * the NFD form has more than one code point,
1590 * and the property starts set needs to be the union of the
1591 * start sets for normalization and case mappings.
1592 */
1593 case UCHAR_CHANGES_WHEN_LOWERCASED:
Jungshik Shin87232d82017-05-13 21:10:13 -07001594 return (UBool)(ucase_toFullLower(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001595 case UCHAR_CHANGES_WHEN_UPPERCASED:
Jungshik Shin87232d82017-05-13 21:10:13 -07001596 return (UBool)(ucase_toFullUpper(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001597 case UCHAR_CHANGES_WHEN_TITLECASED:
Jungshik Shin87232d82017-05-13 21:10:13 -07001598 return (UBool)(ucase_toFullTitle(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001599 /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
1600 case UCHAR_CHANGES_WHEN_CASEMAPPED:
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001601 return (UBool)(
Jungshik Shin87232d82017-05-13 21:10:13 -07001602 ucase_toFullLower(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0 ||
1603 ucase_toFullUpper(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0 ||
1604 ucase_toFullTitle(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001605 default:
1606 return FALSE;
1607 }
1608}