blob: 01238b35f5bb06775373d6752d730d55a4823cd0 [file] [log] [blame]
Jungshik Shin87232d82017-05-13 21:10:13 -07001// © 2016 and later: Unicode, Inc. and others.
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07002// License & terms of use: http://www.unicode.org/copyright.html
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00003/*
4 *******************************************************************************
5 *
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07006 * Copyright (C) 2003-2016, International Business Machines
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00007 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: usprep.cpp
Jungshik Shin87232d82017-05-13 21:10:13 -070011 * encoding: UTF-8
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000012 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2003jul2
16 * created by: Ram Viswanadha
17 */
18
19#include "unicode/utypes.h"
20
21#if !UCONFIG_NO_IDNA
22
23#include "unicode/usprep.h"
24
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -080025#include "unicode/normalizer2.h"
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000026#include "unicode/ustring.h"
27#include "unicode/uchar.h"
28#include "unicode/uversion.h"
29#include "umutex.h"
30#include "cmemory.h"
31#include "sprpimpl.h"
32#include "ustr_imp.h"
33#include "uhash.h"
34#include "cstring.h"
35#include "udataswp.h"
36#include "ucln_cmn.h"
37#include "ubidi_props.h"
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -080038#include "uprops.h"
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000039
40U_NAMESPACE_USE
41
42U_CDECL_BEGIN
43
44/*
45Static cache for already opened StringPrep profiles
46*/
47static UHashtable *SHARED_DATA_HASHTABLE = NULL;
48static icu::UInitOnce gSharedDataInitOnce;
49
50static UMutex usprepMutex = U_MUTEX_INITIALIZER;
51
52/* format version of spp file */
53//static uint8_t formatVersion[4]={ 0, 0, 0, 0 };
54
55/* the Unicode version of the sprep data */
56static UVersionInfo dataVersion={ 0, 0, 0, 0 };
57
58/* Profile names must be aligned to UStringPrepProfileType */
59static const char * const PROFILE_NAMES[] = {
60 "rfc3491", /* USPREP_RFC3491_NAMEPREP */
61 "rfc3530cs", /* USPREP_RFC3530_NFS4_CS_PREP */
62 "rfc3530csci", /* USPREP_RFC3530_NFS4_CS_PREP_CI */
63 "rfc3491", /* USPREP_RFC3530_NSF4_CIS_PREP */
64 "rfc3530mixp", /* USPREP_RFC3530_NSF4_MIXED_PREP_PREFIX */
65 "rfc3491", /* USPREP_RFC3530_NSF4_MIXED_PREP_SUFFIX */
66 "rfc3722", /* USPREP_RFC3722_ISCSI */
67 "rfc3920node", /* USPREP_RFC3920_NODEPREP */
68 "rfc3920res", /* USPREP_RFC3920_RESOURCEPREP */
69 "rfc4011", /* USPREP_RFC4011_MIB */
70 "rfc4013", /* USPREP_RFC4013_SASLPREP */
71 "rfc4505", /* USPREP_RFC4505_TRACE */
72 "rfc4518", /* USPREP_RFC4518_LDAP */
73 "rfc4518ci", /* USPREP_RFC4518_LDAP_CI */
74};
75
76static UBool U_CALLCONV
77isSPrepAcceptable(void * /* context */,
78 const char * /* type */,
79 const char * /* name */,
80 const UDataInfo *pInfo) {
81 if(
82 pInfo->size>=20 &&
83 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
84 pInfo->charsetFamily==U_CHARSET_FAMILY &&
85 pInfo->dataFormat[0]==0x53 && /* dataFormat="SPRP" */
86 pInfo->dataFormat[1]==0x50 &&
87 pInfo->dataFormat[2]==0x52 &&
88 pInfo->dataFormat[3]==0x50 &&
89 pInfo->formatVersion[0]==3 &&
90 pInfo->formatVersion[2]==UTRIE_SHIFT &&
91 pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
92 ) {
93 //uprv_memcpy(formatVersion, pInfo->formatVersion, 4);
94 uprv_memcpy(dataVersion, pInfo->dataVersion, 4);
95 return TRUE;
96 } else {
97 return FALSE;
98 }
99}
100
101static int32_t U_CALLCONV
102getSPrepFoldingOffset(uint32_t data) {
103
104 return (int32_t)data;
105
106}
107
108/* hashes an entry */
109static int32_t U_CALLCONV
110hashEntry(const UHashTok parm) {
111 UStringPrepKey *b = (UStringPrepKey *)parm.pointer;
112 UHashTok namekey, pathkey;
113 namekey.pointer = b->name;
114 pathkey.pointer = b->path;
Jungshik Shin42d50272018-10-24 01:22:09 -0700115 uint32_t unsignedHash = static_cast<uint32_t>(uhash_hashChars(namekey)) +
116 37u * static_cast<uint32_t>(uhash_hashChars(pathkey));
117 return static_cast<int32_t>(unsignedHash);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000118}
119
120/* compares two entries */
121static UBool U_CALLCONV
122compareEntries(const UHashTok p1, const UHashTok p2) {
123 UStringPrepKey *b1 = (UStringPrepKey *)p1.pointer;
124 UStringPrepKey *b2 = (UStringPrepKey *)p2.pointer;
125 UHashTok name1, name2, path1, path2;
126 name1.pointer = b1->name;
127 name2.pointer = b2->name;
128 path1.pointer = b1->path;
129 path2.pointer = b2->path;
130 return ((UBool)(uhash_compareChars(name1, name2) &
131 uhash_compareChars(path1, path2)));
132}
133
134static void
135usprep_unload(UStringPrepProfile* data){
136 udata_close(data->sprepData);
137}
138
139static int32_t
140usprep_internal_flushCache(UBool noRefCount){
141 UStringPrepProfile *profile = NULL;
142 UStringPrepKey *key = NULL;
Jungshik Shin70f82502016-01-29 00:32:36 -0800143 int32_t pos = UHASH_FIRST;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000144 int32_t deletedNum = 0;
145 const UHashElement *e;
146
147 /*
148 * if shared data hasn't even been lazy evaluated yet
149 * return 0
150 */
151 umtx_lock(&usprepMutex);
152 if (SHARED_DATA_HASHTABLE == NULL) {
153 umtx_unlock(&usprepMutex);
154 return 0;
155 }
156
157 /*creates an enumeration to iterate through every element in the table */
158 while ((e = uhash_nextElement(SHARED_DATA_HASHTABLE, &pos)) != NULL)
159 {
160 profile = (UStringPrepProfile *) e->value.pointer;
161 key = (UStringPrepKey *) e->key.pointer;
162
163 if ((noRefCount== FALSE && profile->refCount == 0) ||
164 noRefCount== TRUE) {
165 deletedNum++;
166 uhash_removeElement(SHARED_DATA_HASHTABLE, e);
167
168 /* unload the data */
169 usprep_unload(profile);
170
171 if(key->name != NULL) {
172 uprv_free(key->name);
173 key->name=NULL;
174 }
175 if(key->path != NULL) {
176 uprv_free(key->path);
177 key->path=NULL;
178 }
179 uprv_free(profile);
180 uprv_free(key);
181 }
182
183 }
184 umtx_unlock(&usprepMutex);
185
186 return deletedNum;
187}
188
189/* Works just like ucnv_flushCache()
190static int32_t
191usprep_flushCache(){
192 return usprep_internal_flushCache(FALSE);
193}
194*/
195
196static UBool U_CALLCONV usprep_cleanup(void){
197 if (SHARED_DATA_HASHTABLE != NULL) {
198 usprep_internal_flushCache(TRUE);
199 if (SHARED_DATA_HASHTABLE != NULL && uhash_count(SHARED_DATA_HASHTABLE) == 0) {
200 uhash_close(SHARED_DATA_HASHTABLE);
201 SHARED_DATA_HASHTABLE = NULL;
202 }
203 }
204 gSharedDataInitOnce.reset();
205 return (SHARED_DATA_HASHTABLE == NULL);
206}
207U_CDECL_END
208
209
210/** Initializes the cache for resources */
211static void U_CALLCONV
212createCache(UErrorCode &status) {
213 SHARED_DATA_HASHTABLE = uhash_open(hashEntry, compareEntries, NULL, &status);
214 if (U_FAILURE(status)) {
215 SHARED_DATA_HASHTABLE = NULL;
216 }
217 ucln_common_registerCleanup(UCLN_COMMON_USPREP, usprep_cleanup);
218}
219
220static void
221initCache(UErrorCode *status) {
222 umtx_initOnce(gSharedDataInitOnce, &createCache, *status);
223}
224
225static UBool U_CALLCONV
226loadData(UStringPrepProfile* profile,
227 const char* path,
228 const char* name,
229 const char* type,
230 UErrorCode* errorCode) {
231 /* load Unicode SPREP data from file */
232 UTrie _sprepTrie={ 0,0,0,0,0,0,0 };
233 UDataMemory *dataMemory;
234 const int32_t *p=NULL;
235 const uint8_t *pb;
236 UVersionInfo normUnicodeVersion;
237 int32_t normUniVer, sprepUniVer, normCorrVer;
238
239 if(errorCode==NULL || U_FAILURE(*errorCode)) {
240 return 0;
241 }
242
243 /* open the data outside the mutex block */
244 //TODO: change the path
245 dataMemory=udata_openChoice(path, type, name, isSPrepAcceptable, NULL, errorCode);
246 if(U_FAILURE(*errorCode)) {
247 return FALSE;
248 }
249
250 p=(const int32_t *)udata_getMemory(dataMemory);
251 pb=(const uint8_t *)(p+_SPREP_INDEX_TOP);
252 utrie_unserialize(&_sprepTrie, pb, p[_SPREP_INDEX_TRIE_SIZE], errorCode);
253 _sprepTrie.getFoldingOffset=getSPrepFoldingOffset;
254
255
256 if(U_FAILURE(*errorCode)) {
257 udata_close(dataMemory);
258 return FALSE;
259 }
260
261 /* in the mutex block, set the data for this process */
262 umtx_lock(&usprepMutex);
263 if(profile->sprepData==NULL) {
264 profile->sprepData=dataMemory;
265 dataMemory=NULL;
266 uprv_memcpy(&profile->indexes, p, sizeof(profile->indexes));
267 uprv_memcpy(&profile->sprepTrie, &_sprepTrie, sizeof(UTrie));
268 } else {
269 p=(const int32_t *)udata_getMemory(profile->sprepData);
270 }
271 umtx_unlock(&usprepMutex);
272 /* initialize some variables */
273 profile->mappingData=(uint16_t *)((uint8_t *)(p+_SPREP_INDEX_TOP)+profile->indexes[_SPREP_INDEX_TRIE_SIZE]);
274
275 u_getUnicodeVersion(normUnicodeVersion);
276 normUniVer = (normUnicodeVersion[0] << 24) + (normUnicodeVersion[1] << 16) +
277 (normUnicodeVersion[2] << 8 ) + (normUnicodeVersion[3]);
278 sprepUniVer = (dataVersion[0] << 24) + (dataVersion[1] << 16) +
279 (dataVersion[2] << 8 ) + (dataVersion[3]);
280 normCorrVer = profile->indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION];
281
282 if(U_FAILURE(*errorCode)){
283 udata_close(dataMemory);
284 return FALSE;
285 }
286 if( normUniVer < sprepUniVer && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */
287 normUniVer < normCorrVer && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */
288 ((profile->indexes[_SPREP_OPTIONS] & _SPREP_NORMALIZATION_ON) > 0) /* normalization turned on*/
289 ){
290 *errorCode = U_INVALID_FORMAT_ERROR;
291 udata_close(dataMemory);
292 return FALSE;
293 }
294 profile->isDataLoaded = TRUE;
295
296 /* if a different thread set it first, then close the extra data */
297 if(dataMemory!=NULL) {
298 udata_close(dataMemory); /* NULL if it was set correctly */
299 }
300
301
302 return profile->isDataLoaded;
303}
304
305static UStringPrepProfile*
306usprep_getProfile(const char* path,
307 const char* name,
308 UErrorCode *status){
309
310 UStringPrepProfile* profile = NULL;
311
312 initCache(status);
313
314 if(U_FAILURE(*status)){
315 return NULL;
316 }
317
318 UStringPrepKey stackKey;
319 /*
320 * const is cast way to save malloc, strcpy and free calls
321 * we use the passed in pointers for fetching the data from the
322 * hash table which is safe
323 */
324 stackKey.name = (char*) name;
325 stackKey.path = (char*) path;
326
327 /* fetch the data from the cache */
328 umtx_lock(&usprepMutex);
329 profile = (UStringPrepProfile*) (uhash_get(SHARED_DATA_HASHTABLE,&stackKey));
330 if(profile != NULL) {
331 profile->refCount++;
332 }
333 umtx_unlock(&usprepMutex);
334
335 if(profile == NULL) {
336 /* else load the data and put the data in the cache */
337 LocalMemory<UStringPrepProfile> newProfile;
338 if(newProfile.allocateInsteadAndReset() == NULL) {
339 *status = U_MEMORY_ALLOCATION_ERROR;
340 return NULL;
341 }
342
343 /* load the data */
344 if(!loadData(newProfile.getAlias(), path, name, _SPREP_DATA_TYPE, status) || U_FAILURE(*status) ){
345 return NULL;
346 }
347
348 /* get the options */
349 newProfile->doNFKC = (UBool)((newProfile->indexes[_SPREP_OPTIONS] & _SPREP_NORMALIZATION_ON) > 0);
350 newProfile->checkBiDi = (UBool)((newProfile->indexes[_SPREP_OPTIONS] & _SPREP_CHECK_BIDI_ON) > 0);
351
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000352 LocalMemory<UStringPrepKey> key;
353 LocalMemory<char> keyName;
354 LocalMemory<char> keyPath;
355 if( key.allocateInsteadAndReset() == NULL ||
Jungshik Shin42d50272018-10-24 01:22:09 -0700356 keyName.allocateInsteadAndCopy(static_cast<int32_t>(uprv_strlen(name)+1)) == NULL ||
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000357 (path != NULL &&
Jungshik Shin42d50272018-10-24 01:22:09 -0700358 keyPath.allocateInsteadAndCopy(static_cast<int32_t>(uprv_strlen(path)+1)) == NULL)
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000359 ) {
360 *status = U_MEMORY_ALLOCATION_ERROR;
361 usprep_unload(newProfile.getAlias());
362 return NULL;
363 }
364
365 umtx_lock(&usprepMutex);
366 // If another thread already inserted the same key/value, refcount and cleanup our thread data
367 profile = (UStringPrepProfile*) (uhash_get(SHARED_DATA_HASHTABLE,&stackKey));
368 if(profile != NULL) {
369 profile->refCount++;
370 usprep_unload(newProfile.getAlias());
371 }
372 else {
373 /* initialize the key members */
374 key->name = keyName.orphan();
375 uprv_strcpy(key->name, name);
376 if(path != NULL){
377 key->path = keyPath.orphan();
378 uprv_strcpy(key->path, path);
379 }
380 profile = newProfile.orphan();
381
382 /* add the data object to the cache */
383 profile->refCount = 1;
384 uhash_put(SHARED_DATA_HASHTABLE, key.orphan(), profile, status);
385 }
386 umtx_unlock(&usprepMutex);
387 }
388
389 return profile;
390}
391
392U_CAPI UStringPrepProfile* U_EXPORT2
393usprep_open(const char* path,
394 const char* name,
395 UErrorCode* status){
396
397 if(status == NULL || U_FAILURE(*status)){
398 return NULL;
399 }
400
401 /* initialize the profile struct members */
402 return usprep_getProfile(path,name,status);
403}
404
405U_CAPI UStringPrepProfile* U_EXPORT2
406usprep_openByType(UStringPrepProfileType type,
407 UErrorCode* status) {
408 if(status == NULL || U_FAILURE(*status)){
409 return NULL;
410 }
411 int32_t index = (int32_t)type;
Jungshik Shin5feb9ad2016-10-21 12:52:48 -0700412 if (index < 0 || index >= UPRV_LENGTHOF(PROFILE_NAMES)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000413 *status = U_ILLEGAL_ARGUMENT_ERROR;
414 return NULL;
415 }
416 return usprep_open(NULL, PROFILE_NAMES[index], status);
417}
418
419U_CAPI void U_EXPORT2
420usprep_close(UStringPrepProfile* profile){
421 if(profile==NULL){
422 return;
423 }
424
425 umtx_lock(&usprepMutex);
426 /* decrement the ref count*/
427 if(profile->refCount > 0){
428 profile->refCount--;
429 }
430 umtx_unlock(&usprepMutex);
431
432}
433
434U_CFUNC void
435uprv_syntaxError(const UChar* rules,
436 int32_t pos,
437 int32_t rulesLen,
438 UParseError* parseError){
439 if(parseError == NULL){
440 return;
441 }
442 parseError->offset = pos;
443 parseError->line = 0 ; // we are not using line numbers
444
445 // for pre-context
446 int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
447 int32_t limit = pos;
448
449 u_memcpy(parseError->preContext,rules+start,limit-start);
450 //null terminate the buffer
451 parseError->preContext[limit-start] = 0;
452
453 // for post-context; include error rules[pos]
454 start = pos;
455 limit = start + (U_PARSE_CONTEXT_LEN-1);
456 if (limit > rulesLen) {
457 limit = rulesLen;
458 }
459 if (start < rulesLen) {
460 u_memcpy(parseError->postContext,rules+start,limit-start);
461 }
462 //null terminate the buffer
463 parseError->postContext[limit-start]= 0;
464}
465
466
467static inline UStringPrepType
468getValues(uint16_t trieWord, int16_t& value, UBool& isIndex){
469
470 UStringPrepType type;
471 if(trieWord == 0){
472 /*
473 * Initial value stored in the mapping table
474 * just return USPREP_TYPE_LIMIT .. so that
475 * the source codepoint is copied to the destination
476 */
477 type = USPREP_TYPE_LIMIT;
478 isIndex =FALSE;
479 value = 0;
480 }else if(trieWord >= _SPREP_TYPE_THRESHOLD){
481 type = (UStringPrepType) (trieWord - _SPREP_TYPE_THRESHOLD);
482 isIndex =FALSE;
483 value = 0;
484 }else{
485 /* get the type */
486 type = USPREP_MAP;
487 /* ascertain if the value is index or delta */
488 if(trieWord & 0x02){
489 isIndex = TRUE;
490 value = trieWord >> 2; //mask off the lower 2 bits and shift
491 }else{
492 isIndex = FALSE;
493 value = (int16_t)trieWord;
494 value = (value >> 2);
495 }
496
497 if((trieWord>>2) == _SPREP_MAX_INDEX_VALUE){
498 type = USPREP_DELETE;
499 isIndex =FALSE;
500 value = 0;
501 }
502 }
503 return type;
504}
505
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800506// TODO: change to writing to UnicodeString not UChar *
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000507static int32_t
508usprep_map( const UStringPrepProfile* profile,
509 const UChar* src, int32_t srcLength,
510 UChar* dest, int32_t destCapacity,
511 int32_t options,
512 UParseError* parseError,
513 UErrorCode* status ){
514
515 uint16_t result;
516 int32_t destIndex=0;
517 int32_t srcIndex;
518 UBool allowUnassigned = (UBool) ((options & USPREP_ALLOW_UNASSIGNED)>0);
519 UStringPrepType type;
520 int16_t value;
521 UBool isIndex;
522 const int32_t* indexes = profile->indexes;
523
524 // no error checking the caller check for error and arguments
525 // no string length check the caller finds out the string length
526
527 for(srcIndex=0;srcIndex<srcLength;){
528 UChar32 ch;
529
530 U16_NEXT(src,srcIndex,srcLength,ch);
531
532 result=0;
533
534 UTRIE_GET16(&profile->sprepTrie,ch,result);
535
536 type = getValues(result, value, isIndex);
537
538 // check if the source codepoint is unassigned
539 if(type == USPREP_UNASSIGNED && allowUnassigned == FALSE){
540
541 uprv_syntaxError(src,srcIndex-U16_LENGTH(ch), srcLength,parseError);
542 *status = U_STRINGPREP_UNASSIGNED_ERROR;
543 return 0;
544
545 }else if(type == USPREP_MAP){
546
547 int32_t index, length;
548
549 if(isIndex){
550 index = value;
551 if(index >= indexes[_SPREP_ONE_UCHAR_MAPPING_INDEX_START] &&
552 index < indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START]){
553 length = 1;
554 }else if(index >= indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START] &&
555 index < indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START]){
556 length = 2;
557 }else if(index >= indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START] &&
558 index < indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START]){
559 length = 3;
560 }else{
561 length = profile->mappingData[index++];
562
563 }
564
565 /* copy mapping to destination */
566 for(int32_t i=0; i< length; i++){
567 if(destIndex < destCapacity ){
568 dest[destIndex] = profile->mappingData[index+i];
569 }
570 destIndex++; /* for pre-flighting */
571 }
572 continue;
573 }else{
574 // subtract the delta to arrive at the code point
575 ch -= value;
576 }
577
578 }else if(type==USPREP_DELETE){
579 // just consume the codepoint and contine
580 continue;
581 }
582 //copy the code point into destination
583 if(ch <= 0xFFFF){
584 if(destIndex < destCapacity ){
585 dest[destIndex] = (UChar)ch;
586 }
587 destIndex++;
588 }else{
589 if(destIndex+1 < destCapacity ){
590 dest[destIndex] = U16_LEAD(ch);
591 dest[destIndex+1] = U16_TRAIL(ch);
592 }
593 destIndex +=2;
594 }
595
596 }
597
598 return u_terminateUChars(dest, destCapacity, destIndex, status);
599}
600
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800601/*
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000602 1) Map -- For each character in the input, check if it has a mapping
603 and, if so, replace it with its mapping.
604
605 2) Normalize -- Possibly normalize the result of step 1 using Unicode
606 normalization.
607
608 3) Prohibit -- Check for any characters that are not allowed in the
609 output. If any are found, return an error.
610
611 4) Check bidi -- Possibly check for right-to-left characters, and if
612 any are found, make sure that the whole string satisfies the
613 requirements for bidirectional strings. If the string does not
614 satisfy the requirements for bidirectional strings, return an
615 error.
616 [Unicode3.2] defines several bidirectional categories; each character
617 has one bidirectional category assigned to it. For the purposes of
618 the requirements below, an "RandALCat character" is a character that
619 has Unicode bidirectional categories "R" or "AL"; an "LCat character"
620 is a character that has Unicode bidirectional category "L". Note
621
622
623 that there are many characters which fall in neither of the above
624 definitions; Latin digits (<U+0030> through <U+0039>) are examples of
625 this because they have bidirectional category "EN".
626
627 In any profile that specifies bidirectional character handling, all
628 three of the following requirements MUST be met:
629
630 1) The characters in section 5.8 MUST be prohibited.
631
632 2) If a string contains any RandALCat character, the string MUST NOT
633 contain any LCat character.
634
635 3) If a string contains any RandALCat character, a RandALCat
636 character MUST be the first character of the string, and a
637 RandALCat character MUST be the last character of the string.
638*/
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000639U_CAPI int32_t U_EXPORT2
640usprep_prepare( const UStringPrepProfile* profile,
641 const UChar* src, int32_t srcLength,
642 UChar* dest, int32_t destCapacity,
643 int32_t options,
644 UParseError* parseError,
645 UErrorCode* status ){
646
647 // check error status
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800648 if(U_FAILURE(*status)){
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000649 return 0;
650 }
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800651
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000652 //check arguments
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800653 if(profile==NULL ||
654 (src==NULL ? srcLength!=0 : srcLength<-1) ||
655 (dest==NULL ? destCapacity!=0 : destCapacity<0)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000656 *status=U_ILLEGAL_ARGUMENT_ERROR;
657 return 0;
658 }
659
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000660 //get the string length
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800661 if(srcLength < 0){
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000662 srcLength = u_strlen(src);
663 }
664 // map
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800665 UnicodeString s1;
666 UChar *b1 = s1.getBuffer(srcLength);
667 if(b1==NULL){
668 *status = U_MEMORY_ALLOCATION_ERROR;
669 return 0;
670 }
671 int32_t b1Len = usprep_map(profile, src, srcLength,
672 b1, s1.getCapacity(), options, parseError, status);
673 s1.releaseBuffer(U_SUCCESS(*status) ? b1Len : 0);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000674
675 if(*status == U_BUFFER_OVERFLOW_ERROR){
676 // redo processing of string
677 /* we do not have enough room so grow the buffer*/
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800678 b1 = s1.getBuffer(b1Len);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000679 if(b1==NULL){
680 *status = U_MEMORY_ALLOCATION_ERROR;
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800681 return 0;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000682 }
683
684 *status = U_ZERO_ERROR; // reset error
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800685 b1Len = usprep_map(profile, src, srcLength,
686 b1, s1.getCapacity(), options, parseError, status);
687 s1.releaseBuffer(U_SUCCESS(*status) ? b1Len : 0);
688 }
689 if(U_FAILURE(*status)){
690 return 0;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000691 }
692
693 // normalize
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800694 UnicodeString s2;
695 if(profile->doNFKC){
696 const Normalizer2 *n2 = Normalizer2::getNFKCInstance(*status);
697 FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*status));
698 if(U_FAILURE(*status)){
699 return 0;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000700 }
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800701 fn2.normalize(s1, s2, *status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000702 }else{
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800703 s2.fastCopyFrom(s1);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000704 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000705 if(U_FAILURE(*status)){
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800706 return 0;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000707 }
708
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000709 // Prohibit and checkBiDi in one pass
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800710 const UChar *b2 = s2.getBuffer();
711 int32_t b2Len = s2.length();
712 UCharDirection direction=U_CHAR_DIRECTION_COUNT, firstCharDir=U_CHAR_DIRECTION_COUNT;
713 UBool leftToRight=FALSE, rightToLeft=FALSE;
714 int32_t rtlPos =-1, ltrPos =-1;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000715
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800716 for(int32_t b2Index=0; b2Index<b2Len;){
717 UChar32 ch = 0;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000718 U16_NEXT(b2, b2Index, b2Len, ch);
719
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800720 uint16_t result;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000721 UTRIE_GET16(&profile->sprepTrie,ch,result);
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800722
723 int16_t value;
724 UBool isIndex;
725 UStringPrepType type = getValues(result, value, isIndex);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000726
727 if( type == USPREP_PROHIBITED ||
728 ((result < _SPREP_TYPE_THRESHOLD) && (result & 0x01) /* first bit says it the code point is prohibited*/)
729 ){
730 *status = U_STRINGPREP_PROHIBITED_ERROR;
Jungshik Shin42d50272018-10-24 01:22:09 -0700731 uprv_syntaxError(b2, b2Index-U16_LENGTH(ch), b2Len, parseError);
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800732 return 0;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000733 }
734
735 if(profile->checkBiDi) {
Jungshik Shinf61e46d2018-05-04 13:00:45 -0700736 direction = ubidi_getClass(ch);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000737 if(firstCharDir == U_CHAR_DIRECTION_COUNT){
738 firstCharDir = direction;
739 }
740 if(direction == U_LEFT_TO_RIGHT){
741 leftToRight = TRUE;
742 ltrPos = b2Index-1;
743 }
744 if(direction == U_RIGHT_TO_LEFT || direction == U_RIGHT_TO_LEFT_ARABIC){
745 rightToLeft = TRUE;
746 rtlPos = b2Index-1;
747 }
748 }
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800749 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000750 if(profile->checkBiDi == TRUE){
751 // satisfy 2
752 if( leftToRight == TRUE && rightToLeft == TRUE){
753 *status = U_STRINGPREP_CHECK_BIDI_ERROR;
754 uprv_syntaxError(b2,(rtlPos>ltrPos) ? rtlPos : ltrPos, b2Len, parseError);
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800755 return 0;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000756 }
757
758 //satisfy 3
759 if( rightToLeft == TRUE &&
760 !((firstCharDir == U_RIGHT_TO_LEFT || firstCharDir == U_RIGHT_TO_LEFT_ARABIC) &&
761 (direction == U_RIGHT_TO_LEFT || direction == U_RIGHT_TO_LEFT_ARABIC))
762 ){
763 *status = U_STRINGPREP_CHECK_BIDI_ERROR;
764 uprv_syntaxError(b2, rtlPos, b2Len, parseError);
765 return FALSE;
766 }
767 }
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800768 return s2.extract(dest, destCapacity, *status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000769}
770
771
772/* data swapping ------------------------------------------------------------ */
773
774U_CAPI int32_t U_EXPORT2
775usprep_swap(const UDataSwapper *ds,
776 const void *inData, int32_t length, void *outData,
777 UErrorCode *pErrorCode) {
778 const UDataInfo *pInfo;
779 int32_t headerSize;
780
781 const uint8_t *inBytes;
782 uint8_t *outBytes;
783
784 const int32_t *inIndexes;
785 int32_t indexes[16];
786
787 int32_t i, offset, count, size;
788
789 /* udata_swapDataHeader checks the arguments */
790 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
791 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
792 return 0;
793 }
794
795 /* check data format and format version */
796 pInfo=(const UDataInfo *)((const char *)inData+4);
797 if(!(
798 pInfo->dataFormat[0]==0x53 && /* dataFormat="SPRP" */
799 pInfo->dataFormat[1]==0x50 &&
800 pInfo->dataFormat[2]==0x52 &&
801 pInfo->dataFormat[3]==0x50 &&
802 pInfo->formatVersion[0]==3
803 )) {
804 udata_printError(ds, "usprep_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as StringPrep .spp data\n",
805 pInfo->dataFormat[0], pInfo->dataFormat[1],
806 pInfo->dataFormat[2], pInfo->dataFormat[3],
807 pInfo->formatVersion[0]);
808 *pErrorCode=U_UNSUPPORTED_ERROR;
809 return 0;
810 }
811
812 inBytes=(const uint8_t *)inData+headerSize;
813 outBytes=(uint8_t *)outData+headerSize;
814
815 inIndexes=(const int32_t *)inBytes;
816
817 if(length>=0) {
818 length-=headerSize;
819 if(length<16*4) {
820 udata_printError(ds, "usprep_swap(): too few bytes (%d after header) for StringPrep .spp data\n",
821 length);
822 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
823 return 0;
824 }
825 }
826
827 /* read the first 16 indexes (ICU 2.8/format version 3: _SPREP_INDEX_TOP==16, might grow) */
828 for(i=0; i<16; ++i) {
829 indexes[i]=udata_readInt32(ds, inIndexes[i]);
830 }
831
832 /* calculate the total length of the data */
833 size=
834 16*4+ /* size of indexes[] */
835 indexes[_SPREP_INDEX_TRIE_SIZE]+
836 indexes[_SPREP_INDEX_MAPPING_DATA_SIZE];
837
838 if(length>=0) {
839 if(length<size) {
840 udata_printError(ds, "usprep_swap(): too few bytes (%d after header) for all of StringPrep .spp data\n",
841 length);
842 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
843 return 0;
844 }
845
846 /* copy the data for inaccessible bytes */
847 if(inBytes!=outBytes) {
848 uprv_memcpy(outBytes, inBytes, size);
849 }
850
851 offset=0;
852
853 /* swap the int32_t indexes[] */
854 count=16*4;
855 ds->swapArray32(ds, inBytes, count, outBytes, pErrorCode);
856 offset+=count;
857
858 /* swap the UTrie */
859 count=indexes[_SPREP_INDEX_TRIE_SIZE];
860 utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
861 offset+=count;
862
863 /* swap the uint16_t mappingTable[] */
864 count=indexes[_SPREP_INDEX_MAPPING_DATA_SIZE];
865 ds->swapArray16(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800866 //offset+=count;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000867 }
868
869 return headerSize+size;
870}
871
872#endif /* #if !UCONFIG_NO_IDNA */