blob: d8325133f76283128dfe96952ecdf4ea3b25a181 [file] [log] [blame]
Jungshik Shin87232d82017-05-13 21:10:13 -07001// © 2016 and later: Unicode, Inc. and others.
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07002// License & terms of use: http://www.unicode.org/copyright.html
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00003/*
4 *******************************************************************************
5 *
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07006 * Copyright (C) 2003-2016, International Business Machines
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00007 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: usprep.cpp
Jungshik Shin87232d82017-05-13 21:10:13 -070011 * encoding: UTF-8
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000012 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2003jul2
16 * created by: Ram Viswanadha
17 */
18
19#include "unicode/utypes.h"
20
21#if !UCONFIG_NO_IDNA
22
23#include "unicode/usprep.h"
24
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -080025#include "unicode/normalizer2.h"
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000026#include "unicode/ustring.h"
27#include "unicode/uchar.h"
28#include "unicode/uversion.h"
29#include "umutex.h"
30#include "cmemory.h"
31#include "sprpimpl.h"
32#include "ustr_imp.h"
33#include "uhash.h"
34#include "cstring.h"
35#include "udataswp.h"
36#include "ucln_cmn.h"
37#include "ubidi_props.h"
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -080038#include "uprops.h"
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000039
40U_NAMESPACE_USE
41
42U_CDECL_BEGIN
43
44/*
45Static cache for already opened StringPrep profiles
46*/
47static UHashtable *SHARED_DATA_HASHTABLE = NULL;
48static icu::UInitOnce gSharedDataInitOnce;
49
Frank Tang69c72a62019-04-03 21:41:21 -070050static UMutex *usprepMutex() {
51 static UMutex *m = new UMutex();
52 return m;
53}
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000054
55/* format version of spp file */
56//static uint8_t formatVersion[4]={ 0, 0, 0, 0 };
57
58/* the Unicode version of the sprep data */
59static UVersionInfo dataVersion={ 0, 0, 0, 0 };
60
61/* Profile names must be aligned to UStringPrepProfileType */
62static const char * const PROFILE_NAMES[] = {
63 "rfc3491", /* USPREP_RFC3491_NAMEPREP */
64 "rfc3530cs", /* USPREP_RFC3530_NFS4_CS_PREP */
65 "rfc3530csci", /* USPREP_RFC3530_NFS4_CS_PREP_CI */
66 "rfc3491", /* USPREP_RFC3530_NSF4_CIS_PREP */
67 "rfc3530mixp", /* USPREP_RFC3530_NSF4_MIXED_PREP_PREFIX */
68 "rfc3491", /* USPREP_RFC3530_NSF4_MIXED_PREP_SUFFIX */
69 "rfc3722", /* USPREP_RFC3722_ISCSI */
70 "rfc3920node", /* USPREP_RFC3920_NODEPREP */
71 "rfc3920res", /* USPREP_RFC3920_RESOURCEPREP */
72 "rfc4011", /* USPREP_RFC4011_MIB */
73 "rfc4013", /* USPREP_RFC4013_SASLPREP */
74 "rfc4505", /* USPREP_RFC4505_TRACE */
75 "rfc4518", /* USPREP_RFC4518_LDAP */
76 "rfc4518ci", /* USPREP_RFC4518_LDAP_CI */
77};
78
79static UBool U_CALLCONV
80isSPrepAcceptable(void * /* context */,
81 const char * /* type */,
82 const char * /* name */,
83 const UDataInfo *pInfo) {
84 if(
85 pInfo->size>=20 &&
86 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
87 pInfo->charsetFamily==U_CHARSET_FAMILY &&
88 pInfo->dataFormat[0]==0x53 && /* dataFormat="SPRP" */
89 pInfo->dataFormat[1]==0x50 &&
90 pInfo->dataFormat[2]==0x52 &&
91 pInfo->dataFormat[3]==0x50 &&
92 pInfo->formatVersion[0]==3 &&
93 pInfo->formatVersion[2]==UTRIE_SHIFT &&
94 pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
95 ) {
96 //uprv_memcpy(formatVersion, pInfo->formatVersion, 4);
97 uprv_memcpy(dataVersion, pInfo->dataVersion, 4);
98 return TRUE;
99 } else {
100 return FALSE;
101 }
102}
103
104static int32_t U_CALLCONV
105getSPrepFoldingOffset(uint32_t data) {
106
107 return (int32_t)data;
108
109}
110
111/* hashes an entry */
112static int32_t U_CALLCONV
113hashEntry(const UHashTok parm) {
114 UStringPrepKey *b = (UStringPrepKey *)parm.pointer;
115 UHashTok namekey, pathkey;
116 namekey.pointer = b->name;
117 pathkey.pointer = b->path;
Jungshik Shin42d50272018-10-24 01:22:09 -0700118 uint32_t unsignedHash = static_cast<uint32_t>(uhash_hashChars(namekey)) +
119 37u * static_cast<uint32_t>(uhash_hashChars(pathkey));
120 return static_cast<int32_t>(unsignedHash);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000121}
122
123/* compares two entries */
124static UBool U_CALLCONV
125compareEntries(const UHashTok p1, const UHashTok p2) {
126 UStringPrepKey *b1 = (UStringPrepKey *)p1.pointer;
127 UStringPrepKey *b2 = (UStringPrepKey *)p2.pointer;
128 UHashTok name1, name2, path1, path2;
129 name1.pointer = b1->name;
130 name2.pointer = b2->name;
131 path1.pointer = b1->path;
132 path2.pointer = b2->path;
133 return ((UBool)(uhash_compareChars(name1, name2) &
134 uhash_compareChars(path1, path2)));
135}
136
137static void
138usprep_unload(UStringPrepProfile* data){
139 udata_close(data->sprepData);
140}
141
142static int32_t
143usprep_internal_flushCache(UBool noRefCount){
144 UStringPrepProfile *profile = NULL;
145 UStringPrepKey *key = NULL;
Jungshik Shin70f82502016-01-29 00:32:36 -0800146 int32_t pos = UHASH_FIRST;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000147 int32_t deletedNum = 0;
148 const UHashElement *e;
149
150 /*
151 * if shared data hasn't even been lazy evaluated yet
152 * return 0
153 */
Frank Tang69c72a62019-04-03 21:41:21 -0700154 umtx_lock(usprepMutex());
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000155 if (SHARED_DATA_HASHTABLE == NULL) {
Frank Tang69c72a62019-04-03 21:41:21 -0700156 umtx_unlock(usprepMutex());
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000157 return 0;
158 }
159
160 /*creates an enumeration to iterate through every element in the table */
161 while ((e = uhash_nextElement(SHARED_DATA_HASHTABLE, &pos)) != NULL)
162 {
163 profile = (UStringPrepProfile *) e->value.pointer;
164 key = (UStringPrepKey *) e->key.pointer;
165
166 if ((noRefCount== FALSE && profile->refCount == 0) ||
167 noRefCount== TRUE) {
168 deletedNum++;
169 uhash_removeElement(SHARED_DATA_HASHTABLE, e);
170
171 /* unload the data */
172 usprep_unload(profile);
173
174 if(key->name != NULL) {
175 uprv_free(key->name);
176 key->name=NULL;
177 }
178 if(key->path != NULL) {
179 uprv_free(key->path);
180 key->path=NULL;
181 }
182 uprv_free(profile);
183 uprv_free(key);
184 }
185
186 }
Frank Tang69c72a62019-04-03 21:41:21 -0700187 umtx_unlock(usprepMutex());
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000188
189 return deletedNum;
190}
191
192/* Works just like ucnv_flushCache()
193static int32_t
194usprep_flushCache(){
195 return usprep_internal_flushCache(FALSE);
196}
197*/
198
199static UBool U_CALLCONV usprep_cleanup(void){
200 if (SHARED_DATA_HASHTABLE != NULL) {
201 usprep_internal_flushCache(TRUE);
202 if (SHARED_DATA_HASHTABLE != NULL && uhash_count(SHARED_DATA_HASHTABLE) == 0) {
203 uhash_close(SHARED_DATA_HASHTABLE);
204 SHARED_DATA_HASHTABLE = NULL;
205 }
206 }
207 gSharedDataInitOnce.reset();
208 return (SHARED_DATA_HASHTABLE == NULL);
209}
210U_CDECL_END
211
212
213/** Initializes the cache for resources */
214static void U_CALLCONV
215createCache(UErrorCode &status) {
216 SHARED_DATA_HASHTABLE = uhash_open(hashEntry, compareEntries, NULL, &status);
217 if (U_FAILURE(status)) {
218 SHARED_DATA_HASHTABLE = NULL;
219 }
220 ucln_common_registerCleanup(UCLN_COMMON_USPREP, usprep_cleanup);
221}
222
223static void
224initCache(UErrorCode *status) {
225 umtx_initOnce(gSharedDataInitOnce, &createCache, *status);
226}
227
228static UBool U_CALLCONV
229loadData(UStringPrepProfile* profile,
230 const char* path,
231 const char* name,
232 const char* type,
233 UErrorCode* errorCode) {
234 /* load Unicode SPREP data from file */
235 UTrie _sprepTrie={ 0,0,0,0,0,0,0 };
236 UDataMemory *dataMemory;
237 const int32_t *p=NULL;
238 const uint8_t *pb;
239 UVersionInfo normUnicodeVersion;
240 int32_t normUniVer, sprepUniVer, normCorrVer;
241
242 if(errorCode==NULL || U_FAILURE(*errorCode)) {
243 return 0;
244 }
245
246 /* open the data outside the mutex block */
247 //TODO: change the path
248 dataMemory=udata_openChoice(path, type, name, isSPrepAcceptable, NULL, errorCode);
249 if(U_FAILURE(*errorCode)) {
250 return FALSE;
251 }
252
253 p=(const int32_t *)udata_getMemory(dataMemory);
254 pb=(const uint8_t *)(p+_SPREP_INDEX_TOP);
255 utrie_unserialize(&_sprepTrie, pb, p[_SPREP_INDEX_TRIE_SIZE], errorCode);
256 _sprepTrie.getFoldingOffset=getSPrepFoldingOffset;
257
258
259 if(U_FAILURE(*errorCode)) {
260 udata_close(dataMemory);
261 return FALSE;
262 }
263
264 /* in the mutex block, set the data for this process */
Frank Tang69c72a62019-04-03 21:41:21 -0700265 umtx_lock(usprepMutex());
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000266 if(profile->sprepData==NULL) {
267 profile->sprepData=dataMemory;
268 dataMemory=NULL;
269 uprv_memcpy(&profile->indexes, p, sizeof(profile->indexes));
270 uprv_memcpy(&profile->sprepTrie, &_sprepTrie, sizeof(UTrie));
271 } else {
272 p=(const int32_t *)udata_getMemory(profile->sprepData);
273 }
Frank Tang69c72a62019-04-03 21:41:21 -0700274 umtx_unlock(usprepMutex());
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000275 /* initialize some variables */
276 profile->mappingData=(uint16_t *)((uint8_t *)(p+_SPREP_INDEX_TOP)+profile->indexes[_SPREP_INDEX_TRIE_SIZE]);
277
278 u_getUnicodeVersion(normUnicodeVersion);
279 normUniVer = (normUnicodeVersion[0] << 24) + (normUnicodeVersion[1] << 16) +
280 (normUnicodeVersion[2] << 8 ) + (normUnicodeVersion[3]);
281 sprepUniVer = (dataVersion[0] << 24) + (dataVersion[1] << 16) +
282 (dataVersion[2] << 8 ) + (dataVersion[3]);
283 normCorrVer = profile->indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION];
284
285 if(U_FAILURE(*errorCode)){
286 udata_close(dataMemory);
287 return FALSE;
288 }
289 if( normUniVer < sprepUniVer && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */
290 normUniVer < normCorrVer && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */
291 ((profile->indexes[_SPREP_OPTIONS] & _SPREP_NORMALIZATION_ON) > 0) /* normalization turned on*/
292 ){
293 *errorCode = U_INVALID_FORMAT_ERROR;
294 udata_close(dataMemory);
295 return FALSE;
296 }
297 profile->isDataLoaded = TRUE;
298
299 /* if a different thread set it first, then close the extra data */
300 if(dataMemory!=NULL) {
301 udata_close(dataMemory); /* NULL if it was set correctly */
302 }
303
304
305 return profile->isDataLoaded;
306}
307
308static UStringPrepProfile*
309usprep_getProfile(const char* path,
310 const char* name,
311 UErrorCode *status){
312
313 UStringPrepProfile* profile = NULL;
314
315 initCache(status);
316
317 if(U_FAILURE(*status)){
318 return NULL;
319 }
320
321 UStringPrepKey stackKey;
322 /*
323 * const is cast way to save malloc, strcpy and free calls
324 * we use the passed in pointers for fetching the data from the
325 * hash table which is safe
326 */
327 stackKey.name = (char*) name;
328 stackKey.path = (char*) path;
329
330 /* fetch the data from the cache */
Frank Tang69c72a62019-04-03 21:41:21 -0700331 umtx_lock(usprepMutex());
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000332 profile = (UStringPrepProfile*) (uhash_get(SHARED_DATA_HASHTABLE,&stackKey));
333 if(profile != NULL) {
334 profile->refCount++;
335 }
Frank Tang69c72a62019-04-03 21:41:21 -0700336 umtx_unlock(usprepMutex());
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000337
338 if(profile == NULL) {
339 /* else load the data and put the data in the cache */
340 LocalMemory<UStringPrepProfile> newProfile;
341 if(newProfile.allocateInsteadAndReset() == NULL) {
342 *status = U_MEMORY_ALLOCATION_ERROR;
343 return NULL;
344 }
345
346 /* load the data */
347 if(!loadData(newProfile.getAlias(), path, name, _SPREP_DATA_TYPE, status) || U_FAILURE(*status) ){
348 return NULL;
349 }
350
351 /* get the options */
352 newProfile->doNFKC = (UBool)((newProfile->indexes[_SPREP_OPTIONS] & _SPREP_NORMALIZATION_ON) > 0);
353 newProfile->checkBiDi = (UBool)((newProfile->indexes[_SPREP_OPTIONS] & _SPREP_CHECK_BIDI_ON) > 0);
354
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000355 LocalMemory<UStringPrepKey> key;
356 LocalMemory<char> keyName;
357 LocalMemory<char> keyPath;
358 if( key.allocateInsteadAndReset() == NULL ||
Jungshik Shin42d50272018-10-24 01:22:09 -0700359 keyName.allocateInsteadAndCopy(static_cast<int32_t>(uprv_strlen(name)+1)) == NULL ||
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000360 (path != NULL &&
Jungshik Shin42d50272018-10-24 01:22:09 -0700361 keyPath.allocateInsteadAndCopy(static_cast<int32_t>(uprv_strlen(path)+1)) == NULL)
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000362 ) {
363 *status = U_MEMORY_ALLOCATION_ERROR;
364 usprep_unload(newProfile.getAlias());
365 return NULL;
366 }
367
Frank Tang69c72a62019-04-03 21:41:21 -0700368 umtx_lock(usprepMutex());
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000369 // If another thread already inserted the same key/value, refcount and cleanup our thread data
370 profile = (UStringPrepProfile*) (uhash_get(SHARED_DATA_HASHTABLE,&stackKey));
371 if(profile != NULL) {
372 profile->refCount++;
373 usprep_unload(newProfile.getAlias());
374 }
375 else {
376 /* initialize the key members */
377 key->name = keyName.orphan();
378 uprv_strcpy(key->name, name);
379 if(path != NULL){
380 key->path = keyPath.orphan();
381 uprv_strcpy(key->path, path);
382 }
383 profile = newProfile.orphan();
384
385 /* add the data object to the cache */
386 profile->refCount = 1;
387 uhash_put(SHARED_DATA_HASHTABLE, key.orphan(), profile, status);
388 }
Frank Tang69c72a62019-04-03 21:41:21 -0700389 umtx_unlock(usprepMutex());
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000390 }
391
392 return profile;
393}
394
395U_CAPI UStringPrepProfile* U_EXPORT2
396usprep_open(const char* path,
397 const char* name,
398 UErrorCode* status){
399
400 if(status == NULL || U_FAILURE(*status)){
401 return NULL;
402 }
403
404 /* initialize the profile struct members */
405 return usprep_getProfile(path,name,status);
406}
407
408U_CAPI UStringPrepProfile* U_EXPORT2
409usprep_openByType(UStringPrepProfileType type,
410 UErrorCode* status) {
411 if(status == NULL || U_FAILURE(*status)){
412 return NULL;
413 }
414 int32_t index = (int32_t)type;
Jungshik Shin5feb9ad2016-10-21 12:52:48 -0700415 if (index < 0 || index >= UPRV_LENGTHOF(PROFILE_NAMES)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000416 *status = U_ILLEGAL_ARGUMENT_ERROR;
417 return NULL;
418 }
419 return usprep_open(NULL, PROFILE_NAMES[index], status);
420}
421
422U_CAPI void U_EXPORT2
423usprep_close(UStringPrepProfile* profile){
424 if(profile==NULL){
425 return;
426 }
427
Frank Tang69c72a62019-04-03 21:41:21 -0700428 umtx_lock(usprepMutex());
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000429 /* decrement the ref count*/
430 if(profile->refCount > 0){
431 profile->refCount--;
432 }
Frank Tang69c72a62019-04-03 21:41:21 -0700433 umtx_unlock(usprepMutex());
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000434
435}
436
437U_CFUNC void
438uprv_syntaxError(const UChar* rules,
439 int32_t pos,
440 int32_t rulesLen,
441 UParseError* parseError){
442 if(parseError == NULL){
443 return;
444 }
445 parseError->offset = pos;
446 parseError->line = 0 ; // we are not using line numbers
447
448 // for pre-context
449 int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
450 int32_t limit = pos;
451
452 u_memcpy(parseError->preContext,rules+start,limit-start);
453 //null terminate the buffer
454 parseError->preContext[limit-start] = 0;
455
456 // for post-context; include error rules[pos]
457 start = pos;
458 limit = start + (U_PARSE_CONTEXT_LEN-1);
459 if (limit > rulesLen) {
460 limit = rulesLen;
461 }
462 if (start < rulesLen) {
463 u_memcpy(parseError->postContext,rules+start,limit-start);
464 }
465 //null terminate the buffer
466 parseError->postContext[limit-start]= 0;
467}
468
469
470static inline UStringPrepType
471getValues(uint16_t trieWord, int16_t& value, UBool& isIndex){
472
473 UStringPrepType type;
474 if(trieWord == 0){
475 /*
476 * Initial value stored in the mapping table
477 * just return USPREP_TYPE_LIMIT .. so that
478 * the source codepoint is copied to the destination
479 */
480 type = USPREP_TYPE_LIMIT;
481 isIndex =FALSE;
482 value = 0;
483 }else if(trieWord >= _SPREP_TYPE_THRESHOLD){
484 type = (UStringPrepType) (trieWord - _SPREP_TYPE_THRESHOLD);
485 isIndex =FALSE;
486 value = 0;
487 }else{
488 /* get the type */
489 type = USPREP_MAP;
490 /* ascertain if the value is index or delta */
491 if(trieWord & 0x02){
492 isIndex = TRUE;
493 value = trieWord >> 2; //mask off the lower 2 bits and shift
494 }else{
495 isIndex = FALSE;
496 value = (int16_t)trieWord;
497 value = (value >> 2);
498 }
499
500 if((trieWord>>2) == _SPREP_MAX_INDEX_VALUE){
501 type = USPREP_DELETE;
502 isIndex =FALSE;
503 value = 0;
504 }
505 }
506 return type;
507}
508
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800509// TODO: change to writing to UnicodeString not UChar *
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000510static int32_t
511usprep_map( const UStringPrepProfile* profile,
512 const UChar* src, int32_t srcLength,
513 UChar* dest, int32_t destCapacity,
514 int32_t options,
515 UParseError* parseError,
516 UErrorCode* status ){
517
518 uint16_t result;
519 int32_t destIndex=0;
520 int32_t srcIndex;
521 UBool allowUnassigned = (UBool) ((options & USPREP_ALLOW_UNASSIGNED)>0);
522 UStringPrepType type;
523 int16_t value;
524 UBool isIndex;
525 const int32_t* indexes = profile->indexes;
526
527 // no error checking the caller check for error and arguments
528 // no string length check the caller finds out the string length
529
530 for(srcIndex=0;srcIndex<srcLength;){
531 UChar32 ch;
532
533 U16_NEXT(src,srcIndex,srcLength,ch);
534
535 result=0;
536
537 UTRIE_GET16(&profile->sprepTrie,ch,result);
538
539 type = getValues(result, value, isIndex);
540
541 // check if the source codepoint is unassigned
542 if(type == USPREP_UNASSIGNED && allowUnassigned == FALSE){
543
544 uprv_syntaxError(src,srcIndex-U16_LENGTH(ch), srcLength,parseError);
545 *status = U_STRINGPREP_UNASSIGNED_ERROR;
546 return 0;
547
548 }else if(type == USPREP_MAP){
549
550 int32_t index, length;
551
552 if(isIndex){
553 index = value;
554 if(index >= indexes[_SPREP_ONE_UCHAR_MAPPING_INDEX_START] &&
555 index < indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START]){
556 length = 1;
557 }else if(index >= indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START] &&
558 index < indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START]){
559 length = 2;
560 }else if(index >= indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START] &&
561 index < indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START]){
562 length = 3;
563 }else{
564 length = profile->mappingData[index++];
565
566 }
567
568 /* copy mapping to destination */
569 for(int32_t i=0; i< length; i++){
570 if(destIndex < destCapacity ){
571 dest[destIndex] = profile->mappingData[index+i];
572 }
573 destIndex++; /* for pre-flighting */
574 }
575 continue;
576 }else{
577 // subtract the delta to arrive at the code point
578 ch -= value;
579 }
580
581 }else if(type==USPREP_DELETE){
582 // just consume the codepoint and contine
583 continue;
584 }
585 //copy the code point into destination
586 if(ch <= 0xFFFF){
587 if(destIndex < destCapacity ){
588 dest[destIndex] = (UChar)ch;
589 }
590 destIndex++;
591 }else{
592 if(destIndex+1 < destCapacity ){
593 dest[destIndex] = U16_LEAD(ch);
594 dest[destIndex+1] = U16_TRAIL(ch);
595 }
596 destIndex +=2;
597 }
598
599 }
600
601 return u_terminateUChars(dest, destCapacity, destIndex, status);
602}
603
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800604/*
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000605 1) Map -- For each character in the input, check if it has a mapping
606 and, if so, replace it with its mapping.
607
608 2) Normalize -- Possibly normalize the result of step 1 using Unicode
609 normalization.
610
611 3) Prohibit -- Check for any characters that are not allowed in the
612 output. If any are found, return an error.
613
614 4) Check bidi -- Possibly check for right-to-left characters, and if
615 any are found, make sure that the whole string satisfies the
616 requirements for bidirectional strings. If the string does not
617 satisfy the requirements for bidirectional strings, return an
618 error.
619 [Unicode3.2] defines several bidirectional categories; each character
620 has one bidirectional category assigned to it. For the purposes of
621 the requirements below, an "RandALCat character" is a character that
622 has Unicode bidirectional categories "R" or "AL"; an "LCat character"
623 is a character that has Unicode bidirectional category "L". Note
624
625
626 that there are many characters which fall in neither of the above
627 definitions; Latin digits (<U+0030> through <U+0039>) are examples of
628 this because they have bidirectional category "EN".
629
630 In any profile that specifies bidirectional character handling, all
631 three of the following requirements MUST be met:
632
633 1) The characters in section 5.8 MUST be prohibited.
634
635 2) If a string contains any RandALCat character, the string MUST NOT
636 contain any LCat character.
637
638 3) If a string contains any RandALCat character, a RandALCat
639 character MUST be the first character of the string, and a
640 RandALCat character MUST be the last character of the string.
641*/
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000642U_CAPI int32_t U_EXPORT2
643usprep_prepare( const UStringPrepProfile* profile,
644 const UChar* src, int32_t srcLength,
645 UChar* dest, int32_t destCapacity,
646 int32_t options,
647 UParseError* parseError,
648 UErrorCode* status ){
649
650 // check error status
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800651 if(U_FAILURE(*status)){
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000652 return 0;
653 }
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800654
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000655 //check arguments
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800656 if(profile==NULL ||
657 (src==NULL ? srcLength!=0 : srcLength<-1) ||
658 (dest==NULL ? destCapacity!=0 : destCapacity<0)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000659 *status=U_ILLEGAL_ARGUMENT_ERROR;
660 return 0;
661 }
662
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000663 //get the string length
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800664 if(srcLength < 0){
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000665 srcLength = u_strlen(src);
666 }
667 // map
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800668 UnicodeString s1;
669 UChar *b1 = s1.getBuffer(srcLength);
670 if(b1==NULL){
671 *status = U_MEMORY_ALLOCATION_ERROR;
672 return 0;
673 }
674 int32_t b1Len = usprep_map(profile, src, srcLength,
675 b1, s1.getCapacity(), options, parseError, status);
676 s1.releaseBuffer(U_SUCCESS(*status) ? b1Len : 0);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000677
678 if(*status == U_BUFFER_OVERFLOW_ERROR){
679 // redo processing of string
680 /* we do not have enough room so grow the buffer*/
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800681 b1 = s1.getBuffer(b1Len);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000682 if(b1==NULL){
683 *status = U_MEMORY_ALLOCATION_ERROR;
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800684 return 0;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000685 }
686
687 *status = U_ZERO_ERROR; // reset error
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800688 b1Len = usprep_map(profile, src, srcLength,
689 b1, s1.getCapacity(), options, parseError, status);
690 s1.releaseBuffer(U_SUCCESS(*status) ? b1Len : 0);
691 }
692 if(U_FAILURE(*status)){
693 return 0;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000694 }
695
696 // normalize
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800697 UnicodeString s2;
698 if(profile->doNFKC){
699 const Normalizer2 *n2 = Normalizer2::getNFKCInstance(*status);
700 FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*status));
701 if(U_FAILURE(*status)){
702 return 0;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000703 }
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800704 fn2.normalize(s1, s2, *status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000705 }else{
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800706 s2.fastCopyFrom(s1);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000707 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000708 if(U_FAILURE(*status)){
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800709 return 0;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000710 }
711
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000712 // Prohibit and checkBiDi in one pass
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800713 const UChar *b2 = s2.getBuffer();
714 int32_t b2Len = s2.length();
715 UCharDirection direction=U_CHAR_DIRECTION_COUNT, firstCharDir=U_CHAR_DIRECTION_COUNT;
716 UBool leftToRight=FALSE, rightToLeft=FALSE;
717 int32_t rtlPos =-1, ltrPos =-1;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000718
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800719 for(int32_t b2Index=0; b2Index<b2Len;){
720 UChar32 ch = 0;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000721 U16_NEXT(b2, b2Index, b2Len, ch);
722
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800723 uint16_t result;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000724 UTRIE_GET16(&profile->sprepTrie,ch,result);
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800725
726 int16_t value;
727 UBool isIndex;
728 UStringPrepType type = getValues(result, value, isIndex);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000729
730 if( type == USPREP_PROHIBITED ||
731 ((result < _SPREP_TYPE_THRESHOLD) && (result & 0x01) /* first bit says it the code point is prohibited*/)
732 ){
733 *status = U_STRINGPREP_PROHIBITED_ERROR;
Jungshik Shin42d50272018-10-24 01:22:09 -0700734 uprv_syntaxError(b2, b2Index-U16_LENGTH(ch), b2Len, parseError);
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800735 return 0;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000736 }
737
738 if(profile->checkBiDi) {
Jungshik Shinf61e46d2018-05-04 13:00:45 -0700739 direction = ubidi_getClass(ch);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000740 if(firstCharDir == U_CHAR_DIRECTION_COUNT){
741 firstCharDir = direction;
742 }
743 if(direction == U_LEFT_TO_RIGHT){
744 leftToRight = TRUE;
745 ltrPos = b2Index-1;
746 }
747 if(direction == U_RIGHT_TO_LEFT || direction == U_RIGHT_TO_LEFT_ARABIC){
748 rightToLeft = TRUE;
749 rtlPos = b2Index-1;
750 }
751 }
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800752 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000753 if(profile->checkBiDi == TRUE){
754 // satisfy 2
755 if( leftToRight == TRUE && rightToLeft == TRUE){
756 *status = U_STRINGPREP_CHECK_BIDI_ERROR;
757 uprv_syntaxError(b2,(rtlPos>ltrPos) ? rtlPos : ltrPos, b2Len, parseError);
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800758 return 0;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000759 }
760
761 //satisfy 3
762 if( rightToLeft == TRUE &&
763 !((firstCharDir == U_RIGHT_TO_LEFT || firstCharDir == U_RIGHT_TO_LEFT_ARABIC) &&
764 (direction == U_RIGHT_TO_LEFT || direction == U_RIGHT_TO_LEFT_ARABIC))
765 ){
766 *status = U_STRINGPREP_CHECK_BIDI_ERROR;
767 uprv_syntaxError(b2, rtlPos, b2Len, parseError);
768 return FALSE;
769 }
770 }
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800771 return s2.extract(dest, destCapacity, *status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000772}
773
774
775/* data swapping ------------------------------------------------------------ */
776
777U_CAPI int32_t U_EXPORT2
778usprep_swap(const UDataSwapper *ds,
779 const void *inData, int32_t length, void *outData,
780 UErrorCode *pErrorCode) {
781 const UDataInfo *pInfo;
782 int32_t headerSize;
783
784 const uint8_t *inBytes;
785 uint8_t *outBytes;
786
787 const int32_t *inIndexes;
788 int32_t indexes[16];
789
790 int32_t i, offset, count, size;
791
792 /* udata_swapDataHeader checks the arguments */
793 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
794 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
795 return 0;
796 }
797
798 /* check data format and format version */
799 pInfo=(const UDataInfo *)((const char *)inData+4);
800 if(!(
801 pInfo->dataFormat[0]==0x53 && /* dataFormat="SPRP" */
802 pInfo->dataFormat[1]==0x50 &&
803 pInfo->dataFormat[2]==0x52 &&
804 pInfo->dataFormat[3]==0x50 &&
805 pInfo->formatVersion[0]==3
806 )) {
807 udata_printError(ds, "usprep_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as StringPrep .spp data\n",
808 pInfo->dataFormat[0], pInfo->dataFormat[1],
809 pInfo->dataFormat[2], pInfo->dataFormat[3],
810 pInfo->formatVersion[0]);
811 *pErrorCode=U_UNSUPPORTED_ERROR;
812 return 0;
813 }
814
815 inBytes=(const uint8_t *)inData+headerSize;
816 outBytes=(uint8_t *)outData+headerSize;
817
818 inIndexes=(const int32_t *)inBytes;
819
820 if(length>=0) {
821 length-=headerSize;
822 if(length<16*4) {
823 udata_printError(ds, "usprep_swap(): too few bytes (%d after header) for StringPrep .spp data\n",
824 length);
825 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
826 return 0;
827 }
828 }
829
830 /* read the first 16 indexes (ICU 2.8/format version 3: _SPREP_INDEX_TOP==16, might grow) */
831 for(i=0; i<16; ++i) {
832 indexes[i]=udata_readInt32(ds, inIndexes[i]);
833 }
834
835 /* calculate the total length of the data */
836 size=
837 16*4+ /* size of indexes[] */
838 indexes[_SPREP_INDEX_TRIE_SIZE]+
839 indexes[_SPREP_INDEX_MAPPING_DATA_SIZE];
840
841 if(length>=0) {
842 if(length<size) {
843 udata_printError(ds, "usprep_swap(): too few bytes (%d after header) for all of StringPrep .spp data\n",
844 length);
845 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
846 return 0;
847 }
848
849 /* copy the data for inaccessible bytes */
850 if(inBytes!=outBytes) {
851 uprv_memcpy(outBytes, inBytes, size);
852 }
853
854 offset=0;
855
856 /* swap the int32_t indexes[] */
857 count=16*4;
858 ds->swapArray32(ds, inBytes, count, outBytes, pErrorCode);
859 offset+=count;
860
861 /* swap the UTrie */
862 count=indexes[_SPREP_INDEX_TRIE_SIZE];
863 utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
864 offset+=count;
865
866 /* swap the uint16_t mappingTable[] */
867 count=indexes[_SPREP_INDEX_MAPPING_DATA_SIZE];
868 ds->swapArray16(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800869 //offset+=count;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000870 }
871
872 return headerSize+size;
873}
874
875#endif /* #if !UCONFIG_NO_IDNA */