Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 1 | // © 2016 and later: Unicode, Inc. and others. |
| 2 | // License & terms of use: http://www.unicode.org/copyright.html |
| 3 | /* |
| 4 | ******************************************************************************* |
| 5 | * |
| 6 | * Copyright (C) 2003-2014, International Business Machines |
| 7 | * Corporation and others. All Rights Reserved. |
| 8 | * |
| 9 | ******************************************************************************* |
| 10 | * file name: nptrans.h |
| 11 | * encoding: UTF-8 |
| 12 | * tab size: 8 (not used) |
| 13 | * indentation:4 |
| 14 | * |
| 15 | * created on: 2003feb1 |
| 16 | * created by: Ram Viswanadha |
| 17 | */ |
| 18 | |
| 19 | #include "unicode/utypes.h" |
| 20 | |
| 21 | #if !UCONFIG_NO_TRANSLITERATION |
| 22 | #if !UCONFIG_NO_IDNA |
| 23 | |
| 24 | #include "nptrans.h" |
| 25 | #include "unicode/resbund.h" |
| 26 | #include "unicode/uniset.h" |
| 27 | #include "sprpimpl.h" |
| 28 | #include "cmemory.h" |
| 29 | #include "ustr_imp.h" |
| 30 | #include "intltest.h" |
| 31 | |
| 32 | #ifdef NPTRANS_DEBUG |
| 33 | #include <stdio.h> |
| 34 | #endif |
| 35 | |
| 36 | const char NamePrepTransform::fgClassID=0; |
| 37 | |
| 38 | //Factory method |
| 39 | NamePrepTransform* NamePrepTransform::createInstance(UParseError& parseError, UErrorCode& status){ |
| 40 | NamePrepTransform* transform = new NamePrepTransform(parseError, status); |
| 41 | if(U_FAILURE(status)){ |
| 42 | delete transform; |
| 43 | return NULL; |
| 44 | } |
| 45 | return transform; |
| 46 | } |
| 47 | |
| 48 | //constructor |
| 49 | NamePrepTransform::NamePrepTransform(UParseError& parseError, UErrorCode& status) |
| 50 | : mapping(nullptr), unassigned(), prohibited(), labelSeparatorSet(), bundle(nullptr) { |
| 51 | |
| 52 | LocalPointer<Transliterator> lmapping; |
| 53 | LocalUResourceBundlePointer lbundle; |
| 54 | |
| 55 | const char* testDataName = IntlTest::loadTestData(status); |
| 56 | |
| 57 | if(U_FAILURE(status)){ |
| 58 | return; |
| 59 | } |
| 60 | |
| 61 | lbundle.adoptInstead(ures_openDirect(testDataName,"idna_rules",&status)); |
| 62 | |
| 63 | if(lbundle.isValid() && U_SUCCESS(status)){ |
| 64 | // create the mapping transliterator |
| 65 | int32_t ruleLen = 0; |
| 66 | const UChar* ruleUChar = ures_getStringByKey(lbundle.getAlias(), "MapNFKC",&ruleLen, &status); |
| 67 | int32_t mapRuleLen = 0; |
| 68 | const UChar *mapRuleUChar = ures_getStringByKey(lbundle.getAlias(), "MapNoNormalization", &mapRuleLen, &status); |
| 69 | UnicodeString rule(mapRuleUChar, mapRuleLen); |
| 70 | rule.append(ruleUChar, ruleLen); |
| 71 | |
| 72 | lmapping.adoptInstead( Transliterator::createFromRules(UnicodeString("NamePrepTransform", ""), rule, |
| 73 | UTRANS_FORWARD, parseError,status)); |
| 74 | if(U_FAILURE(status)) { |
| 75 | return; |
| 76 | } |
| 77 | |
| 78 | //create the unassigned set |
| 79 | int32_t patternLen =0; |
| 80 | const UChar* pattern = ures_getStringByKey(lbundle.getAlias(),"UnassignedSet",&patternLen, &status); |
| 81 | unassigned.applyPattern(UnicodeString(pattern, patternLen), status); |
| 82 | |
| 83 | //create prohibited set |
| 84 | patternLen=0; |
| 85 | pattern = ures_getStringByKey(lbundle.getAlias(),"ProhibitedSet",&patternLen, &status); |
| 86 | UnicodeString test(pattern,patternLen); |
| 87 | prohibited.applyPattern(test,status); |
| 88 | #ifdef NPTRANS_DEBUG |
| 89 | if(U_FAILURE(status)){ |
| 90 | printf("Construction of Unicode set failed\n"); |
| 91 | } |
| 92 | |
| 93 | if(U_SUCCESS(status)){ |
| 94 | if(prohibited.contains((UChar) 0x644)){ |
| 95 | printf("The string contains 0x644 ... !!\n"); |
| 96 | } |
| 97 | UnicodeString temp; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 98 | prohibited.toPattern(temp,true); |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 99 | |
| 100 | for(int32_t i=0;i<temp.length();i++){ |
| 101 | printf("%c", (char)temp.charAt(i)); |
| 102 | } |
| 103 | printf("\n"); |
| 104 | } |
| 105 | #endif |
| 106 | |
| 107 | //create label separator set |
| 108 | patternLen=0; |
| 109 | pattern = ures_getStringByKey(lbundle.getAlias(), "LabelSeparatorSet", &patternLen, &status); |
| 110 | labelSeparatorSet.applyPattern(UnicodeString(pattern,patternLen),status); |
| 111 | } |
| 112 | |
| 113 | if(U_SUCCESS(status) && (lmapping.isNull())) { |
| 114 | status = U_MEMORY_ALLOCATION_ERROR; |
| 115 | } |
| 116 | if (U_FAILURE(status)) { |
| 117 | return; |
| 118 | } |
| 119 | mapping = lmapping.orphan(); |
| 120 | bundle = lbundle.orphan(); |
| 121 | } |
| 122 | |
| 123 | |
| 124 | UBool NamePrepTransform::isProhibited(UChar32 ch){ |
| 125 | return (UBool)(ch != ASCII_SPACE); |
| 126 | } |
| 127 | |
| 128 | NamePrepTransform::~NamePrepTransform(){ |
| 129 | delete mapping; |
| 130 | mapping = NULL; |
| 131 | |
| 132 | //close the bundle |
| 133 | ures_close(bundle); |
| 134 | bundle = NULL; |
| 135 | } |
| 136 | |
| 137 | |
| 138 | int32_t NamePrepTransform::map(const UChar* src, int32_t srcLength, |
| 139 | UChar* dest, int32_t destCapacity, |
| 140 | UBool allowUnassigned, |
| 141 | UParseError* /*parseError*/, |
| 142 | UErrorCode& status ){ |
| 143 | |
| 144 | if(U_FAILURE(status)){ |
| 145 | return 0; |
| 146 | } |
| 147 | //check arguments |
| 148 | if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) { |
| 149 | status=U_ILLEGAL_ARGUMENT_ERROR; |
| 150 | return 0; |
| 151 | } |
| 152 | |
| 153 | UnicodeString rsource(src,srcLength); |
| 154 | // map the code points |
| 155 | // transliteration also performs NFKC |
| 156 | mapping->transliterate(rsource); |
| 157 | |
| 158 | const UChar* buffer = rsource.getBuffer(); |
| 159 | int32_t bufLen = rsource.length(); |
| 160 | // check if unassigned |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 161 | if(allowUnassigned == false){ |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 162 | int32_t bufIndex=0; |
| 163 | UChar32 ch =0 ; |
| 164 | for(;bufIndex<bufLen;){ |
| 165 | U16_NEXT(buffer, bufIndex, bufLen, ch); |
| 166 | if(unassigned.contains(ch)){ |
| 167 | status = U_IDNA_UNASSIGNED_ERROR; |
| 168 | return 0; |
| 169 | } |
| 170 | } |
| 171 | } |
| 172 | // check if there is enough room in the output |
| 173 | if(bufLen < destCapacity){ |
| 174 | u_memcpy(dest, buffer, bufLen); |
| 175 | } |
| 176 | |
| 177 | return u_terminateUChars(dest, destCapacity, bufLen, &status); |
| 178 | } |
| 179 | |
| 180 | |
| 181 | #define MAX_BUFFER_SIZE 300 |
| 182 | |
| 183 | int32_t NamePrepTransform::process( const UChar* src, int32_t srcLength, |
| 184 | UChar* dest, int32_t destCapacity, |
| 185 | UBool allowUnassigned, |
| 186 | UParseError* parseError, |
| 187 | UErrorCode& status ){ |
| 188 | // check error status |
| 189 | if(U_FAILURE(status)){ |
| 190 | return 0; |
| 191 | } |
| 192 | |
| 193 | //check arguments |
| 194 | if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) { |
| 195 | status=U_ILLEGAL_ARGUMENT_ERROR; |
| 196 | return 0; |
| 197 | } |
| 198 | |
| 199 | UnicodeString b1String; |
| 200 | UChar *b1 = b1String.getBuffer(MAX_BUFFER_SIZE); |
| 201 | int32_t b1Len; |
| 202 | |
| 203 | int32_t b1Index = 0; |
| 204 | UCharDirection direction=U_CHAR_DIRECTION_COUNT, firstCharDir=U_CHAR_DIRECTION_COUNT; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 205 | UBool leftToRight=false, rightToLeft=false; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 206 | |
| 207 | b1Len = map(src, srcLength, b1, b1String.getCapacity(), allowUnassigned, parseError, status); |
| 208 | b1String.releaseBuffer(b1Len); |
| 209 | |
| 210 | if(status == U_BUFFER_OVERFLOW_ERROR){ |
| 211 | // redo processing of string |
| 212 | /* we do not have enough room so grow the buffer*/ |
| 213 | b1 = b1String.getBuffer(b1Len); |
| 214 | status = U_ZERO_ERROR; // reset error |
| 215 | b1Len = map(src, srcLength, b1, b1String.getCapacity(), allowUnassigned, parseError, status); |
| 216 | b1String.releaseBuffer(b1Len); |
| 217 | } |
| 218 | |
| 219 | if(U_FAILURE(status)){ |
| 220 | b1Len = 0; |
| 221 | goto CLEANUP; |
| 222 | } |
| 223 | |
| 224 | |
| 225 | for(; b1Index<b1Len; ){ |
| 226 | |
| 227 | UChar32 ch = 0; |
| 228 | |
| 229 | U16_NEXT(b1, b1Index, b1Len, ch); |
| 230 | |
| 231 | if(prohibited.contains(ch) && ch!=0x0020){ |
| 232 | status = U_IDNA_PROHIBITED_ERROR; |
| 233 | b1Len = 0; |
| 234 | goto CLEANUP; |
| 235 | } |
| 236 | |
| 237 | direction = u_charDirection(ch); |
| 238 | if(firstCharDir==U_CHAR_DIRECTION_COUNT){ |
| 239 | firstCharDir = direction; |
| 240 | } |
| 241 | if(direction == U_LEFT_TO_RIGHT){ |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 242 | leftToRight = true; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 243 | } |
| 244 | if(direction == U_RIGHT_TO_LEFT || direction == U_RIGHT_TO_LEFT_ARABIC){ |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 245 | rightToLeft = true; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 246 | } |
| 247 | } |
| 248 | |
| 249 | // satisfy 2 |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 250 | if( leftToRight == true && rightToLeft == true){ |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 251 | status = U_IDNA_CHECK_BIDI_ERROR; |
| 252 | b1Len = 0; |
| 253 | goto CLEANUP; |
| 254 | } |
| 255 | |
| 256 | //satisfy 3 |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 257 | if( rightToLeft == true && |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 258 | !((firstCharDir == U_RIGHT_TO_LEFT || firstCharDir == U_RIGHT_TO_LEFT_ARABIC) && |
| 259 | (direction == U_RIGHT_TO_LEFT || direction == U_RIGHT_TO_LEFT_ARABIC)) |
| 260 | ){ |
| 261 | status = U_IDNA_CHECK_BIDI_ERROR; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 262 | return false; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 263 | } |
| 264 | |
| 265 | if(b1Len <= destCapacity){ |
| 266 | u_memmove(dest, b1, b1Len); |
| 267 | } |
| 268 | |
| 269 | CLEANUP: |
| 270 | return u_terminateUChars(dest, destCapacity, b1Len, &status); |
| 271 | } |
| 272 | |
| 273 | UBool NamePrepTransform::isLabelSeparator(UChar32 ch, UErrorCode& status){ |
| 274 | // check error status |
| 275 | if(U_FAILURE(status)){ |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 276 | return false; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 277 | } |
| 278 | |
| 279 | return labelSeparatorSet.contains(ch); |
| 280 | } |
| 281 | |
| 282 | #endif /* #if !UCONFIG_NO_IDNA */ |
| 283 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |