Jungshik Shin | 87232d8 | 2017-05-13 21:10:13 -0700 | [diff] [blame] | 1 | // © 2016 and later: Unicode, Inc. and others. |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 2 | // License & terms of use: http://www.unicode.org/copyright.html |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3 | /* |
| 4 | ******************************************************************************* |
| 5 | * |
| 6 | * Copyright (C) 2003-2013, International Business Machines |
| 7 | * Corporation and others. All Rights Reserved. |
| 8 | * |
| 9 | ******************************************************************************* |
| 10 | * file name: ucm.c |
Jungshik Shin | 87232d8 | 2017-05-13 21:10:13 -0700 | [diff] [blame] | 11 | * encoding: UTF-8 |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 12 | * tab size: 8 (not used) |
| 13 | * indentation:4 |
| 14 | * |
| 15 | * created on: 2003jun20 |
| 16 | * created by: Markus W. Scherer |
| 17 | * |
| 18 | * This file reads a .ucm file, stores its mappings and sorts them. |
| 19 | * It implements handling of Unicode conversion mappings from .ucm files |
| 20 | * for makeconv, canonucm, rptp2ucm, etc. |
| 21 | * |
| 22 | * Unicode code point sequences with a length of more than 1, |
| 23 | * as well as byte sequences with more than 4 bytes or more than one complete |
| 24 | * character sequence are handled to support m:n mappings. |
| 25 | */ |
| 26 | |
| 27 | #include "unicode/utypes.h" |
| 28 | #include "unicode/ustring.h" |
| 29 | #include "cstring.h" |
| 30 | #include "cmemory.h" |
| 31 | #include "filestrm.h" |
| 32 | #include "uarrsort.h" |
| 33 | #include "ucnvmbcs.h" |
| 34 | #include "ucnv_bld.h" |
| 35 | #include "ucnv_ext.h" |
| 36 | #include "uparse.h" |
| 37 | #include "ucm.h" |
| 38 | #include <stdio.h> |
| 39 | |
| 40 | #if !UCONFIG_NO_CONVERSION |
| 41 | |
| 42 | /* -------------------------------------------------------------------------- */ |
| 43 | |
| 44 | static void |
| 45 | printMapping(UCMapping *m, UChar32 *codePoints, uint8_t *bytes, FILE *f) { |
| 46 | int32_t j; |
| 47 | |
| 48 | for(j=0; j<m->uLen; ++j) { |
| 49 | fprintf(f, "<U%04lX>", (long)codePoints[j]); |
| 50 | } |
| 51 | |
| 52 | fputc(' ', f); |
| 53 | |
| 54 | for(j=0; j<m->bLen; ++j) { |
| 55 | fprintf(f, "\\x%02X", bytes[j]); |
| 56 | } |
| 57 | |
| 58 | if(m->f>=0) { |
| 59 | fprintf(f, " |%u\n", m->f); |
| 60 | } else { |
| 61 | fputs("\n", f); |
| 62 | } |
| 63 | } |
| 64 | |
| 65 | U_CAPI void U_EXPORT2 |
| 66 | ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) { |
| 67 | printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f); |
| 68 | } |
| 69 | |
| 70 | U_CAPI void U_EXPORT2 |
| 71 | ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode) { |
| 72 | UCMapping *m; |
| 73 | int32_t i, length; |
| 74 | |
| 75 | m=table->mappings; |
| 76 | length=table->mappingsLength; |
| 77 | if(byUnicode) { |
| 78 | for(i=0; i<length; ++m, ++i) { |
| 79 | ucm_printMapping(table, m, f); |
| 80 | } |
| 81 | } else { |
| 82 | const int32_t *map=table->reverseMap; |
| 83 | for(i=0; i<length; ++i) { |
| 84 | ucm_printMapping(table, m+map[i], f); |
| 85 | } |
| 86 | } |
| 87 | } |
| 88 | |
| 89 | /* mapping comparisons ------------------------------------------------------ */ |
| 90 | |
| 91 | static int32_t |
| 92 | compareUnicode(UCMTable *lTable, const UCMapping *l, |
| 93 | UCMTable *rTable, const UCMapping *r) { |
| 94 | const UChar32 *lu, *ru; |
| 95 | int32_t result, i, length; |
| 96 | |
| 97 | if(l->uLen==1 && r->uLen==1) { |
| 98 | /* compare two single code points */ |
| 99 | return l->u-r->u; |
| 100 | } |
| 101 | |
| 102 | /* get pointers to the code point sequences */ |
| 103 | lu=UCM_GET_CODE_POINTS(lTable, l); |
| 104 | ru=UCM_GET_CODE_POINTS(rTable, r); |
| 105 | |
| 106 | /* get the minimum length */ |
| 107 | if(l->uLen<=r->uLen) { |
| 108 | length=l->uLen; |
| 109 | } else { |
| 110 | length=r->uLen; |
| 111 | } |
| 112 | |
| 113 | /* compare the code points */ |
| 114 | for(i=0; i<length; ++i) { |
| 115 | result=lu[i]-ru[i]; |
| 116 | if(result!=0) { |
| 117 | return result; |
| 118 | } |
| 119 | } |
| 120 | |
| 121 | /* compare the lengths */ |
| 122 | return l->uLen-r->uLen; |
| 123 | } |
| 124 | |
| 125 | static int32_t |
| 126 | compareBytes(UCMTable *lTable, const UCMapping *l, |
| 127 | UCMTable *rTable, const UCMapping *r, |
| 128 | UBool lexical) { |
| 129 | const uint8_t *lb, *rb; |
| 130 | int32_t result, i, length; |
| 131 | |
| 132 | /* |
| 133 | * A lexical comparison is used for sorting in the builder, to allow |
| 134 | * an efficient search for a byte sequence that could be a prefix |
| 135 | * of a previously entered byte sequence. |
| 136 | * |
| 137 | * Comparing by lengths first is for compatibility with old .ucm tools |
| 138 | * like canonucm and rptp2ucm. |
| 139 | */ |
| 140 | if(lexical) { |
| 141 | /* get the minimum length and continue */ |
| 142 | if(l->bLen<=r->bLen) { |
| 143 | length=l->bLen; |
| 144 | } else { |
| 145 | length=r->bLen; |
| 146 | } |
| 147 | } else { |
| 148 | /* compare lengths first */ |
| 149 | result=l->bLen-r->bLen; |
| 150 | if(result!=0) { |
| 151 | return result; |
| 152 | } else { |
| 153 | length=l->bLen; |
| 154 | } |
| 155 | } |
| 156 | |
| 157 | /* get pointers to the byte sequences */ |
| 158 | lb=UCM_GET_BYTES(lTable, l); |
| 159 | rb=UCM_GET_BYTES(rTable, r); |
| 160 | |
| 161 | /* compare the bytes */ |
| 162 | for(i=0; i<length; ++i) { |
| 163 | result=lb[i]-rb[i]; |
| 164 | if(result!=0) { |
| 165 | return result; |
| 166 | } |
| 167 | } |
| 168 | |
| 169 | /* compare the lengths */ |
| 170 | return l->bLen-r->bLen; |
| 171 | } |
| 172 | |
| 173 | /* compare UCMappings for sorting */ |
| 174 | static int32_t |
| 175 | compareMappings(UCMTable *lTable, const UCMapping *l, |
| 176 | UCMTable *rTable, const UCMapping *r, |
| 177 | UBool uFirst) { |
| 178 | int32_t result; |
| 179 | |
| 180 | /* choose which side to compare first */ |
| 181 | if(uFirst) { |
| 182 | /* Unicode then bytes */ |
| 183 | result=compareUnicode(lTable, l, rTable, r); |
| 184 | if(result==0) { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 185 | result=compareBytes(lTable, l, rTable, r, false); /* not lexically, like canonucm */ |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 186 | } |
| 187 | } else { |
| 188 | /* bytes then Unicode */ |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 189 | result=compareBytes(lTable, l, rTable, r, true); /* lexically, for builder */ |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 190 | if(result==0) { |
| 191 | result=compareUnicode(lTable, l, rTable, r); |
| 192 | } |
| 193 | } |
| 194 | |
| 195 | if(result!=0) { |
| 196 | return result; |
| 197 | } |
| 198 | |
| 199 | /* compare the flags */ |
| 200 | return l->f-r->f; |
| 201 | } |
Jungshik Shin | 87232d8 | 2017-05-13 21:10:13 -0700 | [diff] [blame] | 202 | U_CDECL_BEGIN |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 203 | /* sorting by Unicode first sorts mappings directly */ |
Jungshik Shin | 87232d8 | 2017-05-13 21:10:13 -0700 | [diff] [blame] | 204 | static int32_t U_CALLCONV |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 205 | compareMappingsUnicodeFirst(const void *context, const void *left, const void *right) { |
| 206 | return compareMappings( |
| 207 | (UCMTable *)context, (const UCMapping *)left, |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 208 | (UCMTable *)context, (const UCMapping *)right, true); |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 209 | } |
| 210 | |
| 211 | /* sorting by bytes first sorts the reverseMap; use indirection to mappings */ |
Jungshik Shin | 87232d8 | 2017-05-13 21:10:13 -0700 | [diff] [blame] | 212 | static int32_t U_CALLCONV |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 213 | compareMappingsBytesFirst(const void *context, const void *left, const void *right) { |
| 214 | UCMTable *table=(UCMTable *)context; |
| 215 | int32_t l=*(const int32_t *)left, r=*(const int32_t *)right; |
| 216 | return compareMappings( |
| 217 | table, table->mappings+l, |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 218 | table, table->mappings+r, false); |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 219 | } |
Jungshik Shin | 87232d8 | 2017-05-13 21:10:13 -0700 | [diff] [blame] | 220 | U_CDECL_END |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 221 | |
| 222 | U_CAPI void U_EXPORT2 |
| 223 | ucm_sortTable(UCMTable *t) { |
| 224 | UErrorCode errorCode; |
| 225 | int32_t i; |
| 226 | |
| 227 | if(t->isSorted) { |
| 228 | return; |
| 229 | } |
| 230 | |
| 231 | errorCode=U_ZERO_ERROR; |
| 232 | |
| 233 | /* 1. sort by Unicode first */ |
| 234 | uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping), |
| 235 | compareMappingsUnicodeFirst, t, |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 236 | false, &errorCode); |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 237 | |
| 238 | /* build the reverseMap */ |
| 239 | if(t->reverseMap==NULL) { |
| 240 | /* |
| 241 | * allocate mappingsCapacity instead of mappingsLength so that |
| 242 | * if mappings are added, the reverseMap need not be |
| 243 | * reallocated each time |
| 244 | * (see ucm_moveMappings() and ucm_addMapping()) |
| 245 | */ |
| 246 | t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t)); |
| 247 | if(t->reverseMap==NULL) { |
| 248 | fprintf(stderr, "ucm error: unable to allocate reverseMap\n"); |
| 249 | exit(U_MEMORY_ALLOCATION_ERROR); |
| 250 | } |
| 251 | } |
| 252 | for(i=0; i<t->mappingsLength; ++i) { |
| 253 | t->reverseMap[i]=i; |
| 254 | } |
| 255 | |
| 256 | /* 2. sort reverseMap by mappings bytes first */ |
| 257 | uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t), |
| 258 | compareMappingsBytesFirst, t, |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 259 | false, &errorCode); |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 260 | |
| 261 | if(U_FAILURE(errorCode)) { |
| 262 | fprintf(stderr, "ucm error: sortTable()/uprv_sortArray() fails - %s\n", |
| 263 | u_errorName(errorCode)); |
| 264 | exit(errorCode); |
| 265 | } |
| 266 | |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 267 | t->isSorted=true; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 268 | } |
| 269 | |
| 270 | /* |
| 271 | * remove mappings with their move flag set from the base table |
| 272 | * and move some of them (with UCM_MOVE_TO_EXT) to the extension table |
| 273 | */ |
| 274 | U_CAPI void U_EXPORT2 |
| 275 | ucm_moveMappings(UCMTable *base, UCMTable *ext) { |
| 276 | UCMapping *mb, *mbLimit; |
| 277 | int8_t flag; |
| 278 | |
| 279 | mb=base->mappings; |
| 280 | mbLimit=mb+base->mappingsLength; |
| 281 | |
| 282 | while(mb<mbLimit) { |
| 283 | flag=mb->moveFlag; |
| 284 | if(flag!=0) { |
| 285 | /* reset the move flag */ |
| 286 | mb->moveFlag=0; |
| 287 | |
| 288 | if(ext!=NULL && (flag&UCM_MOVE_TO_EXT)) { |
| 289 | /* add the mapping to the extension table */ |
| 290 | ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb)); |
| 291 | } |
| 292 | |
| 293 | /* remove this mapping: move the last base mapping down and overwrite the current one */ |
| 294 | if(mb<(mbLimit-1)) { |
| 295 | uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping)); |
| 296 | } |
| 297 | --mbLimit; |
| 298 | --base->mappingsLength; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 299 | base->isSorted=false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 300 | } else { |
| 301 | ++mb; |
| 302 | } |
| 303 | } |
| 304 | } |
| 305 | |
| 306 | enum { |
| 307 | NEEDS_MOVE=1, |
| 308 | HAS_ERRORS=2 |
| 309 | }; |
| 310 | |
| 311 | static uint8_t |
| 312 | checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext, |
| 313 | UBool moveToExt, UBool intersectBase) { |
Jungshik Shin | 87232d8 | 2017-05-13 21:10:13 -0700 | [diff] [blame] | 314 | (void)baseStates; |
| 315 | |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 316 | UCMapping *mb, *me, *mbLimit, *meLimit; |
| 317 | int32_t cmp; |
| 318 | uint8_t result; |
| 319 | |
| 320 | mb=base->mappings; |
| 321 | mbLimit=mb+base->mappingsLength; |
| 322 | |
| 323 | me=ext->mappings; |
| 324 | meLimit=me+ext->mappingsLength; |
| 325 | |
| 326 | result=0; |
| 327 | |
| 328 | for(;;) { |
| 329 | /* skip irrelevant mappings on both sides */ |
| 330 | for(;;) { |
| 331 | if(mb==mbLimit) { |
| 332 | return result; |
| 333 | } |
| 334 | |
| 335 | if((0<=mb->f && mb->f<=2) || mb->f==4) { |
| 336 | break; |
| 337 | } |
| 338 | |
| 339 | ++mb; |
| 340 | } |
| 341 | |
| 342 | for(;;) { |
| 343 | if(me==meLimit) { |
| 344 | return result; |
| 345 | } |
| 346 | |
| 347 | if((0<=me->f && me->f<=2) || me->f==4) { |
| 348 | break; |
| 349 | } |
| 350 | |
| 351 | ++me; |
| 352 | } |
| 353 | |
| 354 | /* compare the base and extension mappings */ |
| 355 | cmp=compareUnicode(base, mb, ext, me); |
| 356 | if(cmp<0) { |
| 357 | if(intersectBase && (intersectBase!=2 || mb->bLen>1)) { |
| 358 | /* |
| 359 | * mapping in base but not in ext, move it |
| 360 | * |
| 361 | * if ext is DBCS, move DBCS mappings here |
| 362 | * and check SBCS ones for Unicode prefix below |
| 363 | */ |
| 364 | mb->moveFlag|=UCM_MOVE_TO_EXT; |
| 365 | result|=NEEDS_MOVE; |
| 366 | |
| 367 | /* does mb map from an input sequence that is a prefix of me's? */ |
| 368 | } else if( mb->uLen<me->uLen && |
| 369 | 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen) |
| 370 | ) { |
| 371 | if(moveToExt) { |
| 372 | /* mark this mapping to be moved to the extension table */ |
| 373 | mb->moveFlag|=UCM_MOVE_TO_EXT; |
| 374 | result|=NEEDS_MOVE; |
| 375 | } else { |
| 376 | fprintf(stderr, |
| 377 | "ucm error: the base table contains a mapping whose input sequence\n" |
| 378 | " is a prefix of the input sequence of an extension mapping\n"); |
| 379 | ucm_printMapping(base, mb, stderr); |
| 380 | ucm_printMapping(ext, me, stderr); |
| 381 | result|=HAS_ERRORS; |
| 382 | } |
| 383 | } |
| 384 | |
| 385 | ++mb; |
| 386 | } else if(cmp==0) { |
| 387 | /* |
| 388 | * same output: remove the extension mapping, |
| 389 | * otherwise treat as an error |
| 390 | */ |
| 391 | if( mb->f==me->f && mb->bLen==me->bLen && |
| 392 | 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen) |
| 393 | ) { |
| 394 | me->moveFlag|=UCM_REMOVE_MAPPING; |
| 395 | result|=NEEDS_MOVE; |
| 396 | } else if(intersectBase) { |
| 397 | /* mapping in base but not in ext, move it */ |
| 398 | mb->moveFlag|=UCM_MOVE_TO_EXT; |
| 399 | result|=NEEDS_MOVE; |
| 400 | } else { |
| 401 | fprintf(stderr, |
| 402 | "ucm error: the base table contains a mapping whose input sequence\n" |
| 403 | " is the same as the input sequence of an extension mapping\n" |
| 404 | " but it maps differently\n"); |
| 405 | ucm_printMapping(base, mb, stderr); |
| 406 | ucm_printMapping(ext, me, stderr); |
| 407 | result|=HAS_ERRORS; |
| 408 | } |
| 409 | |
| 410 | ++mb; |
| 411 | } else /* cmp>0 */ { |
| 412 | ++me; |
| 413 | } |
| 414 | } |
| 415 | } |
| 416 | |
| 417 | static uint8_t |
| 418 | checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext, |
| 419 | UBool moveToExt, UBool intersectBase) { |
| 420 | UCMapping *mb, *me; |
| 421 | int32_t *baseMap, *extMap; |
| 422 | int32_t b, e, bLimit, eLimit, cmp; |
| 423 | uint8_t result; |
| 424 | UBool isSISO; |
| 425 | |
| 426 | baseMap=base->reverseMap; |
| 427 | extMap=ext->reverseMap; |
| 428 | |
| 429 | b=e=0; |
| 430 | bLimit=base->mappingsLength; |
| 431 | eLimit=ext->mappingsLength; |
| 432 | |
| 433 | result=0; |
| 434 | |
| 435 | isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO); |
| 436 | |
| 437 | for(;;) { |
| 438 | /* skip irrelevant mappings on both sides */ |
| 439 | for(;; ++b) { |
| 440 | if(b==bLimit) { |
| 441 | return result; |
| 442 | } |
| 443 | mb=base->mappings+baseMap[b]; |
| 444 | |
| 445 | if(intersectBase==2 && mb->bLen==1) { |
| 446 | /* |
| 447 | * comparing a base against a DBCS extension: |
| 448 | * leave SBCS base mappings alone |
| 449 | */ |
| 450 | continue; |
| 451 | } |
| 452 | |
| 453 | if(mb->f==0 || mb->f==3) { |
| 454 | break; |
| 455 | } |
| 456 | } |
| 457 | |
| 458 | for(;;) { |
| 459 | if(e==eLimit) { |
| 460 | return result; |
| 461 | } |
| 462 | me=ext->mappings+extMap[e]; |
| 463 | |
| 464 | if(me->f==0 || me->f==3) { |
| 465 | break; |
| 466 | } |
| 467 | |
| 468 | ++e; |
| 469 | } |
| 470 | |
| 471 | /* compare the base and extension mappings */ |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 472 | cmp=compareBytes(base, mb, ext, me, true); |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 473 | if(cmp<0) { |
| 474 | if(intersectBase) { |
| 475 | /* mapping in base but not in ext, move it */ |
| 476 | mb->moveFlag|=UCM_MOVE_TO_EXT; |
| 477 | result|=NEEDS_MOVE; |
| 478 | |
| 479 | /* |
| 480 | * does mb map from an input sequence that is a prefix of me's? |
| 481 | * for SI/SO tables, a single byte is never a prefix because it |
| 482 | * occurs in a separate single-byte state |
| 483 | */ |
| 484 | } else if( mb->bLen<me->bLen && |
| 485 | (!isSISO || mb->bLen>1) && |
| 486 | 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen) |
| 487 | ) { |
| 488 | if(moveToExt) { |
| 489 | /* mark this mapping to be moved to the extension table */ |
| 490 | mb->moveFlag|=UCM_MOVE_TO_EXT; |
| 491 | result|=NEEDS_MOVE; |
| 492 | } else { |
| 493 | fprintf(stderr, |
| 494 | "ucm error: the base table contains a mapping whose input sequence\n" |
| 495 | " is a prefix of the input sequence of an extension mapping\n"); |
| 496 | ucm_printMapping(base, mb, stderr); |
| 497 | ucm_printMapping(ext, me, stderr); |
| 498 | result|=HAS_ERRORS; |
| 499 | } |
| 500 | } |
| 501 | |
| 502 | ++b; |
| 503 | } else if(cmp==0) { |
| 504 | /* |
| 505 | * same output: remove the extension mapping, |
| 506 | * otherwise treat as an error |
| 507 | */ |
| 508 | if( mb->f==me->f && mb->uLen==me->uLen && |
| 509 | 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen) |
| 510 | ) { |
| 511 | me->moveFlag|=UCM_REMOVE_MAPPING; |
| 512 | result|=NEEDS_MOVE; |
| 513 | } else if(intersectBase) { |
| 514 | /* mapping in base but not in ext, move it */ |
| 515 | mb->moveFlag|=UCM_MOVE_TO_EXT; |
| 516 | result|=NEEDS_MOVE; |
| 517 | } else { |
| 518 | fprintf(stderr, |
| 519 | "ucm error: the base table contains a mapping whose input sequence\n" |
| 520 | " is the same as the input sequence of an extension mapping\n" |
| 521 | " but it maps differently\n"); |
| 522 | ucm_printMapping(base, mb, stderr); |
| 523 | ucm_printMapping(ext, me, stderr); |
| 524 | result|=HAS_ERRORS; |
| 525 | } |
| 526 | |
| 527 | ++b; |
| 528 | } else /* cmp>0 */ { |
| 529 | ++e; |
| 530 | } |
| 531 | } |
| 532 | } |
| 533 | |
| 534 | U_CAPI UBool U_EXPORT2 |
| 535 | ucm_checkValidity(UCMTable *table, UCMStates *baseStates) { |
| 536 | UCMapping *m, *mLimit; |
| 537 | int32_t count; |
| 538 | UBool isOK; |
| 539 | |
| 540 | m=table->mappings; |
| 541 | mLimit=m+table->mappingsLength; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 542 | isOK=true; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 543 | |
| 544 | while(m<mLimit) { |
| 545 | count=ucm_countChars(baseStates, UCM_GET_BYTES(table, m), m->bLen); |
| 546 | if(count<1) { |
| 547 | ucm_printMapping(table, m, stderr); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 548 | isOK=false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 549 | } |
| 550 | ++m; |
| 551 | } |
| 552 | |
| 553 | return isOK; |
| 554 | } |
| 555 | |
| 556 | U_CAPI UBool U_EXPORT2 |
| 557 | ucm_checkBaseExt(UCMStates *baseStates, |
| 558 | UCMTable *base, UCMTable *ext, UCMTable *moveTarget, |
| 559 | UBool intersectBase) { |
| 560 | uint8_t result; |
| 561 | |
| 562 | /* if we have an extension table, we must always use precision flags */ |
| 563 | if(base->flagsType&UCM_FLAGS_IMPLICIT) { |
| 564 | fprintf(stderr, "ucm error: the base table contains mappings without precision flags\n"); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 565 | return false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 566 | } |
| 567 | if(ext->flagsType&UCM_FLAGS_IMPLICIT) { |
| 568 | fprintf(stderr, "ucm error: extension table contains mappings without precision flags\n"); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 569 | return false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 570 | } |
| 571 | |
| 572 | /* checking requires both tables to be sorted */ |
| 573 | ucm_sortTable(base); |
| 574 | ucm_sortTable(ext); |
| 575 | |
| 576 | /* check */ |
| 577 | result= |
| 578 | checkBaseExtUnicode(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase)| |
| 579 | checkBaseExtBytes(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase); |
| 580 | |
| 581 | if(result&HAS_ERRORS) { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 582 | return false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 583 | } |
| 584 | |
| 585 | if(result&NEEDS_MOVE) { |
| 586 | ucm_moveMappings(ext, NULL); |
| 587 | ucm_moveMappings(base, moveTarget); |
| 588 | ucm_sortTable(base); |
| 589 | ucm_sortTable(ext); |
| 590 | if(moveTarget!=NULL) { |
| 591 | ucm_sortTable(moveTarget); |
| 592 | } |
| 593 | } |
| 594 | |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 595 | return true; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 596 | } |
| 597 | |
| 598 | /* merge tables for rptp2ucm ------------------------------------------------ */ |
| 599 | |
| 600 | U_CAPI void U_EXPORT2 |
| 601 | ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable, |
| 602 | const uint8_t *subchar, int32_t subcharLength, |
| 603 | uint8_t subchar1) { |
| 604 | UCMapping *fromUMapping, *toUMapping; |
| 605 | int32_t fromUIndex, toUIndex, fromUTop, toUTop, cmp; |
| 606 | |
| 607 | ucm_sortTable(fromUTable); |
| 608 | ucm_sortTable(toUTable); |
| 609 | |
| 610 | fromUMapping=fromUTable->mappings; |
| 611 | toUMapping=toUTable->mappings; |
| 612 | |
| 613 | fromUTop=fromUTable->mappingsLength; |
| 614 | toUTop=toUTable->mappingsLength; |
| 615 | |
| 616 | fromUIndex=toUIndex=0; |
| 617 | |
| 618 | while(fromUIndex<fromUTop && toUIndex<toUTop) { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 619 | cmp=compareMappings(fromUTable, fromUMapping, toUTable, toUMapping, true); |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 620 | if(cmp==0) { |
| 621 | /* equal: roundtrip, nothing to do (flags are initially 0) */ |
| 622 | ++fromUMapping; |
| 623 | ++toUMapping; |
| 624 | |
| 625 | ++fromUIndex; |
| 626 | ++toUIndex; |
| 627 | } else if(cmp<0) { |
| 628 | /* |
| 629 | * the fromU mapping does not have a toU counterpart: |
| 630 | * fallback Unicode->codepage |
| 631 | */ |
| 632 | if( (fromUMapping->bLen==subcharLength && |
| 633 | 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) || |
| 634 | (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1) |
| 635 | ) { |
| 636 | fromUMapping->f=2; /* SUB mapping */ |
| 637 | } else { |
| 638 | fromUMapping->f=1; /* normal fallback */ |
| 639 | } |
| 640 | |
| 641 | ++fromUMapping; |
| 642 | ++fromUIndex; |
| 643 | } else { |
| 644 | /* |
| 645 | * the toU mapping does not have a fromU counterpart: |
| 646 | * (reverse) fallback codepage->Unicode, copy it to the fromU table |
| 647 | */ |
| 648 | |
| 649 | /* ignore reverse fallbacks to Unicode SUB */ |
| 650 | if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) { |
| 651 | toUMapping->f=3; /* reverse fallback */ |
| 652 | ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping)); |
| 653 | |
| 654 | /* the table may have been reallocated */ |
| 655 | fromUMapping=fromUTable->mappings+fromUIndex; |
| 656 | } |
| 657 | |
| 658 | ++toUMapping; |
| 659 | ++toUIndex; |
| 660 | } |
| 661 | } |
| 662 | |
| 663 | /* either one or both tables are exhausted */ |
| 664 | while(fromUIndex<fromUTop) { |
| 665 | /* leftover fromU mappings are fallbacks */ |
| 666 | if( (fromUMapping->bLen==subcharLength && |
| 667 | 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) || |
| 668 | (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1) |
| 669 | ) { |
| 670 | fromUMapping->f=2; /* SUB mapping */ |
| 671 | } else { |
| 672 | fromUMapping->f=1; /* normal fallback */ |
| 673 | } |
| 674 | |
| 675 | ++fromUMapping; |
| 676 | ++fromUIndex; |
| 677 | } |
| 678 | |
| 679 | while(toUIndex<toUTop) { |
| 680 | /* leftover toU mappings are reverse fallbacks */ |
| 681 | |
| 682 | /* ignore reverse fallbacks to Unicode SUB */ |
| 683 | if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) { |
| 684 | toUMapping->f=3; /* reverse fallback */ |
| 685 | ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping)); |
| 686 | } |
| 687 | |
| 688 | ++toUMapping; |
| 689 | ++toUIndex; |
| 690 | } |
| 691 | |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 692 | fromUTable->isSorted=false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 693 | } |
| 694 | |
| 695 | /* separate extension mappings out of base table for rptp2ucm --------------- */ |
| 696 | |
| 697 | U_CAPI UBool U_EXPORT2 |
| 698 | ucm_separateMappings(UCMFile *ucm, UBool isSISO) { |
| 699 | UCMTable *table; |
| 700 | UCMapping *m, *mLimit; |
| 701 | int32_t type; |
| 702 | UBool needsMove, isOK; |
| 703 | |
| 704 | table=ucm->base; |
| 705 | m=table->mappings; |
| 706 | mLimit=m+table->mappingsLength; |
| 707 | |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 708 | needsMove=false; |
| 709 | isOK=true; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 710 | |
| 711 | for(; m<mLimit; ++m) { |
| 712 | if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) { |
| 713 | fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n"); |
| 714 | ucm_printMapping(table, m, stderr); |
| 715 | m->moveFlag|=UCM_REMOVE_MAPPING; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 716 | needsMove=true; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 717 | continue; |
| 718 | } |
| 719 | |
| 720 | type=ucm_mappingType( |
| 721 | &ucm->states, m, |
| 722 | UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m)); |
| 723 | if(type<0) { |
| 724 | /* illegal byte sequence */ |
| 725 | printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 726 | isOK=false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 727 | } else if(type>0) { |
| 728 | m->moveFlag|=UCM_MOVE_TO_EXT; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 729 | needsMove=true; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 730 | } |
| 731 | } |
| 732 | |
| 733 | if(!isOK) { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 734 | return false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 735 | } |
| 736 | if(needsMove) { |
| 737 | ucm_moveMappings(ucm->base, ucm->ext); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 738 | return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, false); |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 739 | } else { |
| 740 | ucm_sortTable(ucm->base); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 741 | return true; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 742 | } |
| 743 | } |
| 744 | |
| 745 | /* ucm parser --------------------------------------------------------------- */ |
| 746 | |
| 747 | U_CAPI int8_t U_EXPORT2 |
| 748 | ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) { |
| 749 | const char *s=*ps; |
| 750 | char *end; |
| 751 | uint8_t byte; |
| 752 | int8_t bLen; |
| 753 | |
| 754 | bLen=0; |
| 755 | for(;;) { |
| 756 | /* skip an optional plus sign */ |
| 757 | if(bLen>0 && *s=='+') { |
| 758 | ++s; |
| 759 | } |
| 760 | if(*s!='\\') { |
| 761 | break; |
| 762 | } |
| 763 | |
| 764 | if( s[1]!='x' || |
| 765 | (byte=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4 |
| 766 | ) { |
| 767 | fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line); |
| 768 | return -1; |
| 769 | } |
| 770 | |
| 771 | if(bLen==UCNV_EXT_MAX_BYTES) { |
| 772 | fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line); |
| 773 | return -1; |
| 774 | } |
| 775 | bytes[bLen++]=byte; |
| 776 | s=end; |
| 777 | } |
| 778 | |
| 779 | *ps=s; |
| 780 | return bLen; |
| 781 | } |
| 782 | |
| 783 | /* parse a mapping line; must not be empty */ |
| 784 | U_CAPI UBool U_EXPORT2 |
| 785 | ucm_parseMappingLine(UCMapping *m, |
| 786 | UChar32 codePoints[UCNV_EXT_MAX_UCHARS], |
| 787 | uint8_t bytes[UCNV_EXT_MAX_BYTES], |
| 788 | const char *line) { |
| 789 | const char *s; |
| 790 | char *end; |
| 791 | UChar32 cp; |
| 792 | int32_t u16Length; |
| 793 | int8_t uLen, bLen, f; |
| 794 | |
| 795 | s=line; |
| 796 | uLen=bLen=0; |
| 797 | |
| 798 | /* parse code points */ |
| 799 | for(;;) { |
| 800 | /* skip an optional plus sign */ |
| 801 | if(uLen>0 && *s=='+') { |
| 802 | ++s; |
| 803 | } |
| 804 | if(*s!='<') { |
| 805 | break; |
| 806 | } |
| 807 | |
| 808 | if( s[1]!='U' || |
| 809 | (cp=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 || |
| 810 | *end!='>' |
| 811 | ) { |
| 812 | fprintf(stderr, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 813 | return false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 814 | } |
| 815 | if((uint32_t)cp>0x10ffff || U_IS_SURROGATE(cp)) { |
| 816 | fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 817 | return false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 818 | } |
| 819 | |
| 820 | if(uLen==UCNV_EXT_MAX_UCHARS) { |
| 821 | fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 822 | return false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 823 | } |
| 824 | codePoints[uLen++]=cp; |
| 825 | s=end+1; |
| 826 | } |
| 827 | |
| 828 | if(uLen==0) { |
| 829 | fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 830 | return false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 831 | } else if(uLen==1) { |
| 832 | m->u=codePoints[0]; |
| 833 | } else { |
| 834 | UErrorCode errorCode=U_ZERO_ERROR; |
| 835 | u_strFromUTF32(NULL, 0, &u16Length, codePoints, uLen, &errorCode); |
| 836 | if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) || |
| 837 | u16Length>UCNV_EXT_MAX_UCHARS |
| 838 | ) { |
| 839 | fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 840 | return false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 841 | } |
| 842 | } |
| 843 | |
| 844 | s=u_skipWhitespace(s); |
| 845 | |
| 846 | /* parse bytes */ |
| 847 | bLen=ucm_parseBytes(bytes, line, &s); |
| 848 | |
| 849 | if(bLen<0) { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 850 | return false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 851 | } else if(bLen==0) { |
| 852 | fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 853 | return false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 854 | } else if(bLen<=4) { |
| 855 | uprv_memcpy(m->b.bytes, bytes, bLen); |
| 856 | } |
| 857 | |
| 858 | /* skip everything until the fallback indicator, even the start of a comment */ |
| 859 | for(;;) { |
| 860 | if(*s==0) { |
| 861 | f=-1; /* no fallback indicator */ |
| 862 | break; |
| 863 | } else if(*s=='|') { |
| 864 | f=(int8_t)(s[1]-'0'); |
| 865 | if((uint8_t)f>4) { |
| 866 | fprintf(stderr, "ucm error: fallback indicator must be |0..|4 - \"%s\"\n", line); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 867 | return false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 868 | } |
| 869 | break; |
| 870 | } |
| 871 | ++s; |
| 872 | } |
| 873 | |
| 874 | m->uLen=uLen; |
| 875 | m->bLen=bLen; |
| 876 | m->f=f; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 877 | return true; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 878 | } |
| 879 | |
| 880 | /* general APIs ------------------------------------------------------------- */ |
| 881 | |
| 882 | U_CAPI UCMTable * U_EXPORT2 |
| 883 | ucm_openTable() { |
| 884 | UCMTable *table=(UCMTable *)uprv_malloc(sizeof(UCMTable)); |
| 885 | if(table==NULL) { |
| 886 | fprintf(stderr, "ucm error: unable to allocate a UCMTable\n"); |
| 887 | exit(U_MEMORY_ALLOCATION_ERROR); |
| 888 | } |
| 889 | |
| 890 | memset(table, 0, sizeof(UCMTable)); |
| 891 | return table; |
| 892 | } |
| 893 | |
| 894 | U_CAPI void U_EXPORT2 |
| 895 | ucm_closeTable(UCMTable *table) { |
| 896 | if(table!=NULL) { |
| 897 | uprv_free(table->mappings); |
| 898 | uprv_free(table->codePoints); |
| 899 | uprv_free(table->bytes); |
| 900 | uprv_free(table->reverseMap); |
| 901 | uprv_free(table); |
| 902 | } |
| 903 | } |
| 904 | |
| 905 | U_CAPI void U_EXPORT2 |
| 906 | ucm_resetTable(UCMTable *table) { |
| 907 | if(table!=NULL) { |
| 908 | table->mappingsLength=0; |
| 909 | table->flagsType=0; |
| 910 | table->unicodeMask=0; |
| 911 | table->bytesLength=table->codePointsLength=0; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 912 | table->isSorted=false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 913 | } |
| 914 | } |
| 915 | |
| 916 | U_CAPI void U_EXPORT2 |
| 917 | ucm_addMapping(UCMTable *table, |
| 918 | UCMapping *m, |
| 919 | UChar32 codePoints[UCNV_EXT_MAX_UCHARS], |
| 920 | uint8_t bytes[UCNV_EXT_MAX_BYTES]) { |
| 921 | UCMapping *tm; |
| 922 | UChar32 c; |
| 923 | int32_t idx; |
| 924 | |
| 925 | if(table->mappingsLength>=table->mappingsCapacity) { |
| 926 | /* make the mappings array larger */ |
| 927 | if(table->mappingsCapacity==0) { |
| 928 | table->mappingsCapacity=1000; |
| 929 | } else { |
| 930 | table->mappingsCapacity*=10; |
| 931 | } |
| 932 | table->mappings=(UCMapping *)uprv_realloc(table->mappings, |
| 933 | table->mappingsCapacity*sizeof(UCMapping)); |
| 934 | if(table->mappings==NULL) { |
| 935 | fprintf(stderr, "ucm error: unable to allocate %d UCMappings\n", |
| 936 | (int)table->mappingsCapacity); |
| 937 | exit(U_MEMORY_ALLOCATION_ERROR); |
| 938 | } |
| 939 | |
| 940 | if(table->reverseMap!=NULL) { |
| 941 | /* the reverseMap must be reallocated in a new sort */ |
| 942 | uprv_free(table->reverseMap); |
| 943 | table->reverseMap=NULL; |
| 944 | } |
| 945 | } |
| 946 | |
| 947 | if(m->uLen>1 && table->codePointsCapacity==0) { |
| 948 | table->codePointsCapacity=10000; |
| 949 | table->codePoints=(UChar32 *)uprv_malloc(table->codePointsCapacity*4); |
| 950 | if(table->codePoints==NULL) { |
| 951 | fprintf(stderr, "ucm error: unable to allocate %d UChar32s\n", |
| 952 | (int)table->codePointsCapacity); |
| 953 | exit(U_MEMORY_ALLOCATION_ERROR); |
| 954 | } |
| 955 | } |
| 956 | |
| 957 | if(m->bLen>4 && table->bytesCapacity==0) { |
| 958 | table->bytesCapacity=10000; |
| 959 | table->bytes=(uint8_t *)uprv_malloc(table->bytesCapacity); |
| 960 | if(table->bytes==NULL) { |
| 961 | fprintf(stderr, "ucm error: unable to allocate %d bytes\n", |
| 962 | (int)table->bytesCapacity); |
| 963 | exit(U_MEMORY_ALLOCATION_ERROR); |
| 964 | } |
| 965 | } |
| 966 | |
| 967 | if(m->uLen>1) { |
| 968 | idx=table->codePointsLength; |
| 969 | table->codePointsLength+=m->uLen; |
| 970 | if(table->codePointsLength>table->codePointsCapacity) { |
| 971 | fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n"); |
| 972 | exit(U_MEMORY_ALLOCATION_ERROR); |
| 973 | } |
| 974 | |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 975 | uprv_memcpy(table->codePoints+idx, codePoints, (size_t)m->uLen*4); |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 976 | m->u=idx; |
| 977 | } |
| 978 | |
| 979 | if(m->bLen>4) { |
| 980 | idx=table->bytesLength; |
| 981 | table->bytesLength+=m->bLen; |
| 982 | if(table->bytesLength>table->bytesCapacity) { |
| 983 | fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n"); |
| 984 | exit(U_MEMORY_ALLOCATION_ERROR); |
| 985 | } |
| 986 | |
| 987 | uprv_memcpy(table->bytes+idx, bytes, m->bLen); |
| 988 | m->b.idx=idx; |
| 989 | } |
| 990 | |
| 991 | /* set unicodeMask */ |
| 992 | for(idx=0; idx<m->uLen; ++idx) { |
| 993 | c=codePoints[idx]; |
| 994 | if(c>=0x10000) { |
| 995 | table->unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */ |
| 996 | } else if(U_IS_SURROGATE(c)) { |
| 997 | table->unicodeMask|=UCNV_HAS_SURROGATES; /* there are surrogate code points */ |
| 998 | } |
| 999 | } |
| 1000 | |
| 1001 | /* set flagsType */ |
| 1002 | if(m->f<0) { |
| 1003 | table->flagsType|=UCM_FLAGS_IMPLICIT; |
| 1004 | } else { |
| 1005 | table->flagsType|=UCM_FLAGS_EXPLICIT; |
| 1006 | } |
| 1007 | |
| 1008 | tm=table->mappings+table->mappingsLength++; |
| 1009 | uprv_memcpy(tm, m, sizeof(UCMapping)); |
| 1010 | |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 1011 | table->isSorted=false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 1012 | } |
| 1013 | |
| 1014 | U_CAPI UCMFile * U_EXPORT2 |
| 1015 | ucm_open() { |
| 1016 | UCMFile *ucm=(UCMFile *)uprv_malloc(sizeof(UCMFile)); |
| 1017 | if(ucm==NULL) { |
| 1018 | fprintf(stderr, "ucm error: unable to allocate a UCMFile\n"); |
| 1019 | exit(U_MEMORY_ALLOCATION_ERROR); |
| 1020 | } |
| 1021 | |
| 1022 | memset(ucm, 0, sizeof(UCMFile)); |
| 1023 | |
| 1024 | ucm->base=ucm_openTable(); |
| 1025 | ucm->ext=ucm_openTable(); |
| 1026 | |
| 1027 | ucm->states.stateFlags[0]=MBCS_STATE_FLAG_DIRECT; |
| 1028 | ucm->states.conversionType=UCNV_UNSUPPORTED_CONVERTER; |
| 1029 | ucm->states.outputType=-1; |
| 1030 | ucm->states.minCharLength=ucm->states.maxCharLength=1; |
| 1031 | |
| 1032 | return ucm; |
| 1033 | } |
| 1034 | |
| 1035 | U_CAPI void U_EXPORT2 |
| 1036 | ucm_close(UCMFile *ucm) { |
| 1037 | if(ucm!=NULL) { |
| 1038 | ucm_closeTable(ucm->base); |
| 1039 | ucm_closeTable(ucm->ext); |
| 1040 | uprv_free(ucm); |
| 1041 | } |
| 1042 | } |
| 1043 | |
| 1044 | U_CAPI int32_t U_EXPORT2 |
| 1045 | ucm_mappingType(UCMStates *baseStates, |
| 1046 | UCMapping *m, |
| 1047 | UChar32 codePoints[UCNV_EXT_MAX_UCHARS], |
| 1048 | uint8_t bytes[UCNV_EXT_MAX_BYTES]) { |
Jungshik Shin | 87232d8 | 2017-05-13 21:10:13 -0700 | [diff] [blame] | 1049 | (void)codePoints; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 1050 | /* check validity of the bytes and count the characters in them */ |
| 1051 | int32_t count=ucm_countChars(baseStates, bytes, m->bLen); |
| 1052 | if(count<1) { |
| 1053 | /* illegal byte sequence */ |
| 1054 | return -1; |
| 1055 | } |
| 1056 | |
| 1057 | /* |
| 1058 | * Suitable for an ICU conversion base table means: |
| 1059 | * - a 1:1 mapping (1 Unicode code point : 1 byte sequence) |
| 1060 | * - precision flag 0..3 |
| 1061 | * - SBCS: any 1:1 mapping |
| 1062 | * (the table stores additional bits to distinguish mapping types) |
| 1063 | * - MBCS: not a |2 SUB mapping for <subchar1> |
| 1064 | * - MBCS: not a |1 fallback to 0x00 |
| 1065 | * - MBCS: not a multi-byte mapping with leading 0x00 bytes |
| 1066 | * |
| 1067 | * Further restrictions for fromUnicode tables |
| 1068 | * are enforced in makeconv (MBCSOkForBaseFromUnicode()). |
| 1069 | * |
| 1070 | * All of the MBCS fromUnicode specific tests could be removed from here, |
| 1071 | * but the ones above are for unusual mappings, and removing the tests |
| 1072 | * from here would change canonucm output which seems gratuitous. |
| 1073 | * (Markus Scherer 2006-nov-28) |
| 1074 | * |
| 1075 | * Exception: All implicit mappings (f<0) that need to be moved |
| 1076 | * because of fromUnicode restrictions _must_ be moved here because |
| 1077 | * makeconv uses a hack for moving mappings only for the fromUnicode table |
| 1078 | * that only works with non-negative values of f. |
| 1079 | */ |
| 1080 | if( m->uLen==1 && count==1 && m->f<=3 && |
| 1081 | (baseStates->maxCharLength==1 || |
| 1082 | !((m->f==2 && m->bLen==1) || |
| 1083 | (m->f==1 && bytes[0]==0) || |
| 1084 | (m->f<=1 && m->bLen>1 && bytes[0]==0))) |
| 1085 | ) { |
| 1086 | return 0; /* suitable for a base table */ |
| 1087 | } else { |
| 1088 | return 1; /* needs to go into an extension table */ |
| 1089 | } |
| 1090 | } |
| 1091 | |
| 1092 | U_CAPI UBool U_EXPORT2 |
| 1093 | ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates, |
| 1094 | UCMapping *m, |
| 1095 | UChar32 codePoints[UCNV_EXT_MAX_UCHARS], |
| 1096 | uint8_t bytes[UCNV_EXT_MAX_BYTES]) { |
| 1097 | int32_t type; |
| 1098 | |
| 1099 | if(m->f==2 && m->uLen>1) { |
| 1100 | fprintf(stderr, "ucm error: illegal <subchar1> |2 mapping from multiple code points\n"); |
| 1101 | printMapping(m, codePoints, bytes, stderr); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 1102 | return false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 1103 | } |
| 1104 | |
| 1105 | if(baseStates!=NULL) { |
| 1106 | /* check validity of the bytes and count the characters in them */ |
| 1107 | type=ucm_mappingType(baseStates, m, codePoints, bytes); |
| 1108 | if(type<0) { |
| 1109 | /* illegal byte sequence */ |
| 1110 | printMapping(m, codePoints, bytes, stderr); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 1111 | return false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 1112 | } |
| 1113 | } else { |
| 1114 | /* not used - adding a mapping for an extension-only table before its base table is read */ |
| 1115 | type=1; |
| 1116 | } |
| 1117 | |
| 1118 | /* |
| 1119 | * Add the mapping to the base table if this is requested and suitable. |
| 1120 | * Otherwise, add it to the extension table. |
| 1121 | */ |
| 1122 | if(forBase && type==0) { |
| 1123 | ucm_addMapping(ucm->base, m, codePoints, bytes); |
| 1124 | } else { |
| 1125 | ucm_addMapping(ucm->ext, m, codePoints, bytes); |
| 1126 | } |
| 1127 | |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 1128 | return true; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 1129 | } |
| 1130 | |
| 1131 | U_CAPI UBool U_EXPORT2 |
| 1132 | ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) { |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 1133 | UCMapping m={ 0, {0}, 0, 0, 0, 0 }; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 1134 | UChar32 codePoints[UCNV_EXT_MAX_UCHARS]; |
| 1135 | uint8_t bytes[UCNV_EXT_MAX_BYTES]; |
| 1136 | |
| 1137 | const char *s; |
| 1138 | |
| 1139 | /* ignore empty and comment lines */ |
| 1140 | if(line[0]=='#' || *(s=u_skipWhitespace(line))==0 || *s=='\n' || *s=='\r') { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 1141 | return true; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 1142 | } |
| 1143 | |
| 1144 | return |
| 1145 | ucm_parseMappingLine(&m, codePoints, bytes, line) && |
| 1146 | ucm_addMappingAuto(ucm, forBase, baseStates, &m, codePoints, bytes); |
| 1147 | } |
| 1148 | |
| 1149 | U_CAPI void U_EXPORT2 |
| 1150 | ucm_readTable(UCMFile *ucm, FileStream* convFile, |
| 1151 | UBool forBase, UCMStates *baseStates, |
| 1152 | UErrorCode *pErrorCode) { |
| 1153 | char line[500]; |
| 1154 | char *end; |
| 1155 | UBool isOK; |
| 1156 | |
| 1157 | if(U_FAILURE(*pErrorCode)) { |
| 1158 | return; |
| 1159 | } |
| 1160 | |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 1161 | isOK=true; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 1162 | |
| 1163 | for(;;) { |
| 1164 | /* read the next line */ |
| 1165 | if(!T_FileStream_readLine(convFile, line, sizeof(line))) { |
| 1166 | fprintf(stderr, "incomplete charmap section\n"); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 1167 | isOK=false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 1168 | break; |
| 1169 | } |
| 1170 | |
| 1171 | /* remove CR LF */ |
| 1172 | end=uprv_strchr(line, 0); |
| 1173 | while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) { |
| 1174 | --end; |
| 1175 | } |
| 1176 | *end=0; |
| 1177 | |
| 1178 | /* ignore empty and comment lines */ |
| 1179 | if(line[0]==0 || line[0]=='#') { |
| 1180 | continue; |
| 1181 | } |
| 1182 | |
| 1183 | /* stop at the end of the mapping table */ |
| 1184 | if(0==uprv_strcmp(line, "END CHARMAP")) { |
| 1185 | break; |
| 1186 | } |
| 1187 | |
| 1188 | isOK&=ucm_addMappingFromLine(ucm, line, forBase, baseStates); |
| 1189 | } |
| 1190 | |
| 1191 | if(!isOK) { |
| 1192 | *pErrorCode=U_INVALID_TABLE_FORMAT; |
| 1193 | } |
| 1194 | } |
| 1195 | #endif |