Jungshik Shin | 87232d8 | 2017-05-13 21:10:13 -0700 | [diff] [blame] | 1 | // © 2016 and later: Unicode, Inc. and others. |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 2 | // License & terms of use: http://www.unicode.org/copyright.html |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3 | /* |
| 4 | ******************************************************************************* |
Jungshik Shin (jungshik at google) | 0f8746a | 2015-01-08 15:46:45 -0800 | [diff] [blame] | 5 | * Copyright (C) 2011-2014, International Business Machines |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 6 | * Corporation and others. All Rights Reserved. |
| 7 | ******************************************************************************* |
| 8 | * file name: ppucd.cpp |
Jungshik Shin | 87232d8 | 2017-05-13 21:10:13 -0700 | [diff] [blame] | 9 | * encoding: UTF-8 |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 10 | * tab size: 8 (not used) |
| 11 | * indentation:4 |
| 12 | * |
| 13 | * created on: 2011dec11 |
| 14 | * created by: Markus W. Scherer |
| 15 | */ |
| 16 | |
| 17 | #include "unicode/utypes.h" |
| 18 | #include "unicode/uchar.h" |
| 19 | #include "charstr.h" |
| 20 | #include "cstring.h" |
| 21 | #include "ppucd.h" |
| 22 | #include "uassert.h" |
| 23 | #include "uparse.h" |
| 24 | |
| 25 | #include <stdio.h> |
| 26 | #include <string.h> |
| 27 | |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 28 | U_NAMESPACE_BEGIN |
| 29 | |
| 30 | PropertyNames::~PropertyNames() {} |
| 31 | |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 32 | // TODO: Create a concrete subclass for the default PropertyNames implementation |
| 33 | // using the ICU library built-in property names API & data. |
| 34 | // Currently only the genprops tool uses PreparsedUCD, and provides its own |
| 35 | // PropertyNames implementation using its just-build property names data and its own code. |
| 36 | // At some point, we should use PreparsedUCD in tests, and then we will need the |
| 37 | // default implementation somewhere. |
| 38 | #if 0 |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 39 | int32_t |
| 40 | PropertyNames::getPropertyEnum(const char *name) const { |
| 41 | return u_getPropertyEnum(name); |
| 42 | } |
| 43 | |
| 44 | int32_t |
| 45 | PropertyNames::getPropertyValueEnum(int32_t property, const char *name) const { |
| 46 | return u_getPropertyValueEnum((UProperty)property, name); |
| 47 | } |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 48 | #endif |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 49 | |
| 50 | UniProps::UniProps() |
| 51 | : start(U_SENTINEL), end(U_SENTINEL), |
| 52 | bmg(U_SENTINEL), bpb(U_SENTINEL), |
| 53 | scf(U_SENTINEL), slc(U_SENTINEL), stc(U_SENTINEL), suc(U_SENTINEL), |
| 54 | digitValue(-1), numericValue(NULL), |
| 55 | name(NULL), nameAlias(NULL) { |
| 56 | memset(binProps, 0, sizeof(binProps)); |
| 57 | memset(intProps, 0, sizeof(intProps)); |
| 58 | memset(age, 0, 4); |
| 59 | } |
| 60 | |
| 61 | UniProps::~UniProps() {} |
| 62 | |
| 63 | const int32_t PreparsedUCD::kNumLineBuffers; |
| 64 | |
| 65 | PreparsedUCD::PreparsedUCD(const char *filename, UErrorCode &errorCode) |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 66 | : pnames(nullptr), |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 67 | file(NULL), |
| 68 | defaultLineIndex(-1), blockLineIndex(-1), lineIndex(0), |
| 69 | lineNumber(0), |
| 70 | lineType(NO_LINE), |
| 71 | fieldLimit(NULL), lineLimit(NULL) { |
| 72 | if(U_FAILURE(errorCode)) { return; } |
| 73 | |
| 74 | if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) { |
| 75 | filename=NULL; |
| 76 | file=stdin; |
| 77 | } else { |
| 78 | file=fopen(filename, "r"); |
| 79 | } |
| 80 | if(file==NULL) { |
| 81 | perror("error opening preparsed UCD"); |
| 82 | fprintf(stderr, "error opening preparsed UCD file %s\n", filename ? filename : "\"no file name given\""); |
| 83 | errorCode=U_FILE_ACCESS_ERROR; |
| 84 | return; |
| 85 | } |
| 86 | |
| 87 | memset(ucdVersion, 0, 4); |
| 88 | lines[0][0]=0; |
| 89 | } |
| 90 | |
| 91 | PreparsedUCD::~PreparsedUCD() { |
| 92 | if(file!=stdin) { |
| 93 | fclose(file); |
| 94 | } |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 95 | } |
| 96 | |
| 97 | // Same order as the LineType values. |
| 98 | static const char *lineTypeStrings[]={ |
| 99 | NULL, |
| 100 | NULL, |
| 101 | "ucd", |
| 102 | "property", |
| 103 | "binary", |
| 104 | "value", |
| 105 | "defaults", |
| 106 | "block", |
| 107 | "cp", |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 108 | "unassigned", |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 109 | "algnamesrange" |
| 110 | }; |
| 111 | |
| 112 | PreparsedUCD::LineType |
| 113 | PreparsedUCD::readLine(UErrorCode &errorCode) { |
| 114 | if(U_FAILURE(errorCode)) { return NO_LINE; } |
| 115 | // Select the next available line buffer. |
| 116 | while(!isLineBufferAvailable(lineIndex)) { |
| 117 | ++lineIndex; |
| 118 | if (lineIndex == kNumLineBuffers) { |
| 119 | lineIndex = 0; |
| 120 | } |
| 121 | } |
| 122 | char *line=lines[lineIndex]; |
| 123 | *line=0; |
| 124 | lineLimit=fieldLimit=line; |
| 125 | lineType=NO_LINE; |
| 126 | char *result=fgets(line, sizeof(lines[0]), file); |
| 127 | if(result==NULL) { |
| 128 | if(ferror(file)) { |
| 129 | perror("error reading preparsed UCD"); |
| 130 | fprintf(stderr, "error reading preparsed UCD before line %ld\n", (long)lineNumber); |
| 131 | errorCode=U_FILE_ACCESS_ERROR; |
| 132 | } |
| 133 | return NO_LINE; |
| 134 | } |
| 135 | ++lineNumber; |
| 136 | if(*line=='#') { |
| 137 | fieldLimit=strchr(line, 0); |
| 138 | return lineType=EMPTY_LINE; |
| 139 | } |
| 140 | // Remove trailing /r/n. |
| 141 | char c; |
| 142 | char *limit=strchr(line, 0); |
| 143 | while(line<limit && ((c=*(limit-1))=='\n' || c=='\r')) { --limit; } |
| 144 | // Remove trailing white space. |
| 145 | while(line<limit && ((c=*(limit-1))==' ' || c=='\t')) { --limit; } |
| 146 | *limit=0; |
| 147 | lineLimit=limit; |
| 148 | if(line==limit) { |
| 149 | fieldLimit=limit; |
| 150 | return lineType=EMPTY_LINE; |
| 151 | } |
| 152 | // Split by ';'. |
| 153 | char *semi=line; |
| 154 | while((semi=strchr(semi, ';'))!=NULL) { *semi++=0; } |
| 155 | fieldLimit=strchr(line, 0); |
| 156 | // Determine the line type. |
| 157 | int32_t type; |
| 158 | for(type=EMPTY_LINE+1;; ++type) { |
| 159 | if(type==LINE_TYPE_COUNT) { |
| 160 | fprintf(stderr, |
| 161 | "error in preparsed UCD: unknown line type (first field) '%s' on line %ld\n", |
| 162 | line, (long)lineNumber); |
| 163 | errorCode=U_PARSE_ERROR; |
| 164 | return NO_LINE; |
| 165 | } |
| 166 | if(0==strcmp(line, lineTypeStrings[type])) { |
| 167 | break; |
| 168 | } |
| 169 | } |
| 170 | lineType=(LineType)type; |
| 171 | if(lineType==UNICODE_VERSION_LINE && fieldLimit<lineLimit) { |
| 172 | u_versionFromString(ucdVersion, fieldLimit+1); |
| 173 | } |
| 174 | return lineType; |
| 175 | } |
| 176 | |
| 177 | const char * |
| 178 | PreparsedUCD::firstField() { |
| 179 | char *field=lines[lineIndex]; |
| 180 | fieldLimit=strchr(field, 0); |
| 181 | return field; |
| 182 | } |
| 183 | |
| 184 | const char * |
| 185 | PreparsedUCD::nextField() { |
| 186 | if(fieldLimit==lineLimit) { return NULL; } |
| 187 | char *field=fieldLimit+1; |
| 188 | fieldLimit=strchr(field, 0); |
| 189 | return field; |
| 190 | } |
| 191 | |
| 192 | const UniProps * |
| 193 | PreparsedUCD::getProps(UnicodeSet &newValues, UErrorCode &errorCode) { |
| 194 | if(U_FAILURE(errorCode)) { return NULL; } |
| 195 | newValues.clear(); |
| 196 | if(!lineHasPropertyValues()) { |
| 197 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 198 | return NULL; |
| 199 | } |
| 200 | firstField(); |
| 201 | const char *field=nextField(); |
| 202 | if(field==NULL) { |
| 203 | // No range field after the type. |
| 204 | fprintf(stderr, |
| 205 | "error in preparsed UCD: missing default/block/cp range field " |
| 206 | "(no second field) on line %ld\n", |
| 207 | (long)lineNumber); |
| 208 | errorCode=U_PARSE_ERROR; |
| 209 | return NULL; |
| 210 | } |
| 211 | UChar32 start, end; |
| 212 | if(!parseCodePointRange(field, start, end, errorCode)) { return NULL; } |
| 213 | UniProps *props; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 214 | UBool insideBlock=false; // true if cp or unassigned range inside the block range. |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 215 | switch(lineType) { |
| 216 | case DEFAULTS_LINE: |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 217 | // Should occur before any block/cp/unassigned line. |
| 218 | if(blockLineIndex>=0) { |
| 219 | fprintf(stderr, |
| 220 | "error in preparsed UCD: default line %ld after one or more block lines\n", |
| 221 | (long)lineNumber); |
| 222 | errorCode=U_PARSE_ERROR; |
| 223 | return NULL; |
| 224 | } |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 225 | if(defaultLineIndex>=0) { |
| 226 | fprintf(stderr, |
| 227 | "error in preparsed UCD: second line with default properties on line %ld\n", |
| 228 | (long)lineNumber); |
| 229 | errorCode=U_PARSE_ERROR; |
| 230 | return NULL; |
| 231 | } |
| 232 | if(start!=0 || end!=0x10ffff) { |
| 233 | fprintf(stderr, |
| 234 | "error in preparsed UCD: default range must be 0..10FFFF, not '%s' on line %ld\n", |
| 235 | field, (long)lineNumber); |
| 236 | errorCode=U_PARSE_ERROR; |
| 237 | return NULL; |
| 238 | } |
| 239 | props=&defaultProps; |
| 240 | defaultLineIndex=lineIndex; |
| 241 | break; |
| 242 | case BLOCK_LINE: |
| 243 | blockProps=defaultProps; // Block inherits default properties. |
| 244 | props=&blockProps; |
| 245 | blockLineIndex=lineIndex; |
| 246 | break; |
| 247 | case CP_LINE: |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 248 | case UNASSIGNED_LINE: |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 249 | if(blockProps.start<=start && end<=blockProps.end) { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 250 | insideBlock=true; |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 251 | if(lineType==CP_LINE) { |
| 252 | // Code point range fully inside the last block inherits the block properties. |
| 253 | cpProps=blockProps; |
| 254 | } else { |
| 255 | // Unassigned line inside the block is based on default properties |
| 256 | // which override block properties. |
| 257 | cpProps=defaultProps; |
| 258 | newValues=blockValues; |
| 259 | // Except, it inherits the one blk=Block property. |
| 260 | int32_t blkIndex=UCHAR_BLOCK-UCHAR_INT_START; |
| 261 | cpProps.intProps[blkIndex]=blockProps.intProps[blkIndex]; |
| 262 | newValues.remove((UChar32)UCHAR_BLOCK); |
| 263 | } |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 264 | } else if(start>blockProps.end || end<blockProps.start) { |
| 265 | // Code point range fully outside the last block inherits the default properties. |
| 266 | cpProps=defaultProps; |
| 267 | } else { |
| 268 | // Code point range partially overlapping with the last block is illegal. |
| 269 | fprintf(stderr, |
| 270 | "error in preparsed UCD: cp range %s on line %ld only " |
| 271 | "partially overlaps with block range %04lX..%04lX\n", |
| 272 | field, (long)lineNumber, (long)blockProps.start, (long)blockProps.end); |
| 273 | errorCode=U_PARSE_ERROR; |
| 274 | return NULL; |
| 275 | } |
| 276 | props=&cpProps; |
| 277 | break; |
| 278 | default: |
| 279 | // Will not occur because of the range check above. |
| 280 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 281 | return NULL; |
| 282 | } |
| 283 | props->start=start; |
| 284 | props->end=end; |
| 285 | while((field=nextField())!=NULL) { |
| 286 | if(!parseProperty(*props, field, newValues, errorCode)) { return NULL; } |
| 287 | } |
Jungshik Shin | b318966 | 2017-11-07 11:18:34 -0800 | [diff] [blame] | 288 | if(lineType==BLOCK_LINE) { |
| 289 | blockValues=newValues; |
| 290 | } else if(lineType==UNASSIGNED_LINE && insideBlock) { |
| 291 | // Unset newValues for values that are the same as the block values. |
| 292 | for(int32_t prop=0; prop<UCHAR_BINARY_LIMIT; ++prop) { |
| 293 | if(newValues.contains(prop) && cpProps.binProps[prop]==blockProps.binProps[prop]) { |
| 294 | newValues.remove(prop); |
| 295 | } |
| 296 | } |
| 297 | for(int32_t prop=UCHAR_INT_START; prop<UCHAR_INT_LIMIT; ++prop) { |
| 298 | int32_t index=prop-UCHAR_INT_START; |
| 299 | if(newValues.contains(prop) && cpProps.intProps[index]==blockProps.intProps[index]) { |
| 300 | newValues.remove(prop); |
| 301 | } |
| 302 | } |
| 303 | } |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 304 | return props; |
| 305 | } |
| 306 | |
| 307 | static const struct { |
| 308 | const char *name; |
| 309 | int32_t prop; |
| 310 | } ppucdProperties[]={ |
| 311 | { "Name_Alias", PPUCD_NAME_ALIAS }, |
| 312 | { "Conditional_Case_Mappings", PPUCD_CONDITIONAL_CASE_MAPPINGS }, |
| 313 | { "Turkic_Case_Folding", PPUCD_TURKIC_CASE_FOLDING } |
| 314 | }; |
| 315 | |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 316 | // Returns true for "ok to continue parsing fields". |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 317 | UBool |
| 318 | PreparsedUCD::parseProperty(UniProps &props, const char *field, UnicodeSet &newValues, |
| 319 | UErrorCode &errorCode) { |
| 320 | CharString pBuffer; |
| 321 | const char *p=field; |
| 322 | const char *v=strchr(p, '='); |
| 323 | int binaryValue; |
| 324 | if(*p=='-') { |
| 325 | if(v!=NULL) { |
| 326 | fprintf(stderr, |
| 327 | "error in preparsed UCD: mix of binary-property-no and " |
| 328 | "enum-property syntax '%s' on line %ld\n", |
| 329 | field, (long)lineNumber); |
| 330 | errorCode=U_PARSE_ERROR; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 331 | return false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 332 | } |
| 333 | binaryValue=0; |
| 334 | ++p; |
| 335 | } else if(v==NULL) { |
| 336 | binaryValue=1; |
| 337 | } else { |
| 338 | binaryValue=-1; |
| 339 | // Copy out the property name rather than modifying the field (writing a NUL). |
| 340 | pBuffer.append(p, (int32_t)(v-p), errorCode); |
| 341 | p=pBuffer.data(); |
| 342 | ++v; |
| 343 | } |
| 344 | int32_t prop=pnames->getPropertyEnum(p); |
| 345 | if(prop<0) { |
| 346 | for(int32_t i=0;; ++i) { |
Jungshik Shin (jungshik at google) | 0f8746a | 2015-01-08 15:46:45 -0800 | [diff] [blame] | 347 | if(i==UPRV_LENGTHOF(ppucdProperties)) { |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 348 | // Ignore unknown property names. |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 349 | return true; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 350 | } |
| 351 | if(0==uprv_stricmp(p, ppucdProperties[i].name)) { |
| 352 | prop=ppucdProperties[i].prop; |
| 353 | U_ASSERT(prop>=0); |
| 354 | break; |
| 355 | } |
| 356 | } |
| 357 | } |
| 358 | if(prop<UCHAR_BINARY_LIMIT) { |
| 359 | if(binaryValue>=0) { |
| 360 | props.binProps[prop]=(UBool)binaryValue; |
| 361 | } else { |
| 362 | // No binary value for a binary property. |
| 363 | fprintf(stderr, |
| 364 | "error in preparsed UCD: enum-property syntax '%s' " |
| 365 | "for binary property on line %ld\n", |
| 366 | field, (long)lineNumber); |
| 367 | errorCode=U_PARSE_ERROR; |
| 368 | } |
| 369 | } else if(binaryValue>=0) { |
| 370 | // Binary value for a non-binary property. |
| 371 | fprintf(stderr, |
| 372 | "error in preparsed UCD: binary-property syntax '%s' " |
| 373 | "for non-binary property on line %ld\n", |
| 374 | field, (long)lineNumber); |
| 375 | errorCode=U_PARSE_ERROR; |
| 376 | } else if (prop < UCHAR_INT_START) { |
| 377 | fprintf(stderr, |
| 378 | "error in preparsed UCD: prop value is invalid: '%d' for line %ld\n", |
| 379 | prop, (long)lineNumber); |
| 380 | errorCode=U_PARSE_ERROR; |
| 381 | } else if(prop<UCHAR_INT_LIMIT) { |
| 382 | int32_t value=pnames->getPropertyValueEnum(prop, v); |
| 383 | if(value==UCHAR_INVALID_CODE && prop==UCHAR_CANONICAL_COMBINING_CLASS) { |
| 384 | // TODO: Make getPropertyValueEnum(UCHAR_CANONICAL_COMBINING_CLASS, v) work. |
| 385 | char *end; |
| 386 | unsigned long ccc=uprv_strtoul(v, &end, 10); |
| 387 | if(v<end && *end==0 && ccc<=254) { |
| 388 | value=(int32_t)ccc; |
| 389 | } |
| 390 | } |
| 391 | if(value==UCHAR_INVALID_CODE) { |
| 392 | fprintf(stderr, |
| 393 | "error in preparsed UCD: '%s' is not a valid value on line %ld\n", |
| 394 | field, (long)lineNumber); |
| 395 | errorCode=U_PARSE_ERROR; |
| 396 | } else { |
| 397 | props.intProps[prop-UCHAR_INT_START]=value; |
| 398 | } |
| 399 | } else if(*v=='<') { |
| 400 | // Do not parse default values like <code point>, just set null values. |
| 401 | switch(prop) { |
| 402 | case UCHAR_BIDI_MIRRORING_GLYPH: |
| 403 | props.bmg=U_SENTINEL; |
| 404 | break; |
| 405 | case UCHAR_BIDI_PAIRED_BRACKET: |
| 406 | props.bpb=U_SENTINEL; |
| 407 | break; |
| 408 | case UCHAR_SIMPLE_CASE_FOLDING: |
| 409 | props.scf=U_SENTINEL; |
| 410 | break; |
| 411 | case UCHAR_SIMPLE_LOWERCASE_MAPPING: |
| 412 | props.slc=U_SENTINEL; |
| 413 | break; |
| 414 | case UCHAR_SIMPLE_TITLECASE_MAPPING: |
| 415 | props.stc=U_SENTINEL; |
| 416 | break; |
| 417 | case UCHAR_SIMPLE_UPPERCASE_MAPPING: |
| 418 | props.suc=U_SENTINEL; |
| 419 | break; |
| 420 | case UCHAR_CASE_FOLDING: |
| 421 | props.cf.remove(); |
| 422 | break; |
| 423 | case UCHAR_LOWERCASE_MAPPING: |
| 424 | props.lc.remove(); |
| 425 | break; |
| 426 | case UCHAR_TITLECASE_MAPPING: |
| 427 | props.tc.remove(); |
| 428 | break; |
| 429 | case UCHAR_UPPERCASE_MAPPING: |
| 430 | props.uc.remove(); |
| 431 | break; |
| 432 | case UCHAR_SCRIPT_EXTENSIONS: |
| 433 | props.scx.clear(); |
| 434 | break; |
| 435 | default: |
| 436 | fprintf(stderr, |
| 437 | "error in preparsed UCD: '%s' is not a valid default value on line %ld\n", |
| 438 | field, (long)lineNumber); |
| 439 | errorCode=U_PARSE_ERROR; |
| 440 | } |
| 441 | } else { |
| 442 | char c; |
| 443 | switch(prop) { |
| 444 | case UCHAR_NUMERIC_VALUE: |
| 445 | props.numericValue=v; |
| 446 | c=*v; |
| 447 | if('0'<=c && c<='9' && v[1]==0) { |
| 448 | props.digitValue=c-'0'; |
| 449 | } else { |
| 450 | props.digitValue=-1; |
| 451 | } |
| 452 | break; |
| 453 | case UCHAR_NAME: |
| 454 | props.name=v; |
| 455 | break; |
| 456 | case UCHAR_AGE: |
| 457 | u_versionFromString(props.age, v); // Writes 0.0.0.0 if v is not numeric. |
| 458 | break; |
| 459 | case UCHAR_BIDI_MIRRORING_GLYPH: |
| 460 | props.bmg=parseCodePoint(v, errorCode); |
| 461 | break; |
| 462 | case UCHAR_BIDI_PAIRED_BRACKET: |
| 463 | props.bpb=parseCodePoint(v, errorCode); |
| 464 | break; |
| 465 | case UCHAR_SIMPLE_CASE_FOLDING: |
| 466 | props.scf=parseCodePoint(v, errorCode); |
| 467 | break; |
| 468 | case UCHAR_SIMPLE_LOWERCASE_MAPPING: |
| 469 | props.slc=parseCodePoint(v, errorCode); |
| 470 | break; |
| 471 | case UCHAR_SIMPLE_TITLECASE_MAPPING: |
| 472 | props.stc=parseCodePoint(v, errorCode); |
| 473 | break; |
| 474 | case UCHAR_SIMPLE_UPPERCASE_MAPPING: |
| 475 | props.suc=parseCodePoint(v, errorCode); |
| 476 | break; |
| 477 | case UCHAR_CASE_FOLDING: |
| 478 | parseString(v, props.cf, errorCode); |
| 479 | break; |
| 480 | case UCHAR_LOWERCASE_MAPPING: |
| 481 | parseString(v, props.lc, errorCode); |
| 482 | break; |
| 483 | case UCHAR_TITLECASE_MAPPING: |
| 484 | parseString(v, props.tc, errorCode); |
| 485 | break; |
| 486 | case UCHAR_UPPERCASE_MAPPING: |
| 487 | parseString(v, props.uc, errorCode); |
| 488 | break; |
| 489 | case PPUCD_NAME_ALIAS: |
| 490 | props.nameAlias=v; |
| 491 | break; |
| 492 | case PPUCD_CONDITIONAL_CASE_MAPPINGS: |
| 493 | case PPUCD_TURKIC_CASE_FOLDING: |
| 494 | // No need to parse their values: They are hardcoded in the runtime library. |
| 495 | break; |
| 496 | case UCHAR_SCRIPT_EXTENSIONS: |
| 497 | parseScriptExtensions(v, props.scx, errorCode); |
| 498 | break; |
| 499 | default: |
| 500 | // Ignore unhandled properties. |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 501 | return true; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 502 | } |
| 503 | } |
| 504 | if(U_SUCCESS(errorCode)) { |
| 505 | newValues.add((UChar32)prop); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 506 | return true; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 507 | } else { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 508 | return false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 509 | } |
| 510 | } |
| 511 | |
| 512 | UBool |
| 513 | PreparsedUCD::getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode) { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 514 | if(U_FAILURE(errorCode)) { return false; } |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 515 | if(lineType!=ALG_NAMES_RANGE_LINE) { |
| 516 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 517 | return false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 518 | } |
| 519 | firstField(); |
| 520 | const char *field=nextField(); |
| 521 | if(field==NULL) { |
| 522 | // No range field after the type. |
| 523 | fprintf(stderr, |
| 524 | "error in preparsed UCD: missing algnamesrange range field " |
| 525 | "(no second field) on line %ld\n", |
| 526 | (long)lineNumber); |
| 527 | errorCode=U_PARSE_ERROR; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 528 | return false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 529 | } |
| 530 | return parseCodePointRange(field, start, end, errorCode); |
| 531 | } |
| 532 | |
| 533 | UChar32 |
| 534 | PreparsedUCD::parseCodePoint(const char *s, UErrorCode &errorCode) { |
| 535 | char *end; |
| 536 | uint32_t value=(uint32_t)uprv_strtoul(s, &end, 16); |
| 537 | if(end<=s || *end!=0 || value>=0x110000) { |
| 538 | fprintf(stderr, |
| 539 | "error in preparsed UCD: '%s' is not a valid code point on line %ld\n", |
| 540 | s, (long)lineNumber); |
| 541 | errorCode=U_PARSE_ERROR; |
| 542 | return U_SENTINEL; |
| 543 | } |
| 544 | return (UChar32)value; |
| 545 | } |
| 546 | |
| 547 | UBool |
| 548 | PreparsedUCD::parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode) { |
| 549 | uint32_t st, e; |
| 550 | u_parseCodePointRange(s, &st, &e, &errorCode); |
| 551 | if(U_FAILURE(errorCode)) { |
| 552 | fprintf(stderr, |
| 553 | "error in preparsed UCD: '%s' is not a valid code point range on line %ld\n", |
| 554 | s, (long)lineNumber); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 555 | return false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 556 | } |
| 557 | start=(UChar32)st; |
| 558 | end=(UChar32)e; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 559 | return true; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 560 | } |
| 561 | |
| 562 | void |
| 563 | PreparsedUCD::parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode) { |
Jungshik Shin | 87232d8 | 2017-05-13 21:10:13 -0700 | [diff] [blame] | 564 | UChar *buffer=toUCharPtr(uni.getBuffer(-1)); |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 565 | int32_t length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode); |
| 566 | if(errorCode==U_BUFFER_OVERFLOW_ERROR) { |
| 567 | errorCode=U_ZERO_ERROR; |
| 568 | uni.releaseBuffer(0); |
Jungshik Shin | 87232d8 | 2017-05-13 21:10:13 -0700 | [diff] [blame] | 569 | buffer=toUCharPtr(uni.getBuffer(length)); |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 570 | length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode); |
| 571 | } |
| 572 | uni.releaseBuffer(length); |
| 573 | if(U_FAILURE(errorCode)) { |
| 574 | fprintf(stderr, |
| 575 | "error in preparsed UCD: '%s' is not a valid Unicode string on line %ld\n", |
| 576 | s, (long)lineNumber); |
| 577 | } |
| 578 | } |
| 579 | |
| 580 | void |
| 581 | PreparsedUCD::parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode) { |
| 582 | if(U_FAILURE(errorCode)) { return; } |
| 583 | scx.clear(); |
| 584 | CharString scString; |
| 585 | for(;;) { |
| 586 | const char *scs; |
| 587 | const char *scLimit=strchr(s, ' '); |
| 588 | if(scLimit!=NULL) { |
| 589 | scs=scString.clear().append(s, (int32_t)(scLimit-s), errorCode).data(); |
| 590 | if(U_FAILURE(errorCode)) { return; } |
| 591 | } else { |
| 592 | scs=s; |
| 593 | } |
| 594 | int32_t script=pnames->getPropertyValueEnum(UCHAR_SCRIPT, scs); |
| 595 | if(script==UCHAR_INVALID_CODE) { |
| 596 | fprintf(stderr, |
| 597 | "error in preparsed UCD: '%s' is not a valid script code on line %ld\n", |
| 598 | scs, (long)lineNumber); |
| 599 | errorCode=U_PARSE_ERROR; |
| 600 | return; |
| 601 | } else if(scx.contains(script)) { |
| 602 | fprintf(stderr, |
| 603 | "error in preparsed UCD: scx has duplicate '%s' codes on line %ld\n", |
| 604 | scs, (long)lineNumber); |
| 605 | errorCode=U_PARSE_ERROR; |
| 606 | return; |
| 607 | } else { |
| 608 | scx.add(script); |
| 609 | } |
| 610 | if(scLimit!=NULL) { |
| 611 | s=scLimit+1; |
| 612 | } else { |
| 613 | break; |
| 614 | } |
| 615 | } |
| 616 | if(scx.isEmpty()) { |
| 617 | fprintf(stderr, "error in preparsed UCD: empty scx= on line %ld\n", (long)lineNumber); |
| 618 | errorCode=U_PARSE_ERROR; |
| 619 | } |
| 620 | } |
| 621 | |
| 622 | U_NAMESPACE_END |