blob: 2c5d49524369b225be756802a34fbcceb6b9c7c0 [file] [log] [blame]
Jungshik Shin87232d82017-05-13 21:10:13 -07001// © 2016 and later: Unicode, Inc. and others.
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07002// License & terms of use: http://www.unicode.org/copyright.html
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00003/*
4*******************************************************************************
5*
Jungshik Shin70f82502016-01-29 00:32:36 -08006* Copyright (C) 1998-2015, International Business Machines
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00007* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10*
11* File parse.cpp
12*
13* Modification History:
14*
15* Date Name Description
16* 05/26/99 stephen Creation.
17* 02/25/00 weiv Overhaul to write udata
18* 5/10/01 Ram removed ustdio dependency
19* 06/10/2001 Dominic Ludlam <dom@recoil.org> Rewritten
20*******************************************************************************
21*/
22
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -080023// Safer use of UnicodeString.
Frank Tang1f164ee2022-11-08 12:31:27 -080024#include <cstdint>
25#include "unicode/umachine.h"
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -080026#ifndef UNISTR_FROM_CHAR_EXPLICIT
27# define UNISTR_FROM_CHAR_EXPLICIT explicit
28#endif
29
30// Less important, but still a good idea.
31#ifndef UNISTR_FROM_STRING_EXPLICIT
32# define UNISTR_FROM_STRING_EXPLICIT explicit
33#endif
34
Jungshik Shin70f82502016-01-29 00:32:36 -080035#include <assert.h>
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000036#include "parse.h"
37#include "errmsg.h"
38#include "uhash.h"
39#include "cmemory.h"
40#include "cstring.h"
41#include "uinvchar.h"
42#include "read.h"
43#include "ustr.h"
44#include "reslist.h"
45#include "rbt_pars.h"
46#include "genrb.h"
Frank Tang1f164ee2022-11-08 12:31:27 -080047#include "unicode/normalizer2.h"
Jungshik Shin70f82502016-01-29 00:32:36 -080048#include "unicode/stringpiece.h"
49#include "unicode/unistr.h"
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000050#include "unicode/ustring.h"
51#include "unicode/uscript.h"
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -080052#include "unicode/utf16.h"
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000053#include "unicode/putil.h"
Jungshik Shin70f82502016-01-29 00:32:36 -080054#include "charstr.h"
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -080055#include "collationbuilder.h"
56#include "collationdata.h"
57#include "collationdatareader.h"
58#include "collationdatawriter.h"
59#include "collationfastlatinbuilder.h"
60#include "collationinfo.h"
61#include "collationroot.h"
62#include "collationruleparser.h"
63#include "collationtailoring.h"
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000064#include <stdio.h>
Frank Tang1f164ee2022-11-08 12:31:27 -080065#include "writesrc.h"
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000066
67/* Number of tokens to read ahead of the current stream position */
68#define MAX_LOOKAHEAD 3
69
70#define CR 0x000D
71#define LF 0x000A
72#define SPACE 0x0020
73#define TAB 0x0009
74#define ESCAPE 0x005C
75#define HASH 0x0023
76#define QUOTE 0x0027
77#define ZERO 0x0030
78#define STARTCOMMAND 0x005B
79#define ENDCOMMAND 0x005D
80#define OPENSQBRACKET 0x005B
81#define CLOSESQBRACKET 0x005D
82
Frank Tang1f164ee2022-11-08 12:31:27 -080083#define ICU4X_DIACRITIC_BASE 0x0300
84#define ICU4X_DIACRITIC_LIMIT 0x034F
85
Jungshik Shin70f82502016-01-29 00:32:36 -080086using icu::CharString;
87using icu::LocalMemory;
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -080088using icu::LocalPointer;
Jungshik Shin70f82502016-01-29 00:32:36 -080089using icu::LocalUCHARBUFPointer;
90using icu::StringPiece;
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -080091using icu::UnicodeString;
92
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000093struct Lookahead
94{
95 enum ETokenType type;
96 struct UString value;
97 struct UString comment;
98 uint32_t line;
99};
100
101/* keep in sync with token defines in read.h */
102const char *tokenNames[TOK_TOKEN_COUNT] =
103{
104 "string", /* A string token, such as "MonthNames" */
105 "'{'", /* An opening brace character */
106 "'}'", /* A closing brace character */
107 "','", /* A comma */
108 "':'", /* A colon */
109
110 "<end of file>", /* End of the file has been reached successfully */
111 "<end of line>"
112};
113
114/* Just to store "TRUE" */
115//static const UChar trueValue[] = {0x0054, 0x0052, 0x0055, 0x0045, 0x0000};
116
117typedef struct {
118 struct Lookahead lookahead[MAX_LOOKAHEAD + 1];
119 uint32_t lookaheadPosition;
120 UCHARBUF *buffer;
121 struct SRBRoot *bundle;
122 const char *inputdir;
123 uint32_t inputdirLength;
124 const char *outputdir;
125 uint32_t outputdirLength;
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800126 const char *filename;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000127 UBool makeBinaryCollation;
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800128 UBool omitCollationRules;
Frank Tang1f164ee2022-11-08 12:31:27 -0800129 UBool icu4xMode;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000130} ParseState;
131
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000132typedef struct SResource *
133ParseResourceFunction(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status);
134
135static struct SResource *parseResource(ParseState* state, char *tag, const struct UString *comment, UErrorCode *status);
136
137/* The nature of the lookahead buffer:
138 There are MAX_LOOKAHEAD + 1 slots, used as a circular buffer. This provides
139 MAX_LOOKAHEAD lookahead tokens and a slot for the current token and value.
140 When getToken is called, the current pointer is moved to the next slot and the
141 old slot is filled with the next token from the reader by calling getNextToken.
142 The token values are stored in the slot, which means that token values don't
143 survive a call to getToken, ie.
144
145 UString *value;
146
147 getToken(&value, NULL, status);
148 getToken(NULL, NULL, status); bad - value is now a different string
149*/
150static void
151initLookahead(ParseState* state, UCHARBUF *buf, UErrorCode *status)
152{
153 static uint32_t initTypeStrings = 0;
154 uint32_t i;
155
156 if (!initTypeStrings)
157 {
158 initTypeStrings = 1;
159 }
160
161 state->lookaheadPosition = 0;
162 state->buffer = buf;
163
164 resetLineNumber();
165
166 for (i = 0; i < MAX_LOOKAHEAD; i++)
167 {
168 state->lookahead[i].type = getNextToken(state->buffer, &state->lookahead[i].value, &state->lookahead[i].line, &state->lookahead[i].comment, status);
169 if (U_FAILURE(*status))
170 {
171 return;
172 }
173 }
174
175 *status = U_ZERO_ERROR;
176}
177
178static void
179cleanupLookahead(ParseState* state)
180{
181 uint32_t i;
182 for (i = 0; i <= MAX_LOOKAHEAD; i++)
183 {
184 ustr_deinit(&state->lookahead[i].value);
185 ustr_deinit(&state->lookahead[i].comment);
186 }
187
188}
189
190static enum ETokenType
191getToken(ParseState* state, struct UString **tokenValue, struct UString* comment, uint32_t *linenumber, UErrorCode *status)
192{
193 enum ETokenType result;
194 uint32_t i;
195
196 result = state->lookahead[state->lookaheadPosition].type;
197
198 if (tokenValue != NULL)
199 {
200 *tokenValue = &state->lookahead[state->lookaheadPosition].value;
201 }
202
203 if (linenumber != NULL)
204 {
205 *linenumber = state->lookahead[state->lookaheadPosition].line;
206 }
207
208 if (comment != NULL)
209 {
210 ustr_cpy(comment, &(state->lookahead[state->lookaheadPosition].comment), status);
211 }
212
213 i = (state->lookaheadPosition + MAX_LOOKAHEAD) % (MAX_LOOKAHEAD + 1);
214 state->lookaheadPosition = (state->lookaheadPosition + 1) % (MAX_LOOKAHEAD + 1);
215 ustr_setlen(&state->lookahead[i].comment, 0, status);
216 ustr_setlen(&state->lookahead[i].value, 0, status);
217 state->lookahead[i].type = getNextToken(state->buffer, &state->lookahead[i].value, &state->lookahead[i].line, &state->lookahead[i].comment, status);
218
219 /* printf("getToken, returning %s\n", tokenNames[result]); */
220
221 return result;
222}
223
224static enum ETokenType
225peekToken(ParseState* state, uint32_t lookaheadCount, struct UString **tokenValue, uint32_t *linenumber, struct UString *comment, UErrorCode *status)
226{
227 uint32_t i = (state->lookaheadPosition + lookaheadCount) % (MAX_LOOKAHEAD + 1);
228
229 if (U_FAILURE(*status))
230 {
231 return TOK_ERROR;
232 }
233
234 if (lookaheadCount >= MAX_LOOKAHEAD)
235 {
236 *status = U_INTERNAL_PROGRAM_ERROR;
237 return TOK_ERROR;
238 }
239
240 if (tokenValue != NULL)
241 {
242 *tokenValue = &state->lookahead[i].value;
243 }
244
245 if (linenumber != NULL)
246 {
247 *linenumber = state->lookahead[i].line;
248 }
249
250 if(comment != NULL){
251 ustr_cpy(comment, &(state->lookahead[state->lookaheadPosition].comment), status);
252 }
253
254 return state->lookahead[i].type;
255}
256
257static void
258expect(ParseState* state, enum ETokenType expectedToken, struct UString **tokenValue, struct UString *comment, uint32_t *linenumber, UErrorCode *status)
259{
260 uint32_t line;
261
262 enum ETokenType token = getToken(state, tokenValue, comment, &line, status);
263
264 if (linenumber != NULL)
265 {
266 *linenumber = line;
267 }
268
269 if (U_FAILURE(*status))
270 {
271 return;
272 }
273
274 if (token != expectedToken)
275 {
276 *status = U_INVALID_FORMAT_ERROR;
277 error(line, "expecting %s, got %s", tokenNames[expectedToken], tokenNames[token]);
278 }
279 else
280 {
281 *status = U_ZERO_ERROR;
282 }
283}
284
Frank Tangb8696612019-10-25 14:58:21 -0700285static char *getInvariantString(ParseState* state, uint32_t *line, struct UString *comment,
286 int32_t &stringLength, UErrorCode *status)
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000287{
288 struct UString *tokenValue;
289 char *result;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000290
291 expect(state, TOK_STRING, &tokenValue, comment, line, status);
292
293 if (U_FAILURE(*status))
294 {
295 return NULL;
296 }
297
Frank Tangb8696612019-10-25 14:58:21 -0700298 if(!uprv_isInvariantUString(tokenValue->fChars, tokenValue->fLength)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000299 *status = U_INVALID_FORMAT_ERROR;
300 error(*line, "invariant characters required for table keys, binary data, etc.");
301 return NULL;
302 }
303
Frank Tangb8696612019-10-25 14:58:21 -0700304 result = static_cast<char *>(uprv_malloc(tokenValue->fLength+1));
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000305
306 if (result == NULL)
307 {
308 *status = U_MEMORY_ALLOCATION_ERROR;
309 return NULL;
310 }
311
Frank Tangb8696612019-10-25 14:58:21 -0700312 u_UCharsToChars(tokenValue->fChars, result, tokenValue->fLength+1);
313 stringLength = tokenValue->fLength;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000314 return result;
315}
316
317static struct SResource *
318parseUCARules(ParseState* state, char *tag, uint32_t startline, const struct UString* /*comment*/, UErrorCode *status)
319{
320 struct SResource *result = NULL;
321 struct UString *tokenValue;
322 FileStream *file = NULL;
323 char filename[256] = { '\0' };
324 char cs[128] = { '\0' };
325 uint32_t line;
Frank Tang1f164ee2022-11-08 12:31:27 -0800326 UBool quoted = false;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000327 UCHARBUF *ucbuf=NULL;
328 UChar32 c = 0;
329 const char* cp = NULL;
330 UChar *pTarget = NULL;
331 UChar *target = NULL;
332 UChar *targetLimit = NULL;
333 int32_t size = 0;
334
335 expect(state, TOK_STRING, &tokenValue, NULL, &line, status);
336
337 if(isVerbose()){
338 printf(" %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
339 }
340
341 if (U_FAILURE(*status))
342 {
343 return NULL;
344 }
345 /* make the filename including the directory */
346 if (state->inputdir != NULL)
347 {
348 uprv_strcat(filename, state->inputdir);
349
350 if (state->inputdir[state->inputdirLength - 1] != U_FILE_SEP_CHAR)
351 {
352 uprv_strcat(filename, U_FILE_SEP_STRING);
353 }
354 }
355
356 u_UCharsToChars(tokenValue->fChars, cs, tokenValue->fLength);
357
358 expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
359
360 if (U_FAILURE(*status))
361 {
362 return NULL;
363 }
364 uprv_strcat(filename, cs);
365
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800366 if(state->omitCollationRules) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000367 return res_none();
368 }
369
Frank Tang1f164ee2022-11-08 12:31:27 -0800370 ucbuf = ucbuf_open(filename, &cp, getShowWarning(),false, status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000371
372 if (U_FAILURE(*status)) {
Jungshik Shinf61e46d2018-05-04 13:00:45 -0700373 error(line, "An error occurred while opening the input file %s\n", filename);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000374 return NULL;
375 }
376
377 /* We allocate more space than actually required
378 * since the actual size needed for storing UChars
379 * is not known in UTF-8 byte stream
380 */
381 size = ucbuf_size(ucbuf) + 1;
382 pTarget = (UChar*) uprv_malloc(U_SIZEOF_UCHAR * size);
383 uprv_memset(pTarget, 0, size*U_SIZEOF_UCHAR);
384 target = pTarget;
385 targetLimit = pTarget+size;
386
387 /* read the rules into the buffer */
388 while (target < targetLimit)
389 {
390 c = ucbuf_getc(ucbuf, status);
391 if(c == QUOTE) {
392 quoted = (UBool)!quoted;
393 }
394 /* weiv (06/26/2002): adding the following:
395 * - preserving spaces in commands [...]
396 * - # comments until the end of line
397 */
398 if (c == STARTCOMMAND && !quoted)
399 {
400 /* preserve commands
401 * closing bracket will be handled by the
402 * append at the end of the loop
403 */
404 while(c != ENDCOMMAND) {
405 U_APPEND_CHAR32_ONLY(c, target);
406 c = ucbuf_getc(ucbuf, status);
407 }
408 }
409 else if (c == HASH && !quoted) {
410 /* skip comments */
411 while(c != CR && c != LF) {
412 c = ucbuf_getc(ucbuf, status);
413 }
414 continue;
415 }
416 else if (c == ESCAPE)
417 {
418 c = unescape(ucbuf, status);
419
420 if (c == (UChar32)U_ERR)
421 {
422 uprv_free(pTarget);
423 T_FileStream_close(file);
424 return NULL;
425 }
426 }
427 else if (!quoted && (c == SPACE || c == TAB || c == CR || c == LF))
428 {
429 /* ignore spaces carriage returns
430 * and line feed unless in the form \uXXXX
431 */
432 continue;
433 }
434
435 /* Append UChar * after dissembling if c > 0xffff*/
436 if (c != (UChar32)U_EOF)
437 {
438 U_APPEND_CHAR32_ONLY(c, target);
439 }
440 else
441 {
442 break;
443 }
444 }
445
446 /* terminate the string */
447 if(target < targetLimit){
448 *target = 0x0000;
449 }
450
451 result = string_open(state->bundle, tag, pTarget, (int32_t)(target - pTarget), NULL, status);
452
453
454 ucbuf_close(ucbuf);
455 uprv_free(pTarget);
456 T_FileStream_close(file);
457
458 return result;
459}
460
461static struct SResource *
462parseTransliterator(ParseState* state, char *tag, uint32_t startline, const struct UString* /*comment*/, UErrorCode *status)
463{
464 struct SResource *result = NULL;
465 struct UString *tokenValue;
466 FileStream *file = NULL;
467 char filename[256] = { '\0' };
468 char cs[128] = { '\0' };
469 uint32_t line;
470 UCHARBUF *ucbuf=NULL;
471 const char* cp = NULL;
472 UChar *pTarget = NULL;
473 const UChar *pSource = NULL;
474 int32_t size = 0;
475
476 expect(state, TOK_STRING, &tokenValue, NULL, &line, status);
477
478 if(isVerbose()){
479 printf(" %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
480 }
481
482 if (U_FAILURE(*status))
483 {
484 return NULL;
485 }
486 /* make the filename including the directory */
487 if (state->inputdir != NULL)
488 {
489 uprv_strcat(filename, state->inputdir);
490
491 if (state->inputdir[state->inputdirLength - 1] != U_FILE_SEP_CHAR)
492 {
493 uprv_strcat(filename, U_FILE_SEP_STRING);
494 }
495 }
496
497 u_UCharsToChars(tokenValue->fChars, cs, tokenValue->fLength);
498
499 expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
500
501 if (U_FAILURE(*status))
502 {
503 return NULL;
504 }
505 uprv_strcat(filename, cs);
506
507
Frank Tang1f164ee2022-11-08 12:31:27 -0800508 ucbuf = ucbuf_open(filename, &cp, getShowWarning(),false, status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000509
510 if (U_FAILURE(*status)) {
Jungshik Shinf61e46d2018-05-04 13:00:45 -0700511 error(line, "An error occurred while opening the input file %s\n", filename);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000512 return NULL;
513 }
514
515 /* We allocate more space than actually required
516 * since the actual size needed for storing UChars
517 * is not known in UTF-8 byte stream
518 */
519 pSource = ucbuf_getBuffer(ucbuf, &size, status);
520 pTarget = (UChar*) uprv_malloc(U_SIZEOF_UCHAR * (size + 1));
521 uprv_memset(pTarget, 0, size*U_SIZEOF_UCHAR);
522
523#if !UCONFIG_NO_TRANSLITERATION
524 size = utrans_stripRules(pSource, size, pTarget, status);
525#else
526 size = 0;
527 fprintf(stderr, " Warning: writing empty transliteration data ( UCONFIG_NO_TRANSLITERATION ) \n");
528#endif
529 result = string_open(state->bundle, tag, pTarget, size, NULL, status);
530
531 ucbuf_close(ucbuf);
532 uprv_free(pTarget);
533 T_FileStream_close(file);
534
535 return result;
536}
Jungshik Shin70f82502016-01-29 00:32:36 -0800537static ArrayResource* dependencyArray = NULL;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000538
539static struct SResource *
540parseDependency(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status)
541{
542 struct SResource *result = NULL;
543 struct SResource *elem = NULL;
544 struct UString *tokenValue;
545 uint32_t line;
546 char filename[256] = { '\0' };
547 char cs[128] = { '\0' };
Frank Tang69c72a62019-04-03 21:41:21 -0700548
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000549 expect(state, TOK_STRING, &tokenValue, NULL, &line, status);
550
551 if(isVerbose()){
552 printf(" %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
553 }
554
555 if (U_FAILURE(*status))
556 {
557 return NULL;
558 }
559 /* make the filename including the directory */
560 if (state->outputdir != NULL)
561 {
562 uprv_strcat(filename, state->outputdir);
563
564 if (state->outputdir[state->outputdirLength - 1] != U_FILE_SEP_CHAR)
565 {
566 uprv_strcat(filename, U_FILE_SEP_STRING);
567 }
568 }
Frank Tang69c72a62019-04-03 21:41:21 -0700569
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000570 u_UCharsToChars(tokenValue->fChars, cs, tokenValue->fLength);
571
572 if (U_FAILURE(*status))
573 {
574 return NULL;
575 }
576 uprv_strcat(filename, cs);
577 if(!T_FileStream_file_exists(filename)){
578 if(isStrict()){
579 error(line, "The dependency file %s does not exist. Please make sure it exists.\n",filename);
580 }else{
Frank Tang69c72a62019-04-03 21:41:21 -0700581 warning(line, "The dependency file %s does not exist. Please make sure it exists.\n",filename);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000582 }
583 }
584 if(dependencyArray==NULL){
585 dependencyArray = array_open(state->bundle, "%%DEPENDENCY", NULL, status);
586 }
587 if(tag!=NULL){
588 result = string_open(state->bundle, tag, tokenValue->fChars, tokenValue->fLength, comment, status);
589 }
590 elem = string_open(state->bundle, NULL, tokenValue->fChars, tokenValue->fLength, comment, status);
591
Jungshik Shin70f82502016-01-29 00:32:36 -0800592 dependencyArray->add(elem);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000593
594 if (U_FAILURE(*status))
595 {
596 return NULL;
597 }
598 expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
599 return result;
600}
601static struct SResource *
602parseString(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status)
603{
604 struct UString *tokenValue;
605 struct SResource *result = NULL;
606
607/* if (tag != NULL && uprv_strcmp(tag, "%%UCARULES") == 0)
608 {
609 return parseUCARules(tag, startline, status);
610 }*/
611 if(isVerbose()){
612 printf(" string %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
613 }
614 expect(state, TOK_STRING, &tokenValue, NULL, NULL, status);
615
616 if (U_SUCCESS(*status))
617 {
618 /* create the string now - tokenValue doesn't survive a call to getToken (and therefore
619 doesn't survive expect either) */
620
621 result = string_open(state->bundle, tag, tokenValue->fChars, tokenValue->fLength, comment, status);
622 if(U_SUCCESS(*status) && result) {
623 expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
624
625 if (U_FAILURE(*status))
626 {
627 res_close(result);
628 return NULL;
629 }
630 }
631 }
632
633 return result;
634}
635
636static struct SResource *
637parseAlias(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status)
638{
639 struct UString *tokenValue;
640 struct SResource *result = NULL;
641
642 expect(state, TOK_STRING, &tokenValue, NULL, NULL, status);
643
644 if(isVerbose()){
645 printf(" alias %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
646 }
647
648 if (U_SUCCESS(*status))
649 {
650 /* create the string now - tokenValue doesn't survive a call to getToken (and therefore
651 doesn't survive expect either) */
652
653 result = alias_open(state->bundle, tag, tokenValue->fChars, tokenValue->fLength, comment, status);
654
655 expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
656
657 if (U_FAILURE(*status))
658 {
659 res_close(result);
660 return NULL;
661 }
662 }
663
664 return result;
665}
666
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800667#if !UCONFIG_NO_COLLATION
668
669namespace {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000670
671static struct SResource* resLookup(struct SResource* res, const char* key){
Jungshik Shin70f82502016-01-29 00:32:36 -0800672 if (res == res_none() || !res->isTable()) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000673 return NULL;
674 }
675
Jungshik Shin70f82502016-01-29 00:32:36 -0800676 TableResource *list = static_cast<TableResource *>(res);
677 SResource *current = list->fFirst;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000678 while (current != NULL) {
679 if (uprv_strcmp(((list->fRoot->fKeys) + (current->fKey)), key) == 0) {
680 return current;
681 }
682 current = current->fNext;
683 }
684 return NULL;
685}
686
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800687class GenrbImporter : public icu::CollationRuleParser::Importer {
688public:
689 GenrbImporter(const char *in, const char *out) : inputDir(in), outputDir(out) {}
690 virtual ~GenrbImporter();
691 virtual void getRules(
692 const char *localeID, const char *collationType,
693 UnicodeString &rules,
Frank Tang3e05d9d2021-11-08 14:04:04 -0800694 const char *&errorReason, UErrorCode &errorCode) override;
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800695
696private:
697 const char *inputDir;
698 const char *outputDir;
699};
700
701GenrbImporter::~GenrbImporter() {}
702
703void
704GenrbImporter::getRules(
705 const char *localeID, const char *collationType,
706 UnicodeString &rules,
707 const char *& /*errorReason*/, UErrorCode &errorCode) {
Jungshik Shin70f82502016-01-29 00:32:36 -0800708 CharString filename(localeID, errorCode);
709 for(int32_t i = 0; i < filename.length(); i++){
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000710 if(filename[i] == '-'){
Jungshik Shin70f82502016-01-29 00:32:36 -0800711 filename.data()[i] = '_';
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000712 }
713 }
Jungshik Shin70f82502016-01-29 00:32:36 -0800714 filename.append(".txt", errorCode);
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800715 if (U_FAILURE(errorCode)) {
716 return;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000717 }
Jungshik Shin70f82502016-01-29 00:32:36 -0800718 CharString inputDirBuf;
719 CharString openFileName;
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800720 if(inputDir == NULL) {
Jungshik Shin70f82502016-01-29 00:32:36 -0800721 const char *filenameBegin = uprv_strrchr(filename.data(), U_FILE_SEP_CHAR);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000722 if (filenameBegin != NULL) {
723 /*
724 * When a filename ../../../data/root.txt is specified,
725 * we presume that the input directory is ../../../data
726 * This is very important when the resource file includes
727 * another file, like UCARules.txt or thaidict.brk.
728 */
Jungshik Shin70f82502016-01-29 00:32:36 -0800729 StringPiece dir = filename.toStringPiece();
730 const char *filenameLimit = filename.data() + filename.length();
731 dir.remove_suffix((int32_t)(filenameLimit - filenameBegin));
732 inputDirBuf.append(dir, errorCode);
733 inputDir = inputDirBuf.data();
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000734 }
735 }else{
Jungshik Shin70f82502016-01-29 00:32:36 -0800736 int32_t dirlen = (int32_t)uprv_strlen(inputDir);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000737
Jungshik Shin70f82502016-01-29 00:32:36 -0800738 if((filename[0] != U_FILE_SEP_CHAR) && (inputDir[dirlen-1] !='.')) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000739 /*
740 * append the input dir to openFileName if the first char in
Jungshik Shin70f82502016-01-29 00:32:36 -0800741 * filename is not file separator char and the last char input directory is not '.'.
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000742 * This is to support :
743 * genrb -s. /home/icu/data
744 * genrb -s. icu/data
745 * The user cannot mix notations like
746 * genrb -s. /icu/data --- the absolute path specified. -s redundant
747 * user should use
748 * genrb -s. icu/data --- start from CWD and look in icu/data dir
749 */
Jungshik Shin70f82502016-01-29 00:32:36 -0800750 openFileName.append(inputDir, dirlen, errorCode);
751 if(inputDir[dirlen-1] != U_FILE_SEP_CHAR) {
752 openFileName.append(U_FILE_SEP_CHAR, errorCode);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000753 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000754 }
755 }
Jungshik Shin70f82502016-01-29 00:32:36 -0800756 openFileName.append(filename, errorCode);
757 if(U_FAILURE(errorCode)) {
758 return;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000759 }
Jungshik Shin70f82502016-01-29 00:32:36 -0800760 // printf("GenrbImporter::getRules(%s, %s) reads %s\n", localeID, collationType, openFileName.data());
761 const char* cp = "";
762 LocalUCHARBUFPointer ucbuf(
Frank Tang1f164ee2022-11-08 12:31:27 -0800763 ucbuf_open(openFileName.data(), &cp, getShowWarning(), true, &errorCode));
Jungshik Shin70f82502016-01-29 00:32:36 -0800764 if(errorCode == U_FILE_ACCESS_ERROR) {
765 fprintf(stderr, "couldn't open file %s\n", openFileName.data());
766 return;
767 }
768 if (ucbuf.isNull() || U_FAILURE(errorCode)) {
Jungshik Shinf61e46d2018-05-04 13:00:45 -0700769 fprintf(stderr, "An error occurred processing file %s. Error: %s\n", openFileName.data(), u_errorName(errorCode));
Jungshik Shin70f82502016-01-29 00:32:36 -0800770 return;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000771 }
772
773 /* Parse the data into an SRBRoot */
Jungshik Shinb3189662017-11-07 11:18:34 -0800774 LocalPointer<SRBRoot> data(
Frank Tang1f164ee2022-11-08 12:31:27 -0800775 parse(ucbuf.getAlias(), inputDir, outputDir, filename.data(), false, false, false, &errorCode));
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800776 if (U_FAILURE(errorCode)) {
Jungshik Shin70f82502016-01-29 00:32:36 -0800777 return;
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800778 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000779
Jungshik Shin70f82502016-01-29 00:32:36 -0800780 struct SResource *root = data->fRoot;
781 struct SResource *collations = resLookup(root, "collations");
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000782 if (collations != NULL) {
Jungshik Shin70f82502016-01-29 00:32:36 -0800783 struct SResource *collation = resLookup(collations, collationType);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000784 if (collation != NULL) {
Jungshik Shin70f82502016-01-29 00:32:36 -0800785 struct SResource *sequence = resLookup(collation, "Sequence");
786 if (sequence != NULL && sequence->isString()) {
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800787 // No string pointer aliasing so that we need not hold onto the resource bundle.
Jungshik Shin70f82502016-01-29 00:32:36 -0800788 StringResource *sr = static_cast<StringResource *>(sequence);
789 rules = sr->fString;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000790 }
791 }
792 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000793}
794
795// Quick-and-dirty escaping function.
796// Assumes that we are on an ASCII-based platform.
797static void
798escape(const UChar *s, char *buffer) {
799 int32_t length = u_strlen(s);
800 int32_t i = 0;
801 for (;;) {
802 UChar32 c;
803 U16_NEXT(s, i, length, c);
804 if (c == 0) {
805 *buffer = 0;
806 return;
807 } else if (0x20 <= c && c <= 0x7e) {
808 // printable ASCII
809 *buffer++ = (char)c; // assumes ASCII-based platform
810 } else {
811 buffer += sprintf(buffer, "\\u%04X", (int)c);
812 }
813 }
814}
815
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800816} // namespace
817
Frank Tang1f164ee2022-11-08 12:31:27 -0800818static FILE*
819openTOML(const char* outputdir, const char* name, const char* collationType, const char* structType, UErrorCode *status) {
820 CharString baseName;
821 baseName.append(name, *status);
822 baseName.append("_", *status);
823 baseName.append(collationType, *status);
824 baseName.append("_", *status);
825 baseName.append(structType, *status);
826
827 CharString outFileName;
828 if (outputdir && *outputdir) {
829 outFileName.append(outputdir, *status).ensureEndsWithFileSeparator(*status);
830 }
831 outFileName.append(baseName, *status);
832 outFileName.append(".toml", *status);
833 if (U_FAILURE(*status)) {
834 return NULL;
835 }
836
837 FILE* f = fopen(outFileName.data(), "w");
838 if (!f) {
839 *status = U_FILE_ACCESS_ERROR;
840 return NULL;
841 }
842 usrc_writeFileNameGeneratedBy(f, "#", baseName.data(), "genrb -X");
843
844 return f;
845}
846
847static void
848writeCollationMetadataTOML(const char* outputdir, const char* name, const char* collationType, const uint32_t metadataBits, UErrorCode *status) {
849 FILE* f = openTOML(outputdir, name, collationType, "meta", status);
850 if (!f) {
851 return;
852 }
853 // printf("writeCollationMetadataTOML %s %s\n", name, collationType);
854 fprintf(f, "bits = 0x%X\n", metadataBits);
855 fclose(f);
856}
857
858static UChar32
859writeCollationDiacriticsTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UErrorCode *status) {
860 UChar32 limit = ICU4X_DIACRITIC_LIMIT;
861 FILE* f = openTOML(outputdir, name, collationType, "dia", status);
862 if (!f) {
863 return limit;
864 }
865 // printf("writeCollationDiacriticsTOML %s %s\n", name, collationType);
866 uint16_t secondaries[ICU4X_DIACRITIC_LIMIT-ICU4X_DIACRITIC_BASE];
867 for (UChar32 c = ICU4X_DIACRITIC_BASE; c < ICU4X_DIACRITIC_LIMIT; ++c) {
868 uint16_t secondary = 0;
869 uint32_t ce32 = data->getCE32(c);
870 if (ce32 == icu::Collation::FALLBACK_CE32) {
871 ce32 = data->base->getCE32(c);
872 }
873 if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344) {
874 // These never occur in NFD data
875 } else if (!icu::Collation::isSimpleOrLongCE32(ce32)) {
876 if (uprv_strcmp(name, "root") == 0) {
877 printf("UNSUPPORTED DIACRITIC CE32 in root: TAG: %X CE32: %X char: %X\n", icu::Collation::tagFromCE32(ce32), ce32, c);
878 fclose(f);
879 *status = U_INTERNAL_PROGRAM_ERROR;
880 return limit;
881 }
882 limit = c;
883 break;
884 } else {
885 uint64_t ce = uint64_t(icu::Collation::ceFromCE32(ce32));
886 if ((ce & 0xFFFFFFFF0000FFFF) != uint64_t(icu::Collation::COMMON_TERTIARY_CE)) {
887 // Not a CE where only the secondary weight differs from the expected
888 // pattern.
889 limit = c;
890 break;
891 }
892 secondary = uint16_t(ce >> 16);
893 }
894 secondaries[c - ICU4X_DIACRITIC_BASE] = secondary;
895
896 }
897 usrc_writeArray(f, "secondaries = [\n ", secondaries, 16, limit-ICU4X_DIACRITIC_BASE, " ", "\n]\n");
898 fclose(f);
899 return limit;
900}
901
902static void
903writeCollationReorderingTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationSettings* settings, UErrorCode *status) {
904 FILE* f = openTOML(outputdir, name, collationType, "reord", status);
905 if (!f) {
906 return;
907 }
908 // printf("writeCollationReorderingTOML %s %s\n", name, collationType);
909 fprintf(f, "min_high_no_reorder = 0x%X\n", settings->minHighNoReorder);
910 usrc_writeArray(f, "reorder_table = [\n ", settings->reorderTable, 8, 256, " ", "\n]\n");
911 usrc_writeArray(f, "reorder_ranges = [\n ", settings->reorderRanges, 32, settings->reorderRangesLength, " ", "\n]\n");
912 fclose(f);
913}
914
915
916static void
917writeCollationJamoTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UErrorCode *status) {
918 FILE* f = openTOML(outputdir, name, collationType, "jamo", status);
919 if (!f) {
920 printf("writeCollationJamoTOML FAILED TO OPEN FILE %s %s\n", name, collationType);
921 return;
922 }
923 uint32_t jamo[0x1200-0x1100];
924 for (UChar32 c = 0x1100; c < 0x1200; ++c) {
925 uint32_t ce32 = data->getCE32(c);
926 if (ce32 == icu::Collation::FALLBACK_CE32) {
927 ce32 = data->base->getCE32(c);
928 }
929 // Can't reject complex CE32s, because search collations have expansions.
930 // These expansions refer to the tailoring, which foils the reuse of the
931 // these jamo tables.
932 // XXX Figure out what to do. Perhaps instead of having Latin mini expansions,
933 // there should be Hangul mini expansions.
934 // XXX in any case, validate that modern jamo are self-contained.
935 jamo[c - 0x1100] = ce32;
936
937 }
938 usrc_writeArray(f, "ce32s = [\n ", jamo, 32, 0x1200-0x1100, " ", "\n]\n");
939 fclose(f);
940}
941
942static UBool
943convertTrie(const void *context, UChar32 start, UChar32 end, uint32_t value) {
944 if (start >= 0x1100 && start < 0x1200 && end >= 0x1100 && end < 0x1200) {
945 // Range entirely in conjoining jamo block.
946 return true;
947 }
948 icu::IcuToolErrorCode status("genrb: convertTrie");
949 umutablecptrie_setRange((UMutableCPTrie*)context, start, end, value, status);
950 return !U_FAILURE(*status);
951}
952
953static void
954writeCollationDataTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UBool root, UChar32 diacriticLimit, UErrorCode *status) {
955 FILE* f = openTOML(outputdir, name, collationType, "data", status);
956 if (!f) {
957 return;
958 }
959 // printf("writeCollationDataTOML %s %s\n", name, collationType);
960
961 icu::UnicodeSet tailoringSet;
962
963 if (data->base) {
964 tailoringSet.addAll(*(data->unsafeBackwardSet));
965 tailoringSet.removeAll(*(data->base->unsafeBackwardSet));
966 } else {
967 tailoringSet.addAll(*(data->unsafeBackwardSet));
968 }
969
970 // Use the same value for out-of-range and default in the hope of not having to allocate
971 // different blocks, since ICU4X never does out-of-range queries.
972 uint32_t trieDefault = root ? icu::Collation::UNASSIGNED_CE32 : icu::Collation::FALLBACK_CE32;
973 icu::LocalUMutableCPTriePointer builder(umutablecptrie_open(trieDefault, trieDefault, status));
974
975 utrie2_enum(data->trie, NULL, &convertTrie, builder.getAlias());
976
977 // If the diacritic table was cut short, copy CE32s between the lowered
978 // limit and the max limit from the root to the tailoring. As of June 2022,
979 // no collation in CLDR needs this.
980 for (UChar32 c = diacriticLimit; c < ICU4X_DIACRITIC_LIMIT; ++c) {
981 if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344) {
982 // These never occur in NFD data.
983 continue;
984 }
985 uint32_t ce32 = data->getCE32(c);
986 if (ce32 == icu::Collation::FALLBACK_CE32) {
987 ce32 = data->base->getCE32(c);
988 umutablecptrie_set(builder.getAlias(), c, ce32, status);
989 }
990 }
991
992 // Ensure that the range covered by the diacritic table isn't duplicated
993 // in the trie.
994 for (UChar32 c = ICU4X_DIACRITIC_BASE; c < diacriticLimit; ++c) {
995 if (umutablecptrie_get(builder.getAlias(), c) != trieDefault) {
996 umutablecptrie_set(builder.getAlias(), c, trieDefault, status);
997 }
998 }
999
1000 icu::LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
1001 builder.getAlias(),
1002 UCPTRIE_TYPE_SMALL,
1003 UCPTRIE_VALUE_BITS_32,
1004 status));
1005 usrc_writeArray(f, "contexts = [\n ", data->contexts, 16, data->contextsLength, " ", "\n]\n");
1006 usrc_writeArray(f, "ce32s = [\n ", data->ce32s, 32, data->ce32sLength, " ", "\n]\n");
1007 usrc_writeArray(f, "ces = [\n ", data->ces, 64, data->cesLength, " ", "\n]\n");
1008 fprintf(f, "[trie]\n");
1009 usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
1010
1011 fclose(f);
1012}
1013
1014static void
1015writeCollationSpecialPrimariesTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UErrorCode *status) {
1016 FILE* f = openTOML(outputdir, name, collationType, "prim", status);
1017 if (!f) {
1018 return;
1019 }
1020 // printf("writeCollationSpecialPrimariesTOML %s %s\n", name, collationType);
1021
1022 uint16_t lastPrimaries[4];
1023 for (int32_t i = 0; i < 4; ++i) {
1024 // getLastPrimaryForGroup subtracts one from a 16-bit value, so we add one
1025 // back to get a value that fits in 16 bits.
1026 lastPrimaries[i] = (uint16_t)((data->getLastPrimaryForGroup(UCOL_REORDER_CODE_FIRST + i) + 1) >> 16);
1027 }
1028
1029 uint32_t numericPrimary = data->numericPrimary;
1030 if (numericPrimary & 0xFFFFFF) {
1031 printf("Lower 24 bits set in numeric primary");
1032 *status = U_INTERNAL_PROGRAM_ERROR;
1033 return;
1034 }
1035
1036 usrc_writeArray(f, "last_primaries = [\n ", lastPrimaries, 16, 4, " ", "\n]\n");
1037 fprintf(f, "numeric_primary = 0x%X\n", numericPrimary >> 24);
1038 fclose(f);
1039}
1040
1041static void
1042writeCollationTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, const icu::CollationSettings* settings, UErrorCode *status) {
1043 UBool tailored = false;
1044 UBool tailoredDiacritics = false;
1045 UBool lithuanianDotAbove = (uprv_strcmp(name, "lt") == 0);
1046 UBool reordering = false;
1047 UBool isRoot = uprv_strcmp(name, "root") == 0;
1048 UChar32 diacriticLimit = ICU4X_DIACRITIC_LIMIT;
1049 if (!data->base && isRoot) {
1050 diacriticLimit = writeCollationDiacriticsTOML(outputdir, name, collationType, data, status);
1051 if (U_FAILURE(*status)) {
1052 return;
1053 }
1054 writeCollationJamoTOML(outputdir, name, collationType, data, status);
1055 if (U_FAILURE(*status)) {
1056 return;
1057 }
1058 writeCollationSpecialPrimariesTOML(outputdir, name, collationType, data, status);
1059 if (U_FAILURE(*status)) {
1060 return;
1061 }
1062 } else if (data->base && !lithuanianDotAbove) {
1063 for (UChar32 c = ICU4X_DIACRITIC_BASE; c < ICU4X_DIACRITIC_LIMIT; ++c) {
1064 if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344) {
1065 // These never occur in NFD data.
1066 continue;
1067 }
1068 uint32_t ce32 = data->getCE32(c);
1069 if ((ce32 != icu::Collation::FALLBACK_CE32) && (ce32 != data->base->getCE32(c))) {
1070 tailoredDiacritics = true;
1071 diacriticLimit = writeCollationDiacriticsTOML(outputdir, name, collationType, data, status);
1072 if (U_FAILURE(*status)) {
1073 return;
1074 }
1075 break;
1076 }
1077 }
1078 }
1079
1080 if (settings->hasReordering()) {
1081 reordering = true;
1082 // Note: There are duplicate reorderings. Expecting the ICU4X provider
1083 // to take care of deduplication.
1084 writeCollationReorderingTOML(outputdir, name, collationType, settings, status);
1085 if (U_FAILURE(*status)) {
1086 return;
1087 }
1088 }
1089
1090 // Write collation data if either base is non-null or the name is root.
1091 // Languages that only reorder scripts are otherwise root-like and have
1092 // null base.
1093 if (data->base || isRoot) {
1094 tailored = !isRoot;
1095 writeCollationDataTOML(outputdir, name, collationType, data, (!data->base && isRoot), diacriticLimit, status);
1096 if (U_FAILURE(*status)) {
1097 return;
1098 }
1099 }
1100
1101 uint32_t maxVariable = (uint32_t)settings->getMaxVariable();
1102 if (maxVariable >= 4) {
1103 printf("Max variable out of range");
1104 *status = U_INTERNAL_PROGRAM_ERROR;
1105 return;
1106 }
1107
1108 uint32_t metadataBits = maxVariable;
1109 if (tailored) {
1110 metadataBits |= (1 << 3);
1111 }
1112 if (tailoredDiacritics) {
1113 metadataBits |= (1 << 4);
1114 }
1115 if (reordering) {
1116 metadataBits |= (1 << 5);
1117 }
1118 if (lithuanianDotAbove) {
1119 metadataBits |= (1 << 6);
1120 }
1121 if ((settings->options & icu::CollationSettings::BACKWARD_SECONDARY) != 0) {
1122 metadataBits |= (1 << 7);
1123 }
1124 if (settings->getAlternateHandling() == UCOL_SHIFTED) {
1125 metadataBits |= (1 << 8);
1126 }
1127 switch (settings->getCaseFirst()) {
1128 case UCOL_OFF:
1129 break;
1130 case UCOL_UPPER_FIRST:
1131 metadataBits |= (1 << 9);
1132 metadataBits |= (1 << 10);
1133 break;
1134 case UCOL_LOWER_FIRST:
1135 metadataBits |= (1 << 9);
1136 break;
1137 default:
1138 *status = U_INTERNAL_PROGRAM_ERROR;
1139 return;
1140 }
1141
1142 writeCollationMetadataTOML(outputdir, name, collationType, metadataBits, status);
1143}
1144
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001145#endif // !UCONFIG_NO_COLLATION
1146
Jungshik Shin70f82502016-01-29 00:32:36 -08001147static TableResource *
1148addCollation(ParseState* state, TableResource *result, const char *collationType,
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001149 uint32_t startline, UErrorCode *status)
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001150{
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001151 // TODO: Use LocalPointer for result, or make caller close it when there is a failure.
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001152 struct SResource *member = NULL;
1153 struct UString *tokenValue;
1154 struct UString comment;
1155 enum ETokenType token;
1156 char subtag[1024];
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001157 UnicodeString rules;
Frank Tang1f164ee2022-11-08 12:31:27 -08001158 UBool haveRules = false;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001159 UVersionInfo version;
1160 uint32_t line;
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001161
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001162 /* '{' . (name resource)* '}' */
1163 version[0]=0; version[1]=0; version[2]=0; version[3]=0;
1164
1165 for (;;)
1166 {
1167 ustr_init(&comment);
1168 token = getToken(state, &tokenValue, &comment, &line, status);
1169
1170 if (token == TOK_CLOSE_BRACE)
1171 {
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001172 break;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001173 }
1174
1175 if (token != TOK_STRING)
1176 {
1177 res_close(result);
1178 *status = U_INVALID_FORMAT_ERROR;
1179
1180 if (token == TOK_EOF)
1181 {
1182 error(startline, "unterminated table");
1183 }
1184 else
1185 {
1186 error(line, "Unexpected token %s", tokenNames[token]);
1187 }
1188
1189 return NULL;
1190 }
1191
1192 u_UCharsToChars(tokenValue->fChars, subtag, u_strlen(tokenValue->fChars) + 1);
1193
1194 if (U_FAILURE(*status))
1195 {
1196 res_close(result);
1197 return NULL;
1198 }
1199
1200 member = parseResource(state, subtag, NULL, status);
1201
1202 if (U_FAILURE(*status))
1203 {
1204 res_close(result);
1205 return NULL;
1206 }
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001207 if (result == NULL)
1208 {
1209 // Ignore the parsed resources, continue parsing.
1210 }
Jungshik Shin70f82502016-01-29 00:32:36 -08001211 else if (uprv_strcmp(subtag, "Version") == 0 && member->isString())
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001212 {
Jungshik Shin70f82502016-01-29 00:32:36 -08001213 StringResource *sr = static_cast<StringResource *>(member);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001214 char ver[40];
Jungshik Shin70f82502016-01-29 00:32:36 -08001215 int32_t length = sr->length();
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001216
Jungshik Shin70f82502016-01-29 00:32:36 -08001217 if (length >= UPRV_LENGTHOF(ver))
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001218 {
Jungshik Shin70f82502016-01-29 00:32:36 -08001219 length = UPRV_LENGTHOF(ver) - 1;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001220 }
1221
Jungshik Shin70f82502016-01-29 00:32:36 -08001222 sr->fString.extract(0, length, ver, UPRV_LENGTHOF(ver), US_INV);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001223 u_versionFromString(version, ver);
1224
Jungshik Shin70f82502016-01-29 00:32:36 -08001225 result->add(member, line, *status);
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001226 member = NULL;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001227 }
1228 else if(uprv_strcmp(subtag, "%%CollationBin")==0)
1229 {
1230 /* discard duplicate %%CollationBin if any*/
1231 }
Jungshik Shin70f82502016-01-29 00:32:36 -08001232 else if (uprv_strcmp(subtag, "Sequence") == 0 && member->isString())
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001233 {
Jungshik Shin70f82502016-01-29 00:32:36 -08001234 StringResource *sr = static_cast<StringResource *>(member);
1235 rules = sr->fString;
Frank Tang1f164ee2022-11-08 12:31:27 -08001236 haveRules = true;
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001237 // Defer building the collator until we have seen
1238 // all sub-elements of the collation table, including the Version.
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001239 /* in order to achieve smaller data files, we can direct genrb */
1240 /* to omit collation rules */
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001241 if(!state->omitCollationRules) {
Jungshik Shin70f82502016-01-29 00:32:36 -08001242 result->add(member, line, *status);
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001243 member = NULL;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001244 }
1245 }
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001246 else // Just copy non-special items.
1247 {
Jungshik Shin70f82502016-01-29 00:32:36 -08001248 result->add(member, line, *status);
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001249 member = NULL;
1250 }
1251 res_close(member); // TODO: use LocalPointer
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001252 if (U_FAILURE(*status))
1253 {
1254 res_close(result);
1255 return NULL;
1256 }
1257 }
1258
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001259 if (!haveRules) { return result; }
1260
1261#if UCONFIG_NO_COLLATION || UCONFIG_NO_FILE_IO
1262 warning(line, "Not building collation elements because of UCONFIG_NO_COLLATION and/or UCONFIG_NO_FILE_IO, see uconfig.h");
1263 (void)collationType;
1264#else
1265 // CLDR ticket #3949, ICU ticket #8082:
1266 // Do not build collation binary data for for-import-only "private" collation rule strings.
1267 if (uprv_strncmp(collationType, "private-", 8) == 0) {
1268 if(isVerbose()) {
1269 printf("Not building %s~%s collation binary\n", state->filename, collationType);
1270 }
1271 return result;
1272 }
1273
1274 if(!state->makeBinaryCollation) {
1275 if(isVerbose()) {
1276 printf("Not building %s~%s collation binary\n", state->filename, collationType);
1277 }
1278 return result;
1279 }
1280 UErrorCode intStatus = U_ZERO_ERROR;
1281 UParseError parseError;
1282 uprv_memset(&parseError, 0, sizeof(parseError));
1283 GenrbImporter importer(state->inputdir, state->outputdir);
1284 const icu::CollationTailoring *base = icu::CollationRoot::getRoot(intStatus);
1285 if(U_FAILURE(intStatus)) {
1286 error(line, "failed to load root collator (ucadata.icu) - %s", u_errorName(intStatus));
1287 res_close(result);
1288 return NULL; // TODO: use LocalUResourceBundlePointer for result
1289 }
Frank Tang1f164ee2022-11-08 12:31:27 -08001290 icu::CollationBuilder builder(base, state->icu4xMode, intStatus);
1291 if(state->icu4xMode || (uprv_strncmp(collationType, "search", 6) == 0)) {
1292 builder.disableFastLatin(); // build fast-Latin table unless search collator or ICU4X
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001293 }
1294 LocalPointer<icu::CollationTailoring> t(
1295 builder.parseAndBuild(rules, version, &importer, &parseError, intStatus));
1296 if(U_FAILURE(intStatus)) {
1297 const char *reason = builder.getErrorReason();
1298 if(reason == NULL) { reason = ""; }
1299 error(line, "CollationBuilder failed at %s~%s/Sequence rule offset %ld: %s %s",
1300 state->filename, collationType,
1301 (long)parseError.offset, u_errorName(intStatus), reason);
1302 if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) {
1303 // Print pre- and post-context.
1304 char preBuffer[100], postBuffer[100];
1305 escape(parseError.preContext, preBuffer);
1306 escape(parseError.postContext, postBuffer);
1307 error(line, " error context: \"...%s\" ! \"%s...\"", preBuffer, postBuffer);
1308 }
Jungshik Shin70f82502016-01-29 00:32:36 -08001309 if(isStrict() || t.isNull()) {
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001310 *status = intStatus;
1311 res_close(result);
1312 return NULL;
1313 }
1314 }
Frank Tang1f164ee2022-11-08 12:31:27 -08001315 if (state->icu4xMode) {
1316 char *nameWithoutSuffix = static_cast<char *>(uprv_malloc(uprv_strlen(state->filename) + 1));
1317 if (nameWithoutSuffix == NULL) {
1318 *status = U_MEMORY_ALLOCATION_ERROR;
1319 res_close(result);
1320 return NULL;
1321 }
1322 uprv_strcpy(nameWithoutSuffix, state->filename);
1323 *uprv_strrchr(nameWithoutSuffix, '.') = 0;
1324
1325 writeCollationTOML(state->outputdir, nameWithoutSuffix, collationType, t->data, t->settings, status);
1326 uprv_free(nameWithoutSuffix);
1327 }
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001328 icu::LocalMemory<uint8_t> buffer;
1329 int32_t capacity = 100000;
1330 uint8_t *dest = buffer.allocateInsteadAndCopy(capacity);
1331 if(dest == NULL) {
1332 fprintf(stderr, "memory allocation (%ld bytes) for file contents failed\n",
1333 (long)capacity);
1334 *status = U_MEMORY_ALLOCATION_ERROR;
1335 res_close(result);
1336 return NULL;
1337 }
1338 int32_t indexes[icu::CollationDataReader::IX_TOTAL_SIZE + 1];
1339 int32_t totalSize = icu::CollationDataWriter::writeTailoring(
1340 *t, *t->settings, indexes, dest, capacity, intStatus);
1341 if(intStatus == U_BUFFER_OVERFLOW_ERROR) {
1342 intStatus = U_ZERO_ERROR;
1343 capacity = totalSize;
1344 dest = buffer.allocateInsteadAndCopy(capacity);
1345 if(dest == NULL) {
1346 fprintf(stderr, "memory allocation (%ld bytes) for file contents failed\n",
1347 (long)capacity);
1348 *status = U_MEMORY_ALLOCATION_ERROR;
1349 res_close(result);
1350 return NULL;
1351 }
1352 totalSize = icu::CollationDataWriter::writeTailoring(
1353 *t, *t->settings, indexes, dest, capacity, intStatus);
1354 }
1355 if(U_FAILURE(intStatus)) {
1356 fprintf(stderr, "CollationDataWriter::writeTailoring() failed: %s\n",
1357 u_errorName(intStatus));
1358 res_close(result);
1359 return NULL;
1360 }
1361 if(isVerbose()) {
1362 printf("%s~%s collation tailoring part sizes:\n", state->filename, collationType);
1363 icu::CollationInfo::printSizes(totalSize, indexes);
Jungshik Shin70f82502016-01-29 00:32:36 -08001364 if(t->settings->hasReordering()) {
1365 printf("%s~%s collation reordering ranges:\n", state->filename, collationType);
1366 icu::CollationInfo::printReorderRanges(
1367 *t->data, t->settings->reorderCodes, t->settings->reorderCodesLength);
1368 }
Jungshik Shin42d50272018-10-24 01:22:09 -07001369#if 0 // debugging output
1370 } else {
1371 printf("%s~%s collation tailoring part sizes:\n", state->filename, collationType);
1372 icu::CollationInfo::printSizes(totalSize, indexes);
1373#endif
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001374 }
1375 struct SResource *collationBin = bin_open(state->bundle, "%%CollationBin", totalSize, dest, NULL, NULL, status);
Jungshik Shin70f82502016-01-29 00:32:36 -08001376 result->add(collationBin, line, *status);
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001377 if (U_FAILURE(*status)) {
1378 res_close(result);
1379 return NULL;
1380 }
1381#endif
1382 return result;
1383}
1384
1385static UBool
1386keepCollationType(const char * /*type*/) {
Frank Tang1f164ee2022-11-08 12:31:27 -08001387 return true;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001388}
1389
1390static struct SResource *
1391parseCollationElements(ParseState* state, char *tag, uint32_t startline, UBool newCollation, UErrorCode *status)
1392{
Jungshik Shin70f82502016-01-29 00:32:36 -08001393 TableResource *result = NULL;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001394 struct SResource *member = NULL;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001395 struct UString *tokenValue;
1396 struct UString comment;
1397 enum ETokenType token;
1398 char subtag[1024], typeKeyword[1024];
1399 uint32_t line;
1400
1401 result = table_open(state->bundle, tag, NULL, status);
1402
1403 if (result == NULL || U_FAILURE(*status))
1404 {
1405 return NULL;
1406 }
1407 if(isVerbose()){
1408 printf(" collation elements %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
1409 }
1410 if(!newCollation) {
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001411 return addCollation(state, result, "(no type)", startline, status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001412 }
1413 else {
1414 for(;;) {
1415 ustr_init(&comment);
1416 token = getToken(state, &tokenValue, &comment, &line, status);
1417
1418 if (token == TOK_CLOSE_BRACE)
1419 {
1420 return result;
1421 }
1422
1423 if (token != TOK_STRING)
1424 {
1425 res_close(result);
1426 *status = U_INVALID_FORMAT_ERROR;
1427
1428 if (token == TOK_EOF)
1429 {
1430 error(startline, "unterminated table");
1431 }
1432 else
1433 {
1434 error(line, "Unexpected token %s", tokenNames[token]);
1435 }
1436
1437 return NULL;
1438 }
1439
1440 u_UCharsToChars(tokenValue->fChars, subtag, u_strlen(tokenValue->fChars) + 1);
1441
1442 if (U_FAILURE(*status))
1443 {
1444 res_close(result);
1445 return NULL;
1446 }
1447
1448 if (uprv_strcmp(subtag, "default") == 0)
1449 {
1450 member = parseResource(state, subtag, NULL, status);
1451
1452 if (U_FAILURE(*status))
1453 {
1454 res_close(result);
1455 return NULL;
1456 }
1457
Jungshik Shin70f82502016-01-29 00:32:36 -08001458 result->add(member, line, *status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001459 }
1460 else
1461 {
1462 token = peekToken(state, 0, &tokenValue, &line, &comment, status);
1463 /* this probably needs to be refactored or recursively use the parser */
1464 /* first we assume that our collation table won't have the explicit type */
1465 /* then, we cannot handle aliases */
1466 if(token == TOK_OPEN_BRACE) {
1467 token = getToken(state, &tokenValue, &comment, &line, status);
Jungshik Shin70f82502016-01-29 00:32:36 -08001468 TableResource *collationRes;
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001469 if (keepCollationType(subtag)) {
1470 collationRes = table_open(state->bundle, subtag, NULL, status);
1471 } else {
1472 collationRes = NULL;
1473 }
1474 // need to parse the collation data regardless
1475 collationRes = addCollation(state, collationRes, subtag, startline, status);
1476 if (collationRes != NULL) {
Jungshik Shin70f82502016-01-29 00:32:36 -08001477 result->add(collationRes, startline, *status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001478 }
1479 } else if(token == TOK_COLON) { /* right now, we'll just try to see if we have aliases */
1480 /* we could have a table too */
1481 token = peekToken(state, 1, &tokenValue, &line, &comment, status);
1482 u_UCharsToChars(tokenValue->fChars, typeKeyword, u_strlen(tokenValue->fChars) + 1);
1483 if(uprv_strcmp(typeKeyword, "alias") == 0) {
1484 member = parseResource(state, subtag, NULL, status);
1485 if (U_FAILURE(*status))
1486 {
1487 res_close(result);
1488 return NULL;
1489 }
1490
Jungshik Shin70f82502016-01-29 00:32:36 -08001491 result->add(member, line, *status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001492 } else {
1493 res_close(result);
1494 *status = U_INVALID_FORMAT_ERROR;
1495 return NULL;
1496 }
1497 } else {
1498 res_close(result);
1499 *status = U_INVALID_FORMAT_ERROR;
1500 return NULL;
1501 }
1502 }
1503
1504 /*member = string_open(bundle, subtag, tokenValue->fChars, tokenValue->fLength, status);*/
1505
1506 /*expect(TOK_CLOSE_BRACE, NULL, NULL, status);*/
1507
1508 if (U_FAILURE(*status))
1509 {
1510 res_close(result);
1511 return NULL;
1512 }
1513 }
1514 }
1515}
1516
1517/* Necessary, because CollationElements requires the bundle->fRoot member to be present which,
1518 if this weren't special-cased, wouldn't be set until the entire file had been processed. */
1519static struct SResource *
Jungshik Shin70f82502016-01-29 00:32:36 -08001520realParseTable(ParseState* state, TableResource *table, char *tag, uint32_t startline, UErrorCode *status)
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001521{
1522 struct SResource *member = NULL;
1523 struct UString *tokenValue=NULL;
1524 struct UString comment;
1525 enum ETokenType token;
1526 char subtag[1024];
1527 uint32_t line;
Frank Tang1f164ee2022-11-08 12:31:27 -08001528 UBool readToken = false;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001529
1530 /* '{' . (name resource)* '}' */
1531
1532 if(isVerbose()){
1533 printf(" parsing table %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
1534 }
1535 for (;;)
1536 {
1537 ustr_init(&comment);
1538 token = getToken(state, &tokenValue, &comment, &line, status);
1539
1540 if (token == TOK_CLOSE_BRACE)
1541 {
Frank Tang7e7574b2021-04-13 21:19:13 -07001542 if (!readToken && isVerbose()) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001543 warning(startline, "Encountered empty table");
1544 }
1545 return table;
1546 }
1547
1548 if (token != TOK_STRING)
1549 {
1550 *status = U_INVALID_FORMAT_ERROR;
1551
1552 if (token == TOK_EOF)
1553 {
1554 error(startline, "unterminated table");
1555 }
1556 else
1557 {
1558 error(line, "unexpected token %s", tokenNames[token]);
1559 }
1560
1561 return NULL;
1562 }
1563
1564 if(uprv_isInvariantUString(tokenValue->fChars, -1)) {
1565 u_UCharsToChars(tokenValue->fChars, subtag, u_strlen(tokenValue->fChars) + 1);
1566 } else {
1567 *status = U_INVALID_FORMAT_ERROR;
1568 error(line, "invariant characters required for table keys");
1569 return NULL;
1570 }
1571
1572 if (U_FAILURE(*status))
1573 {
1574 error(line, "parse error. Stopped parsing tokens with %s", u_errorName(*status));
1575 return NULL;
1576 }
1577
1578 member = parseResource(state, subtag, &comment, status);
1579
1580 if (member == NULL || U_FAILURE(*status))
1581 {
1582 error(line, "parse error. Stopped parsing resource with %s", u_errorName(*status));
1583 return NULL;
1584 }
1585
Jungshik Shin70f82502016-01-29 00:32:36 -08001586 table->add(member, line, *status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001587
1588 if (U_FAILURE(*status))
1589 {
1590 error(line, "parse error. Stopped parsing table with %s", u_errorName(*status));
1591 return NULL;
1592 }
Frank Tang1f164ee2022-11-08 12:31:27 -08001593 readToken = true;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001594 ustr_deinit(&comment);
1595 }
1596
1597 /* not reached */
1598 /* A compiler warning will appear if all paths don't contain a return statement. */
1599/* *status = U_INTERNAL_PROGRAM_ERROR;
1600 return NULL;*/
1601}
1602
1603static struct SResource *
1604parseTable(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status)
1605{
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001606 if (tag != NULL && uprv_strcmp(tag, "CollationElements") == 0)
1607 {
Frank Tang1f164ee2022-11-08 12:31:27 -08001608 return parseCollationElements(state, tag, startline, false, status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001609 }
1610 if (tag != NULL && uprv_strcmp(tag, "collations") == 0)
1611 {
Frank Tang1f164ee2022-11-08 12:31:27 -08001612 return parseCollationElements(state, tag, startline, true, status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001613 }
1614 if(isVerbose()){
1615 printf(" table %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
1616 }
1617
Jungshik Shin70f82502016-01-29 00:32:36 -08001618 TableResource *result = table_open(state->bundle, tag, comment, status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001619
1620 if (result == NULL || U_FAILURE(*status))
1621 {
1622 return NULL;
1623 }
1624 return realParseTable(state, result, tag, startline, status);
1625}
1626
1627static struct SResource *
1628parseArray(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status)
1629{
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001630 struct SResource *member = NULL;
1631 struct UString *tokenValue;
1632 struct UString memberComments;
1633 enum ETokenType token;
Frank Tang1f164ee2022-11-08 12:31:27 -08001634 UBool readToken = false;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001635
Jungshik Shin70f82502016-01-29 00:32:36 -08001636 ArrayResource *result = array_open(state->bundle, tag, comment, status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001637
1638 if (result == NULL || U_FAILURE(*status))
1639 {
1640 return NULL;
1641 }
1642 if(isVerbose()){
1643 printf(" array %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
1644 }
1645
1646 ustr_init(&memberComments);
1647
1648 /* '{' . resource [','] '}' */
1649 for (;;)
1650 {
1651 /* reset length */
1652 ustr_setlen(&memberComments, 0, status);
1653
1654 /* check for end of array, but don't consume next token unless it really is the end */
1655 token = peekToken(state, 0, &tokenValue, NULL, &memberComments, status);
1656
1657
1658 if (token == TOK_CLOSE_BRACE)
1659 {
1660 getToken(state, NULL, NULL, NULL, status);
1661 if (!readToken) {
1662 warning(startline, "Encountered empty array");
1663 }
1664 break;
1665 }
1666
1667 if (token == TOK_EOF)
1668 {
1669 res_close(result);
1670 *status = U_INVALID_FORMAT_ERROR;
1671 error(startline, "unterminated array");
1672 return NULL;
1673 }
1674
1675 /* string arrays are a special case */
1676 if (token == TOK_STRING)
1677 {
1678 getToken(state, &tokenValue, &memberComments, NULL, status);
1679 member = string_open(state->bundle, NULL, tokenValue->fChars, tokenValue->fLength, &memberComments, status);
1680 }
1681 else
1682 {
1683 member = parseResource(state, NULL, &memberComments, status);
1684 }
1685
1686 if (member == NULL || U_FAILURE(*status))
1687 {
1688 res_close(result);
1689 return NULL;
1690 }
1691
Jungshik Shin70f82502016-01-29 00:32:36 -08001692 result->add(member);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001693
1694 /* eat optional comma if present */
1695 token = peekToken(state, 0, NULL, NULL, NULL, status);
1696
1697 if (token == TOK_COMMA)
1698 {
1699 getToken(state, NULL, NULL, NULL, status);
1700 }
1701
1702 if (U_FAILURE(*status))
1703 {
1704 res_close(result);
1705 return NULL;
1706 }
Frank Tang1f164ee2022-11-08 12:31:27 -08001707 readToken = true;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001708 }
1709
1710 ustr_deinit(&memberComments);
1711 return result;
1712}
1713
1714static struct SResource *
1715parseIntVector(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status)
1716{
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001717 enum ETokenType token;
1718 char *string;
1719 int32_t value;
Frank Tang1f164ee2022-11-08 12:31:27 -08001720 UBool readToken = false;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001721 char *stopstring;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001722 struct UString memberComments;
1723
Jungshik Shin70f82502016-01-29 00:32:36 -08001724 IntVectorResource *result = intvector_open(state->bundle, tag, comment, status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001725
1726 if (result == NULL || U_FAILURE(*status))
1727 {
1728 return NULL;
1729 }
1730
1731 if(isVerbose()){
1732 printf(" vector %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
1733 }
1734 ustr_init(&memberComments);
1735 /* '{' . string [','] '}' */
1736 for (;;)
1737 {
1738 ustr_setlen(&memberComments, 0, status);
1739
1740 /* check for end of array, but don't consume next token unless it really is the end */
1741 token = peekToken(state, 0, NULL, NULL,&memberComments, status);
1742
1743 if (token == TOK_CLOSE_BRACE)
1744 {
1745 /* it's the end, consume the close brace */
1746 getToken(state, NULL, NULL, NULL, status);
1747 if (!readToken) {
1748 warning(startline, "Encountered empty int vector");
1749 }
1750 ustr_deinit(&memberComments);
1751 return result;
1752 }
1753
Frank Tangb8696612019-10-25 14:58:21 -07001754 int32_t stringLength;
1755 string = getInvariantString(state, NULL, NULL, stringLength, status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001756
1757 if (U_FAILURE(*status))
1758 {
1759 res_close(result);
1760 return NULL;
1761 }
1762
1763 /* For handling illegal char in the Intvector */
1764 value = uprv_strtoul(string, &stopstring, 0);/* make intvector support decimal,hexdigit,octal digit ranging from -2^31-2^32-1*/
Frank Tangb8696612019-10-25 14:58:21 -07001765 int32_t len = (int32_t)(stopstring-string);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001766
Frank Tangb8696612019-10-25 14:58:21 -07001767 if(len==stringLength)
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001768 {
Jungshik Shin70f82502016-01-29 00:32:36 -08001769 result->add(value, *status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001770 uprv_free(string);
1771 token = peekToken(state, 0, NULL, NULL, NULL, status);
1772 }
1773 else
1774 {
1775 uprv_free(string);
1776 *status=U_INVALID_CHAR_FOUND;
1777 }
1778
1779 if (U_FAILURE(*status))
1780 {
1781 res_close(result);
1782 return NULL;
1783 }
1784
1785 /* the comma is optional (even though it is required to prevent the reader from concatenating
1786 consecutive entries) so that a missing comma on the last entry isn't an error */
1787 if (token == TOK_COMMA)
1788 {
1789 getToken(state, NULL, NULL, NULL, status);
1790 }
Frank Tang1f164ee2022-11-08 12:31:27 -08001791 readToken = true;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001792 }
1793
1794 /* not reached */
1795 /* A compiler warning will appear if all paths don't contain a return statement. */
1796/* intvector_close(result, status);
1797 *status = U_INTERNAL_PROGRAM_ERROR;
1798 return NULL;*/
1799}
1800
1801static struct SResource *
1802parseBinary(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status)
1803{
Jungshik Shin70f82502016-01-29 00:32:36 -08001804 uint32_t line;
Frank Tangb8696612019-10-25 14:58:21 -07001805 int32_t stringLength;
1806 LocalMemory<char> string(getInvariantString(state, &line, NULL, stringLength, status));
Jungshik Shin70f82502016-01-29 00:32:36 -08001807 if (string.isNull() || U_FAILURE(*status))
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001808 {
1809 return NULL;
1810 }
1811
1812 expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001813 if (U_FAILURE(*status))
1814 {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001815 return NULL;
1816 }
1817
1818 if(isVerbose()){
1819 printf(" binary %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
1820 }
1821
Frank Tangb8696612019-10-25 14:58:21 -07001822 LocalMemory<uint8_t> value;
1823 int32_t count = 0;
1824 if (stringLength > 0 && value.allocateInsteadAndCopy(stringLength) == NULL)
1825 {
1826 *status = U_MEMORY_ALLOCATION_ERROR;
1827 return NULL;
1828 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001829
Frank Tangb8696612019-10-25 14:58:21 -07001830 char toConv[3] = {'\0', '\0', '\0'};
1831 for (int32_t i = 0; i < stringLength;)
1832 {
1833 // Skip spaces (which may have been line endings).
1834 char c0 = string[i++];
1835 if (c0 == ' ') { continue; }
1836 if (i == stringLength) {
1837 *status=U_INVALID_CHAR_FOUND;
1838 error(line, "Encountered invalid binary value (odd number of hex digits)");
1839 return NULL;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001840 }
Frank Tangb8696612019-10-25 14:58:21 -07001841 toConv[0] = c0;
1842 toConv[1] = string[i++];
1843
1844 char *stopstring;
1845 value[count++] = (uint8_t) uprv_strtoul(toConv, &stopstring, 16);
1846 uint32_t len=(uint32_t)(stopstring-toConv);
1847
1848 if(len!=2)
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001849 {
Frank Tangb8696612019-10-25 14:58:21 -07001850 *status=U_INVALID_CHAR_FOUND;
1851 error(line, "Encountered invalid binary value (not all pairs of hex digits)");
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001852 return NULL;
1853 }
1854 }
Frank Tangb8696612019-10-25 14:58:21 -07001855
1856 if (count == 0) {
Jungshik Shin70f82502016-01-29 00:32:36 -08001857 warning(startline, "Encountered empty binary value");
1858 return bin_open(state->bundle, tag, 0, NULL, "", comment, status);
Frank Tangb8696612019-10-25 14:58:21 -07001859 } else {
1860 return bin_open(state->bundle, tag, count, value.getAlias(), NULL, comment, status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001861 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001862}
1863
1864static struct SResource *
1865parseInteger(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status)
1866{
1867 struct SResource *result = NULL;
1868 int32_t value;
1869 char *string;
1870 char *stopstring;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001871
Frank Tangb8696612019-10-25 14:58:21 -07001872 int32_t stringLength;
1873 string = getInvariantString(state, NULL, NULL, stringLength, status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001874
1875 if (string == NULL || U_FAILURE(*status))
1876 {
1877 return NULL;
1878 }
1879
1880 expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
1881
1882 if (U_FAILURE(*status))
1883 {
1884 uprv_free(string);
1885 return NULL;
1886 }
1887
1888 if(isVerbose()){
1889 printf(" integer %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
1890 }
1891
Frank Tangb8696612019-10-25 14:58:21 -07001892 if (stringLength == 0)
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001893 {
1894 warning(startline, "Encountered empty integer. Default value is 0.");
1895 }
1896
1897 /* Allow integer support for hexdecimal, octal digit and decimal*/
1898 /* and handle illegal char in the integer*/
1899 value = uprv_strtoul(string, &stopstring, 0);
Frank Tangb8696612019-10-25 14:58:21 -07001900 int32_t len = (int32_t)(stopstring-string);
1901 if(len==stringLength)
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001902 {
1903 result = int_open(state->bundle, tag, value, comment, status);
1904 }
1905 else
1906 {
1907 *status=U_INVALID_CHAR_FOUND;
1908 }
1909 uprv_free(string);
1910
1911 return result;
1912}
1913
1914static struct SResource *
1915parseImport(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status)
1916{
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001917 uint32_t line;
Frank Tangb8696612019-10-25 14:58:21 -07001918 int32_t stringLength;
1919 LocalMemory<char> filename(getInvariantString(state, &line, NULL, stringLength, status));
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001920 if (U_FAILURE(*status))
1921 {
1922 return NULL;
1923 }
1924
1925 expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
1926
1927 if (U_FAILURE(*status))
1928 {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001929 return NULL;
1930 }
1931
1932 if(isVerbose()){
1933 printf(" import %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
1934 }
1935
1936 /* Open the input file for reading */
Jungshik Shin70f82502016-01-29 00:32:36 -08001937 CharString fullname;
1938 if (state->inputdir != NULL) {
1939 fullname.append(state->inputdir, *status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001940 }
Jungshik Shin70f82502016-01-29 00:32:36 -08001941 fullname.appendPathPart(filename.getAlias(), *status);
1942 if (U_FAILURE(*status)) {
1943 return NULL;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001944 }
1945
Jungshik Shin70f82502016-01-29 00:32:36 -08001946 FileStream *file = T_FileStream_open(fullname.data(), "rb");
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001947 if (file == NULL)
1948 {
Jungshik Shin70f82502016-01-29 00:32:36 -08001949 error(line, "couldn't open input file %s", filename.getAlias());
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001950 *status = U_FILE_ACCESS_ERROR;
1951 return NULL;
1952 }
1953
Jungshik Shin70f82502016-01-29 00:32:36 -08001954 int32_t len = T_FileStream_size(file);
1955 LocalMemory<uint8_t> data;
1956 if(data.allocateInsteadAndCopy(len) == NULL)
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001957 {
1958 *status = U_MEMORY_ALLOCATION_ERROR;
1959 T_FileStream_close (file);
1960 return NULL;
1961 }
1962
Jungshik Shin70f82502016-01-29 00:32:36 -08001963 /* int32_t numRead = */ T_FileStream_read(file, data.getAlias(), len);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001964 T_FileStream_close (file);
1965
Jungshik Shin70f82502016-01-29 00:32:36 -08001966 return bin_open(state->bundle, tag, len, data.getAlias(), fullname.data(), comment, status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001967}
1968
1969static struct SResource *
1970parseInclude(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status)
1971{
1972 struct SResource *result;
1973 int32_t len=0;
1974 char *filename;
1975 uint32_t line;
1976 UChar *pTarget = NULL;
1977
1978 UCHARBUF *ucbuf;
1979 char *fullname = NULL;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001980 const char* cp = NULL;
1981 const UChar* uBuffer = NULL;
1982
Frank Tangb8696612019-10-25 14:58:21 -07001983 int32_t stringLength;
1984 filename = getInvariantString(state, &line, NULL, stringLength, status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001985
1986 if (U_FAILURE(*status))
1987 {
1988 return NULL;
1989 }
1990
1991 expect(state, TOK_CLOSE_BRACE, NULL, NULL, NULL, status);
1992
1993 if (U_FAILURE(*status))
1994 {
1995 uprv_free(filename);
1996 return NULL;
1997 }
1998
1999 if(isVerbose()){
2000 printf(" include %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
2001 }
2002
Frank Tangb8696612019-10-25 14:58:21 -07002003 fullname = (char *) uprv_malloc(state->inputdirLength + stringLength + 2);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00002004 /* test for NULL */
2005 if(fullname == NULL)
2006 {
2007 *status = U_MEMORY_ALLOCATION_ERROR;
2008 uprv_free(filename);
2009 return NULL;
2010 }
2011
2012 if(state->inputdir!=NULL){
2013 if (state->inputdir[state->inputdirLength - 1] != U_FILE_SEP_CHAR)
2014 {
2015
2016 uprv_strcpy(fullname, state->inputdir);
2017
2018 fullname[state->inputdirLength] = U_FILE_SEP_CHAR;
2019 fullname[state->inputdirLength + 1] = '\0';
2020
2021 uprv_strcat(fullname, filename);
2022 }
2023 else
2024 {
2025 uprv_strcpy(fullname, state->inputdir);
2026 uprv_strcat(fullname, filename);
2027 }
2028 }else{
2029 uprv_strcpy(fullname,filename);
2030 }
2031
Frank Tang1f164ee2022-11-08 12:31:27 -08002032 ucbuf = ucbuf_open(fullname, &cp,getShowWarning(),false,status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00002033
2034 if (U_FAILURE(*status)) {
2035 error(line, "couldn't open input file %s\n", filename);
2036 return NULL;
2037 }
2038
2039 uBuffer = ucbuf_getBuffer(ucbuf,&len,status);
2040 result = string_open(state->bundle, tag, uBuffer, len, comment, status);
2041
2042 ucbuf_close(ucbuf);
2043
2044 uprv_free(pTarget);
2045
2046 uprv_free(filename);
2047 uprv_free(fullname);
2048
2049 return result;
2050}
2051
2052
2053
2054
2055
2056U_STRING_DECL(k_type_string, "string", 6);
2057U_STRING_DECL(k_type_binary, "binary", 6);
2058U_STRING_DECL(k_type_bin, "bin", 3);
2059U_STRING_DECL(k_type_table, "table", 5);
2060U_STRING_DECL(k_type_table_no_fallback, "table(nofallback)", 17);
2061U_STRING_DECL(k_type_int, "int", 3);
2062U_STRING_DECL(k_type_integer, "integer", 7);
2063U_STRING_DECL(k_type_array, "array", 5);
2064U_STRING_DECL(k_type_alias, "alias", 5);
2065U_STRING_DECL(k_type_intvector, "intvector", 9);
2066U_STRING_DECL(k_type_import, "import", 6);
2067U_STRING_DECL(k_type_include, "include", 7);
2068
2069/* Various non-standard processing plugins that create one or more special resources. */
2070U_STRING_DECL(k_type_plugin_uca_rules, "process(uca_rules)", 18);
2071U_STRING_DECL(k_type_plugin_collation, "process(collation)", 18);
2072U_STRING_DECL(k_type_plugin_transliterator, "process(transliterator)", 23);
2073U_STRING_DECL(k_type_plugin_dependency, "process(dependency)", 19);
2074
2075typedef enum EResourceType
2076{
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08002077 RESTYPE_UNKNOWN,
2078 RESTYPE_STRING,
2079 RESTYPE_BINARY,
2080 RESTYPE_TABLE,
2081 RESTYPE_TABLE_NO_FALLBACK,
2082 RESTYPE_INTEGER,
2083 RESTYPE_ARRAY,
2084 RESTYPE_ALIAS,
2085 RESTYPE_INTVECTOR,
2086 RESTYPE_IMPORT,
2087 RESTYPE_INCLUDE,
2088 RESTYPE_PROCESS_UCA_RULES,
2089 RESTYPE_PROCESS_COLLATION,
2090 RESTYPE_PROCESS_TRANSLITERATOR,
2091 RESTYPE_PROCESS_DEPENDENCY,
2092 RESTYPE_RESERVED
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00002093} EResourceType;
2094
2095static struct {
2096 const char *nameChars; /* only used for debugging */
2097 const UChar *nameUChars;
2098 ParseResourceFunction *parseFunction;
2099} gResourceTypes[] = {
2100 {"Unknown", NULL, NULL},
2101 {"string", k_type_string, parseString},
2102 {"binary", k_type_binary, parseBinary},
2103 {"table", k_type_table, parseTable},
2104 {"table(nofallback)", k_type_table_no_fallback, NULL}, /* parseFunction will never be called */
2105 {"integer", k_type_integer, parseInteger},
2106 {"array", k_type_array, parseArray},
2107 {"alias", k_type_alias, parseAlias},
2108 {"intvector", k_type_intvector, parseIntVector},
2109 {"import", k_type_import, parseImport},
2110 {"include", k_type_include, parseInclude},
2111 {"process(uca_rules)", k_type_plugin_uca_rules, parseUCARules},
2112 {"process(collation)", k_type_plugin_collation, NULL /* not implemented yet */},
2113 {"process(transliterator)", k_type_plugin_transliterator, parseTransliterator},
2114 {"process(dependency)", k_type_plugin_dependency, parseDependency},
2115 {"reserved", NULL, NULL}
2116};
2117
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08002118void initParser()
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00002119{
2120 U_STRING_INIT(k_type_string, "string", 6);
2121 U_STRING_INIT(k_type_binary, "binary", 6);
2122 U_STRING_INIT(k_type_bin, "bin", 3);
2123 U_STRING_INIT(k_type_table, "table", 5);
2124 U_STRING_INIT(k_type_table_no_fallback, "table(nofallback)", 17);
2125 U_STRING_INIT(k_type_int, "int", 3);
2126 U_STRING_INIT(k_type_integer, "integer", 7);
2127 U_STRING_INIT(k_type_array, "array", 5);
2128 U_STRING_INIT(k_type_alias, "alias", 5);
2129 U_STRING_INIT(k_type_intvector, "intvector", 9);
2130 U_STRING_INIT(k_type_import, "import", 6);
2131 U_STRING_INIT(k_type_include, "include", 7);
2132
2133 U_STRING_INIT(k_type_plugin_uca_rules, "process(uca_rules)", 18);
2134 U_STRING_INIT(k_type_plugin_collation, "process(collation)", 18);
2135 U_STRING_INIT(k_type_plugin_transliterator, "process(transliterator)", 23);
2136 U_STRING_INIT(k_type_plugin_dependency, "process(dependency)", 19);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00002137}
2138
2139static inline UBool isTable(enum EResourceType type) {
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08002140 return (UBool)(type==RESTYPE_TABLE || type==RESTYPE_TABLE_NO_FALLBACK);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00002141}
2142
2143static enum EResourceType
2144parseResourceType(ParseState* state, UErrorCode *status)
2145{
2146 struct UString *tokenValue;
2147 struct UString comment;
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08002148 enum EResourceType result = RESTYPE_UNKNOWN;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00002149 uint32_t line=0;
2150 ustr_init(&comment);
2151 expect(state, TOK_STRING, &tokenValue, &comment, &line, status);
2152
2153 if (U_FAILURE(*status))
2154 {
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08002155 return RESTYPE_UNKNOWN;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00002156 }
2157
2158 *status = U_ZERO_ERROR;
2159
2160 /* Search for normal types */
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08002161 result=RESTYPE_UNKNOWN;
2162 while ((result=(EResourceType)(result+1)) < RESTYPE_RESERVED) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00002163 if (u_strcmp(tokenValue->fChars, gResourceTypes[result].nameUChars) == 0) {
2164 break;
2165 }
2166 }
2167 /* Now search for the aliases */
2168 if (u_strcmp(tokenValue->fChars, k_type_int) == 0) {
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08002169 result = RESTYPE_INTEGER;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00002170 }
2171 else if (u_strcmp(tokenValue->fChars, k_type_bin) == 0) {
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08002172 result = RESTYPE_BINARY;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00002173 }
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08002174 else if (result == RESTYPE_RESERVED) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00002175 char tokenBuffer[1024];
2176 u_austrncpy(tokenBuffer, tokenValue->fChars, sizeof(tokenBuffer));
2177 tokenBuffer[sizeof(tokenBuffer) - 1] = 0;
2178 *status = U_INVALID_FORMAT_ERROR;
2179 error(line, "unknown resource type '%s'", tokenBuffer);
2180 }
2181
2182 return result;
2183}
2184
2185/* parse a non-top-level resource */
2186static struct SResource *
2187parseResource(ParseState* state, char *tag, const struct UString *comment, UErrorCode *status)
2188{
2189 enum ETokenType token;
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08002190 enum EResourceType resType = RESTYPE_UNKNOWN;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00002191 ParseResourceFunction *parseFunction = NULL;
2192 struct UString *tokenValue;
2193 uint32_t startline;
2194 uint32_t line;
2195
2196
2197 token = getToken(state, &tokenValue, NULL, &startline, status);
2198
2199 if(isVerbose()){
2200 printf(" resource %s at line %i \n", (tag == NULL) ? "(null)" : tag, (int)startline);
2201 }
2202
2203 /* name . [ ':' type ] '{' resource '}' */
2204 /* This function parses from the colon onwards. If the colon is present, parse the
2205 type then try to parse a resource of that type. If there is no explicit type,
2206 work it out using the lookahead tokens. */
2207 switch (token)
2208 {
2209 case TOK_EOF:
2210 *status = U_INVALID_FORMAT_ERROR;
2211 error(startline, "Unexpected EOF encountered");
2212 return NULL;
2213
2214 case TOK_ERROR:
2215 *status = U_INVALID_FORMAT_ERROR;
2216 return NULL;
2217
2218 case TOK_COLON:
2219 resType = parseResourceType(state, status);
2220 expect(state, TOK_OPEN_BRACE, &tokenValue, NULL, &startline, status);
2221
2222 if (U_FAILURE(*status))
2223 {
2224 return NULL;
2225 }
2226
2227 break;
2228
2229 case TOK_OPEN_BRACE:
2230 break;
2231
2232 default:
2233 *status = U_INVALID_FORMAT_ERROR;
2234 error(startline, "syntax error while reading a resource, expected '{' or ':'");
2235 return NULL;
2236 }
2237
2238
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08002239 if (resType == RESTYPE_UNKNOWN)
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00002240 {
2241 /* No explicit type, so try to work it out. At this point, we've read the first '{'.
2242 We could have any of the following:
2243 { { => array (nested)
2244 { :/} => array
2245 { string , => string array
2246
2247 { string { => table
2248
2249 { string :/{ => table
2250 { string } => string
2251 */
2252
2253 token = peekToken(state, 0, NULL, &line, NULL,status);
2254
2255 if (U_FAILURE(*status))
2256 {
2257 return NULL;
2258 }
2259
2260 if (token == TOK_OPEN_BRACE || token == TOK_COLON ||token ==TOK_CLOSE_BRACE )
2261 {
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08002262 resType = RESTYPE_ARRAY;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00002263 }
2264 else if (token == TOK_STRING)
2265 {
2266 token = peekToken(state, 1, NULL, &line, NULL, status);
2267
2268 if (U_FAILURE(*status))
2269 {
2270 return NULL;
2271 }
2272
2273 switch (token)
2274 {
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08002275 case TOK_COMMA: resType = RESTYPE_ARRAY; break;
2276 case TOK_OPEN_BRACE: resType = RESTYPE_TABLE; break;
2277 case TOK_CLOSE_BRACE: resType = RESTYPE_STRING; break;
2278 case TOK_COLON: resType = RESTYPE_TABLE; break;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00002279 default:
2280 *status = U_INVALID_FORMAT_ERROR;
2281 error(line, "Unexpected token after string, expected ',', '{' or '}'");
2282 return NULL;
2283 }
2284 }
2285 else
2286 {
2287 *status = U_INVALID_FORMAT_ERROR;
2288 error(line, "Unexpected token after '{'");
2289 return NULL;
2290 }
2291
2292 /* printf("Type guessed as %s\n", resourceNames[resType]); */
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08002293 } else if(resType == RESTYPE_TABLE_NO_FALLBACK) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00002294 *status = U_INVALID_FORMAT_ERROR;
2295 error(startline, "error: %s resource type not valid except on top bundle level", gResourceTypes[resType].nameChars);
2296 return NULL;
2297 }
2298
2299
2300 /* We should now know what we need to parse next, so call the appropriate parser
2301 function and return. */
2302 parseFunction = gResourceTypes[resType].parseFunction;
2303 if (parseFunction != NULL) {
2304 return parseFunction(state, tag, startline, comment, status);
2305 }
2306 else {
2307 *status = U_INTERNAL_PROGRAM_ERROR;
2308 error(startline, "internal error: %s resource type found and not handled", gResourceTypes[resType].nameChars);
2309 }
2310
2311 return NULL;
2312}
2313
2314/* parse the top-level resource */
2315struct SRBRoot *
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08002316parse(UCHARBUF *buf, const char *inputDir, const char *outputDir, const char *filename,
Frank Tang1f164ee2022-11-08 12:31:27 -08002317 UBool makeBinaryCollation, UBool omitCollationRules, UBool icu4xMode, UErrorCode *status)
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00002318{
2319 struct UString *tokenValue;
2320 struct UString comment;
2321 uint32_t line;
2322 enum EResourceType bundleType;
2323 enum ETokenType token;
2324 ParseState state;
2325 uint32_t i;
2326
2327
2328 for (i = 0; i < MAX_LOOKAHEAD + 1; i++)
2329 {
2330 ustr_init(&state.lookahead[i].value);
2331 ustr_init(&state.lookahead[i].comment);
2332 }
2333
2334 initLookahead(&state, buf, status);
2335
2336 state.inputdir = inputDir;
2337 state.inputdirLength = (state.inputdir != NULL) ? (uint32_t)uprv_strlen(state.inputdir) : 0;
2338 state.outputdir = outputDir;
2339 state.outputdirLength = (state.outputdir != NULL) ? (uint32_t)uprv_strlen(state.outputdir) : 0;
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08002340 state.filename = filename;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00002341 state.makeBinaryCollation = makeBinaryCollation;
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08002342 state.omitCollationRules = omitCollationRules;
Frank Tang1f164ee2022-11-08 12:31:27 -08002343 state.icu4xMode = icu4xMode;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00002344
2345 ustr_init(&comment);
2346 expect(&state, TOK_STRING, &tokenValue, &comment, NULL, status);
2347
Frank Tang1f164ee2022-11-08 12:31:27 -08002348 state.bundle = new SRBRoot(&comment, false, *status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00002349
2350 if (state.bundle == NULL || U_FAILURE(*status))
2351 {
Frank Tang69c72a62019-04-03 21:41:21 -07002352 delete state.bundle;
2353
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00002354 return NULL;
2355 }
2356
2357
Jungshik Shin70f82502016-01-29 00:32:36 -08002358 state.bundle->setLocale(tokenValue->fChars, *status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00002359
2360 /* The following code is to make Empty bundle work no matter with :table specifer or not */
2361 token = getToken(&state, NULL, NULL, &line, status);
2362 if(token==TOK_COLON) {
2363 *status=U_ZERO_ERROR;
2364 bundleType=parseResourceType(&state, status);
2365
2366 if(isTable(bundleType))
2367 {
2368 expect(&state, TOK_OPEN_BRACE, NULL, NULL, &line, status);
2369 }
2370 else
2371 {
2372 *status=U_PARSE_ERROR;
2373 error(line, "parse error. Stopped parsing with %s", u_errorName(*status));
2374 }
2375 }
2376 else
2377 {
2378 /* not a colon */
2379 if(token==TOK_OPEN_BRACE)
2380 {
2381 *status=U_ZERO_ERROR;
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08002382 bundleType=RESTYPE_TABLE;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00002383 }
2384 else
2385 {
2386 /* neither colon nor open brace */
2387 *status=U_PARSE_ERROR;
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08002388 bundleType=RESTYPE_UNKNOWN;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00002389 error(line, "parse error, did not find open-brace '{' or colon ':', stopped with %s", u_errorName(*status));
2390 }
2391 }
2392
2393 if (U_FAILURE(*status))
2394 {
Jungshik Shin70f82502016-01-29 00:32:36 -08002395 delete state.bundle;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00002396 return NULL;
2397 }
2398
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08002399 if(bundleType==RESTYPE_TABLE_NO_FALLBACK) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00002400 /*
2401 * Parse a top-level table with the table(nofallback) declaration.
2402 * This is the same as a regular table, but also sets the
2403 * URES_ATT_NO_FALLBACK flag in indexes[URES_INDEX_ATTRIBUTES] .
2404 */
Frank Tang1f164ee2022-11-08 12:31:27 -08002405 state.bundle->fNoFallback=true;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00002406 }
2407 /* top-level tables need not handle special table names like "collations" */
Jungshik Shin70f82502016-01-29 00:32:36 -08002408 assert(!state.bundle->fIsPoolBundle);
2409 assert(state.bundle->fRoot->fType == URES_TABLE);
2410 TableResource *rootTable = static_cast<TableResource *>(state.bundle->fRoot);
2411 realParseTable(&state, rootTable, NULL, line, status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00002412 if(dependencyArray!=NULL){
Jungshik Shin70f82502016-01-29 00:32:36 -08002413 rootTable->add(dependencyArray, 0, *status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00002414 dependencyArray = NULL;
2415 }
2416 if (U_FAILURE(*status))
2417 {
Jungshik Shin70f82502016-01-29 00:32:36 -08002418 delete state.bundle;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00002419 res_close(dependencyArray);
2420 return NULL;
2421 }
2422
2423 if (getToken(&state, NULL, NULL, &line, status) != TOK_EOF)
2424 {
2425 warning(line, "extraneous text after resource bundle (perhaps unmatched braces)");
2426 if(isStrict()){
2427 *status = U_INVALID_FORMAT_ERROR;
2428 return NULL;
2429 }
2430 }
2431
2432 cleanupLookahead(&state);
2433 ustr_deinit(&comment);
2434 return state.bundle;
2435}