blob: f007b80c7dbb6153e537e2aaf056cdcd91f95e43 [file] [log] [blame]
Jungshik Shin87232d82017-05-13 21:10:13 -07001// © 2016 and later: Unicode, Inc. and others.
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07002// License & terms of use: http://www.unicode.org/copyright.html
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00003/*
4 ********************************************************************************
5 *
Jungshik Shin70f82502016-01-29 00:32:36 -08006 * Copyright (C) 1998-2015, International Business Machines
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00007 * Corporation and others. All Rights Reserved.
8 *
9 ********************************************************************************
10 *
11 *
Jungshik Shin70f82502016-01-29 00:32:36 -080012 * makeconv.cpp:
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000013 * tool creating a binary (compressed) representation of the conversion mapping
14 * table (IBM NLTC ucmap format).
15 *
16 * 05/04/2000 helena Added fallback mapping into the picture...
17 * 06/29/2000 helena Major rewrite of the callback APIs.
18 */
19
20#include <stdio.h>
21#include "unicode/putil.h"
22#include "unicode/ucnv_err.h"
Jungshik Shin70f82502016-01-29 00:32:36 -080023#include "charstr.h"
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000024#include "ucnv_bld.h"
25#include "ucnv_imp.h"
26#include "ucnv_cnv.h"
27#include "cstring.h"
28#include "cmemory.h"
29#include "uinvchar.h"
30#include "filestrm.h"
31#include "toolutil.h"
32#include "uoptions.h"
33#include "unicode/udata.h"
34#include "unewdata.h"
35#include "uparse.h"
36#include "ucm.h"
37#include "makeconv.h"
38#include "genmbcs.h"
39
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000040#define DEBUG 0
41
42typedef struct ConvData {
43 UCMFile *ucm;
44 NewConverter *cnvData, *extData;
45 UConverterSharedData sharedData;
46 UConverterStaticData staticData;
47} ConvData;
48
49static void
50initConvData(ConvData *data) {
51 uprv_memset(data, 0, sizeof(ConvData));
52 data->sharedData.structSize=sizeof(UConverterSharedData);
53 data->staticData.structSize=sizeof(UConverterStaticData);
54 data->sharedData.staticData=&data->staticData;
55}
56
57static void
58cleanupConvData(ConvData *data) {
59 if(data!=NULL) {
60 if(data->cnvData!=NULL) {
61 data->cnvData->close(data->cnvData);
62 data->cnvData=NULL;
63 }
64 if(data->extData!=NULL) {
65 data->extData->close(data->extData);
66 data->extData=NULL;
67 }
68 ucm_close(data->ucm);
69 data->ucm=NULL;
70 }
71}
72
73/*
74 * from ucnvstat.c - static prototypes of data-based converters
75 */
Jungshik Shin70f82502016-01-29 00:32:36 -080076U_CAPI const UConverterStaticData * ucnv_converterStaticData[UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES];
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000077
78/*
79 * Global - verbosity
80 */
Frank Tang1f164ee2022-11-08 12:31:27 -080081UBool VERBOSE = false;
82UBool QUIET = false;
83UBool SMALL = false;
84UBool IGNORE_SISO_CHECK = false;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000085
86static void
87createConverter(ConvData *data, const char* converterName, UErrorCode *pErrorCode);
88
89/*
90 * Set up the UNewData and write the converter..
91 */
92static void
93writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErrorCode *status);
94
Frank Tang1f164ee2022-11-08 12:31:27 -080095UBool haveCopyright=true;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000096
97static UDataInfo dataInfo={
98 sizeof(UDataInfo),
99 0,
100
101 U_IS_BIG_ENDIAN,
102 U_CHARSET_FAMILY,
103 sizeof(UChar),
104 0,
105
106 {0x63, 0x6e, 0x76, 0x74}, /* dataFormat="cnvt" */
107 {6, 2, 0, 0}, /* formatVersion */
108 {0, 0, 0, 0} /* dataVersion (calculated at runtime) */
109};
110
111static void
112writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErrorCode *status)
113{
114 UNewDataMemory *mem = NULL;
115 uint32_t sz2;
116 uint32_t size = 0;
117 int32_t tableType;
118
119 if(U_FAILURE(*status))
120 {
121 return;
122 }
123
124 tableType=TABLE_NONE;
125 if(data->cnvData!=NULL) {
126 tableType|=TABLE_BASE;
127 }
128 if(data->extData!=NULL) {
129 tableType|=TABLE_EXT;
130 }
131
132 mem = udata_create(cnvDir, "cnv", cnvName, &dataInfo, haveCopyright ? U_COPYRIGHT_STRING : NULL, status);
133
134 if(U_FAILURE(*status))
135 {
136 fprintf(stderr, "Couldn't create the udata %s.%s: %s\n",
137 cnvName,
138 "cnv",
139 u_errorName(*status));
140 return;
141 }
142
143 if(VERBOSE)
144 {
145 printf("- Opened udata %s.%s\n", cnvName, "cnv");
146 }
147
148
149 /* all read only, clean, platform independent data. Mmmm. :) */
150 udata_writeBlock(mem, &data->staticData, sizeof(UConverterStaticData));
151 size += sizeof(UConverterStaticData); /* Is 4-aligned - by size */
152 /* Now, write the table */
153 if(tableType&TABLE_BASE) {
154 size += data->cnvData->write(data->cnvData, &data->staticData, mem, tableType);
155 }
156 if(tableType&TABLE_EXT) {
157 size += data->extData->write(data->extData, &data->staticData, mem, tableType);
158 }
159
160 sz2 = udata_finish(mem, status);
161 if(size != sz2)
162 {
163 fprintf(stderr, "error: wrote %u bytes to the .cnv file but counted %u bytes\n", (int)sz2, (int)size);
164 *status=U_INTERNAL_PROGRAM_ERROR;
165 }
166 if(VERBOSE)
167 {
168 printf("- Wrote %u bytes to the udata.\n", (int)sz2);
169 }
170}
171
172enum {
173 OPT_HELP_H,
174 OPT_HELP_QUESTION_MARK,
175 OPT_COPYRIGHT,
176 OPT_VERSION,
177 OPT_DESTDIR,
178 OPT_VERBOSE,
179 OPT_SMALL,
180 OPT_IGNORE_SISO_CHECK,
Jungshik Shin70f82502016-01-29 00:32:36 -0800181 OPT_QUIET,
Frank Tang69c72a62019-04-03 21:41:21 -0700182 OPT_SOURCEDIR,
Jungshik Shin70f82502016-01-29 00:32:36 -0800183
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000184 OPT_COUNT
185};
186
187static UOption options[]={
188 UOPTION_HELP_H,
189 UOPTION_HELP_QUESTION_MARK,
190 UOPTION_COPYRIGHT,
191 UOPTION_VERSION,
192 UOPTION_DESTDIR,
193 UOPTION_VERBOSE,
194 { "small", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 },
Jungshik Shin70f82502016-01-29 00:32:36 -0800195 { "ignore-siso-check", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 },
196 UOPTION_QUIET,
Frank Tang69c72a62019-04-03 21:41:21 -0700197 UOPTION_SOURCEDIR,
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000198};
199
200int main(int argc, char* argv[])
201{
202 ConvData data;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000203 char cnvName[UCNV_MAX_FULL_FILE_NAME_LENGTH];
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000204
205 U_MAIN_INIT_ARGS(argc, argv);
206
207 /* Set up the ICU version number */
Jungshik Shin70f82502016-01-29 00:32:36 -0800208 UVersionInfo icuVersion;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000209 u_getVersion(icuVersion);
210 uprv_memcpy(&dataInfo.dataVersion, &icuVersion, sizeof(UVersionInfo));
211
212 /* preset then read command line options */
213 options[OPT_DESTDIR].value=u_getDataDirectory();
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800214 argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000215
Frank Tang3e05d9d2021-11-08 14:04:04 -0800216 if(options[OPT_VERSION].doesOccur) {
217 printf("makeconv version %u.%u, ICU tool to read .ucm codepage mapping files and write .cnv files\n",
218 dataInfo.formatVersion[0], dataInfo.formatVersion[1]);
219 printf("%s\n", U_COPYRIGHT_STRING);
220 exit(0);
221 }
222
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000223 /* error handling, printing usage message */
224 if(argc<0) {
225 fprintf(stderr,
226 "error in command line argument \"%s\"\n",
227 argv[-argc]);
228 } else if(argc<2) {
229 argc=-1;
230 }
231 if(argc<0 || options[OPT_HELP_H].doesOccur || options[OPT_HELP_QUESTION_MARK].doesOccur) {
232 FILE *stdfile=argc<0 ? stderr : stdout;
233 fprintf(stdfile,
234 "usage: %s [-options] files...\n"
235 "\tread .ucm codepage mapping files and write .cnv files\n"
236 "options:\n"
237 "\t-h or -? or --help this usage text\n"
238 "\t-V or --version show a version message\n"
239 "\t-c or --copyright include a copyright notice\n"
240 "\t-d or --destdir destination directory, followed by the path\n"
Jungshik Shin70f82502016-01-29 00:32:36 -0800241 "\t-v or --verbose Turn on verbose output\n"
Frank Tang69c72a62019-04-03 21:41:21 -0700242 "\t-q or --quiet do not display warnings and progress\n"
243 "\t-s or --sourcedir source directory, followed by the path\n",
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000244 argv[0]);
245 fprintf(stdfile,
246 "\t --small Generate smaller .cnv files. They will be\n"
247 "\t significantly smaller but may not be compatible with\n"
248 "\t older versions of ICU and will require heap memory\n"
249 "\t allocation when loaded.\n"
250 "\t --ignore-siso-check Use SI/SO other than 0xf/0xe.\n");
251 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
252 }
253
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000254 /* get the options values */
255 haveCopyright = options[OPT_COPYRIGHT].doesOccur;
Jungshik Shin70f82502016-01-29 00:32:36 -0800256 const char *destdir = options[OPT_DESTDIR].value;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000257 VERBOSE = options[OPT_VERBOSE].doesOccur;
Jungshik Shin70f82502016-01-29 00:32:36 -0800258 QUIET = options[OPT_QUIET].doesOccur;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000259 SMALL = options[OPT_SMALL].doesOccur;
260
261 if (options[OPT_IGNORE_SISO_CHECK].doesOccur) {
Frank Tang1f164ee2022-11-08 12:31:27 -0800262 IGNORE_SISO_CHECK = true;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000263 }
264
Jungshik Shin70f82502016-01-29 00:32:36 -0800265 icu::CharString outFileName;
266 UErrorCode err = U_ZERO_ERROR;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000267 if (destdir != NULL && *destdir != 0) {
Jungshik Shin70f82502016-01-29 00:32:36 -0800268 outFileName.append(destdir, err).ensureEndsWithFileSeparator(err);
269 if (U_FAILURE(err)) {
270 return err;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000271 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000272 }
Jungshik Shin70f82502016-01-29 00:32:36 -0800273 int32_t outBasenameStart = outFileName.length();
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000274
275#if DEBUG
276 {
277 int i;
278 printf("makeconv: processing %d files...\n", argc - 1);
279 for(i=1; i<argc; ++i) {
280 printf("%s ", argv[i]);
281 }
282 printf("\n");
283 fflush(stdout);
284 }
285#endif
286
Jungshik Shin70f82502016-01-29 00:32:36 -0800287 UBool printFilename = (UBool) (argc > 2 || VERBOSE);
Frank Tang69c72a62019-04-03 21:41:21 -0700288 icu::CharString pathBuf;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000289 for (++argv; --argc; ++argv)
290 {
Jungshik Shin70f82502016-01-29 00:32:36 -0800291 UErrorCode localError = U_ZERO_ERROR;
292 const char *arg = getLongPathname(*argv);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000293
Frank Tang69c72a62019-04-03 21:41:21 -0700294 const char* sourcedir = options[OPT_SOURCEDIR].value;
295 if (sourcedir != NULL && *sourcedir != 0 && uprv_strcmp(sourcedir, ".") != 0) {
296 pathBuf.clear();
297 pathBuf.appendPathPart(sourcedir, localError);
298 pathBuf.appendPathPart(arg, localError);
299 arg = pathBuf.data();
300 }
301
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000302 /*produces the right destination path for display*/
Jungshik Shin70f82502016-01-29 00:32:36 -0800303 outFileName.truncate(outBasenameStart);
304 if (outBasenameStart != 0)
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000305 {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000306 /* find the last file sepator */
Jungshik Shin70f82502016-01-29 00:32:36 -0800307 const char *basename = findBasename(arg);
308 outFileName.append(basename, localError);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000309 }
310 else
311 {
Jungshik Shin70f82502016-01-29 00:32:36 -0800312 outFileName.append(arg, localError);
313 }
314 if (U_FAILURE(localError)) {
315 return localError;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000316 }
317
318 /*removes the extension if any is found*/
Jungshik Shin70f82502016-01-29 00:32:36 -0800319 int32_t lastDotIndex = outFileName.lastIndexOf('.');
320 if (lastDotIndex >= outBasenameStart) {
321 outFileName.truncate(lastDotIndex);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000322 }
323
324 /* the basename without extension is the converter name */
Jungshik Shin70f82502016-01-29 00:32:36 -0800325 if ((outFileName.length() - outBasenameStart) >= UPRV_LENGTHOF(cnvName)) {
326 fprintf(stderr, "converter name %s too long\n", outFileName.data() + outBasenameStart);
327 return U_BUFFER_OVERFLOW_ERROR;
328 }
329 uprv_strcpy(cnvName, outFileName.data() + outBasenameStart);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000330
331 /*Adds the target extension*/
Jungshik Shin70f82502016-01-29 00:32:36 -0800332 outFileName.append(CONVERTER_FILE_EXTENSION, localError);
333 if (U_FAILURE(localError)) {
334 return localError;
335 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000336
337#if DEBUG
338 printf("makeconv: processing %s ...\n", arg);
339 fflush(stdout);
340#endif
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000341 initConvData(&data);
342 createConverter(&data, arg, &localError);
343
344 if (U_FAILURE(localError))
345 {
346 /* if an error is found, print out an error msg and keep going */
Jungshik Shin70f82502016-01-29 00:32:36 -0800347 fprintf(stderr, "Error creating converter for \"%s\" file for \"%s\" (%s)\n",
348 outFileName.data(), arg, u_errorName(localError));
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000349 if(U_SUCCESS(err)) {
350 err = localError;
351 }
352 }
353 else
354 {
355 /* Insure the static data name matches the file name */
356 /* Changed to ignore directory and only compare base name
357 LDH 1/2/08*/
358 char *p;
359 p = strrchr(cnvName, U_FILE_SEP_CHAR); /* Find last file separator */
360
361 if(p == NULL) /* OK, try alternate */
362 {
363 p = strrchr(cnvName, U_FILE_ALT_SEP_CHAR);
364 if(p == NULL)
365 {
366 p=cnvName; /* If no separators, no problem */
367 }
368 }
369 else
370 {
Jungshik Shin70f82502016-01-29 00:32:36 -0800371 p++; /* If found separator, don't include it in compare */
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000372 }
Jungshik Shin70f82502016-01-29 00:32:36 -0800373 if(uprv_stricmp(p,data.staticData.name) && !QUIET)
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000374 {
375 fprintf(stderr, "Warning: %s%s claims to be '%s'\n",
376 cnvName, CONVERTER_FILE_EXTENSION,
377 data.staticData.name);
378 }
379
380 uprv_strcpy((char*)data.staticData.name, cnvName);
381
382 if(!uprv_isInvariantString((char*)data.staticData.name, -1)) {
383 fprintf(stderr,
384 "Error: A converter name must contain only invariant characters.\n"
385 "%s is not a valid converter name.\n",
386 data.staticData.name);
387 if(U_SUCCESS(err)) {
388 err = U_INVALID_TABLE_FORMAT;
389 }
390 }
391
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000392 localError = U_ZERO_ERROR;
Jungshik Shin70f82502016-01-29 00:32:36 -0800393 writeConverterData(&data, cnvName, destdir, &localError);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000394
395 if(U_FAILURE(localError))
396 {
397 /* if an error is found, print out an error msg and keep going*/
Jungshik Shin70f82502016-01-29 00:32:36 -0800398 fprintf(stderr, "Error writing \"%s\" file for \"%s\" (%s)\n", outFileName.data(), arg,
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000399 u_errorName(localError));
400 if(U_SUCCESS(err)) {
401 err = localError;
402 }
403 }
404 else if (printFilename)
405 {
Jungshik Shin70f82502016-01-29 00:32:36 -0800406 puts(outFileName.data() + outBasenameStart);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000407 }
408 }
409 fflush(stdout);
410 fflush(stderr);
411
412 cleanupConvData(&data);
413 }
414
415 return err;
416}
417
418static void
419getPlatformAndCCSIDFromName(const char *name, int8_t *pPlatform, int32_t *pCCSID) {
420 if( (name[0]=='i' || name[0]=='I') &&
421 (name[1]=='b' || name[1]=='B') &&
422 (name[2]=='m' || name[2]=='M')
423 ) {
424 name+=3;
425 if(*name=='-') {
426 ++name;
427 }
428 *pPlatform=UCNV_IBM;
429 *pCCSID=(int32_t)uprv_strtoul(name, NULL, 10);
430 } else {
431 *pPlatform=UCNV_UNKNOWN;
432 *pCCSID=0;
433 }
434}
435
436static void
437readHeader(ConvData *data,
438 FileStream* convFile,
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000439 UErrorCode *pErrorCode) {
440 char line[1024];
441 char *s, *key, *value;
442 const UConverterStaticData *prototype;
443 UConverterStaticData *staticData;
444
445 if(U_FAILURE(*pErrorCode)) {
446 return;
447 }
448
449 staticData=&data->staticData;
450 staticData->platform=UCNV_IBM;
451 staticData->subCharLen=0;
452
453 while(T_FileStream_readLine(convFile, line, sizeof(line))) {
454 /* basic parsing and handling of state-related items */
455 if(ucm_parseHeaderLine(data->ucm, line, &key, &value)) {
456 continue;
457 }
458
459 /* stop at the beginning of the mapping section */
460 if(uprv_strcmp(line, "CHARMAP")==0) {
461 break;
462 }
463
464 /* collect the information from the header field, ignore unknown keys */
465 if(uprv_strcmp(key, "code_set_name")==0) {
466 if(*value!=0) {
467 uprv_strcpy((char *)staticData->name, value);
468 getPlatformAndCCSIDFromName(value, &staticData->platform, &staticData->codepage);
469 }
470 } else if(uprv_strcmp(key, "subchar")==0) {
471 uint8_t bytes[UCNV_EXT_MAX_BYTES];
472 int8_t length;
473
474 s=value;
475 length=ucm_parseBytes(bytes, line, (const char **)&s);
476 if(1<=length && length<=4 && *s==0) {
477 staticData->subCharLen=length;
478 uprv_memcpy(staticData->subChar, bytes, length);
479 } else {
480 fprintf(stderr, "error: illegal <subchar> %s\n", value);
481 *pErrorCode=U_INVALID_TABLE_FORMAT;
482 return;
483 }
484 } else if(uprv_strcmp(key, "subchar1")==0) {
485 uint8_t bytes[UCNV_EXT_MAX_BYTES];
486
487 s=value;
488 if(1==ucm_parseBytes(bytes, line, (const char **)&s) && *s==0) {
489 staticData->subChar1=bytes[0];
490 } else {
491 fprintf(stderr, "error: illegal <subchar1> %s\n", value);
492 *pErrorCode=U_INVALID_TABLE_FORMAT;
493 return;
494 }
495 }
496 }
497
498 /* copy values from the UCMFile to the static data */
499 staticData->maxBytesPerChar=(int8_t)data->ucm->states.maxCharLength;
500 staticData->minBytesPerChar=(int8_t)data->ucm->states.minCharLength;
501 staticData->conversionType=data->ucm->states.conversionType;
502
503 if(staticData->conversionType==UCNV_UNSUPPORTED_CONVERTER) {
504 fprintf(stderr, "ucm error: missing conversion type (<uconv_class>)\n");
505 *pErrorCode=U_INVALID_TABLE_FORMAT;
506 return;
507 }
508
509 /*
510 * Now that we know the type, copy any 'default' values from the table.
511 * We need not check the type any further because the parser only
512 * recognizes what we have prototypes for.
513 *
514 * For delta (extension-only) tables, copy values from the base file
515 * instead, see createConverter().
516 */
517 if(data->ucm->baseName[0]==0) {
518 prototype=ucnv_converterStaticData[staticData->conversionType];
519 if(prototype!=NULL) {
520 if(staticData->name[0]==0) {
521 uprv_strcpy((char *)staticData->name, prototype->name);
522 }
523
524 if(staticData->codepage==0) {
525 staticData->codepage=prototype->codepage;
526 }
527
528 if(staticData->platform==0) {
529 staticData->platform=prototype->platform;
530 }
531
532 if(staticData->minBytesPerChar==0) {
533 staticData->minBytesPerChar=prototype->minBytesPerChar;
534 }
535
536 if(staticData->maxBytesPerChar==0) {
537 staticData->maxBytesPerChar=prototype->maxBytesPerChar;
538 }
539
540 if(staticData->subCharLen==0) {
541 staticData->subCharLen=prototype->subCharLen;
542 if(prototype->subCharLen>0) {
543 uprv_memcpy(staticData->subChar, prototype->subChar, prototype->subCharLen);
544 }
545 }
546 }
547 }
548
549 if(data->ucm->states.outputType<0) {
550 data->ucm->states.outputType=(int8_t)data->ucm->states.maxCharLength-1;
551 }
552
553 if( staticData->subChar1!=0 &&
554 (staticData->minBytesPerChar>1 ||
555 (staticData->conversionType!=UCNV_MBCS &&
556 staticData->conversionType!=UCNV_EBCDIC_STATEFUL))
557 ) {
558 fprintf(stderr, "error: <subchar1> defined for a type other than MBCS or EBCDIC_STATEFUL\n");
559 *pErrorCode=U_INVALID_TABLE_FORMAT;
560 }
561}
562
Frank Tang1f164ee2022-11-08 12:31:27 -0800563/* return true if a base table was read, false for an extension table */
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000564static UBool
565readFile(ConvData *data, const char* converterName,
566 UErrorCode *pErrorCode) {
567 char line[1024];
568 char *end;
569 FileStream *convFile;
570
571 UCMStates *baseStates;
572 UBool dataIsBase;
573
574 if(U_FAILURE(*pErrorCode)) {
Frank Tang1f164ee2022-11-08 12:31:27 -0800575 return false;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000576 }
577
578 data->ucm=ucm_open();
579
580 convFile=T_FileStream_open(converterName, "r");
581 if(convFile==NULL) {
582 *pErrorCode=U_FILE_ACCESS_ERROR;
Frank Tang1f164ee2022-11-08 12:31:27 -0800583 return false;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000584 }
585
Jungshik Shin70f82502016-01-29 00:32:36 -0800586 readHeader(data, convFile, pErrorCode);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000587 if(U_FAILURE(*pErrorCode)) {
Frank Tang1f164ee2022-11-08 12:31:27 -0800588 return false;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000589 }
590
591 if(data->ucm->baseName[0]==0) {
Frank Tang1f164ee2022-11-08 12:31:27 -0800592 dataIsBase=true;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000593 baseStates=&data->ucm->states;
594 ucm_processStates(baseStates, IGNORE_SISO_CHECK);
595 } else {
Frank Tang1f164ee2022-11-08 12:31:27 -0800596 dataIsBase=false;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000597 baseStates=NULL;
598 }
599
600 /* read the base table */
601 ucm_readTable(data->ucm, convFile, dataIsBase, baseStates, pErrorCode);
602 if(U_FAILURE(*pErrorCode)) {
Frank Tang1f164ee2022-11-08 12:31:27 -0800603 return false;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000604 }
605
606 /* read an extension table if there is one */
607 while(T_FileStream_readLine(convFile, line, sizeof(line))) {
608 end=uprv_strchr(line, 0);
609 while(line<end &&
610 (*(end-1)=='\n' || *(end-1)=='\r' || *(end-1)==' ' || *(end-1)=='\t')) {
611 --end;
612 }
613 *end=0;
614
615 if(line[0]=='#' || u_skipWhitespace(line)==end) {
616 continue; /* ignore empty and comment lines */
617 }
618
619 if(0==uprv_strcmp(line, "CHARMAP")) {
620 /* read the extension table */
Frank Tang1f164ee2022-11-08 12:31:27 -0800621 ucm_readTable(data->ucm, convFile, false, baseStates, pErrorCode);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000622 } else {
623 fprintf(stderr, "unexpected text after the base mapping table\n");
624 }
625 break;
626 }
627
628 T_FileStream_close(convFile);
629
630 if(data->ucm->base->flagsType==UCM_FLAGS_MIXED || data->ucm->ext->flagsType==UCM_FLAGS_MIXED) {
631 fprintf(stderr, "error: some entries have the mapping precision (with '|'), some do not\n");
632 *pErrorCode=U_INVALID_TABLE_FORMAT;
633 }
634
635 return dataIsBase;
636}
637
638static void
639createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCode) {
640 ConvData baseData;
641 UBool dataIsBase;
642
643 UConverterStaticData *staticData;
644 UCMStates *states, *baseStates;
645
646 if(U_FAILURE(*pErrorCode)) {
647 return;
648 }
649
650 initConvData(data);
651
652 dataIsBase=readFile(data, converterName, pErrorCode);
653 if(U_FAILURE(*pErrorCode)) {
654 return;
655 }
656
657 staticData=&data->staticData;
658 states=&data->ucm->states;
659
660 if(dataIsBase) {
661 /*
662 * Build a normal .cnv file with a base table
663 * and an optional extension table.
664 */
665 data->cnvData=MBCSOpen(data->ucm);
666 if(data->cnvData==NULL) {
667 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
668
669 } else if(!data->cnvData->isValid(data->cnvData,
670 staticData->subChar, staticData->subCharLen)
671 ) {
672 fprintf(stderr, " the substitution character byte sequence is illegal in this codepage structure!\n");
673 *pErrorCode=U_INVALID_TABLE_FORMAT;
674
675 } else if(staticData->subChar1!=0 &&
676 !data->cnvData->isValid(data->cnvData, &staticData->subChar1, 1)
677 ) {
678 fprintf(stderr, " the subchar1 byte is illegal in this codepage structure!\n");
679 *pErrorCode=U_INVALID_TABLE_FORMAT;
680
681 } else if(
682 data->ucm->ext->mappingsLength>0 &&
Frank Tang1f164ee2022-11-08 12:31:27 -0800683 !ucm_checkBaseExt(states, data->ucm->base, data->ucm->ext, data->ucm->ext, false)
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000684 ) {
685 *pErrorCode=U_INVALID_TABLE_FORMAT;
686 } else if(data->ucm->base->flagsType&UCM_FLAGS_EXPLICIT) {
687 /* sort the table so that it can be turned into UTF-8-friendly data */
688 ucm_sortTable(data->ucm->base);
689 }
690
691 if(U_SUCCESS(*pErrorCode)) {
692 if(
693 /* add the base table after ucm_checkBaseExt()! */
694 !data->cnvData->addTable(data->cnvData, data->ucm->base, &data->staticData)
695 ) {
696 *pErrorCode=U_INVALID_TABLE_FORMAT;
697 } else {
698 /*
699 * addTable() may have requested moving more mappings to the extension table
700 * if they fit into the base toUnicode table but not into the
701 * base fromUnicode table.
702 * (Especially for UTF-8-friendly fromUnicode tables.)
703 * Such mappings will have the MBCS_FROM_U_EXT_FLAG set, which causes them
704 * to be excluded from the extension toUnicode data.
705 * See MBCSOkForBaseFromUnicode() for which mappings do not fit into
706 * the base fromUnicode table.
707 */
708 ucm_moveMappings(data->ucm->base, data->ucm->ext);
709 ucm_sortTable(data->ucm->ext);
710 if(data->ucm->ext->mappingsLength>0) {
711 /* prepare the extension table, if there is one */
712 data->extData=CnvExtOpen(data->ucm);
713 if(data->extData==NULL) {
714 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
715 } else if(
716 !data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)
717 ) {
718 *pErrorCode=U_INVALID_TABLE_FORMAT;
719 }
720 }
721 }
722 }
723 } else {
724 /* Build an extension-only .cnv file. */
725 char baseFilename[500];
726 char *basename;
727
728 initConvData(&baseData);
729
730 /* assemble a path/filename for data->ucm->baseName */
731 uprv_strcpy(baseFilename, converterName);
732 basename=(char *)findBasename(baseFilename);
733 uprv_strcpy(basename, data->ucm->baseName);
734 uprv_strcat(basename, ".ucm");
735
736 /* read the base table */
737 dataIsBase=readFile(&baseData, baseFilename, pErrorCode);
738 if(U_FAILURE(*pErrorCode)) {
739 return;
740 } else if(!dataIsBase) {
741 fprintf(stderr, "error: the <icu:base> file \"%s\" is not a base table file\n", baseFilename);
742 *pErrorCode=U_INVALID_TABLE_FORMAT;
743 } else {
744 /* prepare the extension table */
745 data->extData=CnvExtOpen(data->ucm);
746 if(data->extData==NULL) {
747 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
748 } else {
749 /* fill in gaps in extension file header fields */
750 UCMapping *m, *mLimit;
751 uint8_t fallbackFlags;
752
753 baseStates=&baseData.ucm->states;
754 if(states->conversionType==UCNV_DBCS) {
755 staticData->minBytesPerChar=(int8_t)(states->minCharLength=2);
756 } else if(states->minCharLength==0) {
757 staticData->minBytesPerChar=(int8_t)(states->minCharLength=baseStates->minCharLength);
758 }
759 if(states->maxCharLength<states->minCharLength) {
760 staticData->maxBytesPerChar=(int8_t)(states->maxCharLength=baseStates->maxCharLength);
761 }
762
763 if(staticData->subCharLen==0) {
764 uprv_memcpy(staticData->subChar, baseData.staticData.subChar, 4);
765 staticData->subCharLen=baseData.staticData.subCharLen;
766 }
767 /*
768 * do not copy subChar1 -
769 * only use what is explicitly specified
770 * because it cannot be unset in the extension file header
771 */
772
773 /* get the fallback flags */
774 fallbackFlags=0;
775 for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength;
776 m<mLimit && fallbackFlags!=3;
777 ++m
778 ) {
779 if(m->f==1) {
780 fallbackFlags|=1;
781 } else if(m->f==3) {
782 fallbackFlags|=2;
783 }
784 }
785
786 if(fallbackFlags&1) {
Frank Tang1f164ee2022-11-08 12:31:27 -0800787 staticData->hasFromUnicodeFallback=true;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000788 }
789 if(fallbackFlags&2) {
Frank Tang1f164ee2022-11-08 12:31:27 -0800790 staticData->hasToUnicodeFallback=true;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000791 }
792
793 if(1!=ucm_countChars(baseStates, staticData->subChar, staticData->subCharLen)) {
794 fprintf(stderr, " the substitution character byte sequence is illegal in this codepage structure!\n");
795 *pErrorCode=U_INVALID_TABLE_FORMAT;
796
797 } else if(staticData->subChar1!=0 && 1!=ucm_countChars(baseStates, &staticData->subChar1, 1)) {
798 fprintf(stderr, " the subchar1 byte is illegal in this codepage structure!\n");
799 *pErrorCode=U_INVALID_TABLE_FORMAT;
800
801 } else if(
802 !ucm_checkValidity(data->ucm->ext, baseStates) ||
Frank Tang1f164ee2022-11-08 12:31:27 -0800803 !ucm_checkBaseExt(baseStates, baseData.ucm->base, data->ucm->ext, data->ucm->ext, false)
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000804 ) {
805 *pErrorCode=U_INVALID_TABLE_FORMAT;
806 } else {
807 if(states->maxCharLength>1) {
808 /*
809 * When building a normal .cnv file with a base table
810 * for an MBCS (not SBCS) table with explicit precision flags,
811 * the MBCSAddTable() function marks some mappings for moving
812 * to the extension table.
813 * They fit into the base toUnicode table but not into the
814 * base fromUnicode table.
815 * (Note: We do have explicit precision flags because they are
816 * required for extension table generation, and
817 * ucm_checkBaseExt() verified it.)
818 *
819 * We do not call MBCSAddTable() here (we probably could)
820 * so we need to do the analysis before building the extension table.
821 * We assume that MBCSAddTable() will build a UTF-8-friendly table.
822 * Redundant mappings in the extension table are ok except they cost some size.
823 *
824 * Do this after ucm_checkBaseExt().
825 */
826 const MBCSData *mbcsData=MBCSGetDummy();
827 int32_t needsMove=0;
828 for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength;
829 m<mLimit;
830 ++m
831 ) {
832 if(!MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, m->u, m->f)) {
833 m->f|=MBCS_FROM_U_EXT_FLAG;
834 m->moveFlag=UCM_MOVE_TO_EXT;
835 ++needsMove;
836 }
837 }
838
839 if(needsMove!=0) {
840 ucm_moveMappings(baseData.ucm->base, data->ucm->ext);
841 ucm_sortTable(data->ucm->ext);
842 }
843 }
844 if(!data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)) {
845 *pErrorCode=U_INVALID_TABLE_FORMAT;
846 }
847 }
848 }
849 }
850
851 cleanupConvData(&baseData);
852 }
853}
854
855/*
856 * Hey, Emacs, please set the following:
857 *
858 * Local Variables:
859 * indent-tabs-mode: nil
860 * End:
861 *
862 */