Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 1 | /*********************************************************************** |
| 2 | * © 2016 and later: Unicode, Inc. and others. |
| 3 | * License & terms of use: http://www.unicode.org/copyright.html |
| 4 | * |
| 5 | *********************************************************************** |
| 6 | *********************************************************************** |
| 7 | * COPYRIGHT: |
| 8 | * Copyright (C) 2001-2012 IBM, Inc. All Rights Reserved. |
| 9 | * |
| 10 | ***********************************************************************/ |
| 11 | /******************************************************************************** |
| 12 | * |
| 13 | * File ubrkperf.cpp |
| 14 | * |
| 15 | * Modification History: |
| 16 | * Name Description |
| 17 | * Vladimir Weinstein First Version, based on collperf |
| 18 | * |
| 19 | ********************************************************************************* |
| 20 | */ |
| 21 | |
| 22 | // |
| 23 | // This program tests break iterator performance |
| 24 | // Currently we test only ICU APIs with the future possibility of testing *nix & win32 APIs |
| 25 | // (if any) |
| 26 | // A text file is required as input. It must be in utf-8 or utf-16 format, |
| 27 | // and include a byte order mark. Either LE or BE format is OK. |
| 28 | // |
| 29 | |
| 30 | const char gUsageString[] = |
| 31 | "usage: ubrkperf options...\n" |
| 32 | "-help Display this message.\n" |
| 33 | "-file file_name utf-16/utf-8 format file.\n" |
| 34 | "-locale name ICU locale to use. Default is en_US\n" |
| 35 | "-langid 0x1234 Windows Language ID number. Default to value for -locale option\n" |
| 36 | " see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n" |
| 37 | "-win Run test using Windows native services. (currently not working) (ICU is default)\n" |
| 38 | "-unix Run test using Unix word breaking services. (currently not working) \n" |
| 39 | "-mac Run test using MacOSX word breaking services.\n" |
| 40 | "-uselen Use API with string lengths. Default is null-terminated strings\n" |
| 41 | "-char Use character break iterator\n" |
| 42 | "-word Use word break iterator\n" |
| 43 | "-line Use line break iterator\n" |
| 44 | "-sentence Use sentence break iterator\n" |
| 45 | "-loop nnnn Loopcount for test. Adjust for reasonable total running time.\n" |
| 46 | "-iloop n Inner Loop Count. Default = 1. Number of calls to function\n" |
| 47 | " under test at each call point. For measuring test overhead.\n" |
| 48 | "-terse Terse numbers-only output. Intended for use by scripts.\n" |
| 49 | "-dump Display stuff.\n" |
| 50 | "-capi Use C APIs instead of C++ APIs (currently not working)\n" |
| 51 | "-next Do the next test\n" |
| 52 | "-isBound Do the isBound test\n" |
| 53 | ; |
| 54 | |
| 55 | |
| 56 | #include <stdio.h> |
| 57 | #include <string.h> |
| 58 | #include <stdlib.h> |
| 59 | #include <math.h> |
| 60 | #include <locale.h> |
| 61 | #include <errno.h> |
| 62 | #include <sys/stat.h> |
| 63 | |
| 64 | #include <unicode/utypes.h> |
| 65 | #include <unicode/ucol.h> |
| 66 | #include <unicode/ucoleitr.h> |
| 67 | #include <unicode/uloc.h> |
| 68 | #include <unicode/ustring.h> |
| 69 | #include <unicode/ures.h> |
| 70 | #include <unicode/uchar.h> |
| 71 | #include <unicode/ucnv.h> |
| 72 | #include <unicode/utf8.h> |
| 73 | |
| 74 | #include <unicode/brkiter.h> |
| 75 | |
| 76 | |
| 77 | #if U_PLATFORM_HAS_WIN32_API |
| 78 | #include <windows.h> |
| 79 | #else |
| 80 | // |
| 81 | // Stubs for Windows API functions when building on UNIXes. |
| 82 | // |
| 83 | #include <sys/time.h> |
| 84 | unsigned long timeGetTime() { |
| 85 | struct timeval t; |
| 86 | gettimeofday(&t, 0); |
| 87 | unsigned long val = t.tv_sec * 1000; // Let it overflow. Who cares. |
| 88 | val += t.tv_usec / 1000; |
| 89 | return val; |
| 90 | }; |
| 91 | #define MAKELCID(a,b) 0 |
| 92 | #endif |
| 93 | |
| 94 | |
| 95 | // |
| 96 | // Command line option variables |
| 97 | // These global variables are set according to the options specified |
| 98 | // on the command line by the user. |
| 99 | char * opt_fName = 0; |
| 100 | char * opt_locale = "en_US"; |
| 101 | int opt_langid = 0; // Defaults to value corresponding to opt_locale. |
| 102 | char * opt_rules = 0; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 103 | UBool opt_help = false; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 104 | int opt_time = 0; |
| 105 | int opt_loopCount = 0; |
| 106 | int opt_passesCount= 1; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 107 | UBool opt_terse = false; |
| 108 | UBool opt_icu = true; |
| 109 | UBool opt_win = false; // Run with Windows native functions. |
| 110 | UBool opt_unix = false; // Run with UNIX strcoll, strxfrm functions. |
| 111 | UBool opt_mac = false; // Run with MacOSX word break services. |
| 112 | UBool opt_uselen = false; |
| 113 | UBool opt_dump = false; |
| 114 | UBool opt_char = false; |
| 115 | UBool opt_word = false; |
| 116 | UBool opt_line = false; |
| 117 | UBool opt_sentence = false; |
| 118 | UBool opt_capi = false; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 119 | |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 120 | UBool opt_next = false; |
| 121 | UBool opt_isBound = false; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 122 | |
| 123 | |
| 124 | |
| 125 | // |
| 126 | // Definitions for the command line options |
| 127 | // |
| 128 | struct OptSpec { |
| 129 | const char *name; |
| 130 | enum {FLAG, NUM, STRING} type; |
| 131 | void *pVar; |
| 132 | }; |
| 133 | |
| 134 | OptSpec opts[] = { |
| 135 | {"-file", OptSpec::STRING, &opt_fName}, |
| 136 | {"-locale", OptSpec::STRING, &opt_locale}, |
| 137 | {"-langid", OptSpec::NUM, &opt_langid}, |
| 138 | {"-win", OptSpec::FLAG, &opt_win}, |
| 139 | {"-unix", OptSpec::FLAG, &opt_unix}, |
| 140 | {"-mac", OptSpec::FLAG, &opt_mac}, |
| 141 | {"-uselen", OptSpec::FLAG, &opt_uselen}, |
| 142 | {"-loop", OptSpec::NUM, &opt_loopCount}, |
| 143 | {"-time", OptSpec::NUM, &opt_time}, |
| 144 | {"-passes", OptSpec::NUM, &opt_passesCount}, |
| 145 | {"-char", OptSpec::FLAG, &opt_char}, |
| 146 | {"-word", OptSpec::FLAG, &opt_word}, |
| 147 | {"-line", OptSpec::FLAG, &opt_line}, |
| 148 | {"-sentence", OptSpec::FLAG, &opt_sentence}, |
| 149 | {"-terse", OptSpec::FLAG, &opt_terse}, |
| 150 | {"-dump", OptSpec::FLAG, &opt_dump}, |
| 151 | {"-capi", OptSpec::FLAG, &opt_capi}, |
| 152 | {"-next", OptSpec::FLAG, &opt_next}, |
| 153 | {"-isBound", OptSpec::FLAG, &opt_isBound}, |
| 154 | {"-help", OptSpec::FLAG, &opt_help}, |
| 155 | {"-?", OptSpec::FLAG, &opt_help}, |
| 156 | {0, OptSpec::FLAG, 0} |
| 157 | }; |
| 158 | |
| 159 | |
| 160 | //--------------------------------------------------------------------------- |
| 161 | // |
| 162 | // Global variables pointing to and describing the test file |
| 163 | // |
| 164 | //--------------------------------------------------------------------------- |
| 165 | |
| 166 | //DWORD gWinLCID; |
| 167 | BreakIterator *brkit = NULL; |
| 168 | UChar *text = NULL; |
| 169 | int32_t textSize = 0; |
| 170 | |
| 171 | |
| 172 | |
| 173 | #if U_PLATFORM_IS_DARWIN_BASED |
| 174 | #include <ApplicationServices/ApplicationServices.h> |
| 175 | enum{ |
| 176 | kUCTextBreakAllMask = (kUCTextBreakClusterMask | kUCTextBreakWordMask | kUCTextBreakLineMask) |
| 177 | }; |
| 178 | UCTextBreakType breakTypes[4] = {kUCTextBreakCharMask, kUCTextBreakClusterMask, kUCTextBreakWordMask, kUCTextBreakLineMask}; |
| 179 | TextBreakLocatorRef breakRef; |
| 180 | UCTextBreakType macBreakType; |
| 181 | |
| 182 | void createMACBrkIt() { |
| 183 | OSStatus status = noErr; |
| 184 | LocaleRef lref; |
| 185 | status = LocaleRefFromLocaleString(opt_locale, &lref); |
| 186 | status = UCCreateTextBreakLocator(lref, 0, kUCTextBreakAllMask, (TextBreakLocatorRef*)&breakRef); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 187 | if(opt_char == true) { |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 188 | macBreakType = kUCTextBreakClusterMask; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 189 | } else if(opt_word == true) { |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 190 | macBreakType = kUCTextBreakWordMask; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 191 | } else if(opt_line == true) { |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 192 | macBreakType = kUCTextBreakLineMask; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 193 | } else if(opt_sentence == true) { |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 194 | // error |
| 195 | // brkit = BreakIterator::createSentenceInstance(opt_locale, status); |
| 196 | } else { |
| 197 | // default is character iterator |
| 198 | macBreakType = kUCTextBreakClusterMask; |
| 199 | } |
| 200 | } |
| 201 | #endif |
| 202 | |
| 203 | void createICUBrkIt() { |
| 204 | // |
| 205 | // Set up an ICU break iterator |
| 206 | // |
| 207 | UErrorCode status = U_ZERO_ERROR; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 208 | if(opt_char == true) { |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 209 | brkit = BreakIterator::createCharacterInstance(opt_locale, status); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 210 | } else if(opt_word == true) { |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 211 | brkit = BreakIterator::createWordInstance(opt_locale, status); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 212 | } else if(opt_line == true) { |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 213 | brkit = BreakIterator::createLineInstance(opt_locale, status); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 214 | } else if(opt_sentence == true) { |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 215 | brkit = BreakIterator::createSentenceInstance(opt_locale, status); |
| 216 | } else { |
| 217 | // default is character iterator |
| 218 | brkit = BreakIterator::createCharacterInstance(opt_locale, status); |
| 219 | } |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 220 | if (status==U_USING_DEFAULT_WARNING && opt_terse==false) { |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 221 | fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale); |
| 222 | } |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 223 | if (status==U_USING_FALLBACK_WARNING && opt_terse==false) { |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 224 | fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale); |
| 225 | } |
| 226 | |
| 227 | } |
| 228 | |
| 229 | //--------------------------------------------------------------------------- |
| 230 | // |
| 231 | // ProcessOptions() Function to read the command line options. |
| 232 | // |
| 233 | //--------------------------------------------------------------------------- |
| 234 | UBool ProcessOptions(int argc, const char **argv, OptSpec opts[]) |
| 235 | { |
| 236 | int i; |
| 237 | int argNum; |
| 238 | const char *pArgName; |
| 239 | OptSpec *pOpt; |
| 240 | |
| 241 | for (argNum=1; argNum<argc; argNum++) { |
| 242 | pArgName = argv[argNum]; |
| 243 | for (pOpt = opts; pOpt->name != 0; pOpt++) { |
| 244 | if (strcmp(pOpt->name, pArgName) == 0) { |
| 245 | switch (pOpt->type) { |
| 246 | case OptSpec::FLAG: |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 247 | *(UBool *)(pOpt->pVar) = true; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 248 | break; |
| 249 | case OptSpec::STRING: |
| 250 | argNum ++; |
| 251 | if (argNum >= argc) { |
| 252 | fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 253 | return false; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 254 | } |
| 255 | *(const char **)(pOpt->pVar) = argv[argNum]; |
| 256 | break; |
| 257 | case OptSpec::NUM: |
| 258 | argNum ++; |
| 259 | if (argNum >= argc) { |
| 260 | fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 261 | return false; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 262 | } |
| 263 | char *endp; |
| 264 | i = strtol(argv[argNum], &endp, 0); |
| 265 | if (endp == argv[argNum]) { |
| 266 | fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 267 | return false; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 268 | } |
| 269 | *(int *)(pOpt->pVar) = i; |
| 270 | } |
| 271 | break; |
| 272 | } |
| 273 | } |
| 274 | if (pOpt->name == 0) |
| 275 | { |
| 276 | fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 277 | return false; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 278 | } |
| 279 | } |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 280 | return true; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 281 | } |
| 282 | |
| 283 | |
| 284 | void doForwardTest() { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 285 | if (opt_terse == false) { |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 286 | printf("Doing the forward test\n"); |
| 287 | } |
| 288 | int32_t noBreaks = 0; |
| 289 | int32_t i = 0; |
| 290 | unsigned long startTime = timeGetTime(); |
| 291 | unsigned long elapsedTime = 0; |
| 292 | if(opt_icu) { |
| 293 | createICUBrkIt(); |
| 294 | brkit->setText(UnicodeString(text, textSize)); |
| 295 | brkit->first(); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 296 | if (opt_terse == false) { |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 297 | printf("Warmup\n"); |
| 298 | } |
| 299 | int j; |
| 300 | while((j = brkit->next()) != BreakIterator::DONE) { |
| 301 | noBreaks++; |
| 302 | //fprintf(stderr, "%d ", j); |
| 303 | } |
| 304 | |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 305 | if (opt_terse == false) { |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 306 | printf("Measure\n"); |
| 307 | } |
| 308 | startTime = timeGetTime(); |
| 309 | for(i = 0; i < opt_loopCount; i++) { |
| 310 | brkit->first(); |
| 311 | while(brkit->next() != BreakIterator::DONE) { |
| 312 | } |
| 313 | } |
| 314 | |
| 315 | elapsedTime = timeGetTime()-startTime; |
| 316 | } else if(opt_mac) { |
| 317 | #if U_PLATFORM_IS_DARWIN_BASED |
| 318 | createMACBrkIt(); |
| 319 | UniChar* filePtr = text; |
| 320 | OSStatus status = noErr; |
| 321 | UniCharCount startOffset = 0, breakOffset = 0, numUniChars = textSize; |
| 322 | startOffset = 0; |
| 323 | //printf("\t---Search forward--\n"); |
| 324 | |
| 325 | while (startOffset < numUniChars) |
| 326 | { |
| 327 | status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars, |
| 328 | startOffset, &breakOffset); |
| 329 | //require_action(status == noErr, EXIT, printf( "**UCFindTextBreak failed: startOffset %d, status %d\n", (int)startOffset, (int)status)); |
| 330 | //require_action((breakOffset <= numUniChars),EXIT, printf("**UCFindTextBreak breakOffset too big: startOffset %d, breakOffset %d\n", (int)startOffset, (int)breakOffset)); |
| 331 | |
| 332 | // Output break |
| 333 | //printf("\t%d\n", (int)breakOffset); |
| 334 | |
| 335 | // Increment counters |
| 336 | noBreaks++; |
| 337 | startOffset = breakOffset; |
| 338 | } |
| 339 | startTime = timeGetTime(); |
| 340 | for(i = 0; i < opt_loopCount; i++) { |
| 341 | startOffset = 0; |
| 342 | |
| 343 | while (startOffset < numUniChars) |
| 344 | { |
| 345 | status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars, |
| 346 | startOffset, &breakOffset); |
| 347 | // Increment counters |
| 348 | startOffset = breakOffset; |
| 349 | } |
| 350 | } |
| 351 | elapsedTime = timeGetTime()-startTime; |
| 352 | UCDisposeTextBreakLocator(&breakRef); |
| 353 | #endif |
| 354 | |
| 355 | |
| 356 | } |
| 357 | |
| 358 | |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 359 | if (opt_terse == false) { |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 360 | int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount)); |
| 361 | int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize)); |
| 362 | int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks)); |
| 363 | printf("forward break iteration average loop time %d\n", loopTime); |
| 364 | printf("number of code units %d average time per code unit %d\n", textSize, timePerCU); |
| 365 | printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak); |
| 366 | } else { |
| 367 | printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize); |
| 368 | } |
| 369 | |
| 370 | |
| 371 | } |
| 372 | |
| 373 | void doIsBoundTest() { |
| 374 | int32_t noBreaks = 0, hit = 0; |
| 375 | int32_t i = 0, j = 0; |
| 376 | unsigned long startTime = timeGetTime(); |
| 377 | unsigned long elapsedTime = 0; |
| 378 | createICUBrkIt(); |
| 379 | brkit->setText(UnicodeString(text, textSize)); |
| 380 | brkit->first(); |
| 381 | for(j = 0; j < textSize; j++) { |
| 382 | if(brkit->isBoundary(j)) { |
| 383 | noBreaks++; |
| 384 | //fprintf(stderr, "%d ", j); |
| 385 | } |
| 386 | } |
| 387 | /* |
| 388 | while(brkit->next() != BreakIterator::DONE) { |
| 389 | noBreaks++; |
| 390 | } |
| 391 | */ |
| 392 | |
| 393 | startTime = timeGetTime(); |
| 394 | for(i = 0; i < opt_loopCount; i++) { |
| 395 | for(j = 0; j < textSize; j++) { |
| 396 | if(brkit->isBoundary(j)) { |
| 397 | hit++; |
| 398 | } |
| 399 | } |
| 400 | } |
| 401 | |
| 402 | elapsedTime = timeGetTime()-startTime; |
| 403 | int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount)); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 404 | if (opt_terse == false) { |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 405 | int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize)); |
| 406 | int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks)); |
| 407 | printf("forward break iteration average loop time %d\n", loopTime); |
| 408 | printf("number of code units %d average time per code unit %d\n", textSize, timePerCU); |
| 409 | printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak); |
| 410 | } else { |
| 411 | printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize); |
| 412 | } |
| 413 | } |
| 414 | |
| 415 | //---------------------------------------------------------------------------------------- |
| 416 | // |
| 417 | // UnixConvert -- Convert the lines of the file to the encoding for UNIX |
| 418 | // Since it appears that Unicode support is going in the general |
| 419 | // direction of the use of UTF-8 locales, that is the approach |
| 420 | // that is used here. |
| 421 | // |
| 422 | //---------------------------------------------------------------------------------------- |
| 423 | void UnixConvert() { |
| 424 | #if 0 |
| 425 | int line; |
| 426 | |
| 427 | UConverter *cvrtr; // An ICU code page converter. |
| 428 | UErrorCode status = U_ZERO_ERROR; |
| 429 | |
| 430 | |
| 431 | cvrtr = ucnv_open("utf-8", &status); // we are just doing UTF-8 locales for now. |
| 432 | if (U_FAILURE(status)) { |
| 433 | fprintf(stderr, "ICU Converter open failed.: %d\n", &status); |
| 434 | exit(-1); |
| 435 | } |
| 436 | // redo for unix |
| 437 | for (line=0; line < gNumFileLines; line++) { |
| 438 | int sizeNeeded = ucnv_fromUChars(cvrtr, |
| 439 | 0, // ptr to target buffer. |
| 440 | 0, // length of target buffer. |
| 441 | gFileLines[line].name, |
| 442 | -1, // source is null terminated |
| 443 | &status); |
| 444 | if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) { |
| 445 | fprintf(stderr, "Conversion from Unicode, something is wrong.\n"); |
| 446 | exit(-1); |
| 447 | } |
| 448 | status = U_ZERO_ERROR; |
| 449 | gFileLines[line].unixName = new char[sizeNeeded+1]; |
| 450 | sizeNeeded = ucnv_fromUChars(cvrtr, |
| 451 | gFileLines[line].unixName, // ptr to target buffer. |
| 452 | sizeNeeded+1, // length of target buffer. |
| 453 | gFileLines[line].name, |
| 454 | -1, // source is null terminated |
| 455 | &status); |
| 456 | if (U_FAILURE(status)) { |
| 457 | fprintf(stderr, "ICU Conversion Failed.: %d\n", status); |
| 458 | exit(-1); |
| 459 | } |
| 460 | gFileLines[line].unixName[sizeNeeded] = 0; |
| 461 | }; |
| 462 | ucnv_close(cvrtr); |
| 463 | #endif |
| 464 | } |
| 465 | |
| 466 | |
| 467 | //---------------------------------------------------------------------------------------- |
| 468 | // |
| 469 | // class UCharFile Class to hide all the gorp to read a file in |
| 470 | // and produce a stream of UChars. |
| 471 | // |
| 472 | //---------------------------------------------------------------------------------------- |
| 473 | class UCharFile { |
| 474 | public: |
| 475 | UCharFile(const char *fileName); |
| 476 | ~UCharFile(); |
| 477 | UChar get(); |
| 478 | UBool eof() {return fEof;}; |
| 479 | UBool error() {return fError;}; |
| 480 | int32_t size() { return fFileSize; }; |
| 481 | |
| 482 | private: |
| 483 | UCharFile (const UCharFile &other) {}; // No copy constructor. |
| 484 | UCharFile & operator = (const UCharFile &other) {return *this;}; // No assignment op |
| 485 | |
| 486 | FILE *fFile; |
| 487 | const char *fName; |
| 488 | UBool fEof; |
| 489 | UBool fError; |
| 490 | UChar fPending2ndSurrogate; |
| 491 | int32_t fFileSize; |
| 492 | |
| 493 | enum {UTF16LE, UTF16BE, UTF8} fEncoding; |
| 494 | }; |
| 495 | |
| 496 | UCharFile::UCharFile(const char * fileName) { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 497 | fEof = false; |
| 498 | fError = false; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 499 | fName = fileName; |
| 500 | struct stat buf; |
| 501 | int32_t result = stat(fileName, &buf); |
| 502 | if(result != 0) { |
| 503 | fprintf(stderr, "Error getting info\n"); |
| 504 | fFileSize = -1; |
| 505 | } else { |
| 506 | fFileSize = buf.st_size; |
| 507 | } |
| 508 | fFile = fopen(fName, "rb"); |
| 509 | fPending2ndSurrogate = 0; |
| 510 | if (fFile == NULL) { |
| 511 | fprintf(stderr, "Can not open file \"%s\"\n", opt_fName); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 512 | fError = true; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 513 | return; |
| 514 | } |
| 515 | // |
| 516 | // Look for the byte order mark at the start of the file. |
| 517 | // |
| 518 | int BOMC1, BOMC2, BOMC3; |
| 519 | BOMC1 = fgetc(fFile); |
| 520 | BOMC2 = fgetc(fFile); |
| 521 | |
| 522 | if (BOMC1 == 0xff && BOMC2 == 0xfe) { |
| 523 | fEncoding = UTF16LE; } |
| 524 | else if (BOMC1 == 0xfe && BOMC2 == 0xff) { |
| 525 | fEncoding = UTF16BE; } |
| 526 | else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) { |
| 527 | fEncoding = UTF8; } |
| 528 | else |
| 529 | { |
| 530 | fprintf(stderr, "collperf: file \"%s\" encoding must be UTF-8 or UTF-16, and " |
| 531 | "must include a BOM.\n", fileName); |
| 532 | fError = true; |
| 533 | return; |
| 534 | } |
| 535 | } |
| 536 | |
| 537 | |
| 538 | UCharFile::~UCharFile() { |
| 539 | fclose(fFile); |
| 540 | } |
| 541 | |
| 542 | |
| 543 | |
| 544 | UChar UCharFile::get() { |
| 545 | UChar c; |
| 546 | switch (fEncoding) { |
| 547 | case UTF16LE: |
| 548 | { |
| 549 | int cL, cH; |
| 550 | cL = fgetc(fFile); |
| 551 | cH = fgetc(fFile); |
| 552 | c = cL | (cH << 8); |
| 553 | if (cH == EOF) { |
| 554 | c = 0; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 555 | fEof = true; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 556 | } |
| 557 | break; |
| 558 | } |
| 559 | case UTF16BE: |
| 560 | { |
| 561 | int cL, cH; |
| 562 | cH = fgetc(fFile); |
| 563 | cL = fgetc(fFile); |
| 564 | c = cL | (cH << 8); |
| 565 | if (cL == EOF) { |
| 566 | c = 0; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 567 | fEof = true; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 568 | } |
| 569 | break; |
| 570 | } |
| 571 | case UTF8: |
| 572 | { |
| 573 | if (fPending2ndSurrogate != 0) { |
| 574 | c = fPending2ndSurrogate; |
| 575 | fPending2ndSurrogate = 0; |
| 576 | break; |
| 577 | } |
| 578 | |
| 579 | int ch = fgetc(fFile); // Note: c and ch are separate cause eof test doesn't work on UChar type. |
| 580 | if (ch == EOF) { |
| 581 | c = 0; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 582 | fEof = true; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 583 | break; |
| 584 | } |
| 585 | |
| 586 | if (ch <= 0x7f) { |
| 587 | // It's ascii. No further utf-8 conversion. |
| 588 | c = ch; |
| 589 | break; |
| 590 | } |
| 591 | |
| 592 | // Figure out the length of the char and read the rest of the bytes |
| 593 | // into a temp array. |
| 594 | int nBytes; |
| 595 | if (ch >= 0xF0) {nBytes=4;} |
| 596 | else if (ch >= 0xE0) {nBytes=3;} |
| 597 | else if (ch >= 0xC0) {nBytes=2;} |
| 598 | else { |
| 599 | fprintf(stderr, "not likely utf-8 encoded file %s contains corrupt data at offset %d.\n", fName, ftell(fFile)); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 600 | fError = true; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 601 | return 0; |
| 602 | } |
| 603 | |
| 604 | unsigned char bytes[10]; |
| 605 | bytes[0] = (unsigned char)ch; |
| 606 | int i; |
| 607 | for (i=1; i<nBytes; i++) { |
| 608 | bytes[i] = fgetc(fFile); |
| 609 | if (bytes[i] < 0x80 || bytes[i] >= 0xc0) { |
| 610 | fprintf(stderr, "utf-8 encoded file %s contains corrupt data at offset %d. Expected %d bytes, byte %d is invalid. First byte is %02X\n", fName, ftell(fFile), nBytes, i, ch); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 611 | fError = true; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 612 | return 0; |
| 613 | } |
| 614 | } |
| 615 | |
| 616 | // Convert the bytes from the temp array to a Unicode char. |
| 617 | i = 0; |
| 618 | uint32_t cp; |
| 619 | U8_NEXT_UNSAFE(bytes, i, cp); |
| 620 | c = (UChar)cp; |
| 621 | |
| 622 | if (cp >= 0x10000) { |
| 623 | // The code point needs to be broken up into a utf-16 surrogate pair. |
| 624 | // Process first half this time through the main loop, and |
| 625 | // remember the other half for the next time through. |
| 626 | UChar utf16Buf[3]; |
| 627 | i = 0; |
| 628 | UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp); |
| 629 | fPending2ndSurrogate = utf16Buf[1]; |
| 630 | c = utf16Buf[0]; |
| 631 | } |
| 632 | break; |
| 633 | }; |
| 634 | } |
| 635 | return c; |
| 636 | } |
| 637 | |
| 638 | |
| 639 | //---------------------------------------------------------------------------------------- |
| 640 | // |
| 641 | // Main -- process command line, read in and pre-process the test file, |
| 642 | // call other functions to do the actual tests. |
| 643 | // |
| 644 | //---------------------------------------------------------------------------------------- |
| 645 | int main(int argc, const char** argv) { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 646 | if (ProcessOptions(argc, argv, opts) != true || opt_help || opt_fName == 0) { |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 647 | printf(gUsageString); |
| 648 | exit (1); |
| 649 | } |
| 650 | // Make sure that we've only got one API selected. |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 651 | if (opt_mac || opt_unix || opt_win) opt_icu = false; |
| 652 | if (opt_mac || opt_unix) opt_win = false; |
| 653 | if (opt_mac) opt_unix = false; |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 654 | |
| 655 | UErrorCode status = U_ZERO_ERROR; |
| 656 | |
| 657 | |
| 658 | |
| 659 | // |
| 660 | // Set up a Windows LCID |
| 661 | // |
| 662 | /* |
| 663 | if (opt_langid != 0) { |
| 664 | gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT); |
| 665 | } |
| 666 | else { |
| 667 | gWinLCID = uloc_getLCID(opt_locale); |
| 668 | } |
| 669 | */ |
| 670 | |
| 671 | // |
| 672 | // Set the UNIX locale |
| 673 | // |
| 674 | if (opt_unix) { |
| 675 | if (setlocale(LC_ALL, opt_locale) == 0) { |
| 676 | fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale); |
| 677 | exit(-1); |
| 678 | } |
| 679 | } |
| 680 | |
| 681 | // Read in the input file. |
| 682 | // File assumed to be utf-16. |
| 683 | // Lines go onto heap buffers. Global index array to line starts is created. |
| 684 | // Lines themselves are null terminated. |
| 685 | // |
| 686 | |
| 687 | UCharFile f(opt_fName); |
| 688 | if (f.error()) { |
| 689 | exit(-1); |
| 690 | } |
| 691 | int32_t fileSize = f.size(); |
| 692 | const int STARTSIZE = 70000; |
| 693 | int32_t bufSize = 0; |
| 694 | int32_t charCount = 0; |
| 695 | if(fileSize != -1) { |
| 696 | text = (UChar *)malloc(fileSize*sizeof(UChar)); |
| 697 | bufSize = fileSize; |
| 698 | } else { |
| 699 | text = (UChar *)malloc(STARTSIZE*sizeof(UChar)); |
| 700 | bufSize = STARTSIZE; |
| 701 | } |
| 702 | if(text == NULL) { |
| 703 | fprintf(stderr, "Allocating buffer failed\n"); |
| 704 | exit(-1); |
| 705 | } |
| 706 | |
| 707 | |
| 708 | // Read the file, split into lines, and save in memory. |
| 709 | // Loop runs once per utf-16 value from the input file, |
| 710 | // (The number of bytes read from file per loop iteration depends on external encoding.) |
| 711 | for (;;) { |
| 712 | |
| 713 | UChar c = f.get(); |
| 714 | if(f.eof()) { |
| 715 | break; |
| 716 | } |
| 717 | if (f.error()){ |
| 718 | exit(-1); |
| 719 | } |
| 720 | // We now have a good UTF-16 value in c. |
| 721 | text[charCount++] = c; |
| 722 | if(charCount == bufSize) { |
| 723 | text = (UChar *)realloc(text, 2*bufSize*sizeof(UChar)); |
| 724 | if(text == NULL) { |
| 725 | fprintf(stderr, "Reallocating buffer failed\n"); |
| 726 | exit(-1); |
| 727 | } |
| 728 | bufSize *= 2; |
| 729 | } |
| 730 | } |
| 731 | |
| 732 | |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 733 | if (opt_terse == false) { |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 734 | printf("file \"%s\", %d charCount code units.\n", opt_fName, charCount); |
| 735 | } |
| 736 | |
| 737 | textSize = charCount; |
| 738 | |
| 739 | |
| 740 | |
| 741 | |
| 742 | // |
| 743 | // Dump file contents if requested. |
| 744 | // |
| 745 | if (opt_dump) { |
| 746 | // dump file, etc... possibly |
| 747 | } |
| 748 | |
| 749 | |
| 750 | // |
| 751 | // We've got the file read into memory. Go do something with it. |
| 752 | // |
| 753 | int32_t i = 0; |
| 754 | for(i = 0; i < opt_passesCount; i++) { |
| 755 | if(opt_loopCount != 0) { |
| 756 | if(opt_next) { |
| 757 | doForwardTest(); |
| 758 | } else if(opt_isBound) { |
| 759 | doIsBoundTest(); |
| 760 | } else { |
| 761 | doForwardTest(); |
| 762 | } |
| 763 | } else if(opt_time != 0) { |
| 764 | |
| 765 | } |
| 766 | } |
| 767 | |
| 768 | if(text != NULL) { |
| 769 | free(text); |
| 770 | } |
| 771 | if(brkit != NULL) { |
| 772 | delete brkit; |
| 773 | } |
| 774 | |
| 775 | return 0; |
| 776 | } |