Jungshik Shin | 87232d8 | 2017-05-13 21:10:13 -0700 | [diff] [blame] | 1 | // © 2016 and later: Unicode, Inc. and others. |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 2 | // License & terms of use: http://www.unicode.org/copyright.html |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3 | /* |
| 4 | ********************************************************************** |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 5 | * Copyright (C) 2000-2016, International Business Machines |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 6 | * Corporation and others. All Rights Reserved. |
| 7 | ********************************************************************** |
| 8 | * file name: ucnv2022.cpp |
Jungshik Shin | 87232d8 | 2017-05-13 21:10:13 -0700 | [diff] [blame] | 9 | * encoding: UTF-8 |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 10 | * tab size: 8 (not used) |
| 11 | * indentation:4 |
| 12 | * |
| 13 | * created on: 2000feb03 |
| 14 | * created by: Markus W. Scherer |
| 15 | * |
| 16 | * Change history: |
| 17 | * |
| 18 | * 06/29/2000 helena Major rewrite of the callback APIs. |
| 19 | * 08/08/2000 Ram Included support for ISO-2022-JP-2 |
| 20 | * Changed implementation of toUnicode |
| 21 | * function |
| 22 | * 08/21/2000 Ram Added support for ISO-2022-KR |
| 23 | * 08/29/2000 Ram Seperated implementation of EBCDIC to |
| 24 | * ucnvebdc.c |
| 25 | * 09/20/2000 Ram Added support for ISO-2022-CN |
| 26 | * Added implementations for getNextUChar() |
| 27 | * for specific 2022 country variants. |
| 28 | * 10/31/2000 Ram Implemented offsets logic functions |
| 29 | */ |
| 30 | |
| 31 | #include "unicode/utypes.h" |
| 32 | |
| 33 | #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION |
| 34 | |
| 35 | #include "unicode/ucnv.h" |
| 36 | #include "unicode/uset.h" |
| 37 | #include "unicode/ucnv_err.h" |
| 38 | #include "unicode/ucnv_cb.h" |
| 39 | #include "unicode/utf16.h" |
| 40 | #include "ucnv_imp.h" |
| 41 | #include "ucnv_bld.h" |
| 42 | #include "ucnv_cnv.h" |
| 43 | #include "ucnvmbcs.h" |
| 44 | #include "cstring.h" |
| 45 | #include "cmemory.h" |
| 46 | #include "uassert.h" |
| 47 | |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 48 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
| 49 | /* |
| 50 | * I am disabling the generic ISO-2022 converter after proposing to do so on |
| 51 | * the icu mailing list two days ago. |
| 52 | * |
| 53 | * Reasons: |
| 54 | * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of |
| 55 | * its designation sequences, single shifts with return to the previous state, |
| 56 | * switch-with-no-return to UTF-16BE or similar, etc. |
| 57 | * This is unlike the language-specific variants like ISO-2022-JP which |
| 58 | * require a much smaller repertoire of ISO-2022 features. |
| 59 | * These variants continue to be supported. |
| 60 | * 2. I believe that no one is really using the generic ISO-2022 converter |
| 61 | * but rather always one of the language-specific variants. |
| 62 | * Note that ICU's generic ISO-2022 converter has always output one escape |
| 63 | * sequence followed by UTF-8 for the whole stream. |
| 64 | * 3. Switching between subcharsets is extremely slow, because each time |
| 65 | * the previous converter is closed and a new one opened, |
| 66 | * without any kind of caching, least-recently-used list, etc. |
| 67 | * 4. The code is currently buggy, and given the above it does not seem |
| 68 | * reasonable to spend the time on maintenance. |
| 69 | * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings. |
| 70 | * This means, for example, that when ISO-8859-7 is designated, the following |
| 71 | * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff. |
| 72 | * The ICU ISO-2022 converter does not handle this - and has no information |
| 73 | * about which subconverter would have to be shifted vs. which is designed |
| 74 | * for 7-bit ISO-2022. |
| 75 | * |
| 76 | * Markus Scherer 2003-dec-03 |
| 77 | */ |
| 78 | #endif |
| 79 | |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame] | 80 | #if !UCONFIG_ONLY_HTML_CONVERSION |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 81 | static const char SHIFT_IN_STR[] = "\x0F"; |
| 82 | // static const char SHIFT_OUT_STR[] = "\x0E"; |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame] | 83 | #endif |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 84 | |
| 85 | #define CR 0x0D |
| 86 | #define LF 0x0A |
| 87 | #define H_TAB 0x09 |
| 88 | #define V_TAB 0x0B |
| 89 | #define SPACE 0x20 |
| 90 | |
| 91 | enum { |
| 92 | HWKANA_START=0xff61, |
| 93 | HWKANA_END=0xff9f |
| 94 | }; |
| 95 | |
| 96 | /* |
| 97 | * 94-character sets with native byte values A1..FE are encoded in ISO 2022 |
| 98 | * as bytes 21..7E. (Subtract 0x80.) |
| 99 | * 96-character sets with native byte values A0..FF are encoded in ISO 2022 |
| 100 | * as bytes 20..7F. (Subtract 0x80.) |
| 101 | * Do not encode C1 control codes with native bytes 80..9F |
| 102 | * as bytes 00..1F (C0 control codes). |
| 103 | */ |
| 104 | enum { |
| 105 | GR94_START=0xa1, |
| 106 | GR94_END=0xfe, |
| 107 | GR96_START=0xa0, |
| 108 | GR96_END=0xff |
| 109 | }; |
| 110 | |
| 111 | /* |
| 112 | * ISO 2022 control codes must not be converted from Unicode |
| 113 | * because they would mess up the byte stream. |
| 114 | * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b |
| 115 | * corresponding to SO, SI, and ESC. |
| 116 | */ |
| 117 | #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0) |
| 118 | |
| 119 | /* for ISO-2022-JP and -CN implementations */ |
| 120 | typedef enum { |
| 121 | /* shared values */ |
| 122 | INVALID_STATE=-1, |
| 123 | ASCII = 0, |
| 124 | |
| 125 | SS2_STATE=0x10, |
| 126 | SS3_STATE, |
| 127 | |
| 128 | /* JP */ |
| 129 | ISO8859_1 = 1 , |
| 130 | ISO8859_7 = 2 , |
| 131 | JISX201 = 3, |
| 132 | JISX208 = 4, |
| 133 | JISX212 = 5, |
| 134 | GB2312 =6, |
| 135 | KSC5601 =7, |
| 136 | HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */ |
| 137 | |
| 138 | /* CN */ |
| 139 | /* the first few enum constants must keep their values because they correspond to myConverterArray[] */ |
| 140 | GB2312_1=1, |
| 141 | ISO_IR_165=2, |
| 142 | CNS_11643=3, |
| 143 | |
| 144 | /* |
| 145 | * these are used in StateEnum and ISO2022State variables, |
| 146 | * but CNS_11643 must be used to index into myConverterArray[] |
| 147 | */ |
| 148 | CNS_11643_0=0x20, |
| 149 | CNS_11643_1, |
| 150 | CNS_11643_2, |
| 151 | CNS_11643_3, |
| 152 | CNS_11643_4, |
| 153 | CNS_11643_5, |
| 154 | CNS_11643_6, |
| 155 | CNS_11643_7 |
| 156 | } StateEnum; |
| 157 | |
| 158 | /* is the StateEnum charset value for a DBCS charset? */ |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame] | 159 | #if UCONFIG_ONLY_HTML_CONVERSION |
Jungshik Shin (jungshik at google) | afd723b | 2015-01-21 13:24:04 -0800 | [diff] [blame] | 160 | #define IS_JP_DBCS(cs) (JISX208==(cs)) |
| 161 | #else |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 162 | #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601) |
Jungshik Shin (jungshik at google) | afd723b | 2015-01-21 13:24:04 -0800 | [diff] [blame] | 163 | #endif |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 164 | |
| 165 | #define CSM(cs) ((uint16_t)1<<(cs)) |
| 166 | |
| 167 | /* |
| 168 | * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence |
| 169 | * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x |
| 170 | * |
| 171 | * Note: The converter uses some leniency: |
| 172 | * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in |
| 173 | * all versions, not just JIS7 and JIS8. |
| 174 | * - ICU does not distinguish between different versions of JIS X 0208. |
| 175 | */ |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame] | 176 | #if UCONFIG_ONLY_HTML_CONVERSION |
Jungshik Shin (jungshik at google) | afd723b | 2015-01-21 13:24:04 -0800 | [diff] [blame] | 177 | enum { MAX_JA_VERSION=0 }; |
| 178 | #else |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 179 | enum { MAX_JA_VERSION=4 }; |
Jungshik Shin (jungshik at google) | afd723b | 2015-01-21 13:24:04 -0800 | [diff] [blame] | 180 | #endif |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 181 | static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={ |
| 182 | CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT), |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame] | 183 | #if !UCONFIG_ONLY_HTML_CONVERSION |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 184 | CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212), |
| 185 | CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), |
| 186 | CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), |
| 187 | CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7) |
Jungshik Shin (jungshik at google) | afd723b | 2015-01-21 13:24:04 -0800 | [diff] [blame] | 188 | #endif |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 189 | }; |
| 190 | |
| 191 | typedef enum { |
| 192 | ASCII1=0, |
| 193 | LATIN1, |
| 194 | SBCS, |
| 195 | DBCS, |
| 196 | MBCS, |
| 197 | HWKANA |
| 198 | }Cnv2022Type; |
| 199 | |
| 200 | typedef struct ISO2022State { |
| 201 | int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */ |
| 202 | int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */ |
| 203 | int8_t prevG; /* g before single shift (SS2 or SS3) */ |
| 204 | } ISO2022State; |
| 205 | |
| 206 | #define UCNV_OPTIONS_VERSION_MASK 0xf |
| 207 | #define UCNV_2022_MAX_CONVERTERS 10 |
| 208 | |
| 209 | typedef struct{ |
| 210 | UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS]; |
| 211 | UConverter *currentConverter; |
| 212 | Cnv2022Type currentType; |
| 213 | ISO2022State toU2022State, fromU2022State; |
| 214 | uint32_t key; |
| 215 | uint32_t version; |
| 216 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
| 217 | UBool isFirstBuffer; |
| 218 | #endif |
| 219 | UBool isEmptySegment; |
| 220 | char name[30]; |
| 221 | char locale[3]; |
| 222 | }UConverterDataISO2022; |
| 223 | |
| 224 | /* Protos */ |
| 225 | /* ISO-2022 ----------------------------------------------------------------- */ |
| 226 | |
| 227 | /*Forward declaration */ |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 228 | U_CFUNC void U_CALLCONV |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 229 | ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args, |
| 230 | UErrorCode * err); |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 231 | U_CFUNC void U_CALLCONV |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 232 | ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args, |
| 233 | UErrorCode * err); |
| 234 | |
| 235 | #define ESC_2022 0x1B /*ESC*/ |
| 236 | |
| 237 | typedef enum |
| 238 | { |
| 239 | INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/ |
| 240 | VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/ |
| 241 | VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/ |
| 242 | VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/ |
| 243 | } UCNV_TableStates_2022; |
| 244 | |
| 245 | /* |
| 246 | * The way these state transition arrays work is: |
| 247 | * ex : ESC$B is the sequence for JISX208 |
| 248 | * a) First Iteration: char is ESC |
| 249 | * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index |
| 250 | * int x = normalize_esq_chars_2022[27] which is equal to 1 |
| 251 | * ii) Search for this value in escSeqStateTable_Key_2022[] |
| 252 | * value of x is stored at escSeqStateTable_Key_2022[0] |
| 253 | * iii) Save this index as offset |
| 254 | * iv) Get state of this sequence from escSeqStateTable_Value_2022[] |
| 255 | * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 |
| 256 | * b) Switch on this state and continue to next char |
| 257 | * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index |
| 258 | * which is normalize_esq_chars_2022[36] == 4 |
| 259 | * ii) x is currently 1(from above) |
| 260 | * x<<=5 -- x is now 32 |
| 261 | * x+=normalize_esq_chars_2022[36] |
| 262 | * now x is 36 |
| 263 | * iii) Search for this value in escSeqStateTable_Key_2022[] |
| 264 | * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2 |
| 265 | * iv) Get state of this sequence from escSeqStateTable_Value_2022[] |
| 266 | * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 |
| 267 | * c) Switch on this state and continue to next char |
| 268 | * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index |
| 269 | * ii) x is currently 36 (from above) |
| 270 | * x<<=5 -- x is now 1152 |
| 271 | * x+=normalize_esq_chars_2022[66] |
| 272 | * now x is 1161 |
| 273 | * iii) Search for this value in escSeqStateTable_Key_2022[] |
| 274 | * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21 |
| 275 | * iv) Get state of this sequence from escSeqStateTable_Value_2022[21] |
| 276 | * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022 |
| 277 | * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208 |
| 278 | */ |
| 279 | |
| 280 | |
| 281 | /*Below are the 3 arrays depicting a state transition table*/ |
| 282 | static const int8_t normalize_esq_chars_2022[256] = { |
| 283 | /* 0 1 2 3 4 5 6 7 8 9 */ |
| 284 | |
| 285 | 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| 286 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| 287 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0 |
| 288 | ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0 |
| 289 | ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0 |
| 290 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| 291 | ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12 |
| 292 | ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28 |
| 293 | ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| 294 | ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| 295 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| 296 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| 297 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| 298 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| 299 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| 300 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| 301 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| 302 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| 303 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| 304 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| 305 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| 306 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| 307 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| 308 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| 309 | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
| 310 | ,0 ,0 ,0 ,0 ,0 ,0 |
| 311 | }; |
| 312 | |
| 313 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
| 314 | /* |
| 315 | * When the generic ISO-2022 converter is completely removed, not just disabled |
| 316 | * per #ifdef, then the following state table and the associated tables that are |
| 317 | * dimensioned with MAX_STATES_2022 should be trimmed. |
| 318 | * |
| 319 | * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of |
| 320 | * the associated escape sequences starting with ESC ( B should be removed. |
| 321 | * This includes the ones with key values 1097 and all of the ones above 1000000. |
| 322 | * |
| 323 | * For the latter, the tables can simply be truncated. |
| 324 | * For the former, since the tables must be kept parallel, it is probably best |
| 325 | * to simply duplicate an adjacent table cell, parallel in all tables. |
| 326 | * |
| 327 | * It may make sense to restructure the tables, especially by using small search |
| 328 | * tables for the variants instead of indexing them parallel to the table here. |
| 329 | */ |
| 330 | #endif |
| 331 | |
| 332 | #define MAX_STATES_2022 74 |
| 333 | static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = { |
| 334 | /* 0 1 2 3 4 5 6 7 8 9 */ |
| 335 | |
| 336 | 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096 |
| 337 | ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106 |
| 338 | ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257 |
| 339 | ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940 |
| 340 | ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644 |
| 341 | ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138 |
| 342 | ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630 |
| 343 | ,35947631 ,35947635 ,35947636 ,35947638 |
| 344 | }; |
| 345 | |
| 346 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
| 347 | |
| 348 | static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = { |
| 349 | /* 0 1 2 3 4 5 6 7 8 9 */ |
| 350 | |
| 351 | NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1" |
| 352 | ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1" |
| 353 | ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8" |
| 354 | ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383" |
| 355 | ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165" |
| 356 | ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" |
| 357 | ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089" |
| 358 | ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1" |
| 359 | }; |
| 360 | |
| 361 | #endif |
| 362 | |
| 363 | static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = { |
| 364 | /* 0 1 2 3 4 5 6 7 8 9 */ |
| 365 | VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
| 366 | ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
| 367 | ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 |
| 368 | ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
| 369 | ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
| 370 | ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
| 371 | ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
| 372 | ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
| 373 | }; |
| 374 | |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 375 | /* Type def for refactoring changeState_2022 code*/ |
| 376 | typedef enum{ |
| 377 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
| 378 | ISO_2022=0, |
| 379 | #endif |
| 380 | ISO_2022_JP=1, |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame] | 381 | #if !UCONFIG_ONLY_HTML_CONVERSION |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 382 | ISO_2022_KR=2, |
| 383 | ISO_2022_CN=3 |
Jungshik Shin (jungshik at google) | afd723b | 2015-01-21 13:24:04 -0800 | [diff] [blame] | 384 | #endif |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 385 | } Variant2022; |
| 386 | |
| 387 | /*********** ISO 2022 Converter Protos ***********/ |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 388 | static void U_CALLCONV |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 389 | _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode); |
| 390 | |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 391 | static void U_CALLCONV |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 392 | _ISO2022Close(UConverter *converter); |
| 393 | |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 394 | static void U_CALLCONV |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 395 | _ISO2022Reset(UConverter *converter, UConverterResetChoice choice); |
| 396 | |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 397 | U_CDECL_BEGIN |
| 398 | static const char * U_CALLCONV |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 399 | _ISO2022getName(const UConverter* cnv); |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 400 | U_CDECL_END |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 401 | |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 402 | static void U_CALLCONV |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 403 | _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err); |
| 404 | |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 405 | U_CDECL_BEGIN |
| 406 | static UConverter * U_CALLCONV |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 407 | _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status); |
| 408 | |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 409 | U_CDECL_END |
| 410 | |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 411 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 412 | static void U_CALLCONV |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 413 | T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err); |
| 414 | #endif |
| 415 | |
| 416 | namespace { |
| 417 | |
| 418 | /*const UConverterSharedData _ISO2022Data;*/ |
| 419 | extern const UConverterSharedData _ISO2022JPData; |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame] | 420 | |
| 421 | #if !UCONFIG_ONLY_HTML_CONVERSION |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 422 | extern const UConverterSharedData _ISO2022KRData; |
| 423 | extern const UConverterSharedData _ISO2022CNData; |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame] | 424 | #endif |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 425 | |
| 426 | } // namespace |
| 427 | |
| 428 | /*************** Converter implementations ******************/ |
| 429 | |
| 430 | /* The purpose of this function is to get around gcc compiler warnings. */ |
| 431 | static inline void |
| 432 | fromUWriteUInt8(UConverter *cnv, |
| 433 | const char *bytes, int32_t length, |
| 434 | uint8_t **target, const char *targetLimit, |
| 435 | int32_t **offsets, |
| 436 | int32_t sourceIndex, |
| 437 | UErrorCode *pErrorCode) |
| 438 | { |
| 439 | char *targetChars = (char *)*target; |
| 440 | ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit, |
| 441 | offsets, sourceIndex, pErrorCode); |
| 442 | *target = (uint8_t*)targetChars; |
| 443 | |
| 444 | } |
| 445 | |
| 446 | static inline void |
| 447 | setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){ |
| 448 | if(myConverterData->version == 1) { |
| 449 | UConverter *cnv = myConverterData->currentConverter; |
| 450 | |
| 451 | cnv->toUnicodeStatus=0; /* offset */ |
| 452 | cnv->mode=0; /* state */ |
| 453 | cnv->toULength=0; /* byteIndex */ |
| 454 | } |
| 455 | } |
| 456 | |
| 457 | static inline void |
| 458 | setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){ |
| 459 | /* in ISO-2022-KR the designator sequence appears only once |
| 460 | * in a file so we append it only once |
| 461 | */ |
| 462 | if( converter->charErrorBufferLength==0){ |
| 463 | |
| 464 | converter->charErrorBufferLength = 4; |
| 465 | converter->charErrorBuffer[0] = 0x1b; |
| 466 | converter->charErrorBuffer[1] = 0x24; |
| 467 | converter->charErrorBuffer[2] = 0x29; |
| 468 | converter->charErrorBuffer[3] = 0x43; |
| 469 | } |
| 470 | if(myConverterData->version == 1) { |
| 471 | UConverter *cnv = myConverterData->currentConverter; |
| 472 | |
| 473 | cnv->fromUChar32=0; |
| 474 | cnv->fromUnicodeStatus=1; /* prevLength */ |
| 475 | } |
| 476 | } |
| 477 | |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 478 | static void U_CALLCONV |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 479 | _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){ |
| 480 | |
Frank Tang | f222396 | 2020-04-27 18:25:29 -0700 | [diff] [blame] | 481 | char myLocale[7]={' ',' ',' ',' ',' ',' ', '\0'}; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 482 | |
| 483 | cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022)); |
| 484 | if(cnv->extraInfo != NULL) { |
| 485 | UConverterNamePieces stackPieces; |
| 486 | UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER; |
| 487 | UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; |
| 488 | uint32_t version; |
| 489 | |
| 490 | stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable; |
| 491 | |
| 492 | uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022)); |
| 493 | myConverterData->currentType = ASCII1; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 494 | cnv->fromUnicodeStatus =false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 495 | if(pArgs->locale){ |
Frank Tang | f222396 | 2020-04-27 18:25:29 -0700 | [diff] [blame] | 496 | uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale)-1); |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 497 | } |
| 498 | version = pArgs->options & UCNV_OPTIONS_VERSION_MASK; |
| 499 | myConverterData->version = version; |
| 500 | if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') && |
| 501 | (myLocale[2]=='_' || myLocale[2]=='\0')) |
| 502 | { |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 503 | /* open the required converters and cache them */ |
| 504 | if(version>MAX_JA_VERSION) { |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame] | 505 | // ICU 55 fails to open a converter for an unsupported version. |
| 506 | // Previously, it fell back to version 0, but that would yield |
| 507 | // unexpected behavior. |
| 508 | *errorCode = U_MISSING_RESOURCE_ERROR; |
| 509 | return; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 510 | } |
| 511 | if(jpCharsetMasks[version]&CSM(ISO8859_7)) { |
| 512 | myConverterData->myConverterArray[ISO8859_7] = |
| 513 | ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode); |
| 514 | } |
| 515 | myConverterData->myConverterArray[JISX208] = |
Jungshik Shin | 834113a | 2018-11-02 01:24:07 -0700 | [diff] [blame] | 516 | ucnv_loadSharedData("EUC-JP", &stackPieces, &stackArgs, errorCode); |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 517 | if(jpCharsetMasks[version]&CSM(JISX212)) { |
| 518 | myConverterData->myConverterArray[JISX212] = |
| 519 | ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode); |
| 520 | } |
| 521 | if(jpCharsetMasks[version]&CSM(GB2312)) { |
| 522 | myConverterData->myConverterArray[GB2312] = |
jshin@chromium.org | 52e8245 | 2014-09-25 00:17:42 +0000 | [diff] [blame] | 523 | ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */ |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 524 | } |
| 525 | if(jpCharsetMasks[version]&CSM(KSC5601)) { |
| 526 | myConverterData->myConverterArray[KSC5601] = |
| 527 | ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode); |
| 528 | } |
| 529 | |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 530 | /* set the function pointers to appropriate functions */ |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 531 | cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData); |
| 532 | uprv_strcpy(myConverterData->locale,"ja"); |
| 533 | |
| 534 | (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version="); |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame] | 535 | size_t len = uprv_strlen(myConverterData->name); |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 536 | myConverterData->name[len]=(char)(myConverterData->version+(int)'0'); |
| 537 | myConverterData->name[len+1]='\0'; |
| 538 | } |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame] | 539 | #if !UCONFIG_ONLY_HTML_CONVERSION |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 540 | else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') && |
| 541 | (myLocale[2]=='_' || myLocale[2]=='\0')) |
| 542 | { |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame] | 543 | if(version>1) { |
| 544 | // ICU 55 fails to open a converter for an unsupported version. |
| 545 | // Previously, it fell back to version 0, but that would yield |
| 546 | // unexpected behavior. |
| 547 | *errorCode = U_MISSING_RESOURCE_ERROR; |
| 548 | return; |
| 549 | } |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 550 | const char *cnvName; |
| 551 | if(version==1) { |
| 552 | cnvName="icu-internal-25546"; |
| 553 | } else { |
| 554 | cnvName="ibm-949"; |
| 555 | myConverterData->version=version=0; |
| 556 | } |
| 557 | if(pArgs->onlyTestIsLoadable) { |
| 558 | ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */ |
| 559 | uprv_free(cnv->extraInfo); |
| 560 | cnv->extraInfo=NULL; |
| 561 | return; |
| 562 | } else { |
| 563 | myConverterData->currentConverter=ucnv_open(cnvName, errorCode); |
| 564 | if (U_FAILURE(*errorCode)) { |
| 565 | _ISO2022Close(cnv); |
| 566 | return; |
| 567 | } |
| 568 | |
| 569 | if(version==1) { |
| 570 | (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1"); |
| 571 | uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4); |
| 572 | cnv->subCharLen = myConverterData->currentConverter->subCharLen; |
| 573 | }else{ |
| 574 | (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0"); |
| 575 | } |
| 576 | |
| 577 | /* initialize the state variables */ |
| 578 | setInitialStateToUnicodeKR(cnv, myConverterData); |
| 579 | setInitialStateFromUnicodeKR(cnv, myConverterData); |
| 580 | |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 581 | /* set the function pointers to appropriate functions */ |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 582 | cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData; |
| 583 | uprv_strcpy(myConverterData->locale,"ko"); |
| 584 | } |
| 585 | } |
| 586 | else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&& |
| 587 | (myLocale[2]=='_' || myLocale[2]=='\0')) |
| 588 | { |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame] | 589 | if(version>2) { |
| 590 | // ICU 55 fails to open a converter for an unsupported version. |
| 591 | // Previously, it fell back to version 0, but that would yield |
| 592 | // unexpected behavior. |
| 593 | *errorCode = U_MISSING_RESOURCE_ERROR; |
| 594 | return; |
| 595 | } |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 596 | |
| 597 | /* open the required converters and cache them */ |
| 598 | myConverterData->myConverterArray[GB2312_1] = |
jshin@chromium.org | 52e8245 | 2014-09-25 00:17:42 +0000 | [diff] [blame] | 599 | ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode); |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 600 | if(version==1) { |
| 601 | myConverterData->myConverterArray[ISO_IR_165] = |
jshin@chromium.org | 52e8245 | 2014-09-25 00:17:42 +0000 | [diff] [blame] | 602 | ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode); |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 603 | } |
| 604 | myConverterData->myConverterArray[CNS_11643] = |
jshin@chromium.org | 52e8245 | 2014-09-25 00:17:42 +0000 | [diff] [blame] | 605 | ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode); |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 606 | |
| 607 | |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 608 | /* set the function pointers to appropriate functions */ |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 609 | cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData; |
| 610 | uprv_strcpy(myConverterData->locale,"cn"); |
| 611 | |
| 612 | if (version==0){ |
| 613 | myConverterData->version = 0; |
| 614 | (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0"); |
| 615 | }else if (version==1){ |
| 616 | myConverterData->version = 1; |
| 617 | (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1"); |
| 618 | }else { |
| 619 | myConverterData->version = 2; |
| 620 | (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2"); |
| 621 | } |
| 622 | } |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame] | 623 | #endif // !UCONFIG_ONLY_HTML_CONVERSION |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 624 | else{ |
| 625 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 626 | myConverterData->isFirstBuffer = true; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 627 | |
| 628 | /* append the UTF-8 escape sequence */ |
| 629 | cnv->charErrorBufferLength = 3; |
| 630 | cnv->charErrorBuffer[0] = 0x1b; |
| 631 | cnv->charErrorBuffer[1] = 0x25; |
| 632 | cnv->charErrorBuffer[2] = 0x42; |
| 633 | |
| 634 | cnv->sharedData=(UConverterSharedData*)&_ISO2022Data; |
| 635 | /* initialize the state variables */ |
| 636 | uprv_strcpy(myConverterData->name,"ISO_2022"); |
| 637 | #else |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame] | 638 | *errorCode = U_MISSING_RESOURCE_ERROR; |
| 639 | // Was U_UNSUPPORTED_ERROR but changed in ICU 55 to a more standard |
| 640 | // data loading error code. |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 641 | return; |
| 642 | #endif |
| 643 | } |
| 644 | |
| 645 | cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar; |
| 646 | |
| 647 | if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) { |
| 648 | _ISO2022Close(cnv); |
| 649 | } |
| 650 | } else { |
| 651 | *errorCode = U_MEMORY_ALLOCATION_ERROR; |
| 652 | } |
| 653 | } |
| 654 | |
| 655 | |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 656 | static void U_CALLCONV |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 657 | _ISO2022Close(UConverter *converter) { |
| 658 | UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo); |
| 659 | UConverterSharedData **array = myData->myConverterArray; |
| 660 | int32_t i; |
| 661 | |
| 662 | if (converter->extraInfo != NULL) { |
| 663 | /*close the array of converter pointers and free the memory*/ |
| 664 | for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { |
| 665 | if(array[i]!=NULL) { |
| 666 | ucnv_unloadSharedDataIfReady(array[i]); |
| 667 | } |
| 668 | } |
| 669 | |
| 670 | ucnv_close(myData->currentConverter); |
| 671 | |
| 672 | if(!converter->isExtraLocal){ |
| 673 | uprv_free (converter->extraInfo); |
| 674 | converter->extraInfo = NULL; |
| 675 | } |
| 676 | } |
| 677 | } |
| 678 | |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 679 | static void U_CALLCONV |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 680 | _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) { |
| 681 | UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo); |
| 682 | if(choice<=UCNV_RESET_TO_UNICODE) { |
| 683 | uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State)); |
| 684 | myConverterData->key = 0; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 685 | myConverterData->isEmptySegment = false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 686 | } |
| 687 | if(choice!=UCNV_RESET_TO_UNICODE) { |
| 688 | uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State)); |
| 689 | } |
| 690 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
| 691 | if(myConverterData->locale[0] == 0){ |
| 692 | if(choice<=UCNV_RESET_TO_UNICODE) { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 693 | myConverterData->isFirstBuffer = true; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 694 | myConverterData->key = 0; |
| 695 | if (converter->mode == UCNV_SO){ |
| 696 | ucnv_close (myConverterData->currentConverter); |
| 697 | myConverterData->currentConverter=NULL; |
| 698 | } |
| 699 | converter->mode = UCNV_SI; |
| 700 | } |
| 701 | if(choice!=UCNV_RESET_TO_UNICODE) { |
| 702 | /* re-append UTF-8 escape sequence */ |
| 703 | converter->charErrorBufferLength = 3; |
| 704 | converter->charErrorBuffer[0] = 0x1b; |
| 705 | converter->charErrorBuffer[1] = 0x28; |
| 706 | converter->charErrorBuffer[2] = 0x42; |
| 707 | } |
| 708 | } |
| 709 | else |
| 710 | #endif |
| 711 | { |
| 712 | /* reset the state variables */ |
| 713 | if(myConverterData->locale[0] == 'k'){ |
| 714 | if(choice<=UCNV_RESET_TO_UNICODE) { |
| 715 | setInitialStateToUnicodeKR(converter, myConverterData); |
| 716 | } |
| 717 | if(choice!=UCNV_RESET_TO_UNICODE) { |
| 718 | setInitialStateFromUnicodeKR(converter, myConverterData); |
| 719 | } |
| 720 | } |
| 721 | } |
| 722 | } |
| 723 | |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 724 | U_CDECL_BEGIN |
| 725 | |
| 726 | static const char * U_CALLCONV |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 727 | _ISO2022getName(const UConverter* cnv){ |
| 728 | if(cnv->extraInfo){ |
| 729 | UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo; |
| 730 | return myData->name; |
| 731 | } |
| 732 | return NULL; |
| 733 | } |
| 734 | |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 735 | U_CDECL_END |
| 736 | |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 737 | |
| 738 | /*************** to unicode *******************/ |
| 739 | /**************************************************************************** |
| 740 | * Recognized escape sequences are |
| 741 | * <ESC>(B ASCII |
| 742 | * <ESC>.A ISO-8859-1 |
| 743 | * <ESC>.F ISO-8859-7 |
| 744 | * <ESC>(J JISX-201 |
| 745 | * <ESC>(I JISX-201 |
| 746 | * <ESC>$B JISX-208 |
| 747 | * <ESC>$@ JISX-208 |
| 748 | * <ESC>$(D JISX-212 |
| 749 | * <ESC>$A GB2312 |
| 750 | * <ESC>$(C KSC5601 |
| 751 | */ |
| 752 | static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= { |
| 753 | /* 0 1 2 3 4 5 6 7 8 9 */ |
| 754 | INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
| 755 | ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE |
| 756 | ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
| 757 | ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE |
| 758 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
| 759 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
| 760 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
| 761 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
| 762 | }; |
| 763 | |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame] | 764 | #if !UCONFIG_ONLY_HTML_CONVERSION |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 765 | /*************** to unicode *******************/ |
| 766 | static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= { |
| 767 | /* 0 1 2 3 4 5 6 7 8 9 */ |
| 768 | INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
| 769 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
| 770 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
| 771 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
| 772 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165 |
| 773 | ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
| 774 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
| 775 | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
| 776 | }; |
Jungshik Shin (jungshik at google) | afd723b | 2015-01-21 13:24:04 -0800 | [diff] [blame] | 777 | #endif |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 778 | |
| 779 | |
| 780 | static UCNV_TableStates_2022 |
| 781 | getKey_2022(char c,int32_t* key,int32_t* offset){ |
| 782 | int32_t togo; |
| 783 | int32_t low = 0; |
| 784 | int32_t hi = MAX_STATES_2022; |
| 785 | int32_t oldmid=0; |
| 786 | |
| 787 | togo = normalize_esq_chars_2022[(uint8_t)c]; |
| 788 | if(togo == 0) { |
| 789 | /* not a valid character anywhere in an escape sequence */ |
| 790 | *key = 0; |
| 791 | *offset = 0; |
| 792 | return INVALID_2022; |
| 793 | } |
| 794 | togo = (*key << 5) + togo; |
| 795 | |
| 796 | while (hi != low) /*binary search*/{ |
| 797 | |
Jungshik Shin (jungshik at google) | 0f8746a | 2015-01-08 15:46:45 -0800 | [diff] [blame] | 798 | int32_t mid = (hi+low) >> 1; /*Finds median*/ |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 799 | |
| 800 | if (mid == oldmid) |
| 801 | break; |
| 802 | |
| 803 | if (escSeqStateTable_Key_2022[mid] > togo){ |
| 804 | hi = mid; |
| 805 | } |
| 806 | else if (escSeqStateTable_Key_2022[mid] < togo){ |
| 807 | low = mid; |
| 808 | } |
| 809 | else /*we found it*/{ |
| 810 | *key = togo; |
| 811 | *offset = mid; |
| 812 | return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid]; |
| 813 | } |
| 814 | oldmid = mid; |
| 815 | |
| 816 | } |
| 817 | |
| 818 | *key = 0; |
| 819 | *offset = 0; |
| 820 | return INVALID_2022; |
| 821 | } |
| 822 | |
Frank Tang | 7e7574b | 2021-04-13 21:19:13 -0700 | [diff] [blame] | 823 | /*runs through a state machine to determine the escape sequence - codepage correspondence |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 824 | */ |
| 825 | static void |
| 826 | changeState_2022(UConverter* _this, |
| 827 | const char** source, |
| 828 | const char* sourceLimit, |
| 829 | Variant2022 var, |
| 830 | UErrorCode* err){ |
| 831 | UCNV_TableStates_2022 value; |
| 832 | UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo); |
| 833 | uint32_t key = myData2022->key; |
| 834 | int32_t offset = 0; |
| 835 | int8_t initialToULength = _this->toULength; |
| 836 | char c; |
| 837 | |
| 838 | value = VALID_NON_TERMINAL_2022; |
| 839 | while (*source < sourceLimit) { |
| 840 | c = *(*source)++; |
| 841 | _this->toUBytes[_this->toULength++]=(uint8_t)c; |
| 842 | value = getKey_2022(c,(int32_t *) &key, &offset); |
| 843 | |
| 844 | switch (value){ |
| 845 | |
| 846 | case VALID_NON_TERMINAL_2022 : |
| 847 | /* continue with the loop */ |
| 848 | break; |
| 849 | |
| 850 | case VALID_TERMINAL_2022: |
| 851 | key = 0; |
| 852 | goto DONE; |
| 853 | |
| 854 | case INVALID_2022: |
| 855 | goto DONE; |
| 856 | |
| 857 | case VALID_MAYBE_TERMINAL_2022: |
| 858 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
| 859 | /* ESC ( B is ambiguous only for ISO_2022 itself */ |
| 860 | if(var == ISO_2022) { |
| 861 | /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */ |
| 862 | _this->toULength = 0; |
| 863 | |
| 864 | /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */ |
| 865 | |
| 866 | /* continue with the loop */ |
| 867 | value = VALID_NON_TERMINAL_2022; |
| 868 | break; |
| 869 | } else |
| 870 | #endif |
| 871 | { |
| 872 | /* not ISO_2022 itself, finish here */ |
| 873 | value = VALID_TERMINAL_2022; |
| 874 | key = 0; |
| 875 | goto DONE; |
| 876 | } |
| 877 | } |
| 878 | } |
| 879 | |
| 880 | DONE: |
| 881 | myData2022->key = key; |
| 882 | |
| 883 | if (value == VALID_NON_TERMINAL_2022) { |
| 884 | /* indicate that the escape sequence is incomplete: key!=0 */ |
| 885 | return; |
| 886 | } else if (value == INVALID_2022 ) { |
| 887 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
| 888 | } else /* value == VALID_TERMINAL_2022 */ { |
| 889 | switch(var){ |
| 890 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
| 891 | case ISO_2022: |
| 892 | { |
| 893 | const char *chosenConverterName = escSeqStateTable_Result_2022[offset]; |
| 894 | if(chosenConverterName == NULL) { |
| 895 | /* SS2 or SS3 */ |
| 896 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
| 897 | _this->toUCallbackReason = UCNV_UNASSIGNED; |
| 898 | return; |
| 899 | } |
| 900 | |
| 901 | _this->mode = UCNV_SI; |
| 902 | ucnv_close(myData2022->currentConverter); |
| 903 | myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err); |
| 904 | if(U_SUCCESS(*err)) { |
| 905 | myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; |
| 906 | _this->mode = UCNV_SO; |
| 907 | } |
| 908 | break; |
| 909 | } |
| 910 | #endif |
| 911 | case ISO_2022_JP: |
| 912 | { |
| 913 | StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset]; |
| 914 | switch(tempState) { |
| 915 | case INVALID_STATE: |
| 916 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
| 917 | break; |
| 918 | case SS2_STATE: |
| 919 | if(myData2022->toU2022State.cs[2]!=0) { |
| 920 | if(myData2022->toU2022State.g<2) { |
| 921 | myData2022->toU2022State.prevG=myData2022->toU2022State.g; |
| 922 | } |
| 923 | myData2022->toU2022State.g=2; |
| 924 | } else { |
| 925 | /* illegal to have SS2 before a matching designator */ |
| 926 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
| 927 | } |
| 928 | break; |
| 929 | /* case SS3_STATE: not used in ISO-2022-JP-x */ |
| 930 | case ISO8859_1: |
| 931 | case ISO8859_7: |
| 932 | if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { |
| 933 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
| 934 | } else { |
| 935 | /* G2 charset for SS2 */ |
| 936 | myData2022->toU2022State.cs[2]=(int8_t)tempState; |
| 937 | } |
| 938 | break; |
| 939 | default: |
| 940 | if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { |
| 941 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
| 942 | } else { |
| 943 | /* G0 charset */ |
| 944 | myData2022->toU2022State.cs[0]=(int8_t)tempState; |
| 945 | } |
| 946 | break; |
| 947 | } |
| 948 | } |
| 949 | break; |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame] | 950 | #if !UCONFIG_ONLY_HTML_CONVERSION |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 951 | case ISO_2022_CN: |
| 952 | { |
| 953 | StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset]; |
| 954 | switch(tempState) { |
| 955 | case INVALID_STATE: |
| 956 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
| 957 | break; |
| 958 | case SS2_STATE: |
| 959 | if(myData2022->toU2022State.cs[2]!=0) { |
| 960 | if(myData2022->toU2022State.g<2) { |
| 961 | myData2022->toU2022State.prevG=myData2022->toU2022State.g; |
| 962 | } |
| 963 | myData2022->toU2022State.g=2; |
| 964 | } else { |
| 965 | /* illegal to have SS2 before a matching designator */ |
| 966 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
| 967 | } |
| 968 | break; |
| 969 | case SS3_STATE: |
| 970 | if(myData2022->toU2022State.cs[3]!=0) { |
| 971 | if(myData2022->toU2022State.g<2) { |
| 972 | myData2022->toU2022State.prevG=myData2022->toU2022State.g; |
| 973 | } |
| 974 | myData2022->toU2022State.g=3; |
| 975 | } else { |
| 976 | /* illegal to have SS3 before a matching designator */ |
| 977 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
| 978 | } |
| 979 | break; |
| 980 | case ISO_IR_165: |
| 981 | if(myData2022->version==0) { |
| 982 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
| 983 | break; |
| 984 | } |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 985 | U_FALLTHROUGH; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 986 | case GB2312_1: |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 987 | U_FALLTHROUGH; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 988 | case CNS_11643_1: |
| 989 | myData2022->toU2022State.cs[1]=(int8_t)tempState; |
| 990 | break; |
| 991 | case CNS_11643_2: |
| 992 | myData2022->toU2022State.cs[2]=(int8_t)tempState; |
| 993 | break; |
| 994 | default: |
| 995 | /* other CNS 11643 planes */ |
| 996 | if(myData2022->version==0) { |
| 997 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
| 998 | } else { |
| 999 | myData2022->toU2022State.cs[3]=(int8_t)tempState; |
| 1000 | } |
| 1001 | break; |
| 1002 | } |
| 1003 | } |
| 1004 | break; |
| 1005 | case ISO_2022_KR: |
| 1006 | if(offset==0x30){ |
| 1007 | /* nothing to be done, just accept this one escape sequence */ |
| 1008 | } else { |
| 1009 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
| 1010 | } |
| 1011 | break; |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame] | 1012 | #endif // !UCONFIG_ONLY_HTML_CONVERSION |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 1013 | |
| 1014 | default: |
| 1015 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
| 1016 | break; |
| 1017 | } |
| 1018 | } |
| 1019 | if(U_SUCCESS(*err)) { |
| 1020 | _this->toULength = 0; |
| 1021 | } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) { |
| 1022 | if(_this->toULength>1) { |
| 1023 | /* |
| 1024 | * Ticket 5691: consistent illegal sequences: |
| 1025 | * - We include at least the first byte (ESC) in the illegal sequence. |
| 1026 | * - If any of the non-initial bytes could be the start of a character, |
| 1027 | * we stop the illegal sequence before the first one of those. |
| 1028 | * In escape sequences, all following bytes are "printable", that is, |
| 1029 | * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS), |
| 1030 | * they are valid single/lead bytes. |
| 1031 | * For simplicity, we always only report the initial ESC byte as the |
| 1032 | * illegal sequence and back out all other bytes we looked at. |
| 1033 | */ |
| 1034 | /* Back out some bytes. */ |
| 1035 | int8_t backOutDistance=_this->toULength-1; |
| 1036 | int8_t bytesFromThisBuffer=_this->toULength-initialToULength; |
| 1037 | if(backOutDistance<=bytesFromThisBuffer) { |
| 1038 | /* same as initialToULength<=1 */ |
| 1039 | *source-=backOutDistance; |
| 1040 | } else { |
| 1041 | /* Back out bytes from the previous buffer: Need to replay them. */ |
| 1042 | _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); |
| 1043 | /* same as -(initialToULength-1) */ |
| 1044 | /* preToULength is negative! */ |
| 1045 | uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength); |
| 1046 | *source-=bytesFromThisBuffer; |
| 1047 | } |
| 1048 | _this->toULength=1; |
| 1049 | } |
| 1050 | } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) { |
| 1051 | _this->toUCallbackReason = UCNV_UNASSIGNED; |
| 1052 | } |
| 1053 | } |
| 1054 | |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame] | 1055 | #if !UCONFIG_ONLY_HTML_CONVERSION |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 1056 | /*Checks the characters of the buffer against valid 2022 escape sequences |
| 1057 | *if the match we return a pointer to the initial start of the sequence otherwise |
| 1058 | *we return sourceLimit |
| 1059 | */ |
| 1060 | /*for 2022 looks ahead in the stream |
| 1061 | *to determine the longest possible convertible |
| 1062 | *data stream |
| 1063 | */ |
| 1064 | static inline const char* |
| 1065 | getEndOfBuffer_2022(const char** source, |
| 1066 | const char* sourceLimit, |
| 1067 | UBool /*flush*/){ |
| 1068 | |
| 1069 | const char* mySource = *source; |
| 1070 | |
| 1071 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
| 1072 | if (*source >= sourceLimit) |
| 1073 | return sourceLimit; |
| 1074 | |
| 1075 | do{ |
| 1076 | |
| 1077 | if (*mySource == ESC_2022){ |
| 1078 | int8_t i; |
| 1079 | int32_t key = 0; |
| 1080 | int32_t offset; |
| 1081 | UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022; |
| 1082 | |
| 1083 | /* Kludge: I could not |
| 1084 | * figure out the reason for validating an escape sequence |
| 1085 | * twice - once here and once in changeState_2022(). |
| 1086 | * is it possible to have an ESC character in a ISO2022 |
| 1087 | * byte stream which is valid in a code page? Is it legal? |
| 1088 | */ |
| 1089 | for (i=0; |
| 1090 | (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022); |
| 1091 | i++) { |
| 1092 | value = getKey_2022(*(mySource+i), &key, &offset); |
| 1093 | } |
| 1094 | if (value > 0 || *mySource==ESC_2022) |
| 1095 | return mySource; |
| 1096 | |
| 1097 | if ((value == VALID_NON_TERMINAL_2022)&&(!flush) ) |
| 1098 | return sourceLimit; |
| 1099 | } |
| 1100 | }while (++mySource < sourceLimit); |
| 1101 | |
| 1102 | return sourceLimit; |
| 1103 | #else |
| 1104 | while(mySource < sourceLimit && *mySource != ESC_2022) { |
| 1105 | ++mySource; |
| 1106 | } |
| 1107 | return mySource; |
| 1108 | #endif |
| 1109 | } |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame] | 1110 | #endif |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 1111 | |
| 1112 | /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c |
| 1113 | * any future change in _MBCSFromUChar32() function should be reflected here. |
| 1114 | * @return number of bytes in *value; negative number if fallback; 0 if no mapping |
| 1115 | */ |
| 1116 | static inline int32_t |
| 1117 | MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData, |
| 1118 | UChar32 c, |
| 1119 | uint32_t* value, |
| 1120 | UBool useFallback, |
| 1121 | int outputType) |
| 1122 | { |
| 1123 | const int32_t *cx; |
| 1124 | const uint16_t *table; |
| 1125 | uint32_t stage2Entry; |
| 1126 | uint32_t myValue; |
| 1127 | int32_t length; |
| 1128 | const uint8_t *p; |
| 1129 | /* |
| 1130 | * TODO(markus): Use and require new, faster MBCS conversion table structures. |
| 1131 | * Use internal version of ucnv_open() that verifies that the new structures are available, |
| 1132 | * else U_INTERNAL_PROGRAM_ERROR. |
| 1133 | */ |
| 1134 | /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ |
| 1135 | if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { |
| 1136 | table=sharedData->mbcs.fromUnicodeTable; |
| 1137 | stage2Entry=MBCS_STAGE_2_FROM_U(table, c); |
| 1138 | /* get the bytes and the length for the output */ |
| 1139 | if(outputType==MBCS_OUTPUT_2){ |
| 1140 | myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); |
| 1141 | if(myValue<=0xff) { |
| 1142 | length=1; |
| 1143 | } else { |
| 1144 | length=2; |
| 1145 | } |
| 1146 | } else /* outputType==MBCS_OUTPUT_3 */ { |
| 1147 | p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); |
| 1148 | myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; |
| 1149 | if(myValue<=0xff) { |
| 1150 | length=1; |
| 1151 | } else if(myValue<=0xffff) { |
| 1152 | length=2; |
| 1153 | } else { |
| 1154 | length=3; |
| 1155 | } |
| 1156 | } |
| 1157 | /* is this code point assigned, or do we use fallbacks? */ |
| 1158 | if((stage2Entry&(1<<(16+(c&0xf))))!=0) { |
| 1159 | /* assigned */ |
| 1160 | *value=myValue; |
| 1161 | return length; |
| 1162 | } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) { |
| 1163 | /* |
| 1164 | * We allow a 0 byte output if the "assigned" bit is set for this entry. |
| 1165 | * There is no way with this data structure for fallback output |
| 1166 | * to be a zero byte. |
| 1167 | */ |
| 1168 | *value=myValue; |
| 1169 | return -length; |
| 1170 | } |
| 1171 | } |
| 1172 | |
| 1173 | cx=sharedData->mbcs.extIndexes; |
| 1174 | if(cx!=NULL) { |
| 1175 | return ucnv_extSimpleMatchFromU(cx, c, value, useFallback); |
| 1176 | } |
| 1177 | |
| 1178 | /* unassigned */ |
| 1179 | return 0; |
| 1180 | } |
| 1181 | |
| 1182 | /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c |
| 1183 | * any future change in _MBCSSingleFromUChar32() function should be reflected here. |
| 1184 | * @param retval pointer to output byte |
| 1185 | * @return 1 roundtrip byte 0 no mapping -1 fallback byte |
| 1186 | */ |
| 1187 | static inline int32_t |
| 1188 | MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData, |
| 1189 | UChar32 c, |
| 1190 | uint32_t* retval, |
| 1191 | UBool useFallback) |
| 1192 | { |
| 1193 | const uint16_t *table; |
| 1194 | int32_t value; |
| 1195 | /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ |
| 1196 | if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { |
| 1197 | return 0; |
| 1198 | } |
| 1199 | /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ |
| 1200 | table=sharedData->mbcs.fromUnicodeTable; |
| 1201 | /* get the byte for the output */ |
| 1202 | value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); |
| 1203 | /* is this code point assigned, or do we use fallbacks? */ |
| 1204 | *retval=(uint32_t)(value&0xff); |
| 1205 | if(value>=0xf00) { |
| 1206 | return 1; /* roundtrip */ |
| 1207 | } else if(useFallback ? value>=0x800 : value>=0xc00) { |
| 1208 | return -1; /* fallback taken */ |
| 1209 | } else { |
| 1210 | return 0; /* no mapping */ |
| 1211 | } |
| 1212 | } |
| 1213 | |
| 1214 | /* |
| 1215 | * Check that the result is a 2-byte value with each byte in the range A1..FE |
| 1216 | * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte |
| 1217 | * to move it to the ISO 2022 range 21..7E. |
| 1218 | * Return 0 if out of range. |
| 1219 | */ |
| 1220 | static inline uint32_t |
| 1221 | _2022FromGR94DBCS(uint32_t value) { |
| 1222 | if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && |
| 1223 | (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) |
| 1224 | ) { |
| 1225 | return value - 0x8080; /* shift down to 21..7e byte range */ |
| 1226 | } else { |
| 1227 | return 0; /* not valid for ISO 2022 */ |
| 1228 | } |
| 1229 | } |
| 1230 | |
| 1231 | #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */ |
| 1232 | /* |
| 1233 | * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the |
| 1234 | * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point |
| 1235 | * unchanged. |
| 1236 | */ |
| 1237 | static inline uint32_t |
| 1238 | _2022ToGR94DBCS(uint32_t value) { |
| 1239 | uint32_t returnValue = value + 0x8080; |
| 1240 | if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) && |
| 1241 | (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) { |
| 1242 | return returnValue; |
| 1243 | } else { |
| 1244 | return value; |
| 1245 | } |
| 1246 | } |
| 1247 | #endif |
| 1248 | |
| 1249 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
| 1250 | |
| 1251 | /********************************************************************************** |
| 1252 | * ISO-2022 Converter |
| 1253 | * |
| 1254 | * |
| 1255 | */ |
| 1256 | |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 1257 | static void U_CALLCONV |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 1258 | T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, |
| 1259 | UErrorCode* err){ |
| 1260 | const char* mySourceLimit, *realSourceLimit; |
| 1261 | const char* sourceStart; |
| 1262 | const UChar* myTargetStart; |
| 1263 | UConverter* saveThis; |
| 1264 | UConverterDataISO2022* myData; |
| 1265 | int8_t length; |
| 1266 | |
| 1267 | saveThis = args->converter; |
| 1268 | myData=((UConverterDataISO2022*)(saveThis->extraInfo)); |
| 1269 | |
| 1270 | realSourceLimit = args->sourceLimit; |
| 1271 | while (args->source < realSourceLimit) { |
| 1272 | if(myData->key == 0) { /* are we in the middle of an escape sequence? */ |
| 1273 | /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ |
| 1274 | mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush); |
| 1275 | |
| 1276 | if(args->source < mySourceLimit) { |
| 1277 | if(myData->currentConverter==NULL) { |
| 1278 | myData->currentConverter = ucnv_open("ASCII",err); |
| 1279 | if(U_FAILURE(*err)){ |
| 1280 | return; |
| 1281 | } |
| 1282 | |
| 1283 | myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; |
| 1284 | saveThis->mode = UCNV_SO; |
| 1285 | } |
| 1286 | |
| 1287 | /* convert to before the ESC or until the end of the buffer */ |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 1288 | myData->isFirstBuffer=false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 1289 | sourceStart = args->source; |
| 1290 | myTargetStart = args->target; |
| 1291 | args->converter = myData->currentConverter; |
| 1292 | ucnv_toUnicode(args->converter, |
| 1293 | &args->target, |
| 1294 | args->targetLimit, |
| 1295 | &args->source, |
| 1296 | mySourceLimit, |
| 1297 | args->offsets, |
| 1298 | (UBool)(args->flush && mySourceLimit == realSourceLimit), |
| 1299 | err); |
| 1300 | args->converter = saveThis; |
| 1301 | |
| 1302 | if (*err == U_BUFFER_OVERFLOW_ERROR) { |
| 1303 | /* move the overflow buffer */ |
| 1304 | length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength; |
| 1305 | myData->currentConverter->UCharErrorBufferLength = 0; |
| 1306 | if(length > 0) { |
| 1307 | uprv_memcpy(saveThis->UCharErrorBuffer, |
| 1308 | myData->currentConverter->UCharErrorBuffer, |
| 1309 | length*U_SIZEOF_UCHAR); |
| 1310 | } |
| 1311 | return; |
| 1312 | } |
| 1313 | |
| 1314 | /* |
| 1315 | * At least one of: |
| 1316 | * -Error while converting |
| 1317 | * -Done with entire buffer |
| 1318 | * -Need to write offsets or update the current offset |
| 1319 | * (leave that up to the code in ucnv.c) |
| 1320 | * |
| 1321 | * or else we just stopped at an ESC byte and continue with changeState_2022() |
| 1322 | */ |
| 1323 | if (U_FAILURE(*err) || |
| 1324 | (args->source == realSourceLimit) || |
| 1325 | (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) || |
| 1326 | (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0)) |
| 1327 | ) { |
| 1328 | /* copy partial or error input for truncated detection and error handling */ |
| 1329 | if(U_FAILURE(*err)) { |
| 1330 | length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength; |
| 1331 | if(length > 0) { |
| 1332 | uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length); |
| 1333 | } |
| 1334 | } else { |
| 1335 | length = saveThis->toULength = myData->currentConverter->toULength; |
| 1336 | if(length > 0) { |
| 1337 | uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length); |
| 1338 | if(args->source < mySourceLimit) { |
| 1339 | *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */ |
| 1340 | } |
| 1341 | } |
| 1342 | } |
| 1343 | return; |
| 1344 | } |
| 1345 | } |
| 1346 | } |
| 1347 | |
| 1348 | sourceStart = args->source; |
| 1349 | changeState_2022(args->converter, |
| 1350 | &(args->source), |
| 1351 | realSourceLimit, |
| 1352 | ISO_2022, |
| 1353 | err); |
| 1354 | if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) { |
| 1355 | /* let the ucnv.c code update its current offset */ |
| 1356 | return; |
| 1357 | } |
| 1358 | } |
| 1359 | } |
| 1360 | |
| 1361 | #endif |
| 1362 | |
| 1363 | /* |
| 1364 | * To Unicode Callback helper function |
| 1365 | */ |
| 1366 | static void |
| 1367 | toUnicodeCallback(UConverter *cnv, |
| 1368 | const uint32_t sourceChar, const uint32_t targetUniChar, |
| 1369 | UErrorCode* err){ |
| 1370 | if(sourceChar>0xff){ |
| 1371 | cnv->toUBytes[0] = (uint8_t)(sourceChar>>8); |
| 1372 | cnv->toUBytes[1] = (uint8_t)sourceChar; |
| 1373 | cnv->toULength = 2; |
| 1374 | } |
| 1375 | else{ |
| 1376 | cnv->toUBytes[0] =(char) sourceChar; |
| 1377 | cnv->toULength = 1; |
| 1378 | } |
| 1379 | |
| 1380 | if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){ |
| 1381 | *err = U_INVALID_CHAR_FOUND; |
| 1382 | } |
| 1383 | else{ |
| 1384 | *err = U_ILLEGAL_CHAR_FOUND; |
| 1385 | } |
| 1386 | } |
| 1387 | |
| 1388 | /**************************************ISO-2022-JP*************************************************/ |
| 1389 | |
| 1390 | /************************************** IMPORTANT ************************************************** |
| 1391 | * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and |
| 1392 | * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32(). |
| 1393 | * The converter iterates over each Unicode codepoint |
| 1394 | * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is |
| 1395 | * processed one char at a time it would make sense to reduce the extra processing a canned converter |
| 1396 | * would do as far as possible. |
| 1397 | * |
| 1398 | * If the implementation of these macros or structure of sharedData struct change in the future, make |
| 1399 | * sure that ISO-2022 is also changed. |
| 1400 | *************************************************************************************************** |
| 1401 | */ |
| 1402 | |
| 1403 | /*************************************************************************************************** |
| 1404 | * Rules for ISO-2022-jp encoding |
| 1405 | * (i) Escape sequences must be fully contained within a line they should not |
| 1406 | * span new lines or CRs |
| 1407 | * (ii) If the last character on a line is represented by two bytes then an ASCII or |
| 1408 | * JIS-Roman character escape sequence should follow before the line terminates |
| 1409 | * (iii) If the first character on the line is represented by two bytes then a two |
| 1410 | * byte character escape sequence should precede it |
| 1411 | * (iv) If no escape sequence is encountered then the characters are ASCII |
| 1412 | * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2, |
| 1413 | * and invoked with SS2 (ESC N). |
| 1414 | * (vi) If there is any G0 designation in text, there must be a switch to |
| 1415 | * ASCII or to JIS X 0201-Roman before a space character (but not |
| 1416 | * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control |
| 1417 | * characters such as tab or CRLF. |
| 1418 | * (vi) Supported encodings: |
| 1419 | * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7 |
| 1420 | * |
| 1421 | * source : RFC-1554 |
| 1422 | * |
| 1423 | * JISX201, JISX208,JISX212 : new .cnv data files created |
| 1424 | * KSC5601 : alias to ibm-949 mapping table |
| 1425 | * GB2312 : alias to ibm-1386 mapping table |
| 1426 | * ISO-8859-1 : Algorithmic implemented as LATIN1 case |
Frank Tang | 7e7574b | 2021-04-13 21:19:13 -0700 | [diff] [blame] | 1427 | * ISO-8859-7 : alias to ibm-9409 mapping table |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 1428 | */ |
| 1429 | |
| 1430 | /* preference order of JP charsets */ |
| 1431 | static const StateEnum jpCharsetPref[]={ |
| 1432 | ASCII, |
| 1433 | JISX201, |
| 1434 | ISO8859_1, |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 1435 | JISX208, |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame] | 1436 | ISO8859_7, |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 1437 | JISX212, |
| 1438 | GB2312, |
| 1439 | KSC5601, |
| 1440 | HWKANA_7BIT |
| 1441 | }; |
| 1442 | |
| 1443 | /* |
| 1444 | * The escape sequences must be in order of the enum constants like JISX201 = 3, |
| 1445 | * not in order of jpCharsetPref[]! |
| 1446 | */ |
| 1447 | static const char escSeqChars[][6] ={ |
| 1448 | "\x1B\x28\x42", /* <ESC>(B ASCII */ |
| 1449 | "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */ |
| 1450 | "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */ |
| 1451 | "\x1B\x28\x4A", /* <ESC>(J JISX-201 */ |
| 1452 | "\x1B\x24\x42", /* <ESC>$B JISX-208 */ |
| 1453 | "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */ |
| 1454 | "\x1B\x24\x41", /* <ESC>$A GB2312 */ |
| 1455 | "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */ |
| 1456 | "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */ |
| 1457 | |
| 1458 | }; |
| 1459 | static const int8_t escSeqCharsLen[] ={ |
| 1460 | 3, /* length of <ESC>(B ASCII */ |
| 1461 | 3, /* length of <ESC>.A ISO-8859-1 */ |
| 1462 | 3, /* length of <ESC>.F ISO-8859-7 */ |
| 1463 | 3, /* length of <ESC>(J JISX-201 */ |
| 1464 | 3, /* length of <ESC>$B JISX-208 */ |
| 1465 | 4, /* length of <ESC>$(D JISX-212 */ |
| 1466 | 3, /* length of <ESC>$A GB2312 */ |
| 1467 | 4, /* length of <ESC>$(C KSC5601 */ |
| 1468 | 3 /* length of <ESC>(I HWKANA_7BIT */ |
| 1469 | }; |
| 1470 | |
| 1471 | /* |
| 1472 | * The iteration over various code pages works this way: |
| 1473 | * i) Get the currentState from myConverterData->currentState |
| 1474 | * ii) Check if the character is mapped to a valid character in the currentState |
| 1475 | * Yes -> a) set the initIterState to currentState |
| 1476 | * b) remain in this state until an invalid character is found |
| 1477 | * No -> a) go to the next code page and find the character |
| 1478 | * iii) Before changing the state increment the current state check if the current state |
| 1479 | * is equal to the intitIteration state |
| 1480 | * Yes -> A character that cannot be represented in any of the supported encodings |
| 1481 | * break and return a U_INVALID_CHARACTER error |
| 1482 | * No -> Continue and find the character in next code page |
| 1483 | * |
| 1484 | * |
| 1485 | * TODO: Implement a priority technique where the users are allowed to set the priority of code pages |
| 1486 | */ |
| 1487 | |
| 1488 | /* Map 00..7F to Unicode according to JIS X 0201. */ |
| 1489 | static inline uint32_t |
| 1490 | jisx201ToU(uint32_t value) { |
| 1491 | if(value < 0x5c) { |
| 1492 | return value; |
| 1493 | } else if(value == 0x5c) { |
| 1494 | return 0xa5; |
| 1495 | } else if(value == 0x7e) { |
| 1496 | return 0x203e; |
| 1497 | } else /* value <= 0x7f */ { |
| 1498 | return value; |
| 1499 | } |
| 1500 | } |
| 1501 | |
| 1502 | /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */ |
| 1503 | static inline uint32_t |
| 1504 | jisx201FromU(uint32_t value) { |
| 1505 | if(value<=0x7f) { |
| 1506 | if(value!=0x5c && value!=0x7e) { |
| 1507 | return value; |
| 1508 | } |
| 1509 | } else if(value==0xa5) { |
| 1510 | return 0x5c; |
| 1511 | } else if(value==0x203e) { |
| 1512 | return 0x7e; |
| 1513 | } |
| 1514 | return 0xfffe; |
| 1515 | } |
| 1516 | |
| 1517 | /* |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 1518 | * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS) |
| 1519 | * Katakana. |
| 1520 | * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks |
| 1521 | * because Shift-JIS roundtrips half-width Katakana to single bytes. |
| 1522 | * These were the only fallbacks in ICU's jisx-208.ucm file. |
| 1523 | */ |
| 1524 | static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = { |
| 1525 | 0x2123, /* U+FF61 */ |
| 1526 | 0x2156, |
| 1527 | 0x2157, |
| 1528 | 0x2122, |
| 1529 | 0x2126, |
| 1530 | 0x2572, |
| 1531 | 0x2521, |
| 1532 | 0x2523, |
| 1533 | 0x2525, |
| 1534 | 0x2527, |
| 1535 | 0x2529, |
| 1536 | 0x2563, |
| 1537 | 0x2565, |
| 1538 | 0x2567, |
| 1539 | 0x2543, |
| 1540 | 0x213C, /* U+FF70 */ |
| 1541 | 0x2522, |
| 1542 | 0x2524, |
| 1543 | 0x2526, |
| 1544 | 0x2528, |
| 1545 | 0x252A, |
| 1546 | 0x252B, |
| 1547 | 0x252D, |
| 1548 | 0x252F, |
| 1549 | 0x2531, |
| 1550 | 0x2533, |
| 1551 | 0x2535, |
| 1552 | 0x2537, |
| 1553 | 0x2539, |
| 1554 | 0x253B, |
| 1555 | 0x253D, |
| 1556 | 0x253F, /* U+FF80 */ |
| 1557 | 0x2541, |
| 1558 | 0x2544, |
| 1559 | 0x2546, |
| 1560 | 0x2548, |
| 1561 | 0x254A, |
| 1562 | 0x254B, |
| 1563 | 0x254C, |
| 1564 | 0x254D, |
| 1565 | 0x254E, |
| 1566 | 0x254F, |
| 1567 | 0x2552, |
| 1568 | 0x2555, |
| 1569 | 0x2558, |
| 1570 | 0x255B, |
| 1571 | 0x255E, |
| 1572 | 0x255F, /* U+FF90 */ |
| 1573 | 0x2560, |
| 1574 | 0x2561, |
| 1575 | 0x2562, |
| 1576 | 0x2564, |
| 1577 | 0x2566, |
| 1578 | 0x2568, |
| 1579 | 0x2569, |
| 1580 | 0x256A, |
| 1581 | 0x256B, |
| 1582 | 0x256C, |
| 1583 | 0x256D, |
| 1584 | 0x256F, |
| 1585 | 0x2573, |
| 1586 | 0x212B, |
| 1587 | 0x212C /* U+FF9F */ |
| 1588 | }; |
| 1589 | |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 1590 | static void U_CALLCONV |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 1591 | UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) { |
| 1592 | UConverter *cnv = args->converter; |
| 1593 | UConverterDataISO2022 *converterData; |
| 1594 | ISO2022State *pFromU2022State; |
| 1595 | uint8_t *target = (uint8_t *) args->target; |
| 1596 | const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; |
| 1597 | const UChar* source = args->source; |
| 1598 | const UChar* sourceLimit = args->sourceLimit; |
| 1599 | int32_t* offsets = args->offsets; |
| 1600 | UChar32 sourceChar; |
| 1601 | char buffer[8]; |
| 1602 | int32_t len, outLen; |
| 1603 | int8_t choices[10]; |
| 1604 | int32_t choiceCount; |
| 1605 | uint32_t targetValue = 0; |
| 1606 | UBool useFallback; |
| 1607 | |
| 1608 | int32_t i; |
| 1609 | int8_t cs, g; |
| 1610 | |
| 1611 | /* set up the state */ |
| 1612 | converterData = (UConverterDataISO2022*)cnv->extraInfo; |
| 1613 | pFromU2022State = &converterData->fromU2022State; |
| 1614 | |
| 1615 | choiceCount = 0; |
| 1616 | |
| 1617 | /* check if the last codepoint of previous buffer was a lead surrogate*/ |
| 1618 | if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { |
| 1619 | goto getTrail; |
| 1620 | } |
| 1621 | |
| 1622 | while(source < sourceLimit) { |
| 1623 | if(target < targetLimit) { |
| 1624 | |
| 1625 | sourceChar = *(source++); |
| 1626 | /*check if the char is a First surrogate*/ |
| 1627 | if(U16_IS_SURROGATE(sourceChar)) { |
| 1628 | if(U16_IS_SURROGATE_LEAD(sourceChar)) { |
| 1629 | getTrail: |
| 1630 | /*look ahead to find the trail surrogate*/ |
| 1631 | if(source < sourceLimit) { |
| 1632 | /* test the following code unit */ |
| 1633 | UChar trail=(UChar) *source; |
| 1634 | if(U16_IS_TRAIL(trail)) { |
| 1635 | source++; |
| 1636 | sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); |
| 1637 | cnv->fromUChar32=0x00; |
| 1638 | /* convert this supplementary code point */ |
| 1639 | /* exit this condition tree */ |
| 1640 | } else { |
| 1641 | /* this is an unmatched lead code unit (1st surrogate) */ |
| 1642 | /* callback(illegal) */ |
| 1643 | *err=U_ILLEGAL_CHAR_FOUND; |
| 1644 | cnv->fromUChar32=sourceChar; |
| 1645 | break; |
| 1646 | } |
| 1647 | } else { |
| 1648 | /* no more input */ |
| 1649 | cnv->fromUChar32=sourceChar; |
| 1650 | break; |
| 1651 | } |
| 1652 | } else { |
| 1653 | /* this is an unmatched trail code unit (2nd surrogate) */ |
| 1654 | /* callback(illegal) */ |
| 1655 | *err=U_ILLEGAL_CHAR_FOUND; |
| 1656 | cnv->fromUChar32=sourceChar; |
| 1657 | break; |
| 1658 | } |
| 1659 | } |
| 1660 | |
| 1661 | /* do not convert SO/SI/ESC */ |
| 1662 | if(IS_2022_CONTROL(sourceChar)) { |
| 1663 | /* callback(illegal) */ |
| 1664 | *err=U_ILLEGAL_CHAR_FOUND; |
| 1665 | cnv->fromUChar32=sourceChar; |
| 1666 | break; |
| 1667 | } |
| 1668 | |
| 1669 | /* do the conversion */ |
| 1670 | |
| 1671 | if(choiceCount == 0) { |
| 1672 | uint16_t csm; |
| 1673 | |
| 1674 | /* |
| 1675 | * The csm variable keeps track of which charsets are allowed |
| 1676 | * and not used yet while building the choices[]. |
| 1677 | */ |
| 1678 | csm = jpCharsetMasks[converterData->version]; |
| 1679 | choiceCount = 0; |
| 1680 | |
| 1681 | /* JIS7/8: try single-byte half-width Katakana before JISX208 */ |
| 1682 | if(converterData->version == 3 || converterData->version == 4) { |
| 1683 | choices[choiceCount++] = (int8_t)HWKANA_7BIT; |
| 1684 | } |
| 1685 | /* Do not try single-byte half-width Katakana for other versions. */ |
| 1686 | csm &= ~CSM(HWKANA_7BIT); |
| 1687 | |
| 1688 | /* try the current G0 charset */ |
| 1689 | choices[choiceCount++] = cs = pFromU2022State->cs[0]; |
| 1690 | csm &= ~CSM(cs); |
| 1691 | |
| 1692 | /* try the current G2 charset */ |
| 1693 | if((cs = pFromU2022State->cs[2]) != 0) { |
| 1694 | choices[choiceCount++] = cs; |
| 1695 | csm &= ~CSM(cs); |
| 1696 | } |
| 1697 | |
| 1698 | /* try all the other possible charsets */ |
Jungshik Shin (jungshik at google) | 0f8746a | 2015-01-08 15:46:45 -0800 | [diff] [blame] | 1699 | for(i = 0; i < UPRV_LENGTHOF(jpCharsetPref); ++i) { |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 1700 | cs = (int8_t)jpCharsetPref[i]; |
| 1701 | if(CSM(cs) & csm) { |
| 1702 | choices[choiceCount++] = cs; |
| 1703 | csm &= ~CSM(cs); |
| 1704 | } |
| 1705 | } |
| 1706 | } |
| 1707 | |
| 1708 | cs = g = 0; |
| 1709 | /* |
| 1710 | * len==0: no mapping found yet |
| 1711 | * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks |
| 1712 | * len>0: found a roundtrip result, done |
| 1713 | */ |
| 1714 | len = 0; |
| 1715 | /* |
| 1716 | * We will turn off useFallback after finding a fallback, |
| 1717 | * but we still get fallbacks from PUA code points as usual. |
| 1718 | * Therefore, we will also need to check that we don't overwrite |
| 1719 | * an early fallback with a later one. |
| 1720 | */ |
| 1721 | useFallback = cnv->useFallback; |
| 1722 | |
| 1723 | for(i = 0; i < choiceCount && len <= 0; ++i) { |
| 1724 | uint32_t value; |
| 1725 | int32_t len2; |
| 1726 | int8_t cs0 = choices[i]; |
| 1727 | switch(cs0) { |
| 1728 | case ASCII: |
| 1729 | if(sourceChar <= 0x7f) { |
| 1730 | targetValue = (uint32_t)sourceChar; |
| 1731 | len = 1; |
| 1732 | cs = cs0; |
| 1733 | g = 0; |
| 1734 | } |
| 1735 | break; |
| 1736 | case ISO8859_1: |
| 1737 | if(GR96_START <= sourceChar && sourceChar <= GR96_END) { |
| 1738 | targetValue = (uint32_t)sourceChar - 0x80; |
| 1739 | len = 1; |
| 1740 | cs = cs0; |
| 1741 | g = 2; |
| 1742 | } |
| 1743 | break; |
| 1744 | case HWKANA_7BIT: |
| 1745 | if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { |
| 1746 | if(converterData->version==3) { |
| 1747 | /* JIS7: use G1 (SO) */ |
| 1748 | /* Shift U+FF61..U+FF9F to bytes 21..5F. */ |
| 1749 | targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21)); |
| 1750 | len = 1; |
| 1751 | pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */ |
| 1752 | g = 1; |
| 1753 | } else if(converterData->version==4) { |
| 1754 | /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */ |
| 1755 | /* Shift U+FF61..U+FF9F to bytes A1..DF. */ |
| 1756 | targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1)); |
| 1757 | len = 1; |
| 1758 | |
| 1759 | cs = pFromU2022State->cs[0]; |
| 1760 | if(IS_JP_DBCS(cs)) { |
| 1761 | /* switch from a DBCS charset to JISX201 */ |
| 1762 | cs = (int8_t)JISX201; |
| 1763 | } |
| 1764 | /* else stay in the current G0 charset */ |
| 1765 | g = 0; |
| 1766 | } |
| 1767 | /* else do not use HWKANA_7BIT with other versions */ |
| 1768 | } |
| 1769 | break; |
| 1770 | case JISX201: |
| 1771 | /* G0 SBCS */ |
| 1772 | value = jisx201FromU(sourceChar); |
| 1773 | if(value <= 0x7f) { |
| 1774 | targetValue = value; |
| 1775 | len = 1; |
| 1776 | cs = cs0; |
| 1777 | g = 0; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 1778 | useFallback = false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 1779 | } |
| 1780 | break; |
| 1781 | case JISX208: |
| 1782 | /* G0 DBCS from Shift-JIS table */ |
| 1783 | len2 = MBCS_FROM_UCHAR32_ISO2022( |
| 1784 | converterData->myConverterArray[cs0], |
| 1785 | sourceChar, &value, |
| 1786 | useFallback, MBCS_OUTPUT_2); |
Jungshik Shin | 834113a | 2018-11-02 01:24:07 -0700 | [diff] [blame] | 1787 | // Only accept DBCS char (abs(len2) == 2). |
| 1788 | // With EUC-JP table for JIS X 208, half-width Kana |
| 1789 | // represented with DBCS starting with 0x8E has to be |
| 1790 | // filtered out so that they can be converted with |
| 1791 | // hwkana_fb table. |
| 1792 | if((len2 == 2 && ((value & 0xFF00) != 0x8E00)) || (len2 == -2 && len == 0)) { |
| 1793 | value &= 0x7F7F; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 1794 | if(value != 0) { |
| 1795 | targetValue = value; |
| 1796 | len = len2; |
| 1797 | cs = cs0; |
| 1798 | g = 0; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 1799 | useFallback = false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 1800 | } |
| 1801 | } else if(len == 0 && useFallback && |
| 1802 | (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { |
| 1803 | targetValue = hwkana_fb[sourceChar - HWKANA_START]; |
| 1804 | len = -2; |
| 1805 | cs = cs0; |
| 1806 | g = 0; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 1807 | useFallback = false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 1808 | } |
| 1809 | break; |
| 1810 | case ISO8859_7: |
| 1811 | /* G0 SBCS forced to 7-bit output */ |
| 1812 | len2 = MBCS_SINGLE_FROM_UCHAR32( |
| 1813 | converterData->myConverterArray[cs0], |
| 1814 | sourceChar, &value, |
| 1815 | useFallback); |
| 1816 | if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) { |
| 1817 | targetValue = value - 0x80; |
| 1818 | len = len2; |
| 1819 | cs = cs0; |
| 1820 | g = 2; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 1821 | useFallback = false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 1822 | } |
| 1823 | break; |
| 1824 | default: |
| 1825 | /* G0 DBCS */ |
| 1826 | len2 = MBCS_FROM_UCHAR32_ISO2022( |
| 1827 | converterData->myConverterArray[cs0], |
| 1828 | sourceChar, &value, |
| 1829 | useFallback, MBCS_OUTPUT_2); |
| 1830 | if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ |
| 1831 | if(cs0 == KSC5601) { |
| 1832 | /* |
| 1833 | * Check for valid bytes for the encoding scheme. |
| 1834 | * This is necessary because the sub-converter (windows-949) |
| 1835 | * has a broader encoding scheme than is valid for 2022. |
| 1836 | */ |
| 1837 | value = _2022FromGR94DBCS(value); |
| 1838 | if(value == 0) { |
| 1839 | break; |
| 1840 | } |
| 1841 | } |
| 1842 | targetValue = value; |
| 1843 | len = len2; |
| 1844 | cs = cs0; |
| 1845 | g = 0; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 1846 | useFallback = false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 1847 | } |
| 1848 | break; |
| 1849 | } |
| 1850 | } |
| 1851 | |
| 1852 | if(len != 0) { |
| 1853 | if(len < 0) { |
| 1854 | len = -len; /* fallback */ |
| 1855 | } |
| 1856 | outLen = 0; /* count output bytes */ |
| 1857 | |
| 1858 | /* write SI if necessary (only for JIS7) */ |
| 1859 | if(pFromU2022State->g == 1 && g == 0) { |
| 1860 | buffer[outLen++] = UCNV_SI; |
| 1861 | pFromU2022State->g = 0; |
| 1862 | } |
| 1863 | |
| 1864 | /* write the designation sequence if necessary */ |
| 1865 | if(cs != pFromU2022State->cs[g]) { |
| 1866 | int32_t escLen = escSeqCharsLen[cs]; |
| 1867 | uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen); |
| 1868 | outLen += escLen; |
| 1869 | pFromU2022State->cs[g] = cs; |
| 1870 | |
| 1871 | /* invalidate the choices[] */ |
| 1872 | choiceCount = 0; |
| 1873 | } |
| 1874 | |
| 1875 | /* write the shift sequence if necessary */ |
| 1876 | if(g != pFromU2022State->g) { |
| 1877 | switch(g) { |
| 1878 | /* case 0 handled before writing escapes */ |
| 1879 | case 1: |
| 1880 | buffer[outLen++] = UCNV_SO; |
| 1881 | pFromU2022State->g = 1; |
| 1882 | break; |
| 1883 | default: /* case 2 */ |
| 1884 | buffer[outLen++] = 0x1b; |
| 1885 | buffer[outLen++] = 0x4e; |
| 1886 | break; |
| 1887 | /* no case 3: no SS3 in ISO-2022-JP-x */ |
| 1888 | } |
| 1889 | } |
| 1890 | |
| 1891 | /* write the output bytes */ |
| 1892 | if(len == 1) { |
| 1893 | buffer[outLen++] = (char)targetValue; |
| 1894 | } else /* len == 2 */ { |
| 1895 | buffer[outLen++] = (char)(targetValue >> 8); |
| 1896 | buffer[outLen++] = (char)targetValue; |
| 1897 | } |
| 1898 | } else { |
| 1899 | /* |
| 1900 | * if we cannot find the character after checking all codepages |
| 1901 | * then this is an error |
| 1902 | */ |
| 1903 | *err = U_INVALID_CHAR_FOUND; |
| 1904 | cnv->fromUChar32=sourceChar; |
| 1905 | break; |
| 1906 | } |
| 1907 | |
| 1908 | if(sourceChar == CR || sourceChar == LF) { |
| 1909 | /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */ |
| 1910 | pFromU2022State->cs[2] = 0; |
| 1911 | choiceCount = 0; |
| 1912 | } |
| 1913 | |
| 1914 | /* output outLen>0 bytes in buffer[] */ |
| 1915 | if(outLen == 1) { |
| 1916 | *target++ = buffer[0]; |
| 1917 | if(offsets) { |
| 1918 | *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ |
| 1919 | } |
| 1920 | } else if(outLen == 2 && (target + 2) <= targetLimit) { |
| 1921 | *target++ = buffer[0]; |
| 1922 | *target++ = buffer[1]; |
| 1923 | if(offsets) { |
| 1924 | int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); |
| 1925 | *offsets++ = sourceIndex; |
| 1926 | *offsets++ = sourceIndex; |
| 1927 | } |
| 1928 | } else { |
| 1929 | fromUWriteUInt8( |
| 1930 | cnv, |
| 1931 | buffer, outLen, |
| 1932 | &target, (const char *)targetLimit, |
| 1933 | &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), |
| 1934 | err); |
| 1935 | if(U_FAILURE(*err)) { |
| 1936 | break; |
| 1937 | } |
| 1938 | } |
| 1939 | } /* end if(myTargetIndex<myTargetLength) */ |
| 1940 | else{ |
| 1941 | *err =U_BUFFER_OVERFLOW_ERROR; |
| 1942 | break; |
| 1943 | } |
| 1944 | |
| 1945 | }/* end while(mySourceIndex<mySourceLength) */ |
| 1946 | |
| 1947 | /* |
| 1948 | * the end of the input stream and detection of truncated input |
| 1949 | * are handled by the framework, but for ISO-2022-JP conversion |
| 1950 | * we need to be in ASCII mode at the very end |
| 1951 | * |
| 1952 | * conditions: |
| 1953 | * successful |
| 1954 | * in SO mode or not in ASCII mode |
| 1955 | * end of input and no truncated input |
| 1956 | */ |
| 1957 | if( U_SUCCESS(*err) && |
| 1958 | (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) && |
| 1959 | args->flush && source>=sourceLimit && cnv->fromUChar32==0 |
| 1960 | ) { |
| 1961 | int32_t sourceIndex; |
| 1962 | |
| 1963 | outLen = 0; |
| 1964 | |
| 1965 | if(pFromU2022State->g != 0) { |
| 1966 | buffer[outLen++] = UCNV_SI; |
| 1967 | pFromU2022State->g = 0; |
| 1968 | } |
| 1969 | |
| 1970 | if(pFromU2022State->cs[0] != ASCII) { |
| 1971 | int32_t escLen = escSeqCharsLen[ASCII]; |
| 1972 | uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen); |
| 1973 | outLen += escLen; |
| 1974 | pFromU2022State->cs[0] = (int8_t)ASCII; |
| 1975 | } |
| 1976 | |
| 1977 | /* get the source index of the last input character */ |
| 1978 | /* |
| 1979 | * TODO this would be simpler and more reliable if we used a pair |
| 1980 | * of sourceIndex/prevSourceIndex like in ucnvmbcs.c |
| 1981 | * so that we could simply use the prevSourceIndex here; |
| 1982 | * this code gives an incorrect result for the rare case of an unmatched |
| 1983 | * trail surrogate that is alone in the last buffer of the text stream |
| 1984 | */ |
| 1985 | sourceIndex=(int32_t)(source-args->source); |
| 1986 | if(sourceIndex>0) { |
| 1987 | --sourceIndex; |
| 1988 | if( U16_IS_TRAIL(args->source[sourceIndex]) && |
| 1989 | (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) |
| 1990 | ) { |
| 1991 | --sourceIndex; |
| 1992 | } |
| 1993 | } else { |
| 1994 | sourceIndex=-1; |
| 1995 | } |
| 1996 | |
| 1997 | fromUWriteUInt8( |
| 1998 | cnv, |
| 1999 | buffer, outLen, |
| 2000 | &target, (const char *)targetLimit, |
| 2001 | &offsets, sourceIndex, |
| 2002 | err); |
| 2003 | } |
| 2004 | |
| 2005 | /*save the state and return */ |
| 2006 | args->source = source; |
| 2007 | args->target = (char*)target; |
| 2008 | } |
| 2009 | |
| 2010 | /*************** to unicode *******************/ |
| 2011 | |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 2012 | static void U_CALLCONV |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 2013 | UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, |
| 2014 | UErrorCode* err){ |
| 2015 | char tempBuf[2]; |
| 2016 | const char *mySource = (char *) args->source; |
| 2017 | UChar *myTarget = args->target; |
| 2018 | const char *mySourceLimit = args->sourceLimit; |
| 2019 | uint32_t targetUniChar = 0x0000; |
| 2020 | uint32_t mySourceChar = 0x0000; |
| 2021 | uint32_t tmpSourceChar = 0x0000; |
| 2022 | UConverterDataISO2022* myData; |
| 2023 | ISO2022State *pToU2022State; |
| 2024 | StateEnum cs; |
| 2025 | |
| 2026 | myData=(UConverterDataISO2022*)(args->converter->extraInfo); |
| 2027 | pToU2022State = &myData->toU2022State; |
| 2028 | |
| 2029 | if(myData->key != 0) { |
| 2030 | /* continue with a partial escape sequence */ |
| 2031 | goto escape; |
| 2032 | } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { |
| 2033 | /* continue with a partial double-byte character */ |
| 2034 | mySourceChar = args->converter->toUBytes[0]; |
| 2035 | args->converter->toULength = 0; |
| 2036 | cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; |
| 2037 | targetUniChar = missingCharMarker; |
| 2038 | goto getTrailByte; |
| 2039 | } |
| 2040 | |
| 2041 | while(mySource < mySourceLimit){ |
| 2042 | |
| 2043 | targetUniChar =missingCharMarker; |
| 2044 | |
| 2045 | if(myTarget < args->targetLimit){ |
| 2046 | |
| 2047 | mySourceChar= (unsigned char) *mySource++; |
| 2048 | |
| 2049 | switch(mySourceChar) { |
| 2050 | case UCNV_SI: |
| 2051 | if(myData->version==3) { |
| 2052 | pToU2022State->g=0; |
| 2053 | continue; |
| 2054 | } else { |
| 2055 | /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 2056 | myData->isEmptySegment = false; /* reset this, we have a different error */ |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 2057 | break; |
| 2058 | } |
| 2059 | |
| 2060 | case UCNV_SO: |
| 2061 | if(myData->version==3) { |
| 2062 | /* JIS7: switch to G1 half-width Katakana */ |
| 2063 | pToU2022State->cs[1] = (int8_t)HWKANA_7BIT; |
| 2064 | pToU2022State->g=1; |
| 2065 | continue; |
| 2066 | } else { |
| 2067 | /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 2068 | myData->isEmptySegment = false; /* reset this, we have a different error */ |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 2069 | break; |
| 2070 | } |
| 2071 | |
| 2072 | case ESC_2022: |
| 2073 | mySource--; |
| 2074 | escape: |
| 2075 | { |
| 2076 | const char * mySourceBefore = mySource; |
| 2077 | int8_t toULengthBefore = args->converter->toULength; |
| 2078 | |
| 2079 | changeState_2022(args->converter,&(mySource), |
| 2080 | mySourceLimit, ISO_2022_JP,err); |
| 2081 | |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 2082 | /* If in ISO-2022-JP only and we successfully completed an escape sequence, but previous segment was empty, create an error */ |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 2083 | if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { |
| 2084 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
| 2085 | args->converter->toUCallbackReason = UCNV_IRREGULAR; |
| 2086 | args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore)); |
| 2087 | } |
| 2088 | } |
| 2089 | |
| 2090 | /* invalid or illegal escape sequence */ |
| 2091 | if(U_FAILURE(*err)){ |
| 2092 | args->target = myTarget; |
| 2093 | args->source = mySource; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 2094 | myData->isEmptySegment = false; /* Reset to avoid future spurious errors */ |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 2095 | return; |
| 2096 | } |
| 2097 | /* If we successfully completed an escape sequence, we begin a new segment, empty so far */ |
| 2098 | if(myData->key==0) { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 2099 | myData->isEmptySegment = true; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 2100 | } |
| 2101 | continue; |
| 2102 | |
| 2103 | /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */ |
| 2104 | |
| 2105 | case CR: |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 2106 | case LF: |
| 2107 | /* automatically reset to single-byte mode */ |
| 2108 | if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) { |
| 2109 | pToU2022State->cs[0] = (int8_t)ASCII; |
| 2110 | } |
| 2111 | pToU2022State->cs[2] = 0; |
| 2112 | pToU2022State->g = 0; |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 2113 | U_FALLTHROUGH; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 2114 | default: |
| 2115 | /* convert one or two bytes */ |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 2116 | myData->isEmptySegment = false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 2117 | cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; |
| 2118 | if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 && |
| 2119 | !IS_JP_DBCS(cs) |
| 2120 | ) { |
| 2121 | /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */ |
| 2122 | targetUniChar = mySourceChar + (HWKANA_START - 0xa1); |
| 2123 | |
| 2124 | /* return from a single-shift state to the previous one */ |
| 2125 | if(pToU2022State->g >= 2) { |
| 2126 | pToU2022State->g=pToU2022State->prevG; |
| 2127 | } |
| 2128 | } else switch(cs) { |
| 2129 | case ASCII: |
| 2130 | if(mySourceChar <= 0x7f) { |
| 2131 | targetUniChar = mySourceChar; |
| 2132 | } |
| 2133 | break; |
| 2134 | case ISO8859_1: |
| 2135 | if(mySourceChar <= 0x7f) { |
| 2136 | targetUniChar = mySourceChar + 0x80; |
| 2137 | } |
| 2138 | /* return from a single-shift state to the previous one */ |
| 2139 | pToU2022State->g=pToU2022State->prevG; |
| 2140 | break; |
| 2141 | case ISO8859_7: |
| 2142 | if(mySourceChar <= 0x7f) { |
| 2143 | /* convert mySourceChar+0x80 to use a normal 8-bit table */ |
| 2144 | targetUniChar = |
| 2145 | _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP( |
| 2146 | myData->myConverterArray[cs], |
| 2147 | mySourceChar + 0x80); |
| 2148 | } |
| 2149 | /* return from a single-shift state to the previous one */ |
| 2150 | pToU2022State->g=pToU2022State->prevG; |
| 2151 | break; |
| 2152 | case JISX201: |
| 2153 | if(mySourceChar <= 0x7f) { |
| 2154 | targetUniChar = jisx201ToU(mySourceChar); |
| 2155 | } |
| 2156 | break; |
| 2157 | case HWKANA_7BIT: |
| 2158 | if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) { |
| 2159 | /* 7-bit halfwidth Katakana */ |
| 2160 | targetUniChar = mySourceChar + (HWKANA_START - 0x21); |
| 2161 | } |
| 2162 | break; |
| 2163 | default: |
| 2164 | /* G0 DBCS */ |
| 2165 | if(mySource < mySourceLimit) { |
| 2166 | int leadIsOk, trailIsOk; |
| 2167 | uint8_t trailByte; |
| 2168 | getTrailByte: |
| 2169 | trailByte = (uint8_t)*mySource; |
| 2170 | /* |
| 2171 | * Ticket 5691: consistent illegal sequences: |
| 2172 | * - We include at least the first byte in the illegal sequence. |
| 2173 | * - If any of the non-initial bytes could be the start of a character, |
| 2174 | * we stop the illegal sequence before the first one of those. |
| 2175 | * |
| 2176 | * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is |
| 2177 | * an ESC/SO/SI, we report only the first byte as the illegal sequence. |
| 2178 | * Otherwise we convert or report the pair of bytes. |
| 2179 | */ |
| 2180 | leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); |
| 2181 | trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); |
| 2182 | if (leadIsOk && trailIsOk) { |
| 2183 | ++mySource; |
| 2184 | tmpSourceChar = (mySourceChar << 8) | trailByte; |
Jungshik Shin | 834113a | 2018-11-02 01:24:07 -0700 | [diff] [blame] | 2185 | /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */ |
| 2186 | mySourceChar = tmpSourceChar; |
| 2187 | if (cs == JISX208 || cs == KSC5601) { |
| 2188 | tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */ |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 2189 | } |
Jungshik Shin | 834113a | 2018-11-02 01:24:07 -0700 | [diff] [blame] | 2190 | tempBuf[0] = (char)(tmpSourceChar >> 8); |
| 2191 | tempBuf[1] = (char)(tmpSourceChar); |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 2192 | targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, false); |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 2193 | } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { |
| 2194 | /* report a pair of illegal bytes if the second byte is not a DBCS starter */ |
| 2195 | ++mySource; |
| 2196 | /* add another bit so that the code below writes 2 bytes in case of error */ |
| 2197 | mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; |
| 2198 | } |
| 2199 | } else { |
| 2200 | args->converter->toUBytes[0] = (uint8_t)mySourceChar; |
| 2201 | args->converter->toULength = 1; |
| 2202 | goto endloop; |
| 2203 | } |
| 2204 | } /* End of inner switch */ |
| 2205 | break; |
| 2206 | } /* End of outer switch */ |
| 2207 | if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ |
| 2208 | if(args->offsets){ |
| 2209 | args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
| 2210 | } |
| 2211 | *(myTarget++)=(UChar)targetUniChar; |
| 2212 | } |
| 2213 | else if(targetUniChar > missingCharMarker){ |
| 2214 | /* disassemble the surrogate pair and write to output*/ |
| 2215 | targetUniChar-=0x0010000; |
| 2216 | *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); |
| 2217 | if(args->offsets){ |
| 2218 | args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
| 2219 | } |
| 2220 | ++myTarget; |
| 2221 | if(myTarget< args->targetLimit){ |
| 2222 | *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); |
| 2223 | if(args->offsets){ |
| 2224 | args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
| 2225 | } |
| 2226 | ++myTarget; |
| 2227 | }else{ |
| 2228 | args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= |
| 2229 | (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); |
| 2230 | } |
| 2231 | |
| 2232 | } |
| 2233 | else{ |
| 2234 | /* Call the callback function*/ |
| 2235 | toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); |
| 2236 | break; |
| 2237 | } |
| 2238 | } |
| 2239 | else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */ |
| 2240 | *err =U_BUFFER_OVERFLOW_ERROR; |
| 2241 | break; |
| 2242 | } |
| 2243 | } |
| 2244 | endloop: |
| 2245 | args->target = myTarget; |
| 2246 | args->source = mySource; |
| 2247 | } |
| 2248 | |
| 2249 | |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame] | 2250 | #if !UCONFIG_ONLY_HTML_CONVERSION |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 2251 | /*************************************************************** |
| 2252 | * Rules for ISO-2022-KR encoding |
| 2253 | * i) The KSC5601 designator sequence should appear only once in a file, |
Frank Tang | 7e7574b | 2021-04-13 21:19:13 -0700 | [diff] [blame] | 2254 | * at the beginning of a line before any KSC5601 characters. This usually |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 2255 | * means that it appears by itself on the first line of the file |
| 2256 | * ii) There are only 2 shifting sequences SO to shift into double byte mode |
| 2257 | * and SI to shift into single byte mode |
| 2258 | */ |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 2259 | static void U_CALLCONV |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 2260 | UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){ |
| 2261 | |
| 2262 | UConverter* saveConv = args->converter; |
| 2263 | UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo; |
| 2264 | args->converter=myConverterData->currentConverter; |
| 2265 | |
| 2266 | myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32; |
| 2267 | ucnv_MBCSFromUnicodeWithOffsets(args,err); |
| 2268 | saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32; |
| 2269 | |
| 2270 | if(*err == U_BUFFER_OVERFLOW_ERROR) { |
| 2271 | if(myConverterData->currentConverter->charErrorBufferLength > 0) { |
| 2272 | uprv_memcpy( |
| 2273 | saveConv->charErrorBuffer, |
| 2274 | myConverterData->currentConverter->charErrorBuffer, |
| 2275 | myConverterData->currentConverter->charErrorBufferLength); |
| 2276 | } |
| 2277 | saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; |
| 2278 | myConverterData->currentConverter->charErrorBufferLength = 0; |
| 2279 | } |
| 2280 | args->converter=saveConv; |
| 2281 | } |
| 2282 | |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 2283 | static void U_CALLCONV |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 2284 | UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ |
| 2285 | |
| 2286 | const UChar *source = args->source; |
| 2287 | const UChar *sourceLimit = args->sourceLimit; |
| 2288 | unsigned char *target = (unsigned char *) args->target; |
| 2289 | unsigned char *targetLimit = (unsigned char *) args->targetLimit; |
| 2290 | int32_t* offsets = args->offsets; |
| 2291 | uint32_t targetByteUnit = 0x0000; |
| 2292 | UChar32 sourceChar = 0x0000; |
| 2293 | UBool isTargetByteDBCS; |
| 2294 | UBool oldIsTargetByteDBCS; |
| 2295 | UConverterDataISO2022 *converterData; |
| 2296 | UConverterSharedData* sharedData; |
| 2297 | UBool useFallback; |
| 2298 | int32_t length =0; |
| 2299 | |
| 2300 | converterData=(UConverterDataISO2022*)args->converter->extraInfo; |
| 2301 | /* if the version is 1 then the user is requesting |
| 2302 | * conversion with ibm-25546 pass the arguments to |
| 2303 | * MBCS converter and return |
| 2304 | */ |
| 2305 | if(converterData->version==1){ |
| 2306 | UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); |
| 2307 | return; |
| 2308 | } |
| 2309 | |
| 2310 | /* initialize data */ |
| 2311 | sharedData = converterData->currentConverter->sharedData; |
| 2312 | useFallback = args->converter->useFallback; |
| 2313 | isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus; |
| 2314 | oldIsTargetByteDBCS = isTargetByteDBCS; |
| 2315 | |
| 2316 | isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus; |
| 2317 | if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) { |
| 2318 | goto getTrail; |
| 2319 | } |
| 2320 | while(source < sourceLimit){ |
| 2321 | |
| 2322 | targetByteUnit = missingCharMarker; |
| 2323 | |
| 2324 | if(target < (unsigned char*) args->targetLimit){ |
| 2325 | sourceChar = *source++; |
| 2326 | |
| 2327 | /* do not convert SO/SI/ESC */ |
| 2328 | if(IS_2022_CONTROL(sourceChar)) { |
| 2329 | /* callback(illegal) */ |
| 2330 | *err=U_ILLEGAL_CHAR_FOUND; |
| 2331 | args->converter->fromUChar32=sourceChar; |
| 2332 | break; |
| 2333 | } |
| 2334 | |
| 2335 | length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2); |
| 2336 | if(length < 0) { |
| 2337 | length = -length; /* fallback */ |
| 2338 | } |
| 2339 | /* only DBCS or SBCS characters are expected*/ |
| 2340 | /* DB characters with high bit set to 1 are expected */ |
| 2341 | if( length > 2 || length==0 || |
| 2342 | (length == 1 && targetByteUnit > 0x7f) || |
| 2343 | (length == 2 && |
| 2344 | ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) || |
| 2345 | (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1))) |
| 2346 | ) { |
| 2347 | targetByteUnit=missingCharMarker; |
| 2348 | } |
| 2349 | if (targetByteUnit != missingCharMarker){ |
| 2350 | |
| 2351 | oldIsTargetByteDBCS = isTargetByteDBCS; |
| 2352 | isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF); |
| 2353 | /* append the shift sequence */ |
| 2354 | if (oldIsTargetByteDBCS != isTargetByteDBCS ){ |
| 2355 | |
| 2356 | if (isTargetByteDBCS) |
| 2357 | *target++ = UCNV_SO; |
| 2358 | else |
| 2359 | *target++ = UCNV_SI; |
| 2360 | if(offsets) |
| 2361 | *(offsets++) = (int32_t)(source - args->source-1); |
| 2362 | } |
| 2363 | /* write the targetUniChar to target */ |
| 2364 | if(targetByteUnit <= 0x00FF){ |
| 2365 | if( target < targetLimit){ |
| 2366 | *(target++) = (unsigned char) targetByteUnit; |
| 2367 | if(offsets){ |
| 2368 | *(offsets++) = (int32_t)(source - args->source-1); |
| 2369 | } |
| 2370 | |
| 2371 | }else{ |
| 2372 | args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit); |
| 2373 | *err = U_BUFFER_OVERFLOW_ERROR; |
| 2374 | } |
| 2375 | }else{ |
| 2376 | if(target < targetLimit){ |
| 2377 | *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80); |
| 2378 | if(offsets){ |
| 2379 | *(offsets++) = (int32_t)(source - args->source-1); |
| 2380 | } |
| 2381 | if(target < targetLimit){ |
| 2382 | *(target++) =(unsigned char) (targetByteUnit -0x80); |
| 2383 | if(offsets){ |
| 2384 | *(offsets++) = (int32_t)(source - args->source-1); |
| 2385 | } |
| 2386 | }else{ |
| 2387 | args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80); |
| 2388 | *err = U_BUFFER_OVERFLOW_ERROR; |
| 2389 | } |
| 2390 | }else{ |
| 2391 | args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80); |
| 2392 | args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80); |
| 2393 | *err = U_BUFFER_OVERFLOW_ERROR; |
| 2394 | } |
| 2395 | } |
| 2396 | |
| 2397 | } |
| 2398 | else{ |
| 2399 | /* oops.. the code point is unassingned |
| 2400 | * set the error and reason |
| 2401 | */ |
| 2402 | |
| 2403 | /*check if the char is a First surrogate*/ |
| 2404 | if(U16_IS_SURROGATE(sourceChar)) { |
| 2405 | if(U16_IS_SURROGATE_LEAD(sourceChar)) { |
| 2406 | getTrail: |
| 2407 | /*look ahead to find the trail surrogate*/ |
| 2408 | if(source < sourceLimit) { |
| 2409 | /* test the following code unit */ |
| 2410 | UChar trail=(UChar) *source; |
| 2411 | if(U16_IS_TRAIL(trail)) { |
| 2412 | source++; |
| 2413 | sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); |
| 2414 | *err = U_INVALID_CHAR_FOUND; |
| 2415 | /* convert this surrogate code point */ |
| 2416 | /* exit this condition tree */ |
| 2417 | } else { |
| 2418 | /* this is an unmatched lead code unit (1st surrogate) */ |
| 2419 | /* callback(illegal) */ |
| 2420 | *err=U_ILLEGAL_CHAR_FOUND; |
| 2421 | } |
| 2422 | } else { |
| 2423 | /* no more input */ |
| 2424 | *err = U_ZERO_ERROR; |
| 2425 | } |
| 2426 | } else { |
| 2427 | /* this is an unmatched trail code unit (2nd surrogate) */ |
| 2428 | /* callback(illegal) */ |
| 2429 | *err=U_ILLEGAL_CHAR_FOUND; |
| 2430 | } |
| 2431 | } else { |
| 2432 | /* callback(unassigned) for a BMP code point */ |
| 2433 | *err = U_INVALID_CHAR_FOUND; |
| 2434 | } |
| 2435 | |
| 2436 | args->converter->fromUChar32=sourceChar; |
| 2437 | break; |
| 2438 | } |
| 2439 | } /* end if(myTargetIndex<myTargetLength) */ |
| 2440 | else{ |
| 2441 | *err =U_BUFFER_OVERFLOW_ERROR; |
| 2442 | break; |
| 2443 | } |
| 2444 | |
| 2445 | }/* end while(mySourceIndex<mySourceLength) */ |
| 2446 | |
| 2447 | /* |
| 2448 | * the end of the input stream and detection of truncated input |
| 2449 | * are handled by the framework, but for ISO-2022-KR conversion |
| 2450 | * we need to be in ASCII mode at the very end |
| 2451 | * |
| 2452 | * conditions: |
| 2453 | * successful |
| 2454 | * not in ASCII mode |
| 2455 | * end of input and no truncated input |
| 2456 | */ |
| 2457 | if( U_SUCCESS(*err) && |
| 2458 | isTargetByteDBCS && |
| 2459 | args->flush && source>=sourceLimit && args->converter->fromUChar32==0 |
| 2460 | ) { |
| 2461 | int32_t sourceIndex; |
| 2462 | |
| 2463 | /* we are switching to ASCII */ |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 2464 | isTargetByteDBCS=false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 2465 | |
| 2466 | /* get the source index of the last input character */ |
| 2467 | /* |
| 2468 | * TODO this would be simpler and more reliable if we used a pair |
| 2469 | * of sourceIndex/prevSourceIndex like in ucnvmbcs.c |
| 2470 | * so that we could simply use the prevSourceIndex here; |
| 2471 | * this code gives an incorrect result for the rare case of an unmatched |
| 2472 | * trail surrogate that is alone in the last buffer of the text stream |
| 2473 | */ |
| 2474 | sourceIndex=(int32_t)(source-args->source); |
| 2475 | if(sourceIndex>0) { |
| 2476 | --sourceIndex; |
| 2477 | if( U16_IS_TRAIL(args->source[sourceIndex]) && |
| 2478 | (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) |
| 2479 | ) { |
| 2480 | --sourceIndex; |
| 2481 | } |
| 2482 | } else { |
| 2483 | sourceIndex=-1; |
| 2484 | } |
| 2485 | |
| 2486 | fromUWriteUInt8( |
| 2487 | args->converter, |
| 2488 | SHIFT_IN_STR, 1, |
| 2489 | &target, (const char *)targetLimit, |
| 2490 | &offsets, sourceIndex, |
| 2491 | err); |
| 2492 | } |
| 2493 | |
| 2494 | /*save the state and return */ |
| 2495 | args->source = source; |
| 2496 | args->target = (char*)target; |
| 2497 | args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS; |
| 2498 | } |
| 2499 | |
| 2500 | /************************ To Unicode ***************************************/ |
| 2501 | |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 2502 | static void U_CALLCONV |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 2503 | UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args, |
| 2504 | UErrorCode* err){ |
| 2505 | char const* sourceStart; |
| 2506 | UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo); |
| 2507 | |
| 2508 | UConverterToUnicodeArgs subArgs; |
| 2509 | int32_t minArgsSize; |
| 2510 | |
| 2511 | /* set up the subconverter arguments */ |
| 2512 | if(args->size<sizeof(UConverterToUnicodeArgs)) { |
| 2513 | minArgsSize = args->size; |
| 2514 | } else { |
| 2515 | minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs); |
| 2516 | } |
| 2517 | |
| 2518 | uprv_memcpy(&subArgs, args, minArgsSize); |
| 2519 | subArgs.size = (uint16_t)minArgsSize; |
| 2520 | subArgs.converter = myData->currentConverter; |
| 2521 | |
| 2522 | /* remember the original start of the input for offsets */ |
| 2523 | sourceStart = args->source; |
| 2524 | |
| 2525 | if(myData->key != 0) { |
| 2526 | /* continue with a partial escape sequence */ |
| 2527 | goto escape; |
| 2528 | } |
| 2529 | |
| 2530 | while(U_SUCCESS(*err) && args->source < args->sourceLimit) { |
| 2531 | /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ |
| 2532 | subArgs.source = args->source; |
| 2533 | subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush); |
| 2534 | if(subArgs.source != subArgs.sourceLimit) { |
| 2535 | /* |
| 2536 | * get the current partial byte sequence |
| 2537 | * |
| 2538 | * it needs to be moved between the public and the subconverter |
| 2539 | * so that the conversion framework, which only sees the public |
| 2540 | * converter, can handle truncated and illegal input etc. |
| 2541 | */ |
| 2542 | if(args->converter->toULength > 0) { |
| 2543 | uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength); |
| 2544 | } |
| 2545 | subArgs.converter->toULength = args->converter->toULength; |
| 2546 | |
| 2547 | /* |
| 2548 | * Convert up to the end of the input, or to before the next escape character. |
| 2549 | * Does not handle conversion extensions because the preToU[] state etc. |
| 2550 | * is not copied. |
| 2551 | */ |
| 2552 | ucnv_MBCSToUnicodeWithOffsets(&subArgs, err); |
| 2553 | |
| 2554 | if(args->offsets != NULL && sourceStart != args->source) { |
| 2555 | /* update offsets to base them on the actual start of the input */ |
| 2556 | int32_t *offsets = args->offsets; |
| 2557 | UChar *target = args->target; |
| 2558 | int32_t delta = (int32_t)(args->source - sourceStart); |
| 2559 | while(target < subArgs.target) { |
| 2560 | if(*offsets >= 0) { |
| 2561 | *offsets += delta; |
| 2562 | } |
| 2563 | ++offsets; |
| 2564 | ++target; |
| 2565 | } |
| 2566 | } |
| 2567 | args->source = subArgs.source; |
| 2568 | args->target = subArgs.target; |
| 2569 | args->offsets = subArgs.offsets; |
| 2570 | |
| 2571 | /* copy input/error/overflow buffers */ |
| 2572 | if(subArgs.converter->toULength > 0) { |
| 2573 | uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength); |
| 2574 | } |
| 2575 | args->converter->toULength = subArgs.converter->toULength; |
| 2576 | |
| 2577 | if(*err == U_BUFFER_OVERFLOW_ERROR) { |
| 2578 | if(subArgs.converter->UCharErrorBufferLength > 0) { |
| 2579 | uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer, |
| 2580 | subArgs.converter->UCharErrorBufferLength); |
| 2581 | } |
| 2582 | args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength; |
| 2583 | subArgs.converter->UCharErrorBufferLength = 0; |
| 2584 | } |
| 2585 | } |
| 2586 | |
| 2587 | if (U_FAILURE(*err) || (args->source == args->sourceLimit)) { |
| 2588 | return; |
| 2589 | } |
| 2590 | |
| 2591 | escape: |
| 2592 | changeState_2022(args->converter, |
| 2593 | &(args->source), |
| 2594 | args->sourceLimit, |
| 2595 | ISO_2022_KR, |
| 2596 | err); |
| 2597 | } |
| 2598 | } |
| 2599 | |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 2600 | static void U_CALLCONV |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 2601 | UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, |
| 2602 | UErrorCode* err){ |
| 2603 | char tempBuf[2]; |
| 2604 | const char *mySource = ( char *) args->source; |
| 2605 | UChar *myTarget = args->target; |
| 2606 | const char *mySourceLimit = args->sourceLimit; |
| 2607 | UChar32 targetUniChar = 0x0000; |
| 2608 | UChar mySourceChar = 0x0000; |
| 2609 | UConverterDataISO2022* myData; |
| 2610 | UConverterSharedData* sharedData ; |
| 2611 | UBool useFallback; |
| 2612 | |
| 2613 | myData=(UConverterDataISO2022*)(args->converter->extraInfo); |
| 2614 | if(myData->version==1){ |
| 2615 | UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); |
| 2616 | return; |
| 2617 | } |
| 2618 | |
| 2619 | /* initialize state */ |
| 2620 | sharedData = myData->currentConverter->sharedData; |
| 2621 | useFallback = args->converter->useFallback; |
| 2622 | |
| 2623 | if(myData->key != 0) { |
| 2624 | /* continue with a partial escape sequence */ |
| 2625 | goto escape; |
| 2626 | } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { |
| 2627 | /* continue with a partial double-byte character */ |
| 2628 | mySourceChar = args->converter->toUBytes[0]; |
| 2629 | args->converter->toULength = 0; |
| 2630 | goto getTrailByte; |
| 2631 | } |
| 2632 | |
| 2633 | while(mySource< mySourceLimit){ |
| 2634 | |
| 2635 | if(myTarget < args->targetLimit){ |
| 2636 | |
| 2637 | mySourceChar= (unsigned char) *mySource++; |
| 2638 | |
| 2639 | if(mySourceChar==UCNV_SI){ |
| 2640 | myData->toU2022State.g = 0; |
| 2641 | if (myData->isEmptySegment) { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 2642 | myData->isEmptySegment = false; /* we are handling it, reset to avoid future spurious errors */ |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 2643 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
| 2644 | args->converter->toUCallbackReason = UCNV_IRREGULAR; |
| 2645 | args->converter->toUBytes[0] = (uint8_t)mySourceChar; |
| 2646 | args->converter->toULength = 1; |
| 2647 | args->target = myTarget; |
| 2648 | args->source = mySource; |
| 2649 | return; |
| 2650 | } |
| 2651 | /*consume the source */ |
| 2652 | continue; |
| 2653 | }else if(mySourceChar==UCNV_SO){ |
| 2654 | myData->toU2022State.g = 1; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 2655 | myData->isEmptySegment = true; /* Begin a new segment, empty so far */ |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 2656 | /*consume the source */ |
| 2657 | continue; |
| 2658 | }else if(mySourceChar==ESC_2022){ |
| 2659 | mySource--; |
| 2660 | escape: |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 2661 | myData->isEmptySegment = false; /* Any invalid ESC sequences will be detected separately, so just reset this */ |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 2662 | changeState_2022(args->converter,&(mySource), |
| 2663 | mySourceLimit, ISO_2022_KR, err); |
| 2664 | if(U_FAILURE(*err)){ |
| 2665 | args->target = myTarget; |
| 2666 | args->source = mySource; |
| 2667 | return; |
| 2668 | } |
| 2669 | continue; |
| 2670 | } |
| 2671 | |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 2672 | myData->isEmptySegment = false; /* Any invalid char errors will be detected separately, so just reset this */ |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 2673 | if(myData->toU2022State.g == 1) { |
| 2674 | if(mySource < mySourceLimit) { |
| 2675 | int leadIsOk, trailIsOk; |
| 2676 | uint8_t trailByte; |
| 2677 | getTrailByte: |
| 2678 | targetUniChar = missingCharMarker; |
| 2679 | trailByte = (uint8_t)*mySource; |
| 2680 | /* |
| 2681 | * Ticket 5691: consistent illegal sequences: |
| 2682 | * - We include at least the first byte in the illegal sequence. |
| 2683 | * - If any of the non-initial bytes could be the start of a character, |
| 2684 | * we stop the illegal sequence before the first one of those. |
| 2685 | * |
| 2686 | * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is |
| 2687 | * an ESC/SO/SI, we report only the first byte as the illegal sequence. |
| 2688 | * Otherwise we convert or report the pair of bytes. |
| 2689 | */ |
| 2690 | leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); |
| 2691 | trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); |
| 2692 | if (leadIsOk && trailIsOk) { |
| 2693 | ++mySource; |
| 2694 | tempBuf[0] = (char)(mySourceChar + 0x80); |
| 2695 | tempBuf[1] = (char)(trailByte + 0x80); |
| 2696 | targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback); |
| 2697 | mySourceChar = (mySourceChar << 8) | trailByte; |
| 2698 | } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { |
| 2699 | /* report a pair of illegal bytes if the second byte is not a DBCS starter */ |
| 2700 | ++mySource; |
| 2701 | /* add another bit so that the code below writes 2 bytes in case of error */ |
Jungshik Shin | 42d5027 | 2018-10-24 01:22:09 -0700 | [diff] [blame] | 2702 | mySourceChar = static_cast<UChar>(0x10000 | (mySourceChar << 8) | trailByte); |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 2703 | } |
| 2704 | } else { |
| 2705 | args->converter->toUBytes[0] = (uint8_t)mySourceChar; |
| 2706 | args->converter->toULength = 1; |
| 2707 | break; |
| 2708 | } |
| 2709 | } |
| 2710 | else if(mySourceChar <= 0x7f) { |
| 2711 | targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback); |
| 2712 | } else { |
| 2713 | targetUniChar = 0xffff; |
| 2714 | } |
| 2715 | if(targetUniChar < 0xfffe){ |
| 2716 | if(args->offsets) { |
| 2717 | args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
| 2718 | } |
| 2719 | *(myTarget++)=(UChar)targetUniChar; |
| 2720 | } |
| 2721 | else { |
| 2722 | /* Call the callback function*/ |
| 2723 | toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); |
| 2724 | break; |
| 2725 | } |
| 2726 | } |
| 2727 | else{ |
| 2728 | *err =U_BUFFER_OVERFLOW_ERROR; |
| 2729 | break; |
| 2730 | } |
| 2731 | } |
| 2732 | args->target = myTarget; |
| 2733 | args->source = mySource; |
| 2734 | } |
| 2735 | |
| 2736 | /*************************** END ISO2022-KR *********************************/ |
| 2737 | |
| 2738 | /*************************** ISO-2022-CN ********************************* |
| 2739 | * |
| 2740 | * Rules for ISO-2022-CN Encoding: |
| 2741 | * i) The designator sequence must appear once on a line before any instance |
| 2742 | * of character set it designates. |
| 2743 | * ii) If two lines contain characters from the same character set, both lines |
| 2744 | * must include the designator sequence. |
| 2745 | * iii) Once the designator sequence is known, a shifting sequence has to be found |
| 2746 | * to invoke the shifting |
| 2747 | * iv) All lines start in ASCII and end in ASCII. |
| 2748 | * v) Four shifting sequences are employed for this purpose: |
| 2749 | * |
| 2750 | * Sequcence ASCII Eq Charsets |
| 2751 | * ---------- ------- --------- |
| 2752 | * SI <SI> US-ASCII |
| 2753 | * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165 |
| 2754 | * SS2 <ESC>N CNS-11643-1992 Plane 2 |
| 2755 | * SS3 <ESC>O CNS-11643-1992 Planes 3-7 |
| 2756 | * |
| 2757 | * vi) |
| 2758 | * SOdesignator : ESC "$" ")" finalchar_for_SO |
| 2759 | * SS2designator : ESC "$" "*" finalchar_for_SS2 |
| 2760 | * SS3designator : ESC "$" "+" finalchar_for_SS3 |
| 2761 | * |
| 2762 | * ESC $ ) A Indicates the bytes following SO are Chinese |
| 2763 | * characters as defined in GB 2312-80, until |
| 2764 | * another SOdesignation appears |
| 2765 | * |
| 2766 | * |
| 2767 | * ESC $ ) E Indicates the bytes following SO are as defined |
| 2768 | * in ISO-IR-165 (for details, see section 2.1), |
| 2769 | * until another SOdesignation appears |
| 2770 | * |
| 2771 | * ESC $ ) G Indicates the bytes following SO are as defined |
| 2772 | * in CNS 11643-plane-1, until another |
| 2773 | * SOdesignation appears |
| 2774 | * |
| 2775 | * ESC $ * H Indicates the two bytes immediately following |
| 2776 | * SS2 is a Chinese character as defined in CNS |
| 2777 | * 11643-plane-2, until another SS2designation |
| 2778 | * appears |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 2779 | * (Meaning <ESC>N must precede every 2 byte |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 2780 | * sequence.) |
| 2781 | * |
| 2782 | * ESC $ + I Indicates the immediate two bytes following SS3 |
| 2783 | * is a Chinese character as defined in CNS |
| 2784 | * 11643-plane-3, until another SS3designation |
| 2785 | * appears |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 2786 | * (Meaning <ESC>O must precede every 2 byte |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 2787 | * sequence.) |
| 2788 | * |
| 2789 | * ESC $ + J Indicates the immediate two bytes following SS3 |
| 2790 | * is a Chinese character as defined in CNS |
| 2791 | * 11643-plane-4, until another SS3designation |
| 2792 | * appears |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 2793 | * (In English: <ESC>O must precede every 2 byte |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 2794 | * sequence.) |
| 2795 | * |
| 2796 | * ESC $ + K Indicates the immediate two bytes following SS3 |
| 2797 | * is a Chinese character as defined in CNS |
| 2798 | * 11643-plane-5, until another SS3designation |
| 2799 | * appears |
| 2800 | * |
| 2801 | * ESC $ + L Indicates the immediate two bytes following SS3 |
| 2802 | * is a Chinese character as defined in CNS |
| 2803 | * 11643-plane-6, until another SS3designation |
| 2804 | * appears |
| 2805 | * |
| 2806 | * ESC $ + M Indicates the immediate two bytes following SS3 |
| 2807 | * is a Chinese character as defined in CNS |
| 2808 | * 11643-plane-7, until another SS3designation |
| 2809 | * appears |
| 2810 | * |
| 2811 | * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and |
| 2812 | * has its own designation information before any Chinese characters |
| 2813 | * appear |
| 2814 | * |
| 2815 | */ |
| 2816 | |
| 2817 | /* The following are defined this way to make the strings truly readonly */ |
| 2818 | static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41"; |
| 2819 | static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45"; |
| 2820 | static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47"; |
| 2821 | static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48"; |
| 2822 | static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49"; |
| 2823 | static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A"; |
| 2824 | static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B"; |
| 2825 | static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C"; |
| 2826 | static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D"; |
| 2827 | |
| 2828 | /********************** ISO2022-CN Data **************************/ |
| 2829 | static const char* const escSeqCharsCN[10] ={ |
| 2830 | SHIFT_IN_STR, /* 0 ASCII */ |
| 2831 | GB_2312_80_STR, /* 1 GB2312_1 */ |
| 2832 | ISO_IR_165_STR, /* 2 ISO_IR_165 */ |
| 2833 | CNS_11643_1992_Plane_1_STR, |
| 2834 | CNS_11643_1992_Plane_2_STR, |
| 2835 | CNS_11643_1992_Plane_3_STR, |
| 2836 | CNS_11643_1992_Plane_4_STR, |
| 2837 | CNS_11643_1992_Plane_5_STR, |
| 2838 | CNS_11643_1992_Plane_6_STR, |
| 2839 | CNS_11643_1992_Plane_7_STR |
| 2840 | }; |
| 2841 | |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 2842 | static void U_CALLCONV |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 2843 | UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ |
| 2844 | UConverter *cnv = args->converter; |
| 2845 | UConverterDataISO2022 *converterData; |
| 2846 | ISO2022State *pFromU2022State; |
| 2847 | uint8_t *target = (uint8_t *) args->target; |
| 2848 | const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; |
| 2849 | const UChar* source = args->source; |
| 2850 | const UChar* sourceLimit = args->sourceLimit; |
| 2851 | int32_t* offsets = args->offsets; |
| 2852 | UChar32 sourceChar; |
| 2853 | char buffer[8]; |
| 2854 | int32_t len; |
| 2855 | int8_t choices[3]; |
| 2856 | int32_t choiceCount; |
| 2857 | uint32_t targetValue = 0; |
| 2858 | UBool useFallback; |
| 2859 | |
| 2860 | /* set up the state */ |
| 2861 | converterData = (UConverterDataISO2022*)cnv->extraInfo; |
| 2862 | pFromU2022State = &converterData->fromU2022State; |
| 2863 | |
| 2864 | choiceCount = 0; |
| 2865 | |
| 2866 | /* check if the last codepoint of previous buffer was a lead surrogate*/ |
| 2867 | if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { |
| 2868 | goto getTrail; |
| 2869 | } |
| 2870 | |
| 2871 | while( source < sourceLimit){ |
| 2872 | if(target < targetLimit){ |
| 2873 | |
| 2874 | sourceChar = *(source++); |
| 2875 | /*check if the char is a First surrogate*/ |
| 2876 | if(U16_IS_SURROGATE(sourceChar)) { |
| 2877 | if(U16_IS_SURROGATE_LEAD(sourceChar)) { |
| 2878 | getTrail: |
| 2879 | /*look ahead to find the trail surrogate*/ |
| 2880 | if(source < sourceLimit) { |
| 2881 | /* test the following code unit */ |
| 2882 | UChar trail=(UChar) *source; |
| 2883 | if(U16_IS_TRAIL(trail)) { |
| 2884 | source++; |
| 2885 | sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); |
| 2886 | cnv->fromUChar32=0x00; |
| 2887 | /* convert this supplementary code point */ |
| 2888 | /* exit this condition tree */ |
| 2889 | } else { |
| 2890 | /* this is an unmatched lead code unit (1st surrogate) */ |
| 2891 | /* callback(illegal) */ |
| 2892 | *err=U_ILLEGAL_CHAR_FOUND; |
| 2893 | cnv->fromUChar32=sourceChar; |
| 2894 | break; |
| 2895 | } |
| 2896 | } else { |
| 2897 | /* no more input */ |
| 2898 | cnv->fromUChar32=sourceChar; |
| 2899 | break; |
| 2900 | } |
| 2901 | } else { |
| 2902 | /* this is an unmatched trail code unit (2nd surrogate) */ |
| 2903 | /* callback(illegal) */ |
| 2904 | *err=U_ILLEGAL_CHAR_FOUND; |
| 2905 | cnv->fromUChar32=sourceChar; |
| 2906 | break; |
| 2907 | } |
| 2908 | } |
| 2909 | |
| 2910 | /* do the conversion */ |
| 2911 | if(sourceChar <= 0x007f ){ |
| 2912 | /* do not convert SO/SI/ESC */ |
| 2913 | if(IS_2022_CONTROL(sourceChar)) { |
| 2914 | /* callback(illegal) */ |
| 2915 | *err=U_ILLEGAL_CHAR_FOUND; |
| 2916 | cnv->fromUChar32=sourceChar; |
| 2917 | break; |
| 2918 | } |
| 2919 | |
| 2920 | /* US-ASCII */ |
| 2921 | if(pFromU2022State->g == 0) { |
| 2922 | buffer[0] = (char)sourceChar; |
| 2923 | len = 1; |
| 2924 | } else { |
| 2925 | buffer[0] = UCNV_SI; |
| 2926 | buffer[1] = (char)sourceChar; |
| 2927 | len = 2; |
| 2928 | pFromU2022State->g = 0; |
| 2929 | choiceCount = 0; |
| 2930 | } |
| 2931 | if(sourceChar == CR || sourceChar == LF) { |
| 2932 | /* reset the state at the end of a line */ |
| 2933 | uprv_memset(pFromU2022State, 0, sizeof(ISO2022State)); |
| 2934 | choiceCount = 0; |
| 2935 | } |
| 2936 | } |
| 2937 | else{ |
| 2938 | /* convert U+0080..U+10ffff */ |
| 2939 | int32_t i; |
| 2940 | int8_t cs, g; |
| 2941 | |
| 2942 | if(choiceCount == 0) { |
| 2943 | /* try the current SO/G1 converter first */ |
| 2944 | choices[0] = pFromU2022State->cs[1]; |
| 2945 | |
| 2946 | /* default to GB2312_1 if none is designated yet */ |
| 2947 | if(choices[0] == 0) { |
| 2948 | choices[0] = GB2312_1; |
| 2949 | } |
| 2950 | |
| 2951 | if(converterData->version == 0) { |
| 2952 | /* ISO-2022-CN */ |
| 2953 | |
| 2954 | /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */ |
| 2955 | if(choices[0] == GB2312_1) { |
| 2956 | choices[1] = (int8_t)CNS_11643_1; |
| 2957 | } else { |
| 2958 | choices[1] = (int8_t)GB2312_1; |
| 2959 | } |
| 2960 | |
| 2961 | choiceCount = 2; |
| 2962 | } else if (converterData->version == 1) { |
| 2963 | /* ISO-2022-CN-EXT */ |
| 2964 | |
| 2965 | /* try one of the other converters */ |
| 2966 | switch(choices[0]) { |
| 2967 | case GB2312_1: |
| 2968 | choices[1] = (int8_t)CNS_11643_1; |
| 2969 | choices[2] = (int8_t)ISO_IR_165; |
| 2970 | break; |
| 2971 | case ISO_IR_165: |
| 2972 | choices[1] = (int8_t)GB2312_1; |
| 2973 | choices[2] = (int8_t)CNS_11643_1; |
| 2974 | break; |
| 2975 | default: /* CNS_11643_x */ |
| 2976 | choices[1] = (int8_t)GB2312_1; |
| 2977 | choices[2] = (int8_t)ISO_IR_165; |
| 2978 | break; |
| 2979 | } |
| 2980 | |
| 2981 | choiceCount = 3; |
| 2982 | } else { |
| 2983 | choices[0] = (int8_t)CNS_11643_1; |
| 2984 | choices[1] = (int8_t)GB2312_1; |
| 2985 | } |
| 2986 | } |
| 2987 | |
| 2988 | cs = g = 0; |
| 2989 | /* |
| 2990 | * len==0: no mapping found yet |
| 2991 | * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks |
| 2992 | * len>0: found a roundtrip result, done |
| 2993 | */ |
| 2994 | len = 0; |
| 2995 | /* |
| 2996 | * We will turn off useFallback after finding a fallback, |
| 2997 | * but we still get fallbacks from PUA code points as usual. |
| 2998 | * Therefore, we will also need to check that we don't overwrite |
| 2999 | * an early fallback with a later one. |
| 3000 | */ |
| 3001 | useFallback = cnv->useFallback; |
| 3002 | |
| 3003 | for(i = 0; i < choiceCount && len <= 0; ++i) { |
| 3004 | int8_t cs0 = choices[i]; |
| 3005 | if(cs0 > 0) { |
| 3006 | uint32_t value; |
| 3007 | int32_t len2; |
| 3008 | if(cs0 >= CNS_11643_0) { |
| 3009 | len2 = MBCS_FROM_UCHAR32_ISO2022( |
| 3010 | converterData->myConverterArray[CNS_11643], |
| 3011 | sourceChar, |
| 3012 | &value, |
| 3013 | useFallback, |
| 3014 | MBCS_OUTPUT_3); |
| 3015 | if(len2 == 3 || (len2 == -3 && len == 0)) { |
| 3016 | targetValue = value; |
| 3017 | cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80); |
| 3018 | if(len2 >= 0) { |
| 3019 | len = 2; |
| 3020 | } else { |
| 3021 | len = -2; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 3022 | useFallback = false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3023 | } |
| 3024 | if(cs == CNS_11643_1) { |
| 3025 | g = 1; |
| 3026 | } else if(cs == CNS_11643_2) { |
| 3027 | g = 2; |
| 3028 | } else /* plane 3..7 */ if(converterData->version == 1) { |
| 3029 | g = 3; |
| 3030 | } else { |
| 3031 | /* ISO-2022-CN (without -EXT) does not support plane 3..7 */ |
| 3032 | len = 0; |
| 3033 | } |
| 3034 | } |
| 3035 | } else { |
| 3036 | /* GB2312_1 or ISO-IR-165 */ |
| 3037 | U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS); |
| 3038 | len2 = MBCS_FROM_UCHAR32_ISO2022( |
| 3039 | converterData->myConverterArray[cs0], |
| 3040 | sourceChar, |
| 3041 | &value, |
| 3042 | useFallback, |
| 3043 | MBCS_OUTPUT_2); |
| 3044 | if(len2 == 2 || (len2 == -2 && len == 0)) { |
| 3045 | targetValue = value; |
| 3046 | len = len2; |
| 3047 | cs = cs0; |
| 3048 | g = 1; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 3049 | useFallback = false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3050 | } |
| 3051 | } |
| 3052 | } |
| 3053 | } |
| 3054 | |
| 3055 | if(len != 0) { |
| 3056 | len = 0; /* count output bytes; it must have been abs(len) == 2 */ |
| 3057 | |
| 3058 | /* write the designation sequence if necessary */ |
| 3059 | if(cs != pFromU2022State->cs[g]) { |
| 3060 | if(cs < CNS_11643) { |
| 3061 | uprv_memcpy(buffer, escSeqCharsCN[cs], 4); |
| 3062 | } else { |
| 3063 | U_ASSERT(cs >= CNS_11643_1); |
| 3064 | uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4); |
| 3065 | } |
| 3066 | len = 4; |
| 3067 | pFromU2022State->cs[g] = cs; |
| 3068 | if(g == 1) { |
| 3069 | /* changing the SO/G1 charset invalidates the choices[] */ |
| 3070 | choiceCount = 0; |
| 3071 | } |
| 3072 | } |
| 3073 | |
| 3074 | /* write the shift sequence if necessary */ |
| 3075 | if(g != pFromU2022State->g) { |
| 3076 | switch(g) { |
| 3077 | case 1: |
| 3078 | buffer[len++] = UCNV_SO; |
| 3079 | |
| 3080 | /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */ |
| 3081 | pFromU2022State->g = 1; |
| 3082 | break; |
| 3083 | case 2: |
| 3084 | buffer[len++] = 0x1b; |
| 3085 | buffer[len++] = 0x4e; |
| 3086 | break; |
| 3087 | default: /* case 3 */ |
| 3088 | buffer[len++] = 0x1b; |
| 3089 | buffer[len++] = 0x4f; |
| 3090 | break; |
| 3091 | } |
| 3092 | } |
| 3093 | |
| 3094 | /* write the two output bytes */ |
| 3095 | buffer[len++] = (char)(targetValue >> 8); |
| 3096 | buffer[len++] = (char)targetValue; |
| 3097 | } else { |
| 3098 | /* if we cannot find the character after checking all codepages |
| 3099 | * then this is an error |
| 3100 | */ |
| 3101 | *err = U_INVALID_CHAR_FOUND; |
| 3102 | cnv->fromUChar32=sourceChar; |
| 3103 | break; |
| 3104 | } |
| 3105 | } |
| 3106 | |
| 3107 | /* output len>0 bytes in buffer[] */ |
| 3108 | if(len == 1) { |
| 3109 | *target++ = buffer[0]; |
| 3110 | if(offsets) { |
| 3111 | *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ |
| 3112 | } |
| 3113 | } else if(len == 2 && (target + 2) <= targetLimit) { |
| 3114 | *target++ = buffer[0]; |
| 3115 | *target++ = buffer[1]; |
| 3116 | if(offsets) { |
| 3117 | int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); |
| 3118 | *offsets++ = sourceIndex; |
| 3119 | *offsets++ = sourceIndex; |
| 3120 | } |
| 3121 | } else { |
| 3122 | fromUWriteUInt8( |
| 3123 | cnv, |
| 3124 | buffer, len, |
| 3125 | &target, (const char *)targetLimit, |
| 3126 | &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), |
| 3127 | err); |
| 3128 | if(U_FAILURE(*err)) { |
| 3129 | break; |
| 3130 | } |
| 3131 | } |
| 3132 | } /* end if(myTargetIndex<myTargetLength) */ |
| 3133 | else{ |
| 3134 | *err =U_BUFFER_OVERFLOW_ERROR; |
| 3135 | break; |
| 3136 | } |
| 3137 | |
| 3138 | }/* end while(mySourceIndex<mySourceLength) */ |
| 3139 | |
| 3140 | /* |
| 3141 | * the end of the input stream and detection of truncated input |
| 3142 | * are handled by the framework, but for ISO-2022-CN conversion |
| 3143 | * we need to be in ASCII mode at the very end |
| 3144 | * |
| 3145 | * conditions: |
| 3146 | * successful |
| 3147 | * not in ASCII mode |
| 3148 | * end of input and no truncated input |
| 3149 | */ |
| 3150 | if( U_SUCCESS(*err) && |
| 3151 | pFromU2022State->g!=0 && |
| 3152 | args->flush && source>=sourceLimit && cnv->fromUChar32==0 |
| 3153 | ) { |
| 3154 | int32_t sourceIndex; |
| 3155 | |
| 3156 | /* we are switching to ASCII */ |
| 3157 | pFromU2022State->g=0; |
| 3158 | |
| 3159 | /* get the source index of the last input character */ |
| 3160 | /* |
| 3161 | * TODO this would be simpler and more reliable if we used a pair |
| 3162 | * of sourceIndex/prevSourceIndex like in ucnvmbcs.c |
| 3163 | * so that we could simply use the prevSourceIndex here; |
| 3164 | * this code gives an incorrect result for the rare case of an unmatched |
| 3165 | * trail surrogate that is alone in the last buffer of the text stream |
| 3166 | */ |
| 3167 | sourceIndex=(int32_t)(source-args->source); |
| 3168 | if(sourceIndex>0) { |
| 3169 | --sourceIndex; |
| 3170 | if( U16_IS_TRAIL(args->source[sourceIndex]) && |
| 3171 | (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) |
| 3172 | ) { |
| 3173 | --sourceIndex; |
| 3174 | } |
| 3175 | } else { |
| 3176 | sourceIndex=-1; |
| 3177 | } |
| 3178 | |
| 3179 | fromUWriteUInt8( |
| 3180 | cnv, |
| 3181 | SHIFT_IN_STR, 1, |
| 3182 | &target, (const char *)targetLimit, |
| 3183 | &offsets, sourceIndex, |
| 3184 | err); |
| 3185 | } |
| 3186 | |
| 3187 | /*save the state and return */ |
| 3188 | args->source = source; |
| 3189 | args->target = (char*)target; |
| 3190 | } |
| 3191 | |
| 3192 | |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 3193 | static void U_CALLCONV |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3194 | UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, |
| 3195 | UErrorCode* err){ |
| 3196 | char tempBuf[3]; |
| 3197 | const char *mySource = (char *) args->source; |
| 3198 | UChar *myTarget = args->target; |
| 3199 | const char *mySourceLimit = args->sourceLimit; |
| 3200 | uint32_t targetUniChar = 0x0000; |
| 3201 | uint32_t mySourceChar = 0x0000; |
| 3202 | UConverterDataISO2022* myData; |
| 3203 | ISO2022State *pToU2022State; |
| 3204 | |
| 3205 | myData=(UConverterDataISO2022*)(args->converter->extraInfo); |
| 3206 | pToU2022State = &myData->toU2022State; |
| 3207 | |
| 3208 | if(myData->key != 0) { |
| 3209 | /* continue with a partial escape sequence */ |
| 3210 | goto escape; |
| 3211 | } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { |
| 3212 | /* continue with a partial double-byte character */ |
| 3213 | mySourceChar = args->converter->toUBytes[0]; |
| 3214 | args->converter->toULength = 0; |
| 3215 | targetUniChar = missingCharMarker; |
| 3216 | goto getTrailByte; |
| 3217 | } |
| 3218 | |
| 3219 | while(mySource < mySourceLimit){ |
| 3220 | |
| 3221 | targetUniChar =missingCharMarker; |
| 3222 | |
| 3223 | if(myTarget < args->targetLimit){ |
| 3224 | |
| 3225 | mySourceChar= (unsigned char) *mySource++; |
| 3226 | |
| 3227 | switch(mySourceChar){ |
| 3228 | case UCNV_SI: |
| 3229 | pToU2022State->g=0; |
| 3230 | if (myData->isEmptySegment) { |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 3231 | myData->isEmptySegment = false; /* we are handling it, reset to avoid future spurious errors */ |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3232 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
| 3233 | args->converter->toUCallbackReason = UCNV_IRREGULAR; |
Jungshik Shin | 42d5027 | 2018-10-24 01:22:09 -0700 | [diff] [blame] | 3234 | args->converter->toUBytes[0] = static_cast<uint8_t>(mySourceChar); |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3235 | args->converter->toULength = 1; |
| 3236 | args->target = myTarget; |
| 3237 | args->source = mySource; |
| 3238 | return; |
| 3239 | } |
| 3240 | continue; |
| 3241 | |
| 3242 | case UCNV_SO: |
| 3243 | if(pToU2022State->cs[1] != 0) { |
| 3244 | pToU2022State->g=1; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 3245 | myData->isEmptySegment = true; /* Begin a new segment, empty so far */ |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3246 | continue; |
| 3247 | } else { |
| 3248 | /* illegal to have SO before a matching designator */ |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 3249 | myData->isEmptySegment = false; /* Handling a different error, reset this to avoid future spurious errs */ |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3250 | break; |
| 3251 | } |
| 3252 | |
| 3253 | case ESC_2022: |
| 3254 | mySource--; |
| 3255 | escape: |
| 3256 | { |
| 3257 | const char * mySourceBefore = mySource; |
| 3258 | int8_t toULengthBefore = args->converter->toULength; |
| 3259 | |
| 3260 | changeState_2022(args->converter,&(mySource), |
| 3261 | mySourceLimit, ISO_2022_CN,err); |
| 3262 | |
| 3263 | /* After SO there must be at least one character before a designator (designator error handled separately) */ |
| 3264 | if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { |
| 3265 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
| 3266 | args->converter->toUCallbackReason = UCNV_IRREGULAR; |
| 3267 | args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore)); |
| 3268 | } |
| 3269 | } |
| 3270 | |
| 3271 | /* invalid or illegal escape sequence */ |
| 3272 | if(U_FAILURE(*err)){ |
| 3273 | args->target = myTarget; |
| 3274 | args->source = mySource; |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 3275 | myData->isEmptySegment = false; /* Reset to avoid future spurious errors */ |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3276 | return; |
| 3277 | } |
| 3278 | continue; |
| 3279 | |
| 3280 | /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */ |
| 3281 | |
| 3282 | case CR: |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3283 | case LF: |
| 3284 | uprv_memset(pToU2022State, 0, sizeof(ISO2022State)); |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 3285 | U_FALLTHROUGH; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3286 | default: |
| 3287 | /* convert one or two bytes */ |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 3288 | myData->isEmptySegment = false; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3289 | if(pToU2022State->g != 0) { |
| 3290 | if(mySource < mySourceLimit) { |
| 3291 | UConverterSharedData *cnv; |
| 3292 | StateEnum tempState; |
| 3293 | int32_t tempBufLen; |
| 3294 | int leadIsOk, trailIsOk; |
| 3295 | uint8_t trailByte; |
| 3296 | getTrailByte: |
| 3297 | trailByte = (uint8_t)*mySource; |
| 3298 | /* |
| 3299 | * Ticket 5691: consistent illegal sequences: |
| 3300 | * - We include at least the first byte in the illegal sequence. |
| 3301 | * - If any of the non-initial bytes could be the start of a character, |
| 3302 | * we stop the illegal sequence before the first one of those. |
| 3303 | * |
| 3304 | * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is |
| 3305 | * an ESC/SO/SI, we report only the first byte as the illegal sequence. |
| 3306 | * Otherwise we convert or report the pair of bytes. |
| 3307 | */ |
| 3308 | leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); |
| 3309 | trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); |
| 3310 | if (leadIsOk && trailIsOk) { |
| 3311 | ++mySource; |
| 3312 | tempState = (StateEnum)pToU2022State->cs[pToU2022State->g]; |
| 3313 | if(tempState >= CNS_11643_0) { |
| 3314 | cnv = myData->myConverterArray[CNS_11643]; |
| 3315 | tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0)); |
| 3316 | tempBuf[1] = (char) (mySourceChar); |
| 3317 | tempBuf[2] = (char) trailByte; |
| 3318 | tempBufLen = 3; |
| 3319 | |
| 3320 | }else{ |
| 3321 | U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS); |
| 3322 | cnv = myData->myConverterArray[tempState]; |
| 3323 | tempBuf[0] = (char) (mySourceChar); |
| 3324 | tempBuf[1] = (char) trailByte; |
| 3325 | tempBufLen = 2; |
| 3326 | } |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 3327 | targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, false); |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3328 | mySourceChar = (mySourceChar << 8) | trailByte; |
| 3329 | } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { |
| 3330 | /* report a pair of illegal bytes if the second byte is not a DBCS starter */ |
| 3331 | ++mySource; |
| 3332 | /* add another bit so that the code below writes 2 bytes in case of error */ |
| 3333 | mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; |
| 3334 | } |
| 3335 | if(pToU2022State->g>=2) { |
| 3336 | /* return from a single-shift state to the previous one */ |
| 3337 | pToU2022State->g=pToU2022State->prevG; |
| 3338 | } |
| 3339 | } else { |
| 3340 | args->converter->toUBytes[0] = (uint8_t)mySourceChar; |
| 3341 | args->converter->toULength = 1; |
| 3342 | goto endloop; |
| 3343 | } |
| 3344 | } |
| 3345 | else{ |
| 3346 | if(mySourceChar <= 0x7f) { |
| 3347 | targetUniChar = (UChar) mySourceChar; |
| 3348 | } |
| 3349 | } |
| 3350 | break; |
| 3351 | } |
| 3352 | if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ |
| 3353 | if(args->offsets){ |
| 3354 | args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
| 3355 | } |
| 3356 | *(myTarget++)=(UChar)targetUniChar; |
| 3357 | } |
| 3358 | else if(targetUniChar > missingCharMarker){ |
| 3359 | /* disassemble the surrogate pair and write to output*/ |
| 3360 | targetUniChar-=0x0010000; |
| 3361 | *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); |
| 3362 | if(args->offsets){ |
| 3363 | args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
| 3364 | } |
| 3365 | ++myTarget; |
| 3366 | if(myTarget< args->targetLimit){ |
| 3367 | *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); |
| 3368 | if(args->offsets){ |
| 3369 | args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
| 3370 | } |
| 3371 | ++myTarget; |
| 3372 | }else{ |
| 3373 | args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= |
| 3374 | (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); |
| 3375 | } |
| 3376 | |
| 3377 | } |
| 3378 | else{ |
| 3379 | /* Call the callback function*/ |
| 3380 | toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); |
| 3381 | break; |
| 3382 | } |
| 3383 | } |
| 3384 | else{ |
| 3385 | *err =U_BUFFER_OVERFLOW_ERROR; |
| 3386 | break; |
| 3387 | } |
| 3388 | } |
| 3389 | endloop: |
| 3390 | args->target = myTarget; |
| 3391 | args->source = mySource; |
| 3392 | } |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame] | 3393 | #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */ |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3394 | |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 3395 | static void U_CALLCONV |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3396 | _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) { |
| 3397 | UConverter *cnv = args->converter; |
| 3398 | UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; |
| 3399 | ISO2022State *pFromU2022State=&myConverterData->fromU2022State; |
| 3400 | char *p, *subchar; |
| 3401 | char buffer[8]; |
| 3402 | int32_t length; |
| 3403 | |
| 3404 | subchar=(char *)cnv->subChars; |
| 3405 | length=cnv->subCharLen; /* assume length==1 for most variants */ |
| 3406 | |
| 3407 | p = buffer; |
| 3408 | switch(myConverterData->locale[0]){ |
| 3409 | case 'j': |
| 3410 | { |
| 3411 | int8_t cs; |
| 3412 | |
| 3413 | if(pFromU2022State->g == 1) { |
| 3414 | /* JIS7: switch from G1 to G0 */ |
| 3415 | pFromU2022State->g = 0; |
| 3416 | *p++ = UCNV_SI; |
| 3417 | } |
| 3418 | |
| 3419 | cs = pFromU2022State->cs[0]; |
| 3420 | if(cs != ASCII && cs != JISX201) { |
| 3421 | /* not in ASCII or JIS X 0201: switch to ASCII */ |
| 3422 | pFromU2022State->cs[0] = (int8_t)ASCII; |
| 3423 | *p++ = '\x1b'; |
| 3424 | *p++ = '\x28'; |
| 3425 | *p++ = '\x42'; |
| 3426 | } |
| 3427 | |
| 3428 | *p++ = subchar[0]; |
| 3429 | break; |
| 3430 | } |
| 3431 | case 'c': |
| 3432 | if(pFromU2022State->g != 0) { |
| 3433 | /* not in ASCII mode: switch to ASCII */ |
| 3434 | pFromU2022State->g = 0; |
| 3435 | *p++ = UCNV_SI; |
| 3436 | } |
| 3437 | *p++ = subchar[0]; |
| 3438 | break; |
| 3439 | case 'k': |
| 3440 | if(myConverterData->version == 0) { |
| 3441 | if(length == 1) { |
Jungshik Shin | f61e46d | 2018-05-04 13:00:45 -0700 | [diff] [blame] | 3442 | if(args->converter->fromUnicodeStatus) { |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3443 | /* in DBCS mode: switch to SBCS */ |
| 3444 | args->converter->fromUnicodeStatus = 0; |
| 3445 | *p++ = UCNV_SI; |
| 3446 | } |
| 3447 | *p++ = subchar[0]; |
| 3448 | } else /* length == 2*/ { |
Jungshik Shin | f61e46d | 2018-05-04 13:00:45 -0700 | [diff] [blame] | 3449 | if(!args->converter->fromUnicodeStatus) { |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3450 | /* in SBCS mode: switch to DBCS */ |
| 3451 | args->converter->fromUnicodeStatus = 1; |
| 3452 | *p++ = UCNV_SO; |
| 3453 | } |
| 3454 | *p++ = subchar[0]; |
| 3455 | *p++ = subchar[1]; |
| 3456 | } |
| 3457 | break; |
| 3458 | } else { |
| 3459 | /* save the subconverter's substitution string */ |
| 3460 | uint8_t *currentSubChars = myConverterData->currentConverter->subChars; |
| 3461 | int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen; |
| 3462 | |
| 3463 | /* set our substitution string into the subconverter */ |
| 3464 | myConverterData->currentConverter->subChars = (uint8_t *)subchar; |
| 3465 | myConverterData->currentConverter->subCharLen = (int8_t)length; |
| 3466 | |
| 3467 | /* let the subconverter write the subchar, set/retrieve fromUChar32 state */ |
| 3468 | args->converter = myConverterData->currentConverter; |
| 3469 | myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32; |
| 3470 | ucnv_cbFromUWriteSub(args, 0, err); |
| 3471 | cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32; |
| 3472 | args->converter = cnv; |
| 3473 | |
| 3474 | /* restore the subconverter's substitution string */ |
| 3475 | myConverterData->currentConverter->subChars = currentSubChars; |
| 3476 | myConverterData->currentConverter->subCharLen = currentSubCharLen; |
| 3477 | |
| 3478 | if(*err == U_BUFFER_OVERFLOW_ERROR) { |
| 3479 | if(myConverterData->currentConverter->charErrorBufferLength > 0) { |
| 3480 | uprv_memcpy( |
| 3481 | cnv->charErrorBuffer, |
| 3482 | myConverterData->currentConverter->charErrorBuffer, |
| 3483 | myConverterData->currentConverter->charErrorBufferLength); |
| 3484 | } |
| 3485 | cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; |
| 3486 | myConverterData->currentConverter->charErrorBufferLength = 0; |
| 3487 | } |
| 3488 | return; |
| 3489 | } |
| 3490 | default: |
| 3491 | /* not expected */ |
| 3492 | break; |
| 3493 | } |
| 3494 | ucnv_cbFromUWriteBytes(args, |
| 3495 | buffer, (int32_t)(p - buffer), |
| 3496 | offsetIndex, err); |
| 3497 | } |
| 3498 | |
| 3499 | /* |
| 3500 | * Structure for cloning an ISO 2022 converter into a single memory block. |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3501 | */ |
| 3502 | struct cloneStruct |
| 3503 | { |
| 3504 | UConverter cnv; |
| 3505 | UConverter currentConverter; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3506 | UConverterDataISO2022 mydata; |
| 3507 | }; |
| 3508 | |
| 3509 | |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 3510 | U_CDECL_BEGIN |
| 3511 | |
| 3512 | static UConverter * U_CALLCONV |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3513 | _ISO_2022_SafeClone( |
| 3514 | const UConverter *cnv, |
| 3515 | void *stackBuffer, |
| 3516 | int32_t *pBufferSize, |
| 3517 | UErrorCode *status) |
| 3518 | { |
| 3519 | struct cloneStruct * localClone; |
| 3520 | UConverterDataISO2022 *cnvData; |
| 3521 | int32_t i, size; |
| 3522 | |
Frank Tang | b869661 | 2019-10-25 14:58:21 -0700 | [diff] [blame] | 3523 | if (U_FAILURE(*status)){ |
| 3524 | return nullptr; |
| 3525 | } |
| 3526 | |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3527 | if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */ |
| 3528 | *pBufferSize = (int32_t)sizeof(struct cloneStruct); |
| 3529 | return NULL; |
| 3530 | } |
| 3531 | |
| 3532 | cnvData = (UConverterDataISO2022 *)cnv->extraInfo; |
| 3533 | localClone = (struct cloneStruct *)stackBuffer; |
| 3534 | |
| 3535 | /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ |
| 3536 | |
| 3537 | uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022)); |
| 3538 | localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */ |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 3539 | localClone->cnv.isExtraLocal = true; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3540 | |
| 3541 | /* share the subconverters */ |
| 3542 | |
| 3543 | if(cnvData->currentConverter != NULL) { |
Frank Tang | b869661 | 2019-10-25 14:58:21 -0700 | [diff] [blame] | 3544 | size = (int32_t)sizeof(UConverter); |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3545 | localClone->mydata.currentConverter = |
| 3546 | ucnv_safeClone(cnvData->currentConverter, |
| 3547 | &localClone->currentConverter, |
| 3548 | &size, status); |
| 3549 | if(U_FAILURE(*status)) { |
| 3550 | return NULL; |
| 3551 | } |
| 3552 | } |
| 3553 | |
| 3554 | for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) { |
| 3555 | if(cnvData->myConverterArray[i] != NULL) { |
| 3556 | ucnv_incrementRefCount(cnvData->myConverterArray[i]); |
| 3557 | } |
| 3558 | } |
| 3559 | |
| 3560 | return &localClone->cnv; |
| 3561 | } |
| 3562 | |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 3563 | U_CDECL_END |
| 3564 | |
| 3565 | static void U_CALLCONV |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3566 | _ISO_2022_GetUnicodeSet(const UConverter *cnv, |
| 3567 | const USetAdder *sa, |
| 3568 | UConverterUnicodeSet which, |
| 3569 | UErrorCode *pErrorCode) |
| 3570 | { |
| 3571 | int32_t i; |
| 3572 | UConverterDataISO2022* cnvData; |
| 3573 | |
| 3574 | if (U_FAILURE(*pErrorCode)) { |
| 3575 | return; |
| 3576 | } |
| 3577 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
| 3578 | if (cnv->sharedData == &_ISO2022Data) { |
| 3579 | /* We use UTF-8 in this case */ |
| 3580 | sa->addRange(sa->set, 0, 0xd7FF); |
| 3581 | sa->addRange(sa->set, 0xE000, 0x10FFFF); |
| 3582 | return; |
| 3583 | } |
| 3584 | #endif |
| 3585 | |
| 3586 | cnvData = (UConverterDataISO2022*)cnv->extraInfo; |
| 3587 | |
| 3588 | /* open a set and initialize it with code points that are algorithmically round-tripped */ |
| 3589 | switch(cnvData->locale[0]){ |
| 3590 | case 'j': |
| 3591 | /* include JIS X 0201 which is hardcoded */ |
| 3592 | sa->add(sa->set, 0xa5); |
| 3593 | sa->add(sa->set, 0x203e); |
| 3594 | if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) { |
| 3595 | /* include Latin-1 for some variants of JP */ |
| 3596 | sa->addRange(sa->set, 0, 0xff); |
| 3597 | } else { |
| 3598 | /* include ASCII for JP */ |
| 3599 | sa->addRange(sa->set, 0, 0x7f); |
| 3600 | } |
| 3601 | if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) { |
| 3602 | /* |
| 3603 | * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0 |
| 3604 | * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8) |
| 3605 | * use half-width Katakana. |
| 3606 | * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode) |
| 3607 | * half-width Katakana via the ESC ( I sequence. |
| 3608 | * However, we only emit (fromUnicode) half-width Katakana according to the |
| 3609 | * definition of each variant. |
| 3610 | * |
| 3611 | * When including fallbacks, |
| 3612 | * we need to include half-width Katakana Unicode code points for all JP variants because |
| 3613 | * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana). |
| 3614 | */ |
| 3615 | /* include half-width Katakana for JP */ |
| 3616 | sa->addRange(sa->set, HWKANA_START, HWKANA_END); |
| 3617 | } |
| 3618 | break; |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame] | 3619 | #if !UCONFIG_ONLY_HTML_CONVERSION |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3620 | case 'c': |
| 3621 | case 'z': |
| 3622 | /* include ASCII for CN */ |
| 3623 | sa->addRange(sa->set, 0, 0x7f); |
| 3624 | break; |
| 3625 | case 'k': |
| 3626 | /* there is only one converter for KR, and it is not in the myConverterArray[] */ |
| 3627 | cnvData->currentConverter->sharedData->impl->getUnicodeSet( |
| 3628 | cnvData->currentConverter, sa, which, pErrorCode); |
| 3629 | /* the loop over myConverterArray[] will simply not find another converter */ |
| 3630 | break; |
Jungshik Shin (jungshik at google) | afd723b | 2015-01-21 13:24:04 -0800 | [diff] [blame] | 3631 | #endif |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3632 | default: |
| 3633 | break; |
| 3634 | } |
| 3635 | |
| 3636 | #if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ |
| 3637 | if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && |
| 3638 | cnvData->version==0 && i==CNS_11643 |
| 3639 | ) { |
| 3640 | /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */ |
| 3641 | ucnv_MBCSGetUnicodeSetForBytes( |
| 3642 | cnvData->myConverterArray[i], |
| 3643 | sa, UCNV_ROUNDTRIP_SET, |
| 3644 | 0, 0x81, 0x82, |
| 3645 | pErrorCode); |
| 3646 | } |
| 3647 | #endif |
| 3648 | |
| 3649 | for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { |
| 3650 | UConverterSetFilter filter; |
| 3651 | if(cnvData->myConverterArray[i]!=NULL) { |
Jungshik Shin (jungshik at google) | afd723b | 2015-01-21 13:24:04 -0800 | [diff] [blame] | 3652 | if(cnvData->locale[0]=='j' && i==JISX208) { |
| 3653 | /* |
| 3654 | * Only add code points that map to Shift-JIS codes |
| 3655 | * corresponding to JIS X 0208. |
| 3656 | */ |
| 3657 | filter=UCNV_SET_FILTER_SJIS; |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame] | 3658 | #if !UCONFIG_ONLY_HTML_CONVERSION |
Jungshik Shin (jungshik at google) | afd723b | 2015-01-21 13:24:04 -0800 | [diff] [blame] | 3659 | } else if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && |
| 3660 | cnvData->version==0 && i==CNS_11643) { |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3661 | /* |
| 3662 | * Version-specific for CN: |
| 3663 | * CN version 0 does not map CNS planes 3..7 although |
| 3664 | * they are all available in the CNS conversion table; |
| 3665 | * CN version 1 (-EXT) does map them all. |
| 3666 | * The two versions create different Unicode sets. |
| 3667 | */ |
| 3668 | filter=UCNV_SET_FILTER_2022_CN; |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3669 | } else if(i==KSC5601) { |
| 3670 | /* |
| 3671 | * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables) |
| 3672 | * are broader than GR94. |
| 3673 | */ |
| 3674 | filter=UCNV_SET_FILTER_GR94DBCS; |
Jungshik Shin (jungshik at google) | afd723b | 2015-01-21 13:24:04 -0800 | [diff] [blame] | 3675 | #endif |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3676 | } else { |
| 3677 | filter=UCNV_SET_FILTER_NONE; |
| 3678 | } |
| 3679 | ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode); |
| 3680 | } |
| 3681 | } |
| 3682 | |
| 3683 | /* |
| 3684 | * ISO 2022 converters must not convert SO/SI/ESC despite what |
| 3685 | * sub-converters do by themselves. |
| 3686 | * Remove these characters from the set. |
| 3687 | */ |
| 3688 | sa->remove(sa->set, 0x0e); |
| 3689 | sa->remove(sa->set, 0x0f); |
| 3690 | sa->remove(sa->set, 0x1b); |
| 3691 | |
| 3692 | /* ISO 2022 converters do not convert C1 controls either */ |
| 3693 | sa->removeRange(sa->set, 0x80, 0x9f); |
| 3694 | } |
| 3695 | |
| 3696 | static const UConverterImpl _ISO2022Impl={ |
| 3697 | UCNV_ISO_2022, |
| 3698 | |
| 3699 | NULL, |
| 3700 | NULL, |
| 3701 | |
| 3702 | _ISO2022Open, |
| 3703 | _ISO2022Close, |
| 3704 | _ISO2022Reset, |
| 3705 | |
| 3706 | #ifdef U_ENABLE_GENERIC_ISO_2022 |
| 3707 | T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, |
| 3708 | T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, |
| 3709 | ucnv_fromUnicode_UTF8, |
| 3710 | ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, |
| 3711 | #else |
| 3712 | NULL, |
| 3713 | NULL, |
| 3714 | NULL, |
| 3715 | NULL, |
| 3716 | #endif |
| 3717 | NULL, |
| 3718 | |
| 3719 | NULL, |
| 3720 | _ISO2022getName, |
| 3721 | _ISO_2022_WriteSub, |
| 3722 | _ISO_2022_SafeClone, |
| 3723 | _ISO_2022_GetUnicodeSet, |
| 3724 | |
| 3725 | NULL, |
| 3726 | NULL |
| 3727 | }; |
| 3728 | static const UConverterStaticData _ISO2022StaticData={ |
| 3729 | sizeof(UConverterStaticData), |
| 3730 | "ISO_2022", |
| 3731 | 2022, |
| 3732 | UCNV_IBM, |
| 3733 | UCNV_ISO_2022, |
| 3734 | 1, |
| 3735 | 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */ |
| 3736 | { 0x1a, 0, 0, 0 }, |
| 3737 | 1, |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 3738 | false, |
| 3739 | false, |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3740 | 0, |
| 3741 | 0, |
| 3742 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
| 3743 | }; |
Jungshik Shin | a05f412 | 2015-06-09 15:33:54 -0700 | [diff] [blame] | 3744 | const UConverterSharedData _ISO2022Data= |
| 3745 | UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022StaticData, &_ISO2022Impl); |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3746 | |
| 3747 | /*************JP****************/ |
| 3748 | static const UConverterImpl _ISO2022JPImpl={ |
| 3749 | UCNV_ISO_2022, |
| 3750 | |
| 3751 | NULL, |
| 3752 | NULL, |
| 3753 | |
| 3754 | _ISO2022Open, |
| 3755 | _ISO2022Close, |
| 3756 | _ISO2022Reset, |
| 3757 | |
| 3758 | UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, |
| 3759 | UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, |
| 3760 | UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, |
| 3761 | UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, |
| 3762 | NULL, |
| 3763 | |
| 3764 | NULL, |
| 3765 | _ISO2022getName, |
| 3766 | _ISO_2022_WriteSub, |
| 3767 | _ISO_2022_SafeClone, |
| 3768 | _ISO_2022_GetUnicodeSet, |
| 3769 | |
| 3770 | NULL, |
| 3771 | NULL |
| 3772 | }; |
| 3773 | static const UConverterStaticData _ISO2022JPStaticData={ |
| 3774 | sizeof(UConverterStaticData), |
| 3775 | "ISO_2022_JP", |
| 3776 | 0, |
| 3777 | UCNV_IBM, |
| 3778 | UCNV_ISO_2022, |
| 3779 | 1, |
| 3780 | 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */ |
| 3781 | { 0x1a, 0, 0, 0 }, |
| 3782 | 1, |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 3783 | false, |
| 3784 | false, |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3785 | 0, |
| 3786 | 0, |
| 3787 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
| 3788 | }; |
| 3789 | |
| 3790 | namespace { |
| 3791 | |
Jungshik Shin | a05f412 | 2015-06-09 15:33:54 -0700 | [diff] [blame] | 3792 | const UConverterSharedData _ISO2022JPData= |
| 3793 | UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022JPStaticData, &_ISO2022JPImpl); |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3794 | |
| 3795 | } // namespace |
| 3796 | |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame] | 3797 | #if !UCONFIG_ONLY_HTML_CONVERSION |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3798 | /************* KR ***************/ |
| 3799 | static const UConverterImpl _ISO2022KRImpl={ |
| 3800 | UCNV_ISO_2022, |
| 3801 | |
| 3802 | NULL, |
| 3803 | NULL, |
| 3804 | |
| 3805 | _ISO2022Open, |
| 3806 | _ISO2022Close, |
| 3807 | _ISO2022Reset, |
| 3808 | |
| 3809 | UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, |
| 3810 | UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, |
| 3811 | UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, |
| 3812 | UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, |
| 3813 | NULL, |
| 3814 | |
| 3815 | NULL, |
| 3816 | _ISO2022getName, |
| 3817 | _ISO_2022_WriteSub, |
| 3818 | _ISO_2022_SafeClone, |
| 3819 | _ISO_2022_GetUnicodeSet, |
| 3820 | |
| 3821 | NULL, |
| 3822 | NULL |
| 3823 | }; |
| 3824 | static const UConverterStaticData _ISO2022KRStaticData={ |
| 3825 | sizeof(UConverterStaticData), |
| 3826 | "ISO_2022_KR", |
| 3827 | 0, |
| 3828 | UCNV_IBM, |
| 3829 | UCNV_ISO_2022, |
| 3830 | 1, |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 3831 | 8, /* max 8 bytes per UChar */ |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3832 | { 0x1a, 0, 0, 0 }, |
| 3833 | 1, |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 3834 | false, |
| 3835 | false, |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3836 | 0, |
| 3837 | 0, |
| 3838 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
| 3839 | }; |
| 3840 | |
| 3841 | namespace { |
| 3842 | |
Jungshik Shin | a05f412 | 2015-06-09 15:33:54 -0700 | [diff] [blame] | 3843 | const UConverterSharedData _ISO2022KRData= |
| 3844 | UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022KRStaticData, &_ISO2022KRImpl); |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3845 | |
| 3846 | } // namespace |
| 3847 | |
| 3848 | /*************** CN ***************/ |
| 3849 | static const UConverterImpl _ISO2022CNImpl={ |
| 3850 | |
| 3851 | UCNV_ISO_2022, |
| 3852 | |
| 3853 | NULL, |
| 3854 | NULL, |
| 3855 | |
| 3856 | _ISO2022Open, |
| 3857 | _ISO2022Close, |
| 3858 | _ISO2022Reset, |
| 3859 | |
| 3860 | UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, |
| 3861 | UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, |
| 3862 | UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, |
| 3863 | UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, |
| 3864 | NULL, |
| 3865 | |
| 3866 | NULL, |
| 3867 | _ISO2022getName, |
| 3868 | _ISO_2022_WriteSub, |
| 3869 | _ISO_2022_SafeClone, |
| 3870 | _ISO_2022_GetUnicodeSet, |
| 3871 | |
| 3872 | NULL, |
| 3873 | NULL |
| 3874 | }; |
| 3875 | static const UConverterStaticData _ISO2022CNStaticData={ |
| 3876 | sizeof(UConverterStaticData), |
| 3877 | "ISO_2022_CN", |
| 3878 | 0, |
| 3879 | UCNV_IBM, |
| 3880 | UCNV_ISO_2022, |
| 3881 | 1, |
| 3882 | 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */ |
| 3883 | { 0x1a, 0, 0, 0 }, |
| 3884 | 1, |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 3885 | false, |
| 3886 | false, |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3887 | 0, |
| 3888 | 0, |
| 3889 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
| 3890 | }; |
| 3891 | |
| 3892 | namespace { |
| 3893 | |
Jungshik Shin | a05f412 | 2015-06-09 15:33:54 -0700 | [diff] [blame] | 3894 | const UConverterSharedData _ISO2022CNData= |
| 3895 | UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022CNStaticData, &_ISO2022CNImpl); |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3896 | |
| 3897 | } // namespace |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame] | 3898 | #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */ |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3899 | |
| 3900 | #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ |