Jungshik Shin | 87232d8 | 2017-05-13 21:10:13 -0700 | [diff] [blame] | 1 | // © 2016 and later: Unicode, Inc. and others. |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 2 | // License & terms of use: http://www.unicode.org/copyright.html |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 3 | /* |
| 4 | ******************************************************************************* |
| 5 | * |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 6 | * Copyright (C) 1998-2016, International Business Machines |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 7 | * Corporation and others. All Rights Reserved. |
| 8 | * |
| 9 | ******************************************************************************* |
| 10 | * |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 11 | * File ucbuf.h |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 12 | * |
| 13 | * Modification History: |
| 14 | * |
| 15 | * Date Name Description |
| 16 | * 05/10/01 Ram Creation. |
| 17 | * |
| 18 | * This API reads in files and returns UChars |
| 19 | ******************************************************************************* |
| 20 | */ |
| 21 | |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 22 | #include "unicode/localpointer.h" |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 23 | #include "unicode/ucnv.h" |
| 24 | #include "filestrm.h" |
| 25 | |
| 26 | #if !UCONFIG_NO_CONVERSION |
| 27 | |
| 28 | #ifndef UCBUF_H |
| 29 | #define UCBUF_H 1 |
| 30 | |
| 31 | typedef struct UCHARBUF UCHARBUF; |
| 32 | /** |
| 33 | * End of file value |
| 34 | */ |
Frank Tang | b869661 | 2019-10-25 14:58:21 -0700 | [diff] [blame] | 35 | #define U_EOF ((int32_t)0xFFFFFFFF) |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 36 | /** |
| 37 | * Error value if a sequence cannot be unescaped |
| 38 | */ |
Frank Tang | b869661 | 2019-10-25 14:58:21 -0700 | [diff] [blame] | 39 | #define U_ERR ((int32_t)0xFFFFFFFE) |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 40 | |
| 41 | typedef struct ULine ULine; |
| 42 | |
| 43 | struct ULine { |
| 44 | UChar *name; |
| 45 | int32_t len; |
| 46 | }; |
| 47 | |
| 48 | /** |
| 49 | * Opens the UCHARBUF with the given file stream and code page for conversion |
| 50 | * @param fileName Name of the file to open. |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 51 | * @param codepage The encoding of the file stream to convert to Unicode. |
Frank Tang | 7e7574b | 2021-04-13 21:19:13 -0700 | [diff] [blame] | 52 | * If *codepage is NULL on input the API will try to autodetect |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 53 | * popular Unicode encodings |
| 54 | * @param showWarning Flag to print out warnings to STDOUT |
Frank Tang | 1f164ee | 2022-11-08 12:31:27 -0800 | [diff] [blame^] | 55 | * @param buffered If true performs a buffered read of the input file. If false reads |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 56 | * the whole file into memory and converts it. |
| 57 | * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value |
| 58 | * indicates a failure on entry, the function will immediately return. |
| 59 | * On exit the value will indicate the success of the operation. |
| 60 | * @return pointer to the newly opened UCHARBUF |
| 61 | */ |
| 62 | U_CAPI UCHARBUF* U_EXPORT2 |
| 63 | ucbuf_open(const char* fileName,const char** codepage,UBool showWarning, UBool buffered, UErrorCode* err); |
| 64 | |
| 65 | /** |
| 66 | * Gets a UTF-16 code unit at the current position from the converted buffer |
| 67 | * and increments the current position |
| 68 | * @param buf Pointer to UCHARBUF structure |
| 69 | * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value |
| 70 | * indicates a failure on entry, the function will immediately return. |
| 71 | * On exit the value will indicate the success of the operation. |
| 72 | */ |
| 73 | U_CAPI int32_t U_EXPORT2 |
| 74 | ucbuf_getc(UCHARBUF* buf,UErrorCode* err); |
| 75 | |
| 76 | /** |
| 77 | * Gets a UTF-32 code point at the current position from the converted buffer |
| 78 | * and increments the current position |
| 79 | * @param buf Pointer to UCHARBUF structure |
| 80 | * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value |
| 81 | * indicates a failure on entry, the function will immediately return. |
| 82 | * On exit the value will indicate the success of the operation. |
| 83 | */ |
| 84 | U_CAPI int32_t U_EXPORT2 |
| 85 | ucbuf_getc32(UCHARBUF* buf,UErrorCode* err); |
| 86 | |
| 87 | /** |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 88 | * Gets a UTF-16 code unit at the current position from the converted buffer after |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 89 | * unescaping and increments the current position. If the escape sequence is for UTF-32 |
| 90 | * code point (\\Uxxxxxxxx) then a UTF-32 codepoint is returned |
| 91 | * @param buf Pointer to UCHARBUF structure |
| 92 | * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value |
| 93 | * indicates a failure on entry, the function will immediately return. |
| 94 | * On exit the value will indicate the success of the operation. |
| 95 | */ |
| 96 | U_CAPI int32_t U_EXPORT2 |
| 97 | ucbuf_getcx32(UCHARBUF* buf,UErrorCode* err); |
| 98 | |
| 99 | /** |
| 100 | * Gets a pointer to the current position in the internal buffer and length of the line. |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 101 | * It imperative to make a copy of the returned buffer before performing operations on it. |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 102 | * @param buf Pointer to UCHARBUF structure |
| 103 | * @param len Output param to receive the len of the buffer returned till end of the line |
| 104 | * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value |
| 105 | * indicates a failure on entry, the function will immediately return. |
| 106 | * On exit the value will indicate the success of the operation. |
| 107 | * Error: U_TRUNCATED_CHAR_FOUND |
| 108 | * @return Pointer to the internal buffer, NULL if EOF |
| 109 | */ |
| 110 | U_CAPI const UChar* U_EXPORT2 |
| 111 | ucbuf_readline(UCHARBUF* buf,int32_t* len, UErrorCode* err); |
| 112 | |
| 113 | |
| 114 | /** |
| 115 | * Resets the buffers and the underlying file stream. |
| 116 | * @param buf Pointer to UCHARBUF structure |
| 117 | * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value |
| 118 | * indicates a failure on entry, the function will immediately return. |
| 119 | * On exit the value will indicate the success of the operation. |
| 120 | */ |
| 121 | U_CAPI void U_EXPORT2 |
| 122 | ucbuf_rewind(UCHARBUF* buf,UErrorCode* err); |
| 123 | |
| 124 | /** |
| 125 | * Returns a pointer to the internal converted buffer |
| 126 | * @param buf Pointer to UCHARBUF structure |
Frank Tang | 3e05d9d | 2021-11-08 14:04:04 -0800 | [diff] [blame] | 127 | * @param len Pointer to int32_t to receive the length of buffer |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 128 | * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value |
| 129 | * indicates a failure on entry, the function will immediately return. |
| 130 | * On exit the value will indicate the success of the operation. |
| 131 | * @return Pointer to internal UChar buffer |
| 132 | */ |
| 133 | U_CAPI const UChar* U_EXPORT2 |
| 134 | ucbuf_getBuffer(UCHARBUF* buf,int32_t* len,UErrorCode* err); |
| 135 | |
| 136 | /** |
| 137 | * Closes the UCHARBUF structure members and cleans up the malloc'ed memory |
| 138 | * @param buf Pointer to UCHARBUF structure |
| 139 | */ |
| 140 | U_CAPI void U_EXPORT2 |
| 141 | ucbuf_close(UCHARBUF* buf); |
| 142 | |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame] | 143 | #if U_SHOW_CPLUSPLUS_API |
| 144 | |
| 145 | U_NAMESPACE_BEGIN |
| 146 | |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 147 | /** |
| 148 | * \class LocalUCHARBUFPointer |
| 149 | * "Smart pointer" class, closes a UCHARBUF via ucbuf_close(). |
| 150 | * For most methods see the LocalPointerBase base class. |
| 151 | * |
| 152 | * @see LocalPointerBase |
| 153 | * @see LocalPointer |
| 154 | */ |
Jungshik Shin | 70f8250 | 2016-01-29 00:32:36 -0800 | [diff] [blame] | 155 | U_DEFINE_LOCAL_OPEN_POINTER(LocalUCHARBUFPointer, UCHARBUF, ucbuf_close); |
| 156 | |
| 157 | U_NAMESPACE_END |
| 158 | |
| 159 | #endif |
| 160 | |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 161 | /** |
| 162 | * Rewinds the buffer by one codepoint. Does not rewind over escaped characters. |
| 163 | */ |
| 164 | U_CAPI void U_EXPORT2 |
| 165 | ucbuf_ungetc(int32_t ungetChar,UCHARBUF* buf); |
| 166 | |
| 167 | |
| 168 | /** |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 169 | * Autodetects the encoding of the file stream. Only Unicode charsets are autodectected. |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 170 | * Some Unicode charsets are stateful and need byte identifiers to be converted also to bring |
| 171 | * the converter to correct state for converting the rest of the stream. So the UConverter parameter |
| 172 | * is necessary. |
| 173 | * If the charset was autodetected, the caller must close both the input FileStream |
| 174 | * and the converter. |
| 175 | * |
| 176 | * @param fileName The file name to be opened and encoding autodected |
| 177 | * @param conv Output param to receive the opened converter if autodetected; NULL otherwise. |
| 178 | * @param cp Output param to receive the detected encoding |
| 179 | * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value |
| 180 | * indicates a failure on entry, the function will immediately return. |
| 181 | * On exit the value will indicate the success of the operation. |
| 182 | * @return The input FileStream if its charset was autodetected; NULL otherwise. |
| 183 | */ |
| 184 | U_CAPI FileStream * U_EXPORT2 |
| 185 | ucbuf_autodetect(const char* fileName, const char** cp,UConverter** conv, |
| 186 | int32_t* signatureLength, UErrorCode* status); |
| 187 | |
| 188 | /** |
Jungshik Shin | 5feb9ad | 2016-10-21 12:52:48 -0700 | [diff] [blame] | 189 | * Autodetects the encoding of the file stream. Only Unicode charsets are autodectected. |
jshin@chromium.org | 6f31ac3 | 2014-03-26 22:15:14 +0000 | [diff] [blame] | 190 | * Some Unicode charsets are stateful and need byte identifiers to be converted also to bring |
| 191 | * the converter to correct state for converting the rest of the stream. So the UConverter parameter |
| 192 | * is necessary. |
| 193 | * If the charset was autodetected, the caller must close the converter. |
| 194 | * |
| 195 | * @param fileStream The file stream whose encoding is to be detected |
| 196 | * @param conv Output param to receive the opened converter if autodetected; NULL otherwise. |
| 197 | * @param cp Output param to receive the detected encoding |
| 198 | * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value |
| 199 | * indicates a failure on entry, the function will immediately return. |
| 200 | * On exit the value will indicate the success of the operation. |
| 201 | * @return Boolean whether the Unicode charset was autodetected. |
| 202 | */ |
| 203 | |
| 204 | U_CAPI UBool U_EXPORT2 |
| 205 | ucbuf_autodetect_fs(FileStream* in, const char** cp, UConverter** conv, int32_t* signatureLength, UErrorCode* status); |
| 206 | |
| 207 | /** |
| 208 | * Returns the approximate size in UChars required for converting the file to UChars |
| 209 | */ |
| 210 | U_CAPI int32_t U_EXPORT2 |
| 211 | ucbuf_size(UCHARBUF* buf); |
| 212 | |
| 213 | U_CAPI const char* U_EXPORT2 |
| 214 | ucbuf_resolveFileName(const char* inputDir, const char* fileName, char* target, int32_t* len, UErrorCode* status); |
| 215 | |
| 216 | #endif |
| 217 | #endif |
| 218 | |