blob: 88024b2e621f60014fc20255cb3cd66a61a7976b [file] [log] [blame]
Jungshik Shin87232d82017-05-13 21:10:13 -07001// © 2016 and later: Unicode, Inc. and others.
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07002// License & terms of use: http://www.unicode.org/copyright.html
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00003/*
4 ************************************************************************************
Jungshik Shin5feb9ad2016-10-21 12:52:48 -07005 * Copyright (C) 2006-2016, International Business Machines Corporation
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00006 * and others. All Rights Reserved.
7 ************************************************************************************
8 */
9
10#include "unicode/utypes.h"
11
12#if !UCONFIG_NO_BREAK_ITERATION
13
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000014#include "unicode/uchar.h"
15#include "unicode/uniset.h"
16#include "unicode/chariter.h"
17#include "unicode/ures.h"
18#include "unicode/udata.h"
19#include "unicode/putil.h"
20#include "unicode/ustring.h"
21#include "unicode/uscript.h"
22#include "unicode/ucharstrie.h"
23#include "unicode/bytestrie.h"
Jungshik Shinb3189662017-11-07 11:18:34 -080024
25#include "brkeng.h"
26#include "cmemory.h"
27#include "dictbe.h"
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000028#include "charstr.h"
29#include "dictionarydata.h"
Jungshik Shin70f82502016-01-29 00:32:36 -080030#include "mutex.h"
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000031#include "uvector.h"
32#include "umutex.h"
33#include "uresimp.h"
34#include "ubrkimpl.h"
35
36U_NAMESPACE_BEGIN
37
38/*
39 ******************************************************************
40 */
41
42LanguageBreakEngine::LanguageBreakEngine() {
43}
44
45LanguageBreakEngine::~LanguageBreakEngine() {
46}
47
48/*
49 ******************************************************************
50 */
51
52LanguageBreakFactory::LanguageBreakFactory() {
53}
54
55LanguageBreakFactory::~LanguageBreakFactory() {
56}
57
58/*
59 ******************************************************************
60 */
61
62UnhandledEngine::UnhandledEngine(UErrorCode &/*status*/) {
Jungshik Shin5feb9ad2016-10-21 12:52:48 -070063 for (int32_t i = 0; i < UPRV_LENGTHOF(fHandled); ++i) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000064 fHandled[i] = 0;
65 }
66}
67
68UnhandledEngine::~UnhandledEngine() {
Jungshik Shin5feb9ad2016-10-21 12:52:48 -070069 for (int32_t i = 0; i < UPRV_LENGTHOF(fHandled); ++i) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000070 if (fHandled[i] != 0) {
71 delete fHandled[i];
72 }
73 }
74}
75
76UBool
77UnhandledEngine::handles(UChar32 c, int32_t breakType) const {
Jungshik Shin5feb9ad2016-10-21 12:52:48 -070078 return (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000079 && fHandled[breakType] != 0 && fHandled[breakType]->contains(c));
80}
81
82int32_t
83UnhandledEngine::findBreaks( UText *text,
Jungshik Shinb3189662017-11-07 11:18:34 -080084 int32_t /* startPos */,
85 int32_t endPos,
86 int32_t breakType,
87 UVector32 &/*foundBreaks*/ ) const {
Jungshik Shin5feb9ad2016-10-21 12:52:48 -070088 if (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000089 UChar32 c = utext_current32(text);
Jungshik Shinb3189662017-11-07 11:18:34 -080090 while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {
91 utext_next32(text); // TODO: recast loop to work with post-increment operations.
92 c = utext_current32(text);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000093 }
94 }
95 return 0;
96}
97
98void
99UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) {
Jungshik Shin5feb9ad2016-10-21 12:52:48 -0700100 if (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000101 if (fHandled[breakType] == 0) {
102 fHandled[breakType] = new UnicodeSet();
103 if (fHandled[breakType] == 0) {
104 return;
105 }
106 }
107 if (!fHandled[breakType]->contains(c)) {
108 UErrorCode status = U_ZERO_ERROR;
109 // Apply the entire script of the character.
110 int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
111 fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
112 }
113 }
114}
115
116/*
117 ******************************************************************
118 */
119
120ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
121 fEngines = 0;
122}
123
124ICULanguageBreakFactory::~ICULanguageBreakFactory() {
125 if (fEngines != 0) {
126 delete fEngines;
127 }
128}
129
130U_NAMESPACE_END
131U_CDECL_BEGIN
132static void U_CALLCONV _deleteEngine(void *obj) {
133 delete (const icu::LanguageBreakEngine *) obj;
134}
135U_CDECL_END
136U_NAMESPACE_BEGIN
137
Jungshik Shin70f82502016-01-29 00:32:36 -0800138static UMutex gBreakEngineMutex = U_MUTEX_INITIALIZER;
139
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000140const LanguageBreakEngine *
141ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000142 const LanguageBreakEngine *lbe = NULL;
143 UErrorCode status = U_ZERO_ERROR;
144
Jungshik Shin70f82502016-01-29 00:32:36 -0800145 Mutex m(&gBreakEngineMutex);
146
147 if (fEngines == NULL) {
148 UStack *engines = new UStack(_deleteEngine, NULL, status);
149 if (U_FAILURE(status) || engines == NULL) {
150 // Note: no way to return error code to caller.
151 delete engines;
152 return NULL;
153 }
154 fEngines = engines;
155 } else {
156 int32_t i = fEngines->size();
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000157 while (--i >= 0) {
158 lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
159 if (lbe != NULL && lbe->handles(c, breakType)) {
Jungshik Shin70f82502016-01-29 00:32:36 -0800160 return lbe;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000161 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000162 }
163 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000164
Jungshik Shin70f82502016-01-29 00:32:36 -0800165 // We didn't find an engine. Create one.
166 lbe = loadEngineFor(c, breakType);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000167 if (lbe != NULL) {
Jungshik Shin70f82502016-01-29 00:32:36 -0800168 fEngines->push((void *)lbe, status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000169 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000170 return lbe;
171}
172
173const LanguageBreakEngine *
174ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
175 UErrorCode status = U_ZERO_ERROR;
176 UScriptCode code = uscript_getScript(c, &status);
177 if (U_SUCCESS(status)) {
178 DictionaryMatcher *m = loadDictionaryMatcherFor(code, breakType);
179 if (m != NULL) {
180 const LanguageBreakEngine *engine = NULL;
181 switch(code) {
182 case USCRIPT_THAI:
183 engine = new ThaiBreakEngine(m, status);
184 break;
185 case USCRIPT_LAO:
186 engine = new LaoBreakEngine(m, status);
187 break;
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800188 case USCRIPT_MYANMAR:
189 engine = new BurmeseBreakEngine(m, status);
190 break;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000191 case USCRIPT_KHMER:
192 engine = new KhmerBreakEngine(m, status);
193 break;
194
195#if !UCONFIG_NO_NORMALIZATION
196 // CJK not available w/o normalization
197 case USCRIPT_HANGUL:
198 engine = new CjkBreakEngine(m, kKorean, status);
199 break;
200
201 // use same BreakEngine and dictionary for both Chinese and Japanese
202 case USCRIPT_HIRAGANA:
203 case USCRIPT_KATAKANA:
204 case USCRIPT_HAN:
205 engine = new CjkBreakEngine(m, kChineseJapanese, status);
206 break;
207#if 0
208 // TODO: Have to get some characters with script=common handled
209 // by CjkBreakEngine (e.g. U+309B). Simply subjecting
210 // them to CjkBreakEngine does not work. The engine has to
211 // special-case them.
212 case USCRIPT_COMMON:
213 {
214 UBlockCode block = ublock_getCode(code);
215 if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
216 engine = new CjkBreakEngine(dict, kChineseJapanese, status);
217 break;
218 }
219#endif
220#endif
221
222 default:
223 break;
224 }
225 if (engine == NULL) {
226 delete m;
227 }
228 else if (U_FAILURE(status)) {
229 delete engine;
230 engine = NULL;
231 }
232 return engine;
233 }
234 }
235 return NULL;
236}
237
238DictionaryMatcher *
239ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) {
240 UErrorCode status = U_ZERO_ERROR;
241 // open root from brkitr tree.
242 UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
243 b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
244 int32_t dictnlength = 0;
245 const UChar *dictfname =
246 ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
247 if (U_FAILURE(status)) {
248 ures_close(b);
249 return NULL;
250 }
251 CharString dictnbuf;
252 CharString ext;
253 const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot
254 if (extStart != NULL) {
255 int32_t len = (int32_t)(extStart - dictfname);
256 ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status);
257 dictnlength = len;
258 }
259 dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status);
260 ures_close(b);
261
262 UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
263 if (U_SUCCESS(status)) {
264 // build trie
265 const uint8_t *data = (const uint8_t *)udata_getMemory(file);
266 const int32_t *indexes = (const int32_t *)data;
267 const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
268 const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
269 DictionaryMatcher *m = NULL;
270 if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
271 const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
272 const char *characters = (const char *)(data + offset);
273 m = new BytesDictionaryMatcher(characters, transform, file);
274 }
275 else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
276 const UChar *characters = (const UChar *)(data + offset);
277 m = new UCharsDictionaryMatcher(characters, file);
278 }
279 if (m == NULL) {
280 // no matcher exists to take ownership - either we are an invalid
281 // type or memory allocation failed
282 udata_close(file);
283 }
284 return m;
285 } else if (dictfname != NULL) {
286 // we don't have a dictionary matcher.
287 // returning NULL here will cause us to fail to find a dictionary break engine, as expected
288 status = U_ZERO_ERROR;
289 return NULL;
290 }
291 return NULL;
292}
293
294U_NAMESPACE_END
295
296#endif /* #if !UCONFIG_NO_BREAK_ITERATION */