blob: 2398fe9c0ca67418d797737da5228c7b53ced701 [file] [log] [blame]
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001/*
2 ************************************************************************************
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08003 * Copyright (C) 2006-2014, International Business Machines Corporation
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00004 * and others. All Rights Reserved.
5 ************************************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_BREAK_ITERATION
11
12#include "brkeng.h"
13#include "dictbe.h"
14#include "unicode/uchar.h"
15#include "unicode/uniset.h"
16#include "unicode/chariter.h"
17#include "unicode/ures.h"
18#include "unicode/udata.h"
19#include "unicode/putil.h"
20#include "unicode/ustring.h"
21#include "unicode/uscript.h"
22#include "unicode/ucharstrie.h"
23#include "unicode/bytestrie.h"
24#include "charstr.h"
25#include "dictionarydata.h"
26#include "uvector.h"
27#include "umutex.h"
28#include "uresimp.h"
29#include "ubrkimpl.h"
30
31U_NAMESPACE_BEGIN
32
33/*
34 ******************************************************************
35 */
36
37LanguageBreakEngine::LanguageBreakEngine() {
38}
39
40LanguageBreakEngine::~LanguageBreakEngine() {
41}
42
43/*
44 ******************************************************************
45 */
46
47LanguageBreakFactory::LanguageBreakFactory() {
48}
49
50LanguageBreakFactory::~LanguageBreakFactory() {
51}
52
53/*
54 ******************************************************************
55 */
56
57UnhandledEngine::UnhandledEngine(UErrorCode &/*status*/) {
58 for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) {
59 fHandled[i] = 0;
60 }
61}
62
63UnhandledEngine::~UnhandledEngine() {
64 for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) {
65 if (fHandled[i] != 0) {
66 delete fHandled[i];
67 }
68 }
69}
70
71UBool
72UnhandledEngine::handles(UChar32 c, int32_t breakType) const {
73 return (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))
74 && fHandled[breakType] != 0 && fHandled[breakType]->contains(c));
75}
76
77int32_t
78UnhandledEngine::findBreaks( UText *text,
79 int32_t startPos,
80 int32_t endPos,
81 UBool reverse,
82 int32_t breakType,
83 UStack &/*foundBreaks*/ ) const {
84 if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {
85 UChar32 c = utext_current32(text);
86 if (reverse) {
87 while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) {
88 c = utext_previous32(text);
89 }
90 }
91 else {
92 while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {
93 utext_next32(text); // TODO: recast loop to work with post-increment operations.
94 c = utext_current32(text);
95 }
96 }
97 }
98 return 0;
99}
100
101void
102UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) {
103 if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {
104 if (fHandled[breakType] == 0) {
105 fHandled[breakType] = new UnicodeSet();
106 if (fHandled[breakType] == 0) {
107 return;
108 }
109 }
110 if (!fHandled[breakType]->contains(c)) {
111 UErrorCode status = U_ZERO_ERROR;
112 // Apply the entire script of the character.
113 int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
114 fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
115 }
116 }
117}
118
119/*
120 ******************************************************************
121 */
122
123ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
124 fEngines = 0;
125}
126
127ICULanguageBreakFactory::~ICULanguageBreakFactory() {
128 if (fEngines != 0) {
129 delete fEngines;
130 }
131}
132
133U_NAMESPACE_END
134U_CDECL_BEGIN
135static void U_CALLCONV _deleteEngine(void *obj) {
136 delete (const icu::LanguageBreakEngine *) obj;
137}
138U_CDECL_END
139U_NAMESPACE_BEGIN
140
141const LanguageBreakEngine *
142ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {
143 UBool needsInit;
144 int32_t i;
145 const LanguageBreakEngine *lbe = NULL;
146 UErrorCode status = U_ZERO_ERROR;
147
148 // TODO: The global mutex should not be used.
149 // The global mutex should only be used for short periods.
150 // A ICULanguageBreakFactory specific mutex should be used.
151 umtx_lock(NULL);
152 needsInit = (UBool)(fEngines == NULL);
153 if (!needsInit) {
154 i = fEngines->size();
155 while (--i >= 0) {
156 lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
157 if (lbe != NULL && lbe->handles(c, breakType)) {
158 break;
159 }
160 lbe = NULL;
161 }
162 }
163 umtx_unlock(NULL);
164
165 if (lbe != NULL) {
166 return lbe;
167 }
168
169 if (needsInit) {
170 UStack *engines = new UStack(_deleteEngine, NULL, status);
171 if (U_SUCCESS(status) && engines == NULL) {
172 status = U_MEMORY_ALLOCATION_ERROR;
173 }
174 else if (U_FAILURE(status)) {
175 delete engines;
176 engines = NULL;
177 }
178 else {
179 umtx_lock(NULL);
180 if (fEngines == NULL) {
181 fEngines = engines;
182 engines = NULL;
183 }
184 umtx_unlock(NULL);
185 delete engines;
186 }
187 }
188
189 if (fEngines == NULL) {
190 return NULL;
191 }
192
193 // We didn't find an engine the first time through, or there was no
194 // stack. Create an engine.
195 const LanguageBreakEngine *newlbe = loadEngineFor(c, breakType);
196
197 // Now get the lock, and see if someone else has created it in the
198 // meantime
199 umtx_lock(NULL);
200 i = fEngines->size();
201 while (--i >= 0) {
202 lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
203 if (lbe != NULL && lbe->handles(c, breakType)) {
204 break;
205 }
206 lbe = NULL;
207 }
208 if (lbe == NULL && newlbe != NULL) {
209 fEngines->push((void *)newlbe, status);
210 lbe = newlbe;
211 newlbe = NULL;
212 }
213 umtx_unlock(NULL);
214
215 delete newlbe;
216
217 return lbe;
218}
219
220const LanguageBreakEngine *
221ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
222 UErrorCode status = U_ZERO_ERROR;
223 UScriptCode code = uscript_getScript(c, &status);
224 if (U_SUCCESS(status)) {
225 DictionaryMatcher *m = loadDictionaryMatcherFor(code, breakType);
226 if (m != NULL) {
227 const LanguageBreakEngine *engine = NULL;
228 switch(code) {
229 case USCRIPT_THAI:
230 engine = new ThaiBreakEngine(m, status);
231 break;
232 case USCRIPT_LAO:
233 engine = new LaoBreakEngine(m, status);
234 break;
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800235 case USCRIPT_MYANMAR:
236 engine = new BurmeseBreakEngine(m, status);
237 break;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000238 case USCRIPT_KHMER:
239 engine = new KhmerBreakEngine(m, status);
240 break;
241
242#if !UCONFIG_NO_NORMALIZATION
243 // CJK not available w/o normalization
244 case USCRIPT_HANGUL:
245 engine = new CjkBreakEngine(m, kKorean, status);
246 break;
247
248 // use same BreakEngine and dictionary for both Chinese and Japanese
249 case USCRIPT_HIRAGANA:
250 case USCRIPT_KATAKANA:
251 case USCRIPT_HAN:
252 engine = new CjkBreakEngine(m, kChineseJapanese, status);
253 break;
254#if 0
255 // TODO: Have to get some characters with script=common handled
256 // by CjkBreakEngine (e.g. U+309B). Simply subjecting
257 // them to CjkBreakEngine does not work. The engine has to
258 // special-case them.
259 case USCRIPT_COMMON:
260 {
261 UBlockCode block = ublock_getCode(code);
262 if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
263 engine = new CjkBreakEngine(dict, kChineseJapanese, status);
264 break;
265 }
266#endif
267#endif
268
269 default:
270 break;
271 }
272 if (engine == NULL) {
273 delete m;
274 }
275 else if (U_FAILURE(status)) {
276 delete engine;
277 engine = NULL;
278 }
279 return engine;
280 }
281 }
282 return NULL;
283}
284
285DictionaryMatcher *
286ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) {
287 UErrorCode status = U_ZERO_ERROR;
288 // open root from brkitr tree.
289 UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
290 b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
291 int32_t dictnlength = 0;
292 const UChar *dictfname =
293 ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
294 if (U_FAILURE(status)) {
295 ures_close(b);
296 return NULL;
297 }
298 CharString dictnbuf;
299 CharString ext;
300 const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot
301 if (extStart != NULL) {
302 int32_t len = (int32_t)(extStart - dictfname);
303 ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status);
304 dictnlength = len;
305 }
306 dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status);
307 ures_close(b);
308
309 UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
310 if (U_SUCCESS(status)) {
311 // build trie
312 const uint8_t *data = (const uint8_t *)udata_getMemory(file);
313 const int32_t *indexes = (const int32_t *)data;
314 const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
315 const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
316 DictionaryMatcher *m = NULL;
317 if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
318 const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
319 const char *characters = (const char *)(data + offset);
320 m = new BytesDictionaryMatcher(characters, transform, file);
321 }
322 else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
323 const UChar *characters = (const UChar *)(data + offset);
324 m = new UCharsDictionaryMatcher(characters, file);
325 }
326 if (m == NULL) {
327 // no matcher exists to take ownership - either we are an invalid
328 // type or memory allocation failed
329 udata_close(file);
330 }
331 return m;
332 } else if (dictfname != NULL) {
333 // we don't have a dictionary matcher.
334 // returning NULL here will cause us to fail to find a dictionary break engine, as expected
335 status = U_ZERO_ERROR;
336 return NULL;
337 }
338 return NULL;
339}
340
341U_NAMESPACE_END
342
343#endif /* #if !UCONFIG_NO_BREAK_ITERATION */