Blame - source/common/brkeng.cpp - chromium.googlesource.com/chromium/deps/icu

blob: 2398fe9c0ca67418d797737da5228c7b53ced701 [file] [log] [blame]

jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	1	/*
				2	************************************************************************************
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame^]	3	* Copyright (C) 2006-2014, International Business Machines Corporation
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	4	* and others. All Rights Reserved.
				5	************************************************************************************
				6	*/
				7
				8	#include "unicode/utypes.h"
				9
				10	#if !UCONFIG_NO_BREAK_ITERATION
				11
				12	#include "brkeng.h"
				13	#include "dictbe.h"
				14	#include "unicode/uchar.h"
				15	#include "unicode/uniset.h"
				16	#include "unicode/chariter.h"
				17	#include "unicode/ures.h"
				18	#include "unicode/udata.h"
				19	#include "unicode/putil.h"
				20	#include "unicode/ustring.h"
				21	#include "unicode/uscript.h"
				22	#include "unicode/ucharstrie.h"
				23	#include "unicode/bytestrie.h"
				24	#include "charstr.h"
				25	#include "dictionarydata.h"
				26	#include "uvector.h"
				27	#include "umutex.h"
				28	#include "uresimp.h"
				29	#include "ubrkimpl.h"
				30
				31	U_NAMESPACE_BEGIN
				32
				33	/*
				34	******************************************************************
				35	*/
				36
				37	LanguageBreakEngine::LanguageBreakEngine() {
				38	}
				39
				40	LanguageBreakEngine::~LanguageBreakEngine() {
				41	}
				42
				43	/*
				44	******************************************************************
				45	*/
				46
				47	LanguageBreakFactory::LanguageBreakFactory() {
				48	}
				49
				50	LanguageBreakFactory::~LanguageBreakFactory() {
				51	}
				52
				53	/*
				54	******************************************************************
				55	*/
				56
				57	UnhandledEngine::UnhandledEngine(UErrorCode &/status/) {
				58	for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) {
				59	fHandled[i] = 0;
				60	}
				61	}
				62
				63	UnhandledEngine::~UnhandledEngine() {
				64	for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) {
				65	if (fHandled[i] != 0) {
				66	delete fHandled[i];
				67	}
				68	}
				69	}
				70
				71	UBool
				72	UnhandledEngine::handles(UChar32 c, int32_t breakType) const {
				73	return (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))
				74	&& fHandled[breakType] != 0 && fHandled[breakType]->contains(c));
				75	}
				76
				77	int32_t
				78	UnhandledEngine::findBreaks( UText *text,
				79	int32_t startPos,
				80	int32_t endPos,
				81	UBool reverse,
				82	int32_t breakType,
				83	UStack &/foundBreaks/ ) const {
				84	if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {
				85	UChar32 c = utext_current32(text);
				86	if (reverse) {
				87	while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) {
				88	c = utext_previous32(text);
				89	}
				90	}
				91	else {
				92	while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {
				93	utext_next32(text); // TODO: recast loop to work with post-increment operations.
				94	c = utext_current32(text);
				95	}
				96	}
				97	}
				98	return 0;
				99	}
				100
				101	void
				102	UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) {
				103	if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) {
				104	if (fHandled[breakType] == 0) {
				105	fHandled[breakType] = new UnicodeSet();
				106	if (fHandled[breakType] == 0) {
				107	return;
				108	}
				109	}
				110	if (!fHandled[breakType]->contains(c)) {
				111	UErrorCode status = U_ZERO_ERROR;
				112	// Apply the entire script of the character.
				113	int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
				114	fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
				115	}
				116	}
				117	}
				118
				119	/*
				120	******************************************************************
				121	*/
				122
				123	ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/status/) {
				124	fEngines = 0;
				125	}
				126
				127	ICULanguageBreakFactory::~ICULanguageBreakFactory() {
				128	if (fEngines != 0) {
				129	delete fEngines;
				130	}
				131	}
				132
				133	U_NAMESPACE_END
				134	U_CDECL_BEGIN
				135	static void U_CALLCONV _deleteEngine(void *obj) {
				136	delete (const icu::LanguageBreakEngine *) obj;
				137	}
				138	U_CDECL_END
				139	U_NAMESPACE_BEGIN
				140
				141	const LanguageBreakEngine *
				142	ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {
				143	UBool needsInit;
				144	int32_t i;
				145	const LanguageBreakEngine *lbe = NULL;
				146	UErrorCode status = U_ZERO_ERROR;
				147
				148	// TODO: The global mutex should not be used.
				149	// The global mutex should only be used for short periods.
				150	// A ICULanguageBreakFactory specific mutex should be used.
				151	umtx_lock(NULL);
				152	needsInit = (UBool)(fEngines == NULL);
				153	if (!needsInit) {
				154	i = fEngines->size();
				155	while (--i >= 0) {
				156	lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
				157	if (lbe != NULL && lbe->handles(c, breakType)) {
				158	break;
				159	}
				160	lbe = NULL;
				161	}
				162	}
				163	umtx_unlock(NULL);
				164
				165	if (lbe != NULL) {
				166	return lbe;
				167	}
				168
				169	if (needsInit) {
				170	UStack *engines = new UStack(_deleteEngine, NULL, status);
				171	if (U_SUCCESS(status) && engines == NULL) {
				172	status = U_MEMORY_ALLOCATION_ERROR;
				173	}
				174	else if (U_FAILURE(status)) {
				175	delete engines;
				176	engines = NULL;
				177	}
				178	else {
				179	umtx_lock(NULL);
				180	if (fEngines == NULL) {
				181	fEngines = engines;
				182	engines = NULL;
				183	}
				184	umtx_unlock(NULL);
				185	delete engines;
				186	}
				187	}
				188
				189	if (fEngines == NULL) {
				190	return NULL;
				191	}
				192
				193	// We didn't find an engine the first time through, or there was no
				194	// stack. Create an engine.
				195	const LanguageBreakEngine *newlbe = loadEngineFor(c, breakType);
				196
				197	// Now get the lock, and see if someone else has created it in the
				198	// meantime
				199	umtx_lock(NULL);
				200	i = fEngines->size();
				201	while (--i >= 0) {
				202	lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
				203	if (lbe != NULL && lbe->handles(c, breakType)) {
				204	break;
				205	}
				206	lbe = NULL;
				207	}
				208	if (lbe == NULL && newlbe != NULL) {
				209	fEngines->push((void *)newlbe, status);
				210	lbe = newlbe;
				211	newlbe = NULL;
				212	}
				213	umtx_unlock(NULL);
				214
				215	delete newlbe;
				216
				217	return lbe;
				218	}
				219
				220	const LanguageBreakEngine *
				221	ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
				222	UErrorCode status = U_ZERO_ERROR;
				223	UScriptCode code = uscript_getScript(c, &status);
				224	if (U_SUCCESS(status)) {
				225	DictionaryMatcher *m = loadDictionaryMatcherFor(code, breakType);
				226	if (m != NULL) {
				227	const LanguageBreakEngine *engine = NULL;
				228	switch(code) {
				229	case USCRIPT_THAI:
				230	engine = new ThaiBreakEngine(m, status);
				231	break;
				232	case USCRIPT_LAO:
				233	engine = new LaoBreakEngine(m, status);
				234	break;
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame^]	235	case USCRIPT_MYANMAR:
				236	engine = new BurmeseBreakEngine(m, status);
				237	break;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	238	case USCRIPT_KHMER:
				239	engine = new KhmerBreakEngine(m, status);
				240	break;
				241
				242	#if !UCONFIG_NO_NORMALIZATION
				243	// CJK not available w/o normalization
				244	case USCRIPT_HANGUL:
				245	engine = new CjkBreakEngine(m, kKorean, status);
				246	break;
				247
				248	// use same BreakEngine and dictionary for both Chinese and Japanese
				249	case USCRIPT_HIRAGANA:
				250	case USCRIPT_KATAKANA:
				251	case USCRIPT_HAN:
				252	engine = new CjkBreakEngine(m, kChineseJapanese, status);
				253	break;
				254	#if 0
				255	// TODO: Have to get some characters with script=common handled
				256	// by CjkBreakEngine (e.g. U+309B). Simply subjecting
				257	// them to CjkBreakEngine does not work. The engine has to
				258	// special-case them.
				259	case USCRIPT_COMMON:
				260	{
				261	UBlockCode block = ublock_getCode(code);
				262	if (block == UBLOCK_HIRAGANA \|\| block == UBLOCK_KATAKANA)
				263	engine = new CjkBreakEngine(dict, kChineseJapanese, status);
				264	break;
				265	}
				266	#endif
				267	#endif
				268
				269	default:
				270	break;
				271	}
				272	if (engine == NULL) {
				273	delete m;
				274	}
				275	else if (U_FAILURE(status)) {
				276	delete engine;
				277	engine = NULL;
				278	}
				279	return engine;
				280	}
				281	}
				282	return NULL;
				283	}
				284
				285	DictionaryMatcher *
				286	ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) {
				287	UErrorCode status = U_ZERO_ERROR;
				288	// open root from brkitr tree.
				289	UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
				290	b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
				291	int32_t dictnlength = 0;
				292	const UChar *dictfname =
				293	ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
				294	if (U_FAILURE(status)) {
				295	ures_close(b);
				296	return NULL;
				297	}
				298	CharString dictnbuf;
				299	CharString ext;
				300	const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot
				301	if (extStart != NULL) {
				302	int32_t len = (int32_t)(extStart - dictfname);
				303	ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status);
				304	dictnlength = len;
				305	}
				306	dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status);
				307	ures_close(b);
				308
				309	UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
				310	if (U_SUCCESS(status)) {
				311	// build trie
				312	const uint8_t data = (const uint8_t )udata_getMemory(file);
				313	const int32_t indexes = (const int32_t )data;
				314	const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
				315	const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
				316	DictionaryMatcher *m = NULL;
				317	if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
				318	const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
				319	const char characters = (const char )(data + offset);
				320	m = new BytesDictionaryMatcher(characters, transform, file);
				321	}
				322	else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
				323	const UChar characters = (const UChar )(data + offset);
				324	m = new UCharsDictionaryMatcher(characters, file);
				325	}
				326	if (m == NULL) {
				327	// no matcher exists to take ownership - either we are an invalid
				328	// type or memory allocation failed
				329	udata_close(file);
				330	}
				331	return m;
				332	} else if (dictfname != NULL) {
				333	// we don't have a dictionary matcher.
				334	// returning NULL here will cause us to fail to find a dictionary break engine, as expected
				335	status = U_ZERO_ERROR;
				336	return NULL;
				337	}
				338	return NULL;
				339	}
				340
				341	U_NAMESPACE_END
				342
				343	#endif /* #if !UCONFIG_NO_BREAK_ITERATION */