Blame - source/i18n/utf8collationiterator.cpp - chromium.googlesource.com/chromium/deps/icu

blob: 345b1994ef0e77f1d9ffaab13dcb0a1bdfbd59a8 [file] [log] [blame]

Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	1	// © 2016 and later: Unicode, Inc. and others.
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	2	// License & terms of use: http://www.unicode.org/copyright.html
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	3	/*
				4	*******************************************************************************
				5	* Copyright (C) 2012-2014, International Business Machines
				6	* Corporation and others. All Rights Reserved.
				7	*******************************************************************************
				8	* utf8collationiterator.cpp
				9	*
				10	* created on: 2012nov12 (from utf16collationiterator.cpp & uitercollationiterator.cpp)
				11	* created by: Markus W. Scherer
				12	*/
				13
				14	#include "unicode/utypes.h"
				15
				16	#if !UCONFIG_NO_COLLATION
				17
				18	#include "unicode/utf8.h"
				19	#include "charstr.h"
				20	#include "cmemory.h"
				21	#include "collation.h"
				22	#include "collationdata.h"
				23	#include "collationfcd.h"
				24	#include "collationiterator.h"
				25	#include "normalizer2impl.h"
				26	#include "uassert.h"
				27	#include "utf8collationiterator.h"
				28
				29	U_NAMESPACE_BEGIN
				30
				31	UTF8CollationIterator::~UTF8CollationIterator() {}
				32
				33	void
				34	UTF8CollationIterator::resetToOffset(int32_t newOffset) {
				35	reset();
				36	pos = newOffset;
				37	}
				38
				39	int32_t
				40	UTF8CollationIterator::getOffset() const {
				41	return pos;
				42	}
				43
				44	uint32_t
				45	UTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /errorCode/) {
				46	if(pos == length) {
				47	c = U_SENTINEL;
				48	return Collation::FALLBACK_CE32;
				49	}
				50	// Optimized combination of U8_NEXT_OR_FFFD() and UTRIE2_U8_NEXT32().
				51	c = u8[pos++];
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	52	if(U8_IS_SINGLE(c)) {
				53	// ASCII 00..7F
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	54	return trie->data32[c];
				55	}
				56	uint8_t t1, t2;
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	57	if(0xe0 <= c && c < 0xf0 &&
				58	((pos + 1) < length \|\| length < 0) &&
				59	U8_IS_VALID_LEAD3_AND_T1(c, t1 = u8[pos]) &&
				60	(t2 = (u8[pos + 1] - 0x80)) <= 0x3f) {
				61	// U+0800..U+FFFF except surrogates
				62	c = (((c & 0xf) << 12) \| ((t1 & 0x3f) << 6) \| t2);
				63	pos += 2;
				64	return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
				65	} else if(c < 0xe0 && c >= 0xc2 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
				66	// U+0080..U+07FF
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	67	uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1];
				68	c = ((c & 0x1f) << 6) \| t1;
				69	++pos;
				70	return ce32;
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	71	} else {
				72	// Function call for supplementary code points and error cases.
				73	// Illegal byte sequences yield U+FFFD.
				74	c = utf8_nextCharSafeBody(u8, &pos, length, c, -3);
				75	return data->getCE32(c);
				76	}
				77	}
				78
				79	UBool
				80	UTF8CollationIterator::foundNULTerminator() {
				81	if(length < 0) {
				82	length = --pos;
				83	return TRUE;
				84	} else {
				85	return FALSE;
				86	}
				87	}
				88
				89	UBool
				90	UTF8CollationIterator::forbidSurrogateCodePoints() const {
				91	return TRUE;
				92	}
				93
				94	UChar32
				95	UTF8CollationIterator::nextCodePoint(UErrorCode & /errorCode/) {
				96	if(pos == length) {
				97	return U_SENTINEL;
				98	}
				99	if(u8[pos] == 0 && length < 0) {
				100	length = pos;
				101	return U_SENTINEL;
				102	}
				103	UChar32 c;
				104	U8_NEXT_OR_FFFD(u8, pos, length, c);
				105	return c;
				106	}
				107
				108	UChar32
				109	UTF8CollationIterator::previousCodePoint(UErrorCode & /errorCode/) {
				110	if(pos == 0) {
				111	return U_SENTINEL;
				112	}
				113	UChar32 c;
				114	U8_PREV_OR_FFFD(u8, 0, pos, c);
				115	return c;
				116	}
				117
				118	void
				119	UTF8CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /errorCode/) {
				120	U8_FWD_N(u8, pos, length, num);
				121	}
				122
				123	void
				124	UTF8CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /errorCode/) {
				125	U8_BACK_N(u8, 0, pos, num);
				126	}
				127
				128	// FCDUTF8CollationIterator ------------------------------------------------ ***
				129
				130	FCDUTF8CollationIterator::~FCDUTF8CollationIterator() {}
				131
				132	void
				133	FCDUTF8CollationIterator::resetToOffset(int32_t newOffset) {
				134	reset();
				135	start = pos = newOffset;
				136	state = CHECK_FWD;
				137	}
				138
				139	int32_t
				140	FCDUTF8CollationIterator::getOffset() const {
				141	if(state != IN_NORMALIZED) {
				142	return pos;
				143	} else if(pos == 0) {
				144	return start;
				145	} else {
				146	return limit;
				147	}
				148	}
				149
				150	uint32_t
				151	FCDUTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
				152	for(;;) {
				153	if(state == CHECK_FWD) {
				154	// Combination of UTF8CollationIterator::handleNextCE32() with FCD check fastpath.
				155	if(pos == length) {
				156	c = U_SENTINEL;
				157	return Collation::FALLBACK_CE32;
				158	}
				159	c = u8[pos++];
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	160	if(U8_IS_SINGLE(c)) {
				161	// ASCII 00..7F
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	162	return trie->data32[c];
				163	}
				164	uint8_t t1, t2;
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	165	if(0xe0 <= c && c < 0xf0 &&
				166	((pos + 1) < length \|\| length < 0) &&
				167	U8_IS_VALID_LEAD3_AND_T1(c, t1 = u8[pos]) &&
				168	(t2 = (u8[pos + 1] - 0x80)) <= 0x3f) {
				169	// U+0800..U+FFFF except surrogates
				170	c = (((c & 0xf) << 12) \| ((t1 & 0x3f) << 6) \| t2);
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	171	pos += 2;
				172	if(CollationFCD::hasTccc(c) &&
				173	(CollationFCD::maybeTibetanCompositeVowel(c) \|\|
				174	(pos != length && nextHasLccc()))) {
				175	pos -= 3;
				176	} else {
				177	break; // return CE32(BMP)
				178	}
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	179	} else if(c < 0xe0 && c >= 0xc2 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
				180	// U+0080..U+07FF
				181	uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1];
				182	c = ((c & 0x1f) << 6) \| t1;
				183	++pos;
				184	if(CollationFCD::hasTccc(c) && pos != length && nextHasLccc()) {
				185	pos -= 2;
				186	} else {
				187	return ce32;
				188	}
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	189	} else {
				190	// Function call for supplementary code points and error cases.
				191	// Illegal byte sequences yield U+FFFD.
				192	c = utf8_nextCharSafeBody(u8, &pos, length, c, -3);
				193	if(c == 0xfffd) {
				194	return Collation::FFFD_CE32;
				195	} else {
				196	U_ASSERT(c > 0xffff);
				197	if(CollationFCD::hasTccc(U16_LEAD(c)) && pos != length && nextHasLccc()) {
				198	pos -= 4;
				199	} else {
				200	return data->getCE32FromSupplementary(c);
				201	}
				202	}
				203	}
				204	if(!nextSegment(errorCode)) {
				205	c = U_SENTINEL;
				206	return Collation::FALLBACK_CE32;
				207	}
				208	continue;
				209	} else if(state == IN_FCD_SEGMENT && pos != limit) {
				210	return UTF8CollationIterator::handleNextCE32(c, errorCode);
				211	} else if(state == IN_NORMALIZED && pos != normalized.length()) {
				212	c = normalized[pos++];
				213	break;
				214	} else {
				215	switchToForward();
				216	}
				217	}
				218	return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
				219	}
				220
				221	UBool
				222	FCDUTF8CollationIterator::nextHasLccc() const {
				223	U_ASSERT(state == CHECK_FWD && pos != length);
				224	// The lowest code point with ccc!=0 is U+0300 which is CC 80 in UTF-8.
				225	// CJK U+4000..U+DFFF except U+Axxx are also FCD-inert. (Lead bytes E4..ED except EA.)
				226	UChar32 c = u8[pos];
				227	if(c < 0xcc \|\| (0xe4 <= c && c <= 0xed && c != 0xea)) { return FALSE; }
				228	int32_t i = pos;
				229	U8_NEXT_OR_FFFD(u8, i, length, c);
				230	if(c > 0xffff) { c = U16_LEAD(c); }
				231	return CollationFCD::hasLccc(c);
				232	}
				233
				234	UBool
				235	FCDUTF8CollationIterator::previousHasTccc() const {
				236	U_ASSERT(state == CHECK_BWD && pos != 0);
				237	UChar32 c = u8[pos - 1];
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	238	if(U8_IS_SINGLE(c)) { return FALSE; }
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	239	int32_t i = pos;
				240	U8_PREV_OR_FFFD(u8, 0, i, c);
				241	if(c > 0xffff) { c = U16_LEAD(c); }
				242	return CollationFCD::hasTccc(c);
				243	}
				244
				245	UChar
				246	FCDUTF8CollationIterator::handleGetTrailSurrogate() {
				247	if(state != IN_NORMALIZED) { return 0; }
				248	U_ASSERT(pos < normalized.length());
				249	UChar trail;
				250	if(U16_IS_TRAIL(trail = normalized[pos])) { ++pos; }
				251	return trail;
				252	}
				253
				254	UBool
				255	FCDUTF8CollationIterator::foundNULTerminator() {
				256	if(state == CHECK_FWD && length < 0) {
				257	length = --pos;
				258	return TRUE;
				259	} else {
				260	return FALSE;
				261	}
				262	}
				263
				264	UChar32
				265	FCDUTF8CollationIterator::nextCodePoint(UErrorCode &errorCode) {
				266	UChar32 c;
				267	for(;;) {
				268	if(state == CHECK_FWD) {
				269	if(pos == length \|\| ((c = u8[pos]) == 0 && length < 0)) {
				270	return U_SENTINEL;
				271	}
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	272	if(U8_IS_SINGLE(c)) {
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	273	++pos;
				274	return c;
				275	}
				276	U8_NEXT_OR_FFFD(u8, pos, length, c);
				277	if(CollationFCD::hasTccc(c <= 0xffff ? c : U16_LEAD(c)) &&
				278	(CollationFCD::maybeTibetanCompositeVowel(c) \|\|
				279	(pos != length && nextHasLccc()))) {
				280	// c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence
				281	// and we can use U8_LENGTH() rather than a previous-position variable.
				282	pos -= U8_LENGTH(c);
				283	if(!nextSegment(errorCode)) {
				284	return U_SENTINEL;
				285	}
				286	continue;
				287	}
				288	return c;
				289	} else if(state == IN_FCD_SEGMENT && pos != limit) {
				290	U8_NEXT_OR_FFFD(u8, pos, length, c);
				291	return c;
				292	} else if(state == IN_NORMALIZED && pos != normalized.length()) {
				293	c = normalized.char32At(pos);
				294	pos += U16_LENGTH(c);
				295	return c;
				296	} else {
				297	switchToForward();
				298	}
				299	}
				300	}
				301
				302	UChar32
				303	FCDUTF8CollationIterator::previousCodePoint(UErrorCode &errorCode) {
				304	UChar32 c;
				305	for(;;) {
				306	if(state == CHECK_BWD) {
				307	if(pos == 0) {
				308	return U_SENTINEL;
				309	}
Jungshik Shin	b318966	2017-11-07 11:18:34 -0800	[diff] [blame^]	310	if(U8_IS_SINGLE(c = u8[pos - 1])) {
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	311	--pos;
				312	return c;
				313	}
				314	U8_PREV_OR_FFFD(u8, 0, pos, c);
				315	if(CollationFCD::hasLccc(c <= 0xffff ? c : U16_LEAD(c)) &&
				316	(CollationFCD::maybeTibetanCompositeVowel(c) \|\|
				317	(pos != 0 && previousHasTccc()))) {
				318	// c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence
				319	// and we can use U8_LENGTH() rather than a previous-position variable.
				320	pos += U8_LENGTH(c);
				321	if(!previousSegment(errorCode)) {
				322	return U_SENTINEL;
				323	}
				324	continue;
				325	}
				326	return c;
				327	} else if(state == IN_FCD_SEGMENT && pos != start) {
				328	U8_PREV_OR_FFFD(u8, 0, pos, c);
				329	return c;
				330	} else if(state >= IN_NORMALIZED && pos != 0) {
				331	c = normalized.char32At(pos - 1);
				332	pos -= U16_LENGTH(c);
				333	return c;
				334	} else {
				335	switchToBackward();
				336	}
				337	}
				338	}
				339
				340	void
				341	FCDUTF8CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
				342	// Specify the class to avoid a virtual-function indirection.
				343	// In Java, we would declare this class final.
				344	while(num > 0 && FCDUTF8CollationIterator::nextCodePoint(errorCode) >= 0) {
				345	--num;
				346	}
				347	}
				348
				349	void
				350	FCDUTF8CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
				351	// Specify the class to avoid a virtual-function indirection.
				352	// In Java, we would declare this class final.
				353	while(num > 0 && FCDUTF8CollationIterator::previousCodePoint(errorCode) >= 0) {
				354	--num;
				355	}
				356	}
				357
				358	void
				359	FCDUTF8CollationIterator::switchToForward() {
				360	U_ASSERT(state == CHECK_BWD \|\|
				361	(state == IN_FCD_SEGMENT && pos == limit) \|\|
				362	(state == IN_NORMALIZED && pos == normalized.length()));
				363	if(state == CHECK_BWD) {
				364	// Turn around from backward checking.
				365	start = pos;
				366	if(pos == limit) {
				367	state = CHECK_FWD; // Check forward.
				368	} else { // pos < limit
				369	state = IN_FCD_SEGMENT; // Stay in FCD segment.
				370	}
				371	} else {
				372	// Reached the end of the FCD segment.
				373	if(state == IN_FCD_SEGMENT) {
				374	// The input text segment is FCD, extend it forward.
				375	} else {
				376	// The input text segment needed to be normalized.
				377	// Switch to checking forward from it.
				378	start = pos = limit;
				379	}
				380	state = CHECK_FWD;
				381	}
				382	}
				383
				384	UBool
				385	FCDUTF8CollationIterator::nextSegment(UErrorCode &errorCode) {
				386	if(U_FAILURE(errorCode)) { return FALSE; }
				387	U_ASSERT(state == CHECK_FWD && pos != length);
				388	// The input text [start..pos[ passes the FCD check.
				389	int32_t segmentStart = pos;
				390	// Collect the characters being checked, in case they need to be normalized.
				391	UnicodeString s;
				392	uint8_t prevCC = 0;
				393	for(;;) {
				394	// Fetch the next character and its fcd16 value.
				395	int32_t cpStart = pos;
				396	UChar32 c;
				397	U8_NEXT_OR_FFFD(u8, pos, length, c);
				398	uint16_t fcd16 = nfcImpl.getFCD16(c);
				399	uint8_t leadCC = (uint8_t)(fcd16 >> 8);
				400	if(leadCC == 0 && cpStart != segmentStart) {
				401	// FCD boundary before this character.
				402	pos = cpStart;
				403	break;
				404	}
				405	s.append(c);
				406	if(leadCC != 0 && (prevCC > leadCC \|\| CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
				407	// Fails FCD check. Find the next FCD boundary and normalize.
				408	while(pos != length) {
				409	cpStart = pos;
				410	U8_NEXT_OR_FFFD(u8, pos, length, c);
				411	if(nfcImpl.getFCD16(c) <= 0xff) {
				412	pos = cpStart;
				413	break;
				414	}
				415	s.append(c);
				416	}
				417	if(!normalize(s, errorCode)) { return FALSE; }
				418	start = segmentStart;
				419	limit = pos;
				420	state = IN_NORMALIZED;
				421	pos = 0;
				422	return TRUE;
				423	}
				424	prevCC = (uint8_t)fcd16;
				425	if(pos == length \|\| prevCC == 0) {
				426	// FCD boundary after the last character.
				427	break;
				428	}
				429	}
				430	limit = pos;
				431	pos = segmentStart;
				432	U_ASSERT(pos != limit);
				433	state = IN_FCD_SEGMENT;
				434	return TRUE;
				435	}
				436
				437	void
				438	FCDUTF8CollationIterator::switchToBackward() {
				439	U_ASSERT(state == CHECK_FWD \|\|
				440	(state == IN_FCD_SEGMENT && pos == start) \|\|
				441	(state >= IN_NORMALIZED && pos == 0));
				442	if(state == CHECK_FWD) {
				443	// Turn around from forward checking.
				444	limit = pos;
				445	if(pos == start) {
				446	state = CHECK_BWD; // Check backward.
				447	} else { // pos > start
				448	state = IN_FCD_SEGMENT; // Stay in FCD segment.
				449	}
				450	} else {
				451	// Reached the start of the FCD segment.
				452	if(state == IN_FCD_SEGMENT) {
				453	// The input text segment is FCD, extend it backward.
				454	} else {
				455	// The input text segment needed to be normalized.
				456	// Switch to checking backward from it.
				457	limit = pos = start;
				458	}
				459	state = CHECK_BWD;
				460	}
				461	}
				462
				463	UBool
				464	FCDUTF8CollationIterator::previousSegment(UErrorCode &errorCode) {
				465	if(U_FAILURE(errorCode)) { return FALSE; }
				466	U_ASSERT(state == CHECK_BWD && pos != 0);
				467	// The input text [pos..limit[ passes the FCD check.
				468	int32_t segmentLimit = pos;
				469	// Collect the characters being checked, in case they need to be normalized.
				470	UnicodeString s;
				471	uint8_t nextCC = 0;
				472	for(;;) {
				473	// Fetch the previous character and its fcd16 value.
				474	int32_t cpLimit = pos;
				475	UChar32 c;
				476	U8_PREV_OR_FFFD(u8, 0, pos, c);
				477	uint16_t fcd16 = nfcImpl.getFCD16(c);
				478	uint8_t trailCC = (uint8_t)fcd16;
				479	if(trailCC == 0 && cpLimit != segmentLimit) {
				480	// FCD boundary after this character.
				481	pos = cpLimit;
				482	break;
				483	}
				484	s.append(c);
				485	if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) \|\|
				486	CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
				487	// Fails FCD check. Find the previous FCD boundary and normalize.
				488	while(fcd16 > 0xff && pos != 0) {
				489	cpLimit = pos;
				490	U8_PREV_OR_FFFD(u8, 0, pos, c);
				491	fcd16 = nfcImpl.getFCD16(c);
				492	if(fcd16 == 0) {
				493	pos = cpLimit;
				494	break;
				495	}
				496	s.append(c);
				497	}
				498	s.reverse();
				499	if(!normalize(s, errorCode)) { return FALSE; }
				500	limit = segmentLimit;
				501	start = pos;
				502	state = IN_NORMALIZED;
				503	pos = normalized.length();
				504	return TRUE;
				505	}
				506	nextCC = (uint8_t)(fcd16 >> 8);
				507	if(pos == 0 \|\| nextCC == 0) {
				508	// FCD boundary before the following character.
				509	break;
				510	}
				511	}
				512	start = pos;
				513	pos = segmentLimit;
				514	U_ASSERT(pos != start);
				515	state = IN_FCD_SEGMENT;
				516	return TRUE;
				517	}
				518
				519	UBool
				520	FCDUTF8CollationIterator::normalize(const UnicodeString &s, UErrorCode &errorCode) {
				521	// NFD without argument checking.
				522	U_ASSERT(U_SUCCESS(errorCode));
				523	nfcImpl.decompose(s, normalized, errorCode);
				524	return U_SUCCESS(errorCode);
				525	}
				526
				527	U_NAMESPACE_END
				528
				529	#endif // !UCONFIG_NO_COLLATION