Blame - source/common/normlzr.cpp - chromium.googlesource.com/chromium/deps/icu

blob: 58de61591f8a24ef3919190ecf265a29fae0abea [file] [log] [blame]

Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	1	// © 2016 and later: Unicode, Inc. and others.
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	2	// License & terms of use: http://www.unicode.org/copyright.html
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	3	/*
				4	*************************************************************************
				5	* COPYRIGHT:
				6	* Copyright (c) 1996-2012, International Business Machines Corporation and
				7	* others. All Rights Reserved.
				8	*************************************************************************
				9	*/
				10
				11	#include "unicode/utypes.h"
				12
				13	#if !UCONFIG_NO_NORMALIZATION
				14
				15	#include "unicode/uniset.h"
				16	#include "unicode/unistr.h"
				17	#include "unicode/chariter.h"
				18	#include "unicode/schriter.h"
				19	#include "unicode/uchriter.h"
				20	#include "unicode/normlzr.h"
				21	#include "unicode/utf16.h"
				22	#include "cmemory.h"
				23	#include "normalizer2impl.h"
				24	#include "uprops.h" // for uniset_getUnicode32Instance()
				25
Frank Tang	69c72a6	2019-04-03 21:41:21 -0700	[diff] [blame]	26	#if defined(move32)
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	27	// System can define move32 intrinsics, but the char iters define move32 method
				28	// using same undef trick in headers, so undef here to re-enable the method.
				29	#undef move32
				30	#endif
				31
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	32	U_NAMESPACE_BEGIN
				33
				34	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
				35
				36	//-------------------------------------------------------------------------
				37	// Constructors and other boilerplate
				38	//-------------------------------------------------------------------------
				39
				40	Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
				41	UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
				42	text(new StringCharacterIterator(str)),
				43	currentIndex(0), nextIndex(0),
				44	buffer(), bufferPos(0)
				45	{
				46	init();
				47	}
				48
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	49	Normalizer::Normalizer(ConstChar16Ptr str, int32_t length, UNormalizationMode mode) :
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	50	UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
				51	text(new UCharCharacterIterator(str, length)),
				52	currentIndex(0), nextIndex(0),
				53	buffer(), bufferPos(0)
				54	{
				55	init();
				56	}
				57
				58	Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
				59	UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
				60	text(iter.clone()),
				61	currentIndex(0), nextIndex(0),
				62	buffer(), bufferPos(0)
				63	{
				64	init();
				65	}
				66
				67	Normalizer::Normalizer(const Normalizer &copy) :
				68	UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions),
				69	text(copy.text->clone()),
				70	currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
				71	buffer(copy.buffer), bufferPos(copy.bufferPos)
				72	{
				73	init();
				74	}
				75
				76	void
				77	Normalizer::init() {
				78	UErrorCode errorCode=U_ZERO_ERROR;
				79	fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
				80	if(fOptions&UNORM_UNICODE_3_2) {
				81	delete fFilteredNorm2;
				82	fNorm2=fFilteredNorm2=
				83	new FilteredNormalizer2(fNorm2, uniset_getUnicode32Instance(errorCode));
				84	}
				85	if(U_FAILURE(errorCode)) {
				86	errorCode=U_ZERO_ERROR;
				87	fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
				88	}
				89	}
				90
				91	Normalizer::~Normalizer()
				92	{
				93	delete fFilteredNorm2;
				94	delete text;
				95	}
				96
				97	Normalizer*
				98	Normalizer::clone() const
				99	{
				100	return new Normalizer(*this);
				101	}
				102
				103	/**
				104	* Generates a hash code for this iterator.
				105	*/
				106	int32_t Normalizer::hashCode() const
				107	{
				108	return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
				109	}
				110
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	111	bool Normalizer::operator==(const Normalizer& that) const
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	112	{
				113	return
				114	this==&that \|\|
				115	(fUMode==that.fUMode &&
				116	fOptions==that.fOptions &&
				117	text==that.text &&
				118	buffer==that.buffer &&
				119	bufferPos==that.bufferPos &&
				120	nextIndex==that.nextIndex);
				121	}
				122
				123	//-------------------------------------------------------------------------
				124	// Static utility methods
				125	//-------------------------------------------------------------------------
				126
				127	void U_EXPORT2
				128	Normalizer::normalize(const UnicodeString& source,
				129	UNormalizationMode mode, int32_t options,
				130	UnicodeString& result,
				131	UErrorCode &status) {
				132	if(source.isBogus() \|\| U_FAILURE(status)) {
				133	result.setToBogus();
				134	if(U_SUCCESS(status)) {
				135	status=U_ILLEGAL_ARGUMENT_ERROR;
				136	}
				137	} else {
				138	UnicodeString localDest;
				139	UnicodeString *dest;
				140
				141	if(&source!=&result) {
				142	dest=&result;
				143	} else {
				144	// the source and result strings are the same object, use a temporary one
				145	dest=&localDest;
				146	}
				147	const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
				148	if(U_SUCCESS(status)) {
				149	if(options&UNORM_UNICODE_3_2) {
				150	FilteredNormalizer2(n2, uniset_getUnicode32Instance(status)).
				151	normalize(source, *dest, status);
				152	} else {
				153	n2->normalize(source, *dest, status);
				154	}
				155	}
				156	if(dest==&localDest && U_SUCCESS(status)) {
				157	result=*dest;
				158	}
				159	}
				160	}
				161
				162	void U_EXPORT2
				163	Normalizer::compose(const UnicodeString& source,
				164	UBool compat, int32_t options,
				165	UnicodeString& result,
				166	UErrorCode &status) {
				167	normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
				168	}
				169
				170	void U_EXPORT2
				171	Normalizer::decompose(const UnicodeString& source,
				172	UBool compat, int32_t options,
				173	UnicodeString& result,
				174	UErrorCode &status) {
				175	normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
				176	}
				177
				178	UNormalizationCheckResult
				179	Normalizer::quickCheck(const UnicodeString& source,
				180	UNormalizationMode mode, int32_t options,
				181	UErrorCode &status) {
				182	const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
				183	if(U_SUCCESS(status)) {
				184	if(options&UNORM_UNICODE_3_2) {
				185	return FilteredNormalizer2(n2, uniset_getUnicode32Instance(status)).
				186	quickCheck(source, status);
				187	} else {
				188	return n2->quickCheck(source, status);
				189	}
				190	} else {
				191	return UNORM_MAYBE;
				192	}
				193	}
				194
				195	UBool
				196	Normalizer::isNormalized(const UnicodeString& source,
				197	UNormalizationMode mode, int32_t options,
				198	UErrorCode &status) {
				199	const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
				200	if(U_SUCCESS(status)) {
				201	if(options&UNORM_UNICODE_3_2) {
				202	return FilteredNormalizer2(n2, uniset_getUnicode32Instance(status)).
				203	isNormalized(source, status);
				204	} else {
				205	return n2->isNormalized(source, status);
				206	}
				207	} else {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	208	return false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	209	}
				210	}
				211
				212	UnicodeString & U_EXPORT2
				213	Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right,
				214	UnicodeString &result,
				215	UNormalizationMode mode, int32_t options,
				216	UErrorCode &errorCode) {
				217	if(left.isBogus() \|\| right.isBogus() \|\| U_FAILURE(errorCode)) {
				218	result.setToBogus();
				219	if(U_SUCCESS(errorCode)) {
				220	errorCode=U_ILLEGAL_ARGUMENT_ERROR;
				221	}
				222	} else {
				223	UnicodeString localDest;
				224	UnicodeString *dest;
				225
				226	if(&right!=&result) {
				227	dest=&result;
				228	} else {
				229	// the right and result strings are the same object, use a temporary one
				230	dest=&localDest;
				231	}
				232	*dest=left;
				233	const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
				234	if(U_SUCCESS(errorCode)) {
				235	if(options&UNORM_UNICODE_3_2) {
				236	FilteredNormalizer2(n2, uniset_getUnicode32Instance(errorCode)).
				237	append(*dest, right, errorCode);
				238	} else {
				239	n2->append(*dest, right, errorCode);
				240	}
				241	}
				242	if(dest==&localDest && U_SUCCESS(errorCode)) {
				243	result=*dest;
				244	}
				245	}
				246	return result;
				247	}
				248
				249	//-------------------------------------------------------------------------
				250	// Iteration API
				251	//-------------------------------------------------------------------------
				252
				253	/**
				254	* Return the current character in the normalized text.
				255	*/
				256	UChar32 Normalizer::current() {
				257	if(bufferPos<buffer.length() \|\| nextNormalize()) {
				258	return buffer.char32At(bufferPos);
				259	} else {
				260	return DONE;
				261	}
				262	}
				263
				264	/**
				265	* Return the next character in the normalized text and advance
				266	* the iteration position by one. If the end
				267	* of the text has already been reached, {@link #DONE} is returned.
				268	*/
				269	UChar32 Normalizer::next() {
				270	if(bufferPos<buffer.length() \|\| nextNormalize()) {
				271	UChar32 c=buffer.char32At(bufferPos);
				272	bufferPos+=U16_LENGTH(c);
				273	return c;
				274	} else {
				275	return DONE;
				276	}
				277	}
				278
				279	/**
				280	* Return the previous character in the normalized text and decrement
				281	* the iteration position by one. If the beginning
				282	* of the text has already been reached, {@link #DONE} is returned.
				283	*/
				284	UChar32 Normalizer::previous() {
				285	if(bufferPos>0 \|\| previousNormalize()) {
				286	UChar32 c=buffer.char32At(bufferPos-1);
				287	bufferPos-=U16_LENGTH(c);
				288	return c;
				289	} else {
				290	return DONE;
				291	}
				292	}
				293
				294	void Normalizer::reset() {
				295	currentIndex=nextIndex=text->setToStart();
				296	clearBuffer();
				297	}
				298
				299	void
				300	Normalizer::setIndexOnly(int32_t index) {
				301	text->setIndex(index); // pins index
				302	currentIndex=nextIndex=text->getIndex();
				303	clearBuffer();
				304	}
				305
				306	/**
				307	* Return the first character in the normalized text. This resets
				308	* the <tt>Normalizer's</tt> position to the beginning of the text.
				309	*/
				310	UChar32 Normalizer::first() {
				311	reset();
				312	return next();
				313	}
				314
				315	/**
				316	* Return the last character in the normalized text. This resets
				317	* the <tt>Normalizer's</tt> position to be just before the
				318	* the input text corresponding to that normalized character.
				319	*/
				320	UChar32 Normalizer::last() {
				321	currentIndex=nextIndex=text->setToEnd();
				322	clearBuffer();
				323	return previous();
				324	}
				325
				326	/**
				327	* Retrieve the current iteration position in the input text that is
				328	* being normalized. This method is useful in applications such as
				329	* searching, where you need to be able to determine the position in
				330	* the input text that corresponds to a given normalized output character.
				331	* <p>
				332	* <b>Note:</b> This method sets the position in the <em>input</em>, while
				333	* {@link #next} and {@link #previous} iterate through characters in the
				334	* <em>output</em>. This means that there is not necessarily a one-to-one
				335	* correspondence between characters returned by <tt>next</tt> and
				336	* <tt>previous</tt> and the indices passed to and returned from
				337	* <tt>setIndex</tt> and {@link #getIndex}.
				338	*
				339	*/
				340	int32_t Normalizer::getIndex() const {
				341	if(bufferPos<buffer.length()) {
				342	return currentIndex;
				343	} else {
				344	return nextIndex;
				345	}
				346	}
				347
				348	/**
				349	* Retrieve the index of the start of the input text. This is the begin index
				350	* of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
				351	* over which this <tt>Normalizer</tt> is iterating
				352	*/
				353	int32_t Normalizer::startIndex() const {
				354	return text->startIndex();
				355	}
				356
				357	/**
				358	* Retrieve the index of the end of the input text. This is the end index
				359	* of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
				360	* over which this <tt>Normalizer</tt> is iterating
				361	*/
				362	int32_t Normalizer::endIndex() const {
				363	return text->endIndex();
				364	}
				365
				366	//-------------------------------------------------------------------------
				367	// Property access methods
				368	//-------------------------------------------------------------------------
				369
				370	void
				371	Normalizer::setMode(UNormalizationMode newMode)
				372	{
				373	fUMode = newMode;
				374	init();
				375	}
				376
				377	UNormalizationMode
				378	Normalizer::getUMode() const
				379	{
				380	return fUMode;
				381	}
				382
				383	void
				384	Normalizer::setOption(int32_t option,
				385	UBool value)
				386	{
				387	if (value) {
				388	fOptions \|= option;
				389	} else {
				390	fOptions &= (~option);
				391	}
				392	init();
				393	}
				394
				395	UBool
				396	Normalizer::getOption(int32_t option) const
				397	{
				398	return (fOptions & option) != 0;
				399	}
				400
				401	/**
				402	* Set the input text over which this <tt>Normalizer</tt> will iterate.
				403	* The iteration position is set to the beginning of the input text.
				404	*/
				405	void
				406	Normalizer::setText(const UnicodeString& newText,
				407	UErrorCode &status)
				408	{
				409	if (U_FAILURE(status)) {
				410	return;
				411	}
				412	CharacterIterator *newIter = new StringCharacterIterator(newText);
				413	if (newIter == NULL) {
				414	status = U_MEMORY_ALLOCATION_ERROR;
				415	return;
				416	}
				417	delete text;
				418	text = newIter;
				419	reset();
				420	}
				421
				422	/**
				423	* Set the input text over which this <tt>Normalizer</tt> will iterate.
				424	* The iteration position is set to the beginning of the string.
				425	*/
				426	void
				427	Normalizer::setText(const CharacterIterator& newText,
				428	UErrorCode &status)
				429	{
				430	if (U_FAILURE(status)) {
				431	return;
				432	}
				433	CharacterIterator *newIter = newText.clone();
				434	if (newIter == NULL) {
				435	status = U_MEMORY_ALLOCATION_ERROR;
				436	return;
				437	}
				438	delete text;
				439	text = newIter;
				440	reset();
				441	}
				442
				443	void
Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	444	Normalizer::setText(ConstChar16Ptr newText,
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	445	int32_t length,
				446	UErrorCode &status)
				447	{
				448	if (U_FAILURE(status)) {
				449	return;
				450	}
				451	CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
				452	if (newIter == NULL) {
				453	status = U_MEMORY_ALLOCATION_ERROR;
				454	return;
				455	}
				456	delete text;
				457	text = newIter;
				458	reset();
				459	}
				460
				461	/**
				462	* Copies the text under iteration into the UnicodeString referred to by "result".
				463	* @param result Receives a copy of the text under iteration.
				464	*/
				465	void
				466	Normalizer::getText(UnicodeString& result)
				467	{
				468	text->getText(result);
				469	}
				470
				471	//-------------------------------------------------------------------------
				472	// Private utility methods
				473	//-------------------------------------------------------------------------
				474
				475	void Normalizer::clearBuffer() {
				476	buffer.remove();
				477	bufferPos=0;
				478	}
				479
				480	UBool
				481	Normalizer::nextNormalize() {
				482	clearBuffer();
				483	currentIndex=nextIndex;
				484	text->setIndex(nextIndex);
				485	if(!text->hasNext()) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	486	return false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	487	}
				488	// Skip at least one character so we make progress.
				489	UnicodeString segment(text->next32PostInc());
				490	while(text->hasNext()) {
				491	UChar32 c;
				492	if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
				493	text->move32(-1, CharacterIterator::kCurrent);
				494	break;
				495	}
				496	segment.append(c);
				497	}
				498	nextIndex=text->getIndex();
				499	UErrorCode errorCode=U_ZERO_ERROR;
				500	fNorm2->normalize(segment, buffer, errorCode);
				501	return U_SUCCESS(errorCode) && !buffer.isEmpty();
				502	}
				503
				504	UBool
				505	Normalizer::previousNormalize() {
				506	clearBuffer();
				507	nextIndex=currentIndex;
				508	text->setIndex(currentIndex);
				509	if(!text->hasPrevious()) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	510	return false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	511	}
				512	UnicodeString segment;
				513	while(text->hasPrevious()) {
				514	UChar32 c=text->previous32();
				515	segment.insert(0, c);
				516	if(fNorm2->hasBoundaryBefore(c)) {
				517	break;
				518	}
				519	}
				520	currentIndex=text->getIndex();
				521	UErrorCode errorCode=U_ZERO_ERROR;
				522	fNorm2->normalize(segment, buffer, errorCode);
				523	bufferPos=buffer.length();
				524	return U_SUCCESS(errorCode) && !buffer.isEmpty();
				525	}
				526
				527	U_NAMESPACE_END
				528
				529	#endif /* #if !UCONFIG_NO_NORMALIZATION */