Blame - source/i18n/strrepl.cpp - chromium.googlesource.com/chromium/deps/icu

blob: 23dab55430ba947cc77a2e53af2178c712cba23e [file] [log] [blame]

Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	1	// © 2016 and later: Unicode, Inc. and others.
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	2	// License & terms of use: http://www.unicode.org/copyright.html
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	3	/*
				4	**********************************************************************
				5	* Copyright (c) 2002-2012, International Business Machines Corporation
				6	* and others. All Rights Reserved.
				7	**********************************************************************
				8	* Date Name Description
				9	* 01/21/2002 aliu Creation.
				10	**********************************************************************
				11	*/
				12
				13	#include "unicode/utypes.h"
				14
				15	#if !UCONFIG_NO_TRANSLITERATION
				16
				17	#include "unicode/uniset.h"
				18	#include "unicode/utf16.h"
				19	#include "strrepl.h"
				20	#include "rbt_data.h"
				21	#include "util.h"
				22
				23	U_NAMESPACE_BEGIN
				24
				25	UnicodeReplacer::~UnicodeReplacer() {}
				26	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer)
				27
				28	/**
				29	* Construct a StringReplacer that sets the emits the given output
				30	* text and sets the cursor to the given position.
				31	* @param theOutput text that will replace input text when the
				32	* replace() method is called. May contain stand-in characters
				33	* that represent nested replacers.
				34	* @param theCursorPos cursor position that will be returned by
				35	* the replace() method
				36	* @param theData transliterator context object that translates
				37	* stand-in characters to UnicodeReplacer objects
				38	*/
				39	StringReplacer::StringReplacer(const UnicodeString& theOutput,
				40	int32_t theCursorPos,
				41	const TransliterationRuleData* theData) {
				42	output = theOutput;
				43	cursorPos = theCursorPos;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	44	hasCursor = true;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	45	data = theData;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	46	isComplex = true;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	47	}
				48
				49	/**
				50	* Construct a StringReplacer that sets the emits the given output
				51	* text and does not modify the cursor.
				52	* @param theOutput text that will replace input text when the
				53	* replace() method is called. May contain stand-in characters
				54	* that represent nested replacers.
				55	* @param theData transliterator context object that translates
				56	* stand-in characters to UnicodeReplacer objects
				57	*/
				58	StringReplacer::StringReplacer(const UnicodeString& theOutput,
				59	const TransliterationRuleData* theData) {
				60	output = theOutput;
				61	cursorPos = 0;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	62	hasCursor = false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	63	data = theData;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	64	isComplex = true;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	65	}
				66
				67	/**
				68	* Copy constructor.
				69	*/
				70	StringReplacer::StringReplacer(const StringReplacer& other) :
				71	UnicodeFunctor(other),
				72	UnicodeReplacer(other)
				73	{
				74	output = other.output;
				75	cursorPos = other.cursorPos;
				76	hasCursor = other.hasCursor;
				77	data = other.data;
				78	isComplex = other.isComplex;
				79	}
				80
				81	/**
				82	* Destructor
				83	*/
				84	StringReplacer::~StringReplacer() {
				85	}
				86
				87	/**
				88	* Implement UnicodeFunctor
				89	*/
Frank Tang	b869661	2019-10-25 14:58:21 -0700	[diff] [blame]	90	StringReplacer* StringReplacer::clone() const {
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	91	return new StringReplacer(*this);
				92	}
				93
				94	/**
				95	* Implement UnicodeFunctor
				96	*/
				97	UnicodeReplacer* StringReplacer::toReplacer() const {
				98	return const_cast<StringReplacer *>(this);
				99	}
				100
				101	/**
				102	* UnicodeReplacer API
				103	*/
				104	int32_t StringReplacer::replace(Replaceable& text,
				105	int32_t start,
				106	int32_t limit,
				107	int32_t& cursor) {
				108	int32_t outLen;
				109	int32_t newStart = 0;
				110
				111	// NOTE: It should be possible to _always_ run the complex
				112	// processing code; just slower. If not, then there is a bug
				113	// in the complex processing code.
				114
				115	// Simple (no nested replacers) Processing Code :
				116	if (!isComplex) {
				117	text.handleReplaceBetween(start, limit, output);
				118	outLen = output.length();
				119
				120	// Setup default cursor position (for cursorPos within output)
				121	newStart = cursorPos;
				122	}
				123
				124	// Complex (nested replacers) Processing Code :
				125	else {
				126	/* When there are segments to be copied, use the Replaceable.copy()
				127	* API in order to retain out-of-band data. Copy everything to the
				128	* end of the string, then copy them back over the key. This preserves
				129	* the integrity of indices into the key and surrounding context while
				130	* generating the output text.
				131	*/
				132	UnicodeString buf;
				133	int32_t oOutput; // offset into 'output'
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	134	isComplex = false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	135
				136	// The temporary buffer starts at tempStart, and extends
				137	// to destLimit. The start of the buffer has a single
				138	// character from before the key. This provides style
				139	// data when addition characters are filled into the
				140	// temporary buffer. If there is nothing to the left, use
				141	// the non-character U+FFFF, which Replaceable subclasses
				142	// should treat specially as a "no-style character."
				143	// destStart points to the point after the style context
				144	// character, so it is tempStart+1 or tempStart+2.
				145	int32_t tempStart = text.length(); // start of temp buffer
				146	int32_t destStart = tempStart; // copy new text to here
				147	if (start > 0) {
				148	int32_t len = U16_LENGTH(text.char32At(start-1));
				149	text.copy(start-len, start, tempStart);
				150	destStart += len;
				151	} else {
				152	UnicodeString str((UChar) 0xFFFF);
				153	text.handleReplaceBetween(tempStart, tempStart, str);
				154	destStart++;
				155	}
				156	int32_t destLimit = destStart;
				157
				158	for (oOutput=0; oOutput<output.length(); ) {
				159	if (oOutput == cursorPos) {
				160	// Record the position of the cursor
				161	newStart = destLimit - destStart; // relative to start
				162	}
				163	UChar32 c = output.char32At(oOutput);
				164	UnicodeReplacer* r = data->lookupReplacer(c);
				165	if (r == NULL) {
				166	// Accumulate straight (non-segment) text.
				167	buf.append(c);
				168	} else {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	169	isComplex = true;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	170
				171	// Insert any accumulated straight text.
				172	if (buf.length() > 0) {
				173	text.handleReplaceBetween(destLimit, destLimit, buf);
				174	destLimit += buf.length();
				175	buf.truncate(0);
				176	}
				177
				178	// Delegate output generation to replacer object
				179	int32_t len = r->replace(text, destLimit, destLimit, cursor);
				180	destLimit += len;
				181	}
				182	oOutput += U16_LENGTH(c);
				183	}
				184	// Insert any accumulated straight text.
				185	if (buf.length() > 0) {
				186	text.handleReplaceBetween(destLimit, destLimit, buf);
				187	destLimit += buf.length();
				188	}
				189	if (oOutput == cursorPos) {
				190	// Record the position of the cursor
				191	newStart = destLimit - destStart; // relative to start
				192	}
				193
				194	outLen = destLimit - destStart;
				195
				196	// Copy new text to start, and delete it
				197	text.copy(destStart, destLimit, start);
				198	text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, UnicodeString());
				199
				200	// Delete the old text (the key)
				201	text.handleReplaceBetween(start + outLen, limit + outLen, UnicodeString());
				202	}
				203
				204	if (hasCursor) {
				205	// Adjust the cursor for positions outside the key. These
				206	// refer to code points rather than code units. If cursorPos
				207	// is within the output string, then use newStart, which has
				208	// already been set above.
				209	if (cursorPos < 0) {
				210	newStart = start;
				211	int32_t n = cursorPos;
				212	// Outside the output string, cursorPos counts code points
				213	while (n < 0 && newStart > 0) {
				214	newStart -= U16_LENGTH(text.char32At(newStart-1));
				215	++n;
				216	}
				217	newStart += n;
				218	} else if (cursorPos > output.length()) {
				219	newStart = start + outLen;
				220	int32_t n = cursorPos - output.length();
				221	// Outside the output string, cursorPos counts code points
				222	while (n > 0 && newStart < text.length()) {
				223	newStart += U16_LENGTH(text.char32At(newStart));
				224	--n;
				225	}
				226	newStart += n;
				227	} else {
				228	// Cursor is within output string. It has been set up above
				229	// to be relative to start.
				230	newStart += start;
				231	}
				232
				233	cursor = newStart;
				234	}
				235
				236	return outLen;
				237	}
				238
				239	/**
				240	* UnicodeReplacer API
				241	*/
				242	UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule,
				243	UBool escapeUnprintable) const {
				244	rule.truncate(0);
				245	UnicodeString quoteBuf;
				246
				247	int32_t cursor = cursorPos;
				248
				249	// Handle a cursor preceding the output
				250	if (hasCursor && cursor < 0) {
				251	while (cursor++ < 0) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	252	ICU_Utility::appendToRule(rule, (UChar)0x0040 /@/, true, escapeUnprintable, quoteBuf);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	253	}
				254	// Fall through and append '\|' below
				255	}
				256
				257	for (int32_t i=0; i<output.length(); ++i) {
				258	if (hasCursor && i == cursor) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	259	ICU_Utility::appendToRule(rule, (UChar)0x007C /\|/, true, escapeUnprintable, quoteBuf);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	260	}
				261	UChar c = output.charAt(i); // Ok to use 16-bits here
				262
				263	UnicodeReplacer* r = data->lookupReplacer(c);
				264	if (r == NULL) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	265	ICU_Utility::appendToRule(rule, c, false, escapeUnprintable, quoteBuf);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	266	} else {
				267	UnicodeString buf;
				268	r->toReplacerPattern(buf, escapeUnprintable);
				269	buf.insert(0, (UChar)0x20);
				270	buf.append((UChar)0x20);
				271	ICU_Utility::appendToRule(rule, buf,
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	272	true, escapeUnprintable, quoteBuf);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	273	}
				274	}
				275
				276	// Handle a cursor after the output. Use > rather than >= because
				277	// if cursor == output.length() it is at the end of the output,
				278	// which is the default position, so we need not emit it.
				279	if (hasCursor && cursor > output.length()) {
				280	cursor -= output.length();
				281	while (cursor-- > 0) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	282	ICU_Utility::appendToRule(rule, (UChar)0x0040 /@/, true, escapeUnprintable, quoteBuf);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	283	}
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	284	ICU_Utility::appendToRule(rule, (UChar)0x007C /\|/, true, escapeUnprintable, quoteBuf);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	285	}
				286	// Flush quoteBuf out to result
				287	ICU_Utility::appendToRule(rule, -1,
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	288	true, escapeUnprintable, quoteBuf);
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	289
				290	return rule;
				291	}
				292
				293	/**
				294	* Implement UnicodeReplacer
				295	*/
				296	void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const {
				297	UChar32 ch;
				298	for (int32_t i=0; i<output.length(); i+=U16_LENGTH(ch)) {
				299	ch = output.char32At(i);
				300	UnicodeReplacer* r = data->lookupReplacer(ch);
				301	if (r == NULL) {
				302	toUnionTo.add(ch);
				303	} else {
				304	r->addReplacementSetTo(toUnionTo);
				305	}
				306	}
				307	}
				308
				309	/**
				310	* UnicodeFunctor API
				311	*/
				312	void StringReplacer::setData(const TransliterationRuleData* d) {
				313	data = d;
				314	int32_t i = 0;
				315	while (i<output.length()) {
				316	UChar32 c = output.char32At(i);
				317	UnicodeFunctor* f = data->lookup(c);
				318	if (f != NULL) {
				319	f->setData(data);
				320	}
				321	i += U16_LENGTH(c);
				322	}
				323	}
				324
				325	U_NAMESPACE_END
				326
				327	#endif /* #if !UCONFIG_NO_TRANSLITERATION */
				328
				329	//eof