Blame - source/common/dictbe.cpp - chromium.googlesource.com/chromium/deps/icu

2017-05-13 21:10:13 -0700

[diff] [blame]

1

Jungshik Shin

2016-10-21 12:52:48 -0700

[diff] [blame]

2

// License & terms of use: http://www.unicode.org/copyright.html

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

3

/**

4

*******************************************************************************

Jungshik Shin

2016-10-21 12:52:48 -0700

[diff] [blame]

5

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

6

7

*******************************************************************************

8

*/

9

Frank Tang

2019-04-03 21:41:21 -0700

[diff] [blame]

10

#include <utility>

11

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

12

#include "unicode/utypes.h"

13

14

#if !UCONFIG_NO_BREAK_ITERATION

#include "brkeng.h"

#include "dictbe.h"

#include "unicode/uniset.h"

19

#include "unicode/chariter.h"

Frank Tang

2022-04-08 20:34:12 -0700

[diff] [blame]

20

#include "unicode/resbund.h"

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

21

#include "unicode/ubrk.h"

Frank Tang

2022-04-08 20:34:12 -0700

[diff] [blame]

22

#include "unicode/usetiter.h"

23

#include "ubrkimpl.h"

Frank Tang

2020-03-16 13:49:12 -0700

[diff] [blame]

24

#include "utracimp.h"

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

25

#include "uvectr32.h"

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

26

#include "uvector.h"

27

#include "uassert.h"

28

#include "unicode/normlzr.h"

29

#include "cmemory.h"

30

#include "dictionarydata.h"

U_NAMESPACE_BEGIN

/*

******************************************************************

36

*/

37

Jungshik Shin

2018-05-04 13:00:45 -0700

[diff] [blame]

38

DictionaryBreakEngine::DictionaryBreakEngine() {

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

39

}

40

41

DictionaryBreakEngine::~DictionaryBreakEngine() {

42

}

43

44

UBool

Jungshik Shin

2018-05-04 13:00:45 -0700

[diff] [blame]

45

DictionaryBreakEngine::handles(UChar32 c) const {

46

return fSet.contains(c);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

}

int32_t

DictionaryBreakEngine::findBreaks( UText *text,

51

int32_t startPos,

52

int32_t endPos,

Frank Tang

2021-11-08 14:04:04 -0800

[diff] [blame]

53

UVector32 &foundBreaks,

Frank Tang

2022-04-08 20:34:12 -0700

[diff] [blame]

54

UBool isPhraseBreaking,

Frank Tang

2021-11-08 14:04:04 -0800

[diff] [blame]

55

UErrorCode& status) const {

56

if (U_FAILURE(status)) return 0;

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

57

(void)startPos; // TODO: remove this param?

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

58

int32_t result = 0;

59

60

// Find the span of characters included in the set.

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

61

// The span to break begins at the current position in the text, and

62

// extends towards the start or end of the text, depending on 'reverse'.

63

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

64

int32_t start = (int32_t)utext_getNativeIndex(text);

int32_t current;

int32_t rangeStart;

int32_t rangeEnd;

UChar32 c = utext_current32(text);

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

69

while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {

70

utext_next32(text); // TODO: recast loop for postincrement

71

c = utext_current32(text);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

72

}

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

73

rangeStart = start;

74

rangeEnd = current;

Frank Tang

2022-04-08 20:34:12 -0700

[diff] [blame]

75

result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks, isPhraseBreaking, status);

Jungshik Shin

2018-05-04 13:00:45 -0700

[diff] [blame]

76

utext_setNativeIndex(text, current);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

return result;

}

void

DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {

83

fSet = set;

84

// Compact for caching

fSet.compact();

}

/*

******************************************************************

* PossibleWord

*/

// Helper class for improving readability of the Thai/Lao/Khmer word break

94

// algorithm. The implementation is completely inline.

95

96

// List size, limited by the maximum number of words in the dictionary

97

// that form a nested sequence.

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

98

static const int32_t POSSIBLE_WORD_LIST_MAX = 20;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

class PossibleWord {

private:

// list of word candidate lengths, in increasing length order

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

103

// TODO: bytes would be sufficient for word lengths.

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

104

int32_t count; // Count of candidates

105

int32_t prefix; // The longest match with a dictionary word

106

int32_t offset; // Offset in the text of these candidates

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

107

int32_t mark; // The preferred candidate's offset

108

int32_t current; // The candidate we're currently looking at

109

int32_t cuLengths[POSSIBLE_WORD_LIST_MAX]; // Word Lengths, in code units.

110

int32_t cpLengths[POSSIBLE_WORD_LIST_MAX]; // Word Lengths, in code points.

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

111

112

public:

Frank Tang

2019-04-03 21:41:21 -0700

[diff] [blame]

113

PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {}

114

~PossibleWord() {}

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

115

116

// Fill the list of candidates if needed, select the longest, and return the number found

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

117

int32_t candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

118

119

// Select the currently marked candidate, point after it in the text, and invalidate self

120

int32_t acceptMarked( UText *text );

121

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

122

// Back up from the current candidate to the next shorter one; return true if that exists

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

123

// and point the text after it

124

UBool backUp( UText *text );

125

126

// Return the longest prefix this candidate location shares with a dictionary word

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

127

// Return value is in code points.

Frank Tang

2019-04-03 21:41:21 -0700

[diff] [blame]

128

int32_t longestPrefix() { return prefix; }

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

129

130

// Mark the current candidate as the one we like

Frank Tang

2019-04-03 21:41:21 -0700

[diff] [blame]

131

void markCurrent() { mark = current; }

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

132

133

// Get length in code points of the marked word.

Frank Tang

2019-04-03 21:41:21 -0700

[diff] [blame]

134

int32_t markedCPLength() { return cpLengths[mark]; }

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

135

};

136

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

137

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

138

int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

139

// TODO: If getIndex is too slow, use offset < 0 and add discardAll()

140

int32_t start = (int32_t)utext_getNativeIndex(text);

141

if (start != offset) {

142

offset = start;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

143

count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

144

// Dictionary leaves text after longest prefix, not longest word. Back up.

145

if (count <= 0) {

146

utext_setNativeIndex(text, start);

147

}

148

}

149

if (count > 0) {

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

150

utext_setNativeIndex(text, start+cuLengths[count-1]);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

}

current = count-1;

mark = current;

return count;

}

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

157

int32_t

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

158

PossibleWord::acceptMarked( UText *text ) {

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

159

utext_setNativeIndex(text, offset + cuLengths[mark]);

160

return cuLengths[mark];

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

161

}

162

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

163

164

UBool

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

165

PossibleWord::backUp( UText *text ) {

166

if (current > 0) {

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

167

utext_setNativeIndex(text, offset + cuLengths[--current]);

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

168

return true;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

169

}

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

170

return false;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

171

}

172

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

173

/*

174

******************************************************************

* ThaiBreakEngine

*/

// How many words in a row are "good enough"?

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

179

static const int32_t THAI_LOOKAHEAD = 3;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

180

181

// Will not combine a non-word with a preceding dictionary word longer than this

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

182

static const int32_t THAI_ROOT_COMBINE_THRESHOLD = 3;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

183

184

// Will not combine a non-word that shares at least this much prefix with a

185

// dictionary word, with a preceding word

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

186

static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = 3;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

187

Frank Tang

2021-11-08 14:04:04 -0800

[diff] [blame]

188

// Elision character

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

189

static const int32_t THAI_PAIYANNOI = 0x0E2F;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

190

191

// Repeat character

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

192

static const int32_t THAI_MAIYAMOK = 0x0E46;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

193

194

// Minimum word size

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

195

static const int32_t THAI_MIN_WORD = 2;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

196

197

// Minimum number of characters for two words

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

198

static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

199

200

ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)

Jungshik Shin

2018-05-04 13:00:45 -0700

[diff] [blame]

201

: DictionaryBreakEngine(),

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

202

fDictionary(adoptDictionary)

203

{

Frank Tang

2020-03-16 13:49:12 -0700

[diff] [blame]

204

UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);

205

UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Thai");

Frank Tang

2022-04-08 20:34:12 -0700

[diff] [blame]

206

UnicodeSet thaiWordSet(UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]]"), status);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

207

if (U_SUCCESS(status)) {

Frank Tang

2022-04-08 20:34:12 -0700

[diff] [blame]

208

setCharacters(thaiWordSet);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

209

}

Frank Tang

2022-04-08 20:34:12 -0700

[diff] [blame]

210

fMarkSet.applyPattern(UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

211

fMarkSet.add(0x0020);

Frank Tang

2022-04-08 20:34:12 -0700

[diff] [blame]

212

fEndWordSet = thaiWordSet;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

213

fEndWordSet.remove(0x0E31); // MAI HAN-AKAT

214

fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI

215

fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK

216

fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI

217

fSuffixSet.add(THAI_PAIYANNOI);

218

fSuffixSet.add(THAI_MAIYAMOK);

219

220

// Compact for caching.

221

fMarkSet.compact();

222

fEndWordSet.compact();

223

fBeginWordSet.compact();

224

fSuffixSet.compact();

Frank Tang

2020-03-16 13:49:12 -0700

[diff] [blame]

225

UTRACE_EXIT_STATUS(status);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

226

}

227

228

ThaiBreakEngine::~ThaiBreakEngine() {

delete fDictionary;

}

int32_t

ThaiBreakEngine::divideUpDictionaryRange( UText *text,

234

int32_t rangeStart,

235

int32_t rangeEnd,

Frank Tang

2021-11-08 14:04:04 -0800

[diff] [blame]

236

UVector32 &foundBreaks,

Frank Tang

2022-04-08 20:34:12 -0700

[diff] [blame]

237

UBool /* isPhraseBreaking */,

Frank Tang

2021-11-08 14:04:04 -0800

[diff] [blame]

238

UErrorCode& status) const {

239

if (U_FAILURE(status)) return 0;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

240

utext_setNativeIndex(text, rangeStart);

241

utext_moveIndex32(text, THAI_MIN_WORD_SPAN);

242

if (utext_getNativeIndex(text) >= rangeEnd) {

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

243

return 0; // Not enough characters for two words

244

}

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

245

utext_setNativeIndex(text, rangeStart);

246

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

247

248

uint32_t wordsFound = 0;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

249

int32_t cpWordLength = 0; // Word Length in Code Points.

250

int32_t cuWordLength = 0; // Word length in code units (UText native indexing)

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

251

int32_t current;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

252

PossibleWord words[THAI_LOOKAHEAD];

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

253

254

utext_setNativeIndex(text, rangeStart);

255

256

while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

257

cpWordLength = 0;

258

cuWordLength = 0;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

259

260

// Look for candidate words at the current position

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

261

int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

262

263

// If we found exactly one, use that

264

if (candidates == 1) {

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

265

cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);

266

cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

267

wordsFound += 1;

268

}

269

// If there was more than one, see which one can take us forward the most words

270

else if (candidates > 1) {

271

// If we're already at the end of the range, we're done

272

if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {

273

goto foundBest;

274

}

275

do {

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

276

if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {

Frank Tang

2021-04-13 21:19:13 -0700

[diff] [blame]

277

// Followed by another dictionary word; mark first word as a good candidate

278

words[wordsFound%THAI_LOOKAHEAD].markCurrent();

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

279

280

// If we're already at the end of the range, we're done

281

if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {

goto foundBest;

}

// See if any of the possible second words is followed by a third word

286

do {

287

// If we find a third word, stop right away

288

if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {

289

words[wordsFound % THAI_LOOKAHEAD].markCurrent();

goto foundBest;

}

}

while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));

294

}

295

}

296

while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));

297

foundBest:

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

298

// Set UText position to after the accepted word.

299

cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);

300

cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

wordsFound += 1;

}

// We come here after having either found a word or not. We look ahead to the

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

305

// next word. If it's not a dictionary word, we will combine it with the word we

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

306

// just found (if there is one), but only if the preceding word does not exceed

307

// the threshold.

308

// The text iterator should now be positioned at the end of the word we found.

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

309

310

UChar32 uc = 0;

311

if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) {

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

312

// if it is a dictionary word, do nothing. If it isn't, then if there is

313

// no preceding word, or the non-word shares less than the minimum threshold

314

// of characters with a dictionary word, then scan to resynchronize

315

if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

316

&& (cuWordLength == 0

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

317

|| words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {

318

// Look for a plausible word boundary

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

319

int32_t remaining = rangeEnd - (current+cuWordLength);

320

UChar32 pc;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

321

int32_t chars = 0;

322

for (;;) {

Jungshik Shin

2016-10-21 12:52:48 -0700

[diff] [blame]

323

int32_t pcIndex = (int32_t)utext_getNativeIndex(text);

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

324

pc = utext_next32(text);

Jungshik Shin

2016-10-21 12:52:48 -0700

[diff] [blame]

325

int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

326

chars += pcSize;

327

remaining -= pcSize;

328

if (remaining <= 0) {

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

329

break;

330

}

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

331

uc = utext_current32(text);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

332

if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {

333

// Maybe. See if it's in the dictionary.

334

// NOTE: In the original Apple code, checked that the next

335

// two characters after uc were not 0x0E4C THANTHAKHAT before

336

// checking the dictionary. That is just a performance filter,

337

// but it's not clear it's faster than checking the trie.

Jungshik Shin

2018-10-24 01:22:09 -0700

[diff] [blame]

338

int32_t num_candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

339

utext_setNativeIndex(text, current + cuWordLength + chars);

Jungshik Shin

2018-10-24 01:22:09 -0700

[diff] [blame]

340

if (num_candidates > 0) {

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

341

break;

342

}

343

}

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

344

}

345

346

// Bump the word count if there wasn't already one

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

347

if (cuWordLength <= 0) {

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

wordsFound += 1;

}

// Update the length with the passed-over characters

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

352

cuWordLength += chars;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

353

}

354

else {

355

// Back up to where we were for next iteration

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

356

utext_setNativeIndex(text, current+cuWordLength);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

}

}

// Never stop before a combining mark.

361

int32_t currPos;

362

while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {

363

utext_next32(text);

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

364

cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

365

}

366

367

// Look ahead for possible suffixes if a dictionary word does not follow.

368

// We do this in code rather than using a rule so that the heuristic

369

// resynch continues to function. For example, one of the suffix characters

370

// could be a typo in the middle of a word.

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

371

if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cuWordLength > 0) {

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

372

if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0

373

&& fSuffixSet.contains(uc = utext_current32(text))) {

374

if (uc == THAI_PAIYANNOI) {

375

if (!fSuffixSet.contains(utext_previous32(text))) {

376

// Skip over previous end and PAIYANNOI

377

utext_next32(text);

Jungshik Shin

2016-10-21 12:52:48 -0700

[diff] [blame]

378

int32_t paiyannoiIndex = (int32_t)utext_getNativeIndex(text);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

379

utext_next32(text);

Jungshik Shin

2016-10-21 12:52:48 -0700

[diff] [blame]

380

cuWordLength += (int32_t)utext_getNativeIndex(text) - paiyannoiIndex; // Add PAIYANNOI to word

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

381

uc = utext_current32(text); // Fetch next character

382

}

383

else {

384

// Restore prior position

utext_next32(text);

}

}

if (uc == THAI_MAIYAMOK) {

389

if (utext_previous32(text) != THAI_MAIYAMOK) {

390

// Skip over previous end and MAIYAMOK

391

utext_next32(text);

Jungshik Shin

2016-10-21 12:52:48 -0700

[diff] [blame]

392

int32_t maiyamokIndex = (int32_t)utext_getNativeIndex(text);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

393

utext_next32(text);

Jungshik Shin

2016-10-21 12:52:48 -0700

[diff] [blame]

394

cuWordLength += (int32_t)utext_getNativeIndex(text) - maiyamokIndex; // Add MAIYAMOK to word

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

395

}

396

else {

397

// Restore prior position

utext_next32(text);

}

}

}

else {

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

403

utext_setNativeIndex(text, current+cuWordLength);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

}

}

// Did we find a word on this iteration? If so, push it on the break stack

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

408

if (cuWordLength > 0) {

409

foundBreaks.push((current+cuWordLength), status);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

}

}

// Don't return a break for the end of the dictionary range if there is one there.

414

if (foundBreaks.peeki() >= rangeEnd) {

415

(void) foundBreaks.popi();

wordsFound -= 1;

}

return wordsFound;

}

/*

******************************************************************

* LaoBreakEngine

*/

// How many words in a row are "good enough"?

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

428

static const int32_t LAO_LOOKAHEAD = 3;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

429

430

// Will not combine a non-word with a preceding dictionary word longer than this

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

431

static const int32_t LAO_ROOT_COMBINE_THRESHOLD = 3;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

432

433

// Will not combine a non-word that shares at least this much prefix with a

434

// dictionary word, with a preceding word

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

435

static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = 3;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

436

437

// Minimum word size

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

438

static const int32_t LAO_MIN_WORD = 2;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

439

440

// Minimum number of characters for two words

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

441

static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

442

443

LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)

Jungshik Shin

2018-05-04 13:00:45 -0700

[diff] [blame]

444

: DictionaryBreakEngine(),

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

445

fDictionary(adoptDictionary)

446

{

Frank Tang

2020-03-16 13:49:12 -0700

[diff] [blame]

447

UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);

448

UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Laoo");

Frank Tang

2022-04-08 20:34:12 -0700

[diff] [blame]

449

UnicodeSet laoWordSet(UnicodeString(u"[[:Laoo:]&[:LineBreak=SA:]]"), status);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

450

if (U_SUCCESS(status)) {

Frank Tang

2022-04-08 20:34:12 -0700

[diff] [blame]

451

setCharacters(laoWordSet);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

452

}

Frank Tang

2022-04-08 20:34:12 -0700

[diff] [blame]

453

fMarkSet.applyPattern(UnicodeString(u"[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

454

fMarkSet.add(0x0020);

Frank Tang

2022-04-08 20:34:12 -0700

[diff] [blame]

455

fEndWordSet = laoWordSet;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

456

fEndWordSet.remove(0x0EC0, 0x0EC4); // prefix vowels

457

fBeginWordSet.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters)

458

fBeginWordSet.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent)

459

fBeginWordSet.add(0x0EC0, 0x0EC4); // prefix vowels

460

461

// Compact for caching.

462

fMarkSet.compact();

463

fEndWordSet.compact();

464

fBeginWordSet.compact();

Frank Tang

2020-03-16 13:49:12 -0700

[diff] [blame]

465

UTRACE_EXIT_STATUS(status);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

466

}

467

468

LaoBreakEngine::~LaoBreakEngine() {

delete fDictionary;

}

int32_t

LaoBreakEngine::divideUpDictionaryRange( UText *text,

474

int32_t rangeStart,

475

int32_t rangeEnd,

Frank Tang

2021-11-08 14:04:04 -0800

[diff] [blame]

476

UVector32 &foundBreaks,

Frank Tang

2022-04-08 20:34:12 -0700

[diff] [blame]

477

UBool /* isPhraseBreaking */,

Frank Tang

2021-11-08 14:04:04 -0800

[diff] [blame]

478

UErrorCode& status) const {

479

if (U_FAILURE(status)) return 0;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

480

if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {

481

return 0; // Not enough characters for two words

482

}

483

484

uint32_t wordsFound = 0;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

485

int32_t cpWordLength = 0;

486

int32_t cuWordLength = 0;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

487

int32_t current;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

488

PossibleWord words[LAO_LOOKAHEAD];

Frank Tang

2021-11-08 14:04:04 -0800

[diff] [blame]

489

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

490

utext_setNativeIndex(text, rangeStart);

Frank Tang

2021-11-08 14:04:04 -0800

[diff] [blame]

491

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

492

while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

493

cuWordLength = 0;

494

cpWordLength = 0;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

495

496

// Look for candidate words at the current position

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

497

int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

498

499

// If we found exactly one, use that

500

if (candidates == 1) {

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

501

cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);

502

cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

503

wordsFound += 1;

504

}

505

// If there was more than one, see which one can take us forward the most words

506

else if (candidates > 1) {

507

// If we're already at the end of the range, we're done

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

508

if (utext_getNativeIndex(text) >= rangeEnd) {

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

509

goto foundBest;

510

}

511

do {

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

512

if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {

Frank Tang

2021-04-13 21:19:13 -0700

[diff] [blame]

513

// Followed by another dictionary word; mark first word as a good candidate

514

words[wordsFound%LAO_LOOKAHEAD].markCurrent();

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

515

516

// If we're already at the end of the range, we're done

517

if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {

goto foundBest;

}

// See if any of the possible second words is followed by a third word

522

do {

523

// If we find a third word, stop right away

524

if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {

525

words[wordsFound % LAO_LOOKAHEAD].markCurrent();

goto foundBest;

}

}

while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));

530

}

531

}

532

while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));

533

foundBest:

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

534

cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);

535

cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

wordsFound += 1;

}

// We come here after having either found a word or not. We look ahead to the

Frank Tang

2021-11-08 14:04:04 -0800

[diff] [blame]

540

// next word. If it's not a dictionary word, we will combine it with the word we

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

541

// just found (if there is one), but only if the preceding word does not exceed

542

// the threshold.

543

// The text iterator should now be positioned at the end of the word we found.

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

544

if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) {

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

545

// if it is a dictionary word, do nothing. If it isn't, then if there is

546

// no preceding word, or the non-word shares less than the minimum threshold

547

// of characters with a dictionary word, then scan to resynchronize

548

if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

549

&& (cuWordLength == 0

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

550

|| words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {

551

// Look for a plausible word boundary

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

552

int32_t remaining = rangeEnd - (current + cuWordLength);

553

UChar32 pc;

554

UChar32 uc;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

555

int32_t chars = 0;

556

for (;;) {

Jungshik Shin

2016-10-21 12:52:48 -0700

[diff] [blame]

557

int32_t pcIndex = (int32_t)utext_getNativeIndex(text);

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

558

pc = utext_next32(text);

Jungshik Shin

2016-10-21 12:52:48 -0700

[diff] [blame]

559

int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

560

chars += pcSize;

561

remaining -= pcSize;

562

if (remaining <= 0) {

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

563

break;

564

}

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

565

uc = utext_current32(text);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

566

if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {

567

// Maybe. See if it's in the dictionary.

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

568

// TODO: this looks iffy; compare with old code.

Jungshik Shin

2018-10-24 01:22:09 -0700

[diff] [blame]

569

int32_t num_candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

570

utext_setNativeIndex(text, current + cuWordLength + chars);

Jungshik Shin

2018-10-24 01:22:09 -0700

[diff] [blame]

571

if (num_candidates > 0) {

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

572

break;

573

}

574

}

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

575

}

576

577

// Bump the word count if there wasn't already one

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

578

if (cuWordLength <= 0) {

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

wordsFound += 1;

}

// Update the length with the passed-over characters

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

583

cuWordLength += chars;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

584

}

585

else {

586

// Back up to where we were for next iteration

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

587

utext_setNativeIndex(text, current + cuWordLength);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

}

}

// Never stop before a combining mark.

592

int32_t currPos;

593

while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {

594

utext_next32(text);

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

595

cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

596

}

597

598

// Look ahead for possible suffixes if a dictionary word does not follow.

599

// We do this in code rather than using a rule so that the heuristic

600

// resynch continues to function. For example, one of the suffix characters

601

// could be a typo in the middle of a word.

602

// NOT CURRENTLY APPLICABLE TO LAO

603

604

// Did we find a word on this iteration? If so, push it on the break stack

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

605

if (cuWordLength > 0) {

606

foundBreaks.push((current+cuWordLength), status);

}

}

// Don't return a break for the end of the dictionary range if there is one there.

611

if (foundBreaks.peeki() >= rangeEnd) {

612

(void) foundBreaks.popi();

wordsFound -= 1;

}

return wordsFound;

}

/*

******************************************************************

* BurmeseBreakEngine

*/

// How many words in a row are "good enough"?

625

static const int32_t BURMESE_LOOKAHEAD = 3;

626

627

// Will not combine a non-word with a preceding dictionary word longer than this

628

static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3;

629

630

// Will not combine a non-word that shares at least this much prefix with a

631

// dictionary word, with a preceding word

632

static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3;

633

634

// Minimum word size

635

static const int32_t BURMESE_MIN_WORD = 2;

636

637

// Minimum number of characters for two words

638

static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;

639

640

BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)

Jungshik Shin

2018-05-04 13:00:45 -0700

[diff] [blame]

641

: DictionaryBreakEngine(),

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

642

fDictionary(adoptDictionary)

643

{

Frank Tang

2020-03-16 13:49:12 -0700

[diff] [blame]

644

UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);

645

UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Mymr");

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

646

fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels

Frank Tang

2022-04-08 20:34:12 -0700

[diff] [blame]

647

fEndWordSet.applyPattern(UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]]"), status);

648

fMarkSet.applyPattern(UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);

649

fMarkSet.add(0x0020);

650

if (U_SUCCESS(status)) {

651

setCharacters(fEndWordSet);

652

}

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

653

654

// Compact for caching.

655

fMarkSet.compact();

656

fEndWordSet.compact();

657

fBeginWordSet.compact();

Frank Tang

2020-03-16 13:49:12 -0700

[diff] [blame]

658

UTRACE_EXIT_STATUS(status);

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

659

}

660

661

BurmeseBreakEngine::~BurmeseBreakEngine() {

delete fDictionary;

}

int32_t

BurmeseBreakEngine::divideUpDictionaryRange( UText *text,

667

int32_t rangeStart,

668

int32_t rangeEnd,

Frank Tang

2021-11-08 14:04:04 -0800

[diff] [blame]

669

UVector32 &foundBreaks,

Frank Tang

2022-04-08 20:34:12 -0700

[diff] [blame]

670

UBool /* isPhraseBreaking */,

Frank Tang

2021-11-08 14:04:04 -0800

[diff] [blame]

671

UErrorCode& status ) const {

672

if (U_FAILURE(status)) return 0;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

673

if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {

674

return 0; // Not enough characters for two words

675

}

676

677

uint32_t wordsFound = 0;

678

int32_t cpWordLength = 0;

679

int32_t cuWordLength = 0;

680

int32_t current;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

681

PossibleWord words[BURMESE_LOOKAHEAD];

Frank Tang

2021-11-08 14:04:04 -0800

[diff] [blame]

682

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

683

utext_setNativeIndex(text, rangeStart);

Frank Tang

2021-11-08 14:04:04 -0800

[diff] [blame]

684

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

685

while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {

cuWordLength = 0;

cpWordLength = 0;

// Look for candidate words at the current position

690

int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);

691

692

// If we found exactly one, use that

693

if (candidates == 1) {

694

cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);

695

cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();

696

wordsFound += 1;

697

}

698

// If there was more than one, see which one can take us forward the most words

699

else if (candidates > 1) {

700

// If we're already at the end of the range, we're done

701

if (utext_getNativeIndex(text) >= rangeEnd) {

702

goto foundBest;

703

}

704

do {

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

705

if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {

Frank Tang

2021-04-13 21:19:13 -0700

[diff] [blame]

706

// Followed by another dictionary word; mark first word as a good candidate

707

words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

708

709

// If we're already at the end of the range, we're done

710

if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {

goto foundBest;

}

// See if any of the possible second words is followed by a third word

715

do {

716

// If we find a third word, stop right away

717

if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {

718

words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();

goto foundBest;

}

}

while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(text));

723

}

724

}

725

while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));

726

foundBest:

727

cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);

728

cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();

wordsFound += 1;

}

// We come here after having either found a word or not. We look ahead to the

Frank Tang

2021-11-08 14:04:04 -0800

[diff] [blame]

733

// next word. If it's not a dictionary word, we will combine it with the word we

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

734

// just found (if there is one), but only if the preceding word does not exceed

735

// the threshold.

736

// The text iterator should now be positioned at the end of the word we found.

737

if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {

738

// if it is a dictionary word, do nothing. If it isn't, then if there is

739

// no preceding word, or the non-word shares less than the minimum threshold

740

// of characters with a dictionary word, then scan to resynchronize

741

if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0

742

&& (cuWordLength == 0

743

|| words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {

744

// Look for a plausible word boundary

745

int32_t remaining = rangeEnd - (current + cuWordLength);

UChar32 pc;

UChar32 uc;

int32_t chars = 0;

for (;;) {

Jungshik Shin

2016-10-21 12:52:48 -0700

[diff] [blame]

750

int32_t pcIndex = (int32_t)utext_getNativeIndex(text);

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

751

pc = utext_next32(text);

Jungshik Shin

2016-10-21 12:52:48 -0700

[diff] [blame]

752

int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

753

chars += pcSize;

754

remaining -= pcSize;

755

if (remaining <= 0) {

756

break;

757

}

758

uc = utext_current32(text);

759

if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {

760

// Maybe. See if it's in the dictionary.

761

// TODO: this looks iffy; compare with old code.

Jungshik Shin

2018-10-24 01:22:09 -0700

[diff] [blame]

762

int32_t num_candidates = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

763

utext_setNativeIndex(text, current + cuWordLength + chars);

Jungshik Shin

2018-10-24 01:22:09 -0700

[diff] [blame]

764

if (num_candidates > 0) {

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

break;

}

}

}

// Bump the word count if there wasn't already one

771

if (cuWordLength <= 0) {

wordsFound += 1;

}

// Update the length with the passed-over characters

776

cuWordLength += chars;

777

}

778

else {

779

// Back up to where we were for next iteration

780

utext_setNativeIndex(text, current + cuWordLength);

}

}

// Never stop before a combining mark.

785

int32_t currPos;

786

while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {

787

utext_next32(text);

788

cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;

789

}

790

791

// Look ahead for possible suffixes if a dictionary word does not follow.

792

// We do this in code rather than using a rule so that the heuristic

793

// resynch continues to function. For example, one of the suffix characters

794

// could be a typo in the middle of a word.

795

// NOT CURRENTLY APPLICABLE TO BURMESE

796

797

// Did we find a word on this iteration? If so, push it on the break stack

798

if (cuWordLength > 0) {

799

foundBreaks.push((current+cuWordLength), status);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

}

}

// Don't return a break for the end of the dictionary range if there is one there.

804

if (foundBreaks.peeki() >= rangeEnd) {

805

(void) foundBreaks.popi();

wordsFound -= 1;

}

return wordsFound;

}

/*

******************************************************************

* KhmerBreakEngine

*/

// How many words in a row are "good enough"?

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

818

static const int32_t KHMER_LOOKAHEAD = 3;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

819

820

// Will not combine a non-word with a preceding dictionary word longer than this

Jungshik Shin

e0d9b90

2016-10-28 12:56:54 -0700

[diff] [blame]

821

static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 10;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

822

823

// Will not combine a non-word that shares at least this much prefix with a

824

// dictionary word, with a preceding word

Jungshik Shin

e0d9b90

2016-10-28 12:56:54 -0700

[diff] [blame]

825

static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 5;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

826

827

// Minimum word size

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

828

static const int32_t KHMER_MIN_WORD = 2;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

829

830

// Minimum number of characters for two words

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

831

static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

832

833

KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)

Jungshik Shin

2018-05-04 13:00:45 -0700

[diff] [blame]

834

: DictionaryBreakEngine(),

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

835

fDictionary(adoptDictionary)

836

{

Frank Tang

2020-03-16 13:49:12 -0700

[diff] [blame]

837

UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);

838

UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Khmr");

Frank Tang

2022-04-08 20:34:12 -0700

[diff] [blame]

839

UnicodeSet khmerWordSet(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]]"), status);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

840

if (U_SUCCESS(status)) {

Frank Tang

2022-04-08 20:34:12 -0700

[diff] [blame]

841

setCharacters(khmerWordSet);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

842

}

Frank Tang

2022-04-08 20:34:12 -0700

[diff] [blame]

843

fMarkSet.applyPattern(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

844

fMarkSet.add(0x0020);

Frank Tang

2022-04-08 20:34:12 -0700

[diff] [blame]

845

fEndWordSet = khmerWordSet;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

846

fBeginWordSet.add(0x1780, 0x17B3);

847

//fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels

848

//fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word

849

//fEndWordSet.remove(0x17B2); // Khmer independent vowel that can't end a word

850

fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters

851

//fEndWordSet.remove(0x17B6, 0x17C5); // Remove dependent vowels

852

// fEndWordSet.remove(0x0E31); // MAI HAN-AKAT

853

// fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI

854

// fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK

855

// fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI

856

// fSuffixSet.add(THAI_PAIYANNOI);

857

// fSuffixSet.add(THAI_MAIYAMOK);

858

859

// Compact for caching.

860

fMarkSet.compact();

861

fEndWordSet.compact();

862

fBeginWordSet.compact();

863

// fSuffixSet.compact();

Frank Tang

2020-03-16 13:49:12 -0700

[diff] [blame]

864

UTRACE_EXIT_STATUS(status);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

865

}

866

867

KhmerBreakEngine::~KhmerBreakEngine() {

delete fDictionary;

}

int32_t

KhmerBreakEngine::divideUpDictionaryRange( UText *text,

873

int32_t rangeStart,

874

int32_t rangeEnd,

Frank Tang

2021-11-08 14:04:04 -0800

[diff] [blame]

875

UVector32 &foundBreaks,

Frank Tang

2022-04-08 20:34:12 -0700

[diff] [blame]

876

UBool /* isPhraseBreaking */,

Frank Tang

2021-11-08 14:04:04 -0800

[diff] [blame]

877

UErrorCode& status ) const {

878

if (U_FAILURE(status)) return 0;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

879

if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {

880

return 0; // Not enough characters for two words

881

}

882

883

uint32_t wordsFound = 0;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

884

int32_t cpWordLength = 0;

885

int32_t cuWordLength = 0;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

886

int32_t current;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

887

PossibleWord words[KHMER_LOOKAHEAD];

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

888

889

utext_setNativeIndex(text, rangeStart);

890

891

while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

892

cuWordLength = 0;

893

cpWordLength = 0;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

894

895

// Look for candidate words at the current position

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

896

int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

897

898

// If we found exactly one, use that

899

if (candidates == 1) {

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

900

cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);

901

cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

wordsFound += 1;

}

// If there was more than one, see which one can take us forward the most words

906

else if (candidates > 1) {

907

// If we're already at the end of the range, we're done

908

if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {

909

goto foundBest;

910

}

911

do {

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

912

if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {

Frank Tang

2021-04-13 21:19:13 -0700

[diff] [blame]

913

// Followed by another dictionary word; mark first word as a good candidate

914

words[wordsFound % KHMER_LOOKAHEAD].markCurrent();

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

915

916

// If we're already at the end of the range, we're done

917

if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {

goto foundBest;

}

// See if any of the possible second words is followed by a third word

922

do {

923

// If we find a third word, stop right away

924

if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {

925

words[wordsFound % KHMER_LOOKAHEAD].markCurrent();

goto foundBest;

}

}

while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));

930

}

931

}

932

while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));

933

foundBest:

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

934

cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);

935

cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

wordsFound += 1;

}

// We come here after having either found a word or not. We look ahead to the

940

// next word. If it's not a dictionary word, we will combine it with the word we

941

// just found (if there is one), but only if the preceding word does not exceed

942

// the threshold.

943

// The text iterator should now be positioned at the end of the word we found.

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

944

if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

945

// if it is a dictionary word, do nothing. If it isn't, then if there is

946

// no preceding word, or the non-word shares less than the minimum threshold

947

// of characters with a dictionary word, then scan to resynchronize

948

if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

949

&& (cuWordLength == 0

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

950

|| words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {

951

// Look for a plausible word boundary

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

952

int32_t remaining = rangeEnd - (current+cuWordLength);

953

UChar32 pc;

954

UChar32 uc;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

955

int32_t chars = 0;

956

for (;;) {

Jungshik Shin

2016-10-21 12:52:48 -0700

[diff] [blame]

957

int32_t pcIndex = (int32_t)utext_getNativeIndex(text);

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

958

pc = utext_next32(text);

Jungshik Shin

2016-10-21 12:52:48 -0700

[diff] [blame]

959

int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

960

chars += pcSize;

961

remaining -= pcSize;

962

if (remaining <= 0) {

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

963

break;

964

}

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

965

uc = utext_current32(text);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

966

if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {

967

// Maybe. See if it's in the dictionary.

Jungshik Shin

2018-10-24 01:22:09 -0700

[diff] [blame]

968

int32_t num_candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

969

utext_setNativeIndex(text, current+cuWordLength+chars);

Jungshik Shin

2018-10-24 01:22:09 -0700

[diff] [blame]

970

if (num_candidates > 0) {

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

971

break;

972

}

973

}

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

974

}

975

976

// Bump the word count if there wasn't already one

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

977

if (cuWordLength <= 0) {

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

wordsFound += 1;

}

// Update the length with the passed-over characters

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

982

cuWordLength += chars;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

983

}

984

else {

985

// Back up to where we were for next iteration

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

986

utext_setNativeIndex(text, current+cuWordLength);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

}

}

// Never stop before a combining mark.

991

int32_t currPos;

992

while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {

993

utext_next32(text);

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

994

cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

995

}

996

997

// Look ahead for possible suffixes if a dictionary word does not follow.

998

// We do this in code rather than using a rule so that the heuristic

999

// resynch continues to function. For example, one of the suffix characters

1000

// could be a typo in the middle of a word.

1001

// if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {

1002

// if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0

1003

// && fSuffixSet.contains(uc = utext_current32(text))) {

1004

// if (uc == KHMER_PAIYANNOI) {

1005

// if (!fSuffixSet.contains(utext_previous32(text))) {

1006

// // Skip over previous end and PAIYANNOI

1007

// utext_next32(text);

1008

// utext_next32(text);

1009

// wordLength += 1; // Add PAIYANNOI to word

1010

// uc = utext_current32(text); // Fetch next character

1011

// }

1012

// else {

1013

// // Restore prior position

1014

// utext_next32(text);

1015

// }

1016

// }

1017

// if (uc == KHMER_MAIYAMOK) {

1018

// if (utext_previous32(text) != KHMER_MAIYAMOK) {

1019

// // Skip over previous end and MAIYAMOK

1020

// utext_next32(text);

1021

// utext_next32(text);

1022

// wordLength += 1; // Add MAIYAMOK to word

1023

// }

1024

// else {

1025

// // Restore prior position

1026

// utext_next32(text);

// }

// }

// }

// else {

// utext_setNativeIndex(text, current+wordLength);

// }

// }

// Did we find a word on this iteration? If so, push it on the break stack

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1036

if (cuWordLength > 0) {

1037

foundBreaks.push((current+cuWordLength), status);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

}

}

// Don't return a break for the end of the dictionary range if there is one there.

1042

if (foundBreaks.peeki() >= rangeEnd) {

1043

(void) foundBreaks.popi();

wordsFound -= 1;

}

return wordsFound;

}

#if !UCONFIG_NO_NORMALIZATION

1051

/*

1052

******************************************************************

1053

* CjkBreakEngine

1054

*/

1055

static const uint32_t kuint32max = 0xFFFFFFFF;

1056

CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)

Jungshik Shin

2018-05-04 13:00:45 -0700

[diff] [blame]

1057

: DictionaryBreakEngine(), fDictionary(adoptDictionary) {

Frank Tang

2020-03-16 13:49:12 -0700

[diff] [blame]

1058

UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);

1059

UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Hani");

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1060

nfkcNorm2 = Normalizer2::getNFKCInstance(status);

Frank Tang

2022-04-08 20:34:12 -0700

[diff] [blame]

1061

// Korean dictionary only includes Hangul syllables

1062

fHangulWordSet.applyPattern(UnicodeString(u"[\\uac00-\\ud7a3]"), status);

1063

fHangulWordSet.compact();

1064

// Digits, open puncutation and Alphabetic characters.

1065

fDigitOrOpenPunctuationOrAlphabetSet.applyPattern(

1066

UnicodeString(u"[[:Nd:][:Pi:][:Ps:][:Alphabetic:]]"), status);

1067

fDigitOrOpenPunctuationOrAlphabetSet.compact();

1068

fClosePunctuationSet.applyPattern(UnicodeString(u"[[:Pc:][:Pd:][:Pe:][:Pf:][:Po:]]"), status);

1069

fClosePunctuationSet.compact();

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1070

Frank Tang

2022-04-08 20:34:12 -0700

[diff] [blame]

1071

// handle Korean and Japanese/Chinese using different dictionaries

1072

if (type == kKorean) {

1073

if (U_SUCCESS(status)) {

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1074

setCharacters(fHangulWordSet);

Frank Tang

2022-04-08 20:34:12 -0700

[diff] [blame]

1075

}

1076

} else { //Chinese and Japanese

1077

UnicodeSet cjSet(UnicodeString(u"[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]"), status);

1078

if (U_SUCCESS(status)) {

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1079

setCharacters(cjSet);

Frank Tang

2022-04-08 20:34:12 -0700

[diff] [blame]

1080

initJapanesePhraseParameter(status);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1081

}

1082

}

Frank Tang

2020-03-16 13:49:12 -0700

[diff] [blame]

1083

UTRACE_EXIT_STATUS(status);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1084

}

1085

1086

CjkBreakEngine::~CjkBreakEngine(){

delete fDictionary;

}

// The katakanaCost values below are based on the length frequencies of all

1091

// katakana phrases in the dictionary

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1092

static const int32_t kMaxKatakanaLength = 8;

1093

static const int32_t kMaxKatakanaGroupLength = 20;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1094

static const uint32_t maxSnlp = 255;

1095

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1096

static inline uint32_t getKatakanaCost(int32_t wordLength){

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1097

//TODO: fill array with actual values from dictionary!

1098

static const uint32_t katakanaCost[kMaxKatakanaLength + 1]

1099

= {8192, 984, 408, 240, 204, 252, 300, 372, 480};

1100

return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];

1101

}

1102

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

1103

static inline bool isKatakana(UChar32 value) {

1104

return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) ||

1105

(value >= 0xFF66 && value <= 0xFF9f);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1106

}

1107

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1108

// Function for accessing internal utext flags.

1109

// Replicates an internal UText function.

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1110

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1111

static inline int32_t utext_i32_flag(int32_t bitIndex) {

1112

return (int32_t)1 << bitIndex;

1113

}

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1114

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1115

/*

1116

* @param text A UText representing the text

1117

* @param rangeStart The start of the range of dictionary characters

1118

* @param rangeEnd The end of the range of dictionary characters

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

1119

* @param foundBreaks vector<int32> to receive the break positions

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1120

* @return The number of breaks found

1121

*/

1122

int32_t

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1123

CjkBreakEngine::divideUpDictionaryRange( UText *inText,

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1124

int32_t rangeStart,

1125

int32_t rangeEnd,

Frank Tang

2021-11-08 14:04:04 -0800

[diff] [blame]

1126

UVector32 &foundBreaks,

Frank Tang

2022-04-08 20:34:12 -0700

[diff] [blame]

1127

UBool isPhraseBreaking,

Frank Tang

2021-11-08 14:04:04 -0800

[diff] [blame]

1128

UErrorCode& status) const {

1129

if (U_FAILURE(status)) return 0;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1130

if (rangeStart >= rangeEnd) {

return 0;

}

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

1134

// UnicodeString version of input UText, NFKC normalized if necessary.

1135

UnicodeString inString;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1136

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1137

// inputMap[inStringIndex] = corresponding native index from UText inText.

1138

// If NULL then mapping is 1:1

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

1139

LocalPointer<UVector32> inputMap;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1140

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1141

// if UText has the input string as one contiguous UTF-16 chunk

1142

if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) &&

1143

inText->chunkNativeStart <= rangeStart &&

1144

inText->chunkNativeLimit >= rangeEnd &&

1145

inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) {

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

1146

1147

// Input UText is in one contiguous UTF-16 chunk.

1148

// Use Read-only aliasing UnicodeString.

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

1149

inString.setTo(false,

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

1150

inText->chunkContents + rangeStart - inText->chunkNativeStart,

1151

rangeEnd - rangeStart);

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1152

} else {

1153

// Copy the text from the original inText (UText) to inString (UnicodeString).

1154

// Create a map from UnicodeString indices -> UText offsets.

1155

utext_setNativeIndex(inText, rangeStart);

1156

int32_t limit = rangeEnd;

1157

U_ASSERT(limit <= utext_nativeLength(inText));

1158

if (limit > utext_nativeLength(inText)) {

Jungshik Shin

2016-10-21 12:52:48 -0700

[diff] [blame]

1159

limit = (int32_t)utext_nativeLength(inText);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1160

}

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

1161

inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);

1162

if (U_FAILURE(status)) {

1163

return 0;

1164

}

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1165

while (utext_getNativeIndex(inText) < limit) {

Jungshik Shin

2016-10-21 12:52:48 -0700

[diff] [blame]

1166

int32_t nativePosition = (int32_t)utext_getNativeIndex(inText);

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1167

UChar32 c = utext_next32(inText);

1168

U_ASSERT(c != U_SENTINEL);

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

1169

inString.append(c);

1170

while (inputMap->size() < inString.length()) {

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1171

inputMap->addElement(nativePosition, status);

1172

}

1173

}

1174

inputMap->addElement(limit, status);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1175

}

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1176

1177

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

1178

if (!nfkcNorm2->isNormalized(inString, status)) {

1179

UnicodeString normalizedInput;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1180

// normalizedMap[normalizedInput position] == original UText position.

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

1181

LocalPointer<UVector32> normalizedMap(new UVector32(status), status);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1182

if (U_FAILURE(status)) {

1183

return 0;

1184

}

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1185

1186

UnicodeString fragment;

1187

UnicodeString normalizedFragment;

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

1188

for (int32_t srcI = 0; srcI < inString.length();) { // Once per normalization chunk

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1189

fragment.remove();

1190

int32_t fragmentStartI = srcI;

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

1191

UChar32 c = inString.char32At(srcI);

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1192

for (;;) {

1193

fragment.append(c);

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

1194

srcI = inString.moveIndex32(srcI, 1);

1195

if (srcI == inString.length()) {

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1196

break;

1197

}

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

1198

c = inString.char32At(srcI);

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1199

if (nfkcNorm2->hasBoundaryBefore(c)) {

break;

}

}

nfkcNorm2->normalize(fragment, normalizedFragment, status);

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

1204

normalizedInput.append(normalizedFragment);

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1205

1206

// Map every position in the normalized chunk to the start of the chunk

1207

// in the original input.

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

1208

int32_t fragmentOriginalStart = inputMap.isValid() ?

1209

inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart;

1210

while (normalizedMap->size() < normalizedInput.length()) {

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1211

normalizedMap->addElement(fragmentOriginalStart, status);

1212

if (U_FAILURE(status)) {

1213

break;

1214

}

1215

}

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1216

}

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

1217

U_ASSERT(normalizedMap->size() == normalizedInput.length());

1218

int32_t nativeEnd = inputMap.isValid() ?

1219

inputMap->elementAti(inString.length()) : inString.length()+rangeStart;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1220

normalizedMap->addElement(nativeEnd, status);

1221

Frank Tang

2019-04-03 21:41:21 -0700

[diff] [blame]

1222

inputMap = std::move(normalizedMap);

1223

inString = std::move(normalizedInput);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1224

}

1225

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

1226

int32_t numCodePts = inString.countChar32();

1227

if (numCodePts != inString.length()) {

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1228

// There are supplementary characters in the input.

1229

// The dictionary will produce boundary positions in terms of code point indexes,

1230

// not in terms of code unit string indexes.

1231

// Use the inputMap mechanism to take care of this in addition to indexing differences

1232

// from normalization and/or UTF-8 input.

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

1233

UBool hadExistingMap = inputMap.isValid();

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1234

if (!hadExistingMap) {

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

1235

inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);

1236

if (U_FAILURE(status)) {

1237

return 0;

1238

}

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1239

}

1240

int32_t cpIdx = 0;

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

1241

for (int32_t cuIdx = 0; ; cuIdx = inString.moveIndex32(cuIdx, 1)) {

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1242

U_ASSERT(cuIdx >= cpIdx);

1243

if (hadExistingMap) {

1244

inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx);

1245

} else {

1246

inputMap->addElement(cuIdx+rangeStart, status);

1247

}

1248

cpIdx++;

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

1249

if (cuIdx == inString.length()) {

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1250

break;

1251

}

1252

}

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1253

}

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1254

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1255

// bestSnlp[i] is the snlp of the best segmentation of the first i

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1256

// code points in the range to be matched.

1257

UVector32 bestSnlp(numCodePts + 1, status);

1258

bestSnlp.addElement(0, status);

1259

for(int32_t i = 1; i <= numCodePts; i++) {

1260

bestSnlp.addElement(kuint32max, status);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1261

}

1262

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1263

1264

// prev[i] is the index of the last CJK code point in the previous word in

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1265

// the best segmentation of the first i characters.

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1266

UVector32 prev(numCodePts + 1, status);

1267

for(int32_t i = 0; i <= numCodePts; i++){

1268

prev.addElement(-1, status);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1269

}

1270

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1271

const int32_t maxWordSize = 20;

1272

UVector32 values(numCodePts, status);

1273

values.setSize(numCodePts);

1274

UVector32 lengths(numCodePts, status);

1275

lengths.setSize(numCodePts);

1276

1277

UText fu = UTEXT_INITIALIZER;

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

1278

utext_openUnicodeString(&fu, &inString, &status);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1279

1280

// Dynamic programming to find the best segmentation.

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1281

1282

// In outer loop, i is the code point index,

1283

// ix is the corresponding string (code unit) index.

1284

// They differ when the string contains supplementary characters.

1285

int32_t ix = 0;

Jungshik Shin

2016-10-21 12:52:48 -0700

[diff] [blame]

1286

bool is_prev_katakana = false;

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

1287

for (int32_t i = 0; i < numCodePts; ++i, ix = inString.moveIndex32(ix, 1)) {

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1288

if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1289

continue;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1290

}

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1291

1292

int32_t count;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1293

utext_setNativeIndex(&fu, ix);

1294

count = fDictionary->matches(&fu, maxWordSize, numCodePts,

1295

NULL, lengths.getBuffer(), values.getBuffer(), NULL);

1296

// Note: lengths is filled with code point lengths

1297

// The NULL parameter is the ignored code unit lengths.

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1298

1299

// if there are no single character matches found in the dictionary

Jungshik Shin

2016-10-21 12:52:48 -0700

[diff] [blame]

1300

// starting with this character, treat character as a 1-character word

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1301

// with the highest value possible, i.e. the least likely to occur.

1302

// Exclude Korean characters from this treatment, as they should be left

1303

// together by default.

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1304

if ((count == 0 || lengths.elementAti(0) != 1) &&

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

1305

!fHangulWordSet.contains(inString.char32At(ix))) {

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1306

values.setElementAt(maxSnlp, count); // 255

1307

lengths.setElementAt(1, count++);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1308

}

1309

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1310

for (int32_t j = 0; j < count; j++) {

1311

uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)values.elementAti(j);

1312

int32_t ln_j_i = lengths.elementAti(j) + i;

1313

if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) {

1314

bestSnlp.setElementAt(newSnlp, ln_j_i);

1315

prev.setElementAt(i, ln_j_i);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

}

}

// In Japanese,

// Katakana word in single character is pretty rare. So we apply

1321

// the following heuristic to Katakana: any continuous run of Katakana

1322

// characters is considered a candidate word with a default cost

1323

// specified in the katakanaCost table according to its length.

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1324

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

1325

bool is_katakana = isKatakana(inString.char32At(ix));

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1326

int32_t katakanaRunLength = 1;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1327

if (!is_prev_katakana && is_katakana) {

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

1328

int32_t j = inString.moveIndex32(ix, 1);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1329

// Find the end of the continuous run of Katakana characters

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

1330

while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength &&

1331

isKatakana(inString.char32At(j))) {

1332

j = inString.moveIndex32(j, 1);

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1333

katakanaRunLength++;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1334

}

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1335

if (katakanaRunLength < kMaxKatakanaGroupLength) {

1336

uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);

Jungshik Shin

aff99f5

2018-04-11 17:29:08 -0700

[diff] [blame]

1337

if (newSnlp < (uint32_t)bestSnlp.elementAti(i+katakanaRunLength)) {

1338

bestSnlp.setElementAt(newSnlp, i+katakanaRunLength);

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1339

prev.setElementAt(i, i+katakanaRunLength); // prev[j] = i;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

}

}

}

is_prev_katakana = is_katakana;

1344

}

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1345

utext_close(&fu);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1346

1347

// Start pushing the optimal offset index into t_boundary (t for tentative).

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1348

// prev[numCodePts] is guaranteed to be meaningful.

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1349

// We'll first push in the reverse order, i.e.,

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1350

// t_boundary[0] = numCodePts, and afterwards do a swap.

1351

UVector32 t_boundary(numCodePts+1, status);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1352

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1353

int32_t numBreaks = 0;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1354

// No segmentation found, set boundary to end of range

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1355

if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {

1356

t_boundary.addElement(numCodePts, status);

1357

numBreaks++;

Frank Tang

2022-04-08 20:34:12 -0700

[diff] [blame]

1358

} else if (isPhraseBreaking) {

1359

t_boundary.addElement(numCodePts, status);

1360

if(U_SUCCESS(status)) {

1361

numBreaks++;

1362

int32_t prevIdx = numCodePts;

1363

1364

int32_t codeUnitIdx = -1;

1365

int32_t prevCodeUnitIdx = -1;

1366

int32_t length = -1;

1367

for (int32_t i = prev.elementAti(numCodePts); i > 0; i = prev.elementAti(i)) {

1368

codeUnitIdx = inString.moveIndex32(0, i);

1369

prevCodeUnitIdx = inString.moveIndex32(0, prevIdx);

1370

// Calculate the length by using the code unit.

1371

length = prevCodeUnitIdx - codeUnitIdx;

1372

prevIdx = i;

1373

// Keep the breakpoint if the pattern is not in the fSkipSet and continuous Katakana

1374

// characters don't occur.

1375

if (!fSkipSet.containsKey(inString.tempSubString(codeUnitIdx, length))

1376

&& (!isKatakana(inString.char32At(inString.moveIndex32(codeUnitIdx, -1)))

1377

|| !isKatakana(inString.char32At(codeUnitIdx)))) {

1378

t_boundary.addElement(i, status);

numBreaks++;

}

}

}

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1383

} else {

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1384

for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) {

1385

t_boundary.addElement(i, status);

1386

numBreaks++;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1387

}

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1388

U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1389

}

1390

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1391

// Add a break for the start of the dictionary range if there is not one

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1392

// there already.

1393

if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1394

t_boundary.addElement(0, status);

1395

numBreaks++;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1396

}

1397

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1398

// Now that we're done, convert positions in t_boundary[] (indices in

1399

// the normalized input string) back to indices in the original input UText

1400

// while reversing t_boundary and pushing values to foundBreaks.

Jungshik Shin

2017-05-13 21:10:13 -0700

[diff] [blame]

1401

int32_t prevCPPos = -1;

1402

int32_t prevUTextPos = -1;

Frank Tang

2022-04-08 20:34:12 -0700

[diff] [blame]

1403

int32_t correctedNumBreaks = 0;

1404

for (int32_t i = numBreaks - 1; i >= 0; i--) {

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1405

int32_t cpPos = t_boundary.elementAti(i);

Jungshik Shin

2017-05-13 21:10:13 -0700

[diff] [blame]

1406

U_ASSERT(cpPos > prevCPPos);

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

1407

int32_t utextPos = inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;

Jungshik Shin

2017-05-13 21:10:13 -0700

[diff] [blame]

1408

U_ASSERT(utextPos >= prevUTextPos);

1409

if (utextPos > prevUTextPos) {

1410

// Boundaries are added to foundBreaks output in ascending order.

1411

U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos);

Frank Tang

2022-04-08 20:34:12 -0700

[diff] [blame]

1412

// In phrase breaking, there has to be a breakpoint between Cj character and close

1413

// punctuation.

1414

// E.g.［携帯電話］正しい選択 -> ［携帯▁電話］▁正しい▁選択 -> breakpoint between ］ and 正

1415

if (utextPos != rangeStart

1416

|| (isPhraseBreaking && utextPos > 0

1417

&& fClosePunctuationSet.contains(utext_char32At(inText, utextPos - 1)))) {

1418

foundBreaks.push(utextPos, status);

1419

correctedNumBreaks++;

1420

}

Jungshik Shin

2017-05-13 21:10:13 -0700

[diff] [blame]

1421

} else {

1422

// Normalization expanded the input text, the dictionary found a boundary

1423

// within the expansion, giving two boundaries with the same index in the

1424

// original text. Ignore the second. See ticket #12918.

--numBreaks;

}

prevCPPos = cpPos;

prevUTextPos = utextPos;

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1429

}

Jungshik Shin

2017-11-07 11:18:34 -0800

[diff] [blame]

1430

(void)prevCPPos; // suppress compiler warnings about unused variable

jshin@chromium.org

2014-03-26 22:15:14 +0000

[diff] [blame]

1431

Frank Tang

2022-04-08 20:34:12 -0700

[diff] [blame]

1432

UChar32 nextChar = utext_char32At(inText, rangeEnd);

1433

if (!foundBreaks.isEmpty() && foundBreaks.peeki() == rangeEnd) {

1434

// In phrase breaking, there has to be a breakpoint between Cj character and

1435

// the number/open punctuation.

1436

// E.g. る文字「そうだ、京都」->る▁文字▁「そうだ、▁京都」-> breakpoint between 字 and「

1437

// E.g. 乗車率９０％程度だろうか -> 乗車▁率▁９０％▁程度だろうか -> breakpoint between 率 and ９

1438

// E.g. しかもロゴがＵｎｉｃｏｄｅ！ -> しかも▁ロゴが▁Ｕｎｉｃｏｄｅ！-> breakpoint between が and Ｕ

1439

if (isPhraseBreaking) {

1440

if (!fDigitOrOpenPunctuationOrAlphabetSet.contains(nextChar)) {

1441

foundBreaks.popi();

1442

correctedNumBreaks--;

}

} else {

foundBreaks.popi();

correctedNumBreaks--;

}

}

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

1450

// inString goes out of scope

1451

// inputMap goes out of scope

Frank Tang

2022-04-08 20:34:12 -0700

[diff] [blame]

1452

return correctedNumBreaks;

1453

}

1454

1455

void CjkBreakEngine::initJapanesePhraseParameter(UErrorCode& error) {

1456

loadJapaneseExtensions(error);

loadHiragana(error);

}

void CjkBreakEngine::loadJapaneseExtensions(UErrorCode& error) {

1461

const char* tag = "extensions";

1462

ResourceBundle ja(U_ICUDATA_BRKITR, "ja", error);

1463

if (U_SUCCESS(error)) {

1464

ResourceBundle bundle = ja.get(tag, error);

1465

while (U_SUCCESS(error) && bundle.hasNext()) {

1466

fSkipSet.puti(bundle.getNextString(error), 1, error);

}

}

}

void CjkBreakEngine::loadHiragana(UErrorCode& error) {

1472

UnicodeSet hiraganaWordSet(UnicodeString(u"[:Hiragana:]"), error);

1473

hiraganaWordSet.compact();

1474

UnicodeSetIterator iterator(hiraganaWordSet);

1475

while (iterator.next()) {

1476

fSkipSet.puti(UnicodeString(iterator.getCodepoint()), 1, error);

1477

}

jshin@chromium.org