Blame - source/i18n/collationbuilder.cpp - chromium.googlesource.com/chromium/deps/icu

2015-01-08 15:46:45 -0800

[diff] [blame]

3

/*

4

*******************************************************************************

5

6

7

*******************************************************************************

8

* collationbuilder.cpp

9

*

10

* (replaced the former ucol_bld.cpp)

11

*

12

* created on: 2013may06

13

* created by: Markus W. Scherer

14

*/

15

16

#ifdef DEBUG_COLLATION_BUILDER

#include <stdio.h>

#endif

#include "unicode/utypes.h"

21

22

#if !UCONFIG_NO_COLLATION

23

24

#include "unicode/caniter.h"

25

#include "unicode/normalizer2.h"

26

#include "unicode/tblcoll.h"

27

#include "unicode/parseerr.h"

28

#include "unicode/uchar.h"

29

#include "unicode/ucol.h"

30

#include "unicode/unistr.h"

31

#include "unicode/usetiter.h"

32

#include "unicode/utf16.h"

33

#include "unicode/uversion.h"

34

#include "cmemory.h"

35

#include "collation.h"

36

#include "collationbuilder.h"

37

#include "collationdata.h"

38

#include "collationdatabuilder.h"

39

#include "collationfastlatin.h"

40

#include "collationroot.h"

41

#include "collationrootelements.h"

42

#include "collationruleparser.h"

43

#include "collationsettings.h"

44

#include "collationtailoring.h"

45

#include "collationweights.h"

46

#include "normalizer2impl.h"

47

#include "uassert.h"

48

#include "ucol_imp.h"

49

#include "utf16collationiterator.h"

U_NAMESPACE_BEGIN

namespace {

class BundleImporter : public CollationRuleParser::Importer {

56

public:

57

BundleImporter() {}

58

virtual ~BundleImporter();

59

virtual void getRules(

60

const char *localeID, const char *collationType,

61

UnicodeString &rules,

Frank Tang

2021-11-08 14:04:04 -0800

[diff] [blame]

62

const char *&errorReason, UErrorCode &errorCode) override;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

63

};

64

65

BundleImporter::~BundleImporter() {}

66

67

void

68

BundleImporter::getRules(

69

const char *localeID, const char *collationType,

70

UnicodeString &rules,

71

const char *& /*errorReason*/, UErrorCode &errorCode) {

72

CollationLoader::loadRules(localeID, collationType, rules, errorCode);

}

} // namespace

// RuleBasedCollator implementation ---------------------------------------- ***

78

79

// These methods are here, rather than in rulebasedcollator.cpp,

80

// for modularization:

81

// Most code using Collator does not need to build a Collator from rules.

82

// By moving these constructors and helper methods to a separate file,

83

// most code will not have a static dependency on the builder code.

84

85

RuleBasedCollator::RuleBasedCollator()

: data(NULL),

settings(NULL),

tailoring(NULL),

cacheEntry(NULL),

validLocale(""),

explicitlySetAttributes(0),

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

92

actualLocaleIsSameAsValid(false) {

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

93

}

94

95

RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, UErrorCode &errorCode)

: data(NULL),

settings(NULL),

tailoring(NULL),

cacheEntry(NULL),

validLocale(""),

explicitlySetAttributes(0),

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

102

actualLocaleIsSameAsValid(false) {

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

103

internalBuildTailoring(rules, UCOL_DEFAULT, UCOL_DEFAULT, NULL, NULL, errorCode);

104

}

105

106

RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, ECollationStrength strength,

107

UErrorCode &errorCode)

: data(NULL),

settings(NULL),

tailoring(NULL),

cacheEntry(NULL),

validLocale(""),

explicitlySetAttributes(0),

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

114

actualLocaleIsSameAsValid(false) {

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

115

internalBuildTailoring(rules, strength, UCOL_DEFAULT, NULL, NULL, errorCode);

116

}

117

118

RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules,

119

UColAttributeValue decompositionMode,

120

UErrorCode &errorCode)

: data(NULL),

settings(NULL),

tailoring(NULL),

cacheEntry(NULL),

validLocale(""),

explicitlySetAttributes(0),

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

127

actualLocaleIsSameAsValid(false) {

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

128

internalBuildTailoring(rules, UCOL_DEFAULT, decompositionMode, NULL, NULL, errorCode);

129

}

130

131

RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules,

132

ECollationStrength strength,

133

UColAttributeValue decompositionMode,

134

UErrorCode &errorCode)

: data(NULL),

settings(NULL),

tailoring(NULL),

cacheEntry(NULL),

validLocale(""),

explicitlySetAttributes(0),

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

141

actualLocaleIsSameAsValid(false) {

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

142

internalBuildTailoring(rules, strength, decompositionMode, NULL, NULL, errorCode);

143

}

144

145

RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules,

146

UParseError &parseError, UnicodeString &reason,

147

UErrorCode &errorCode)

: data(NULL),

settings(NULL),

tailoring(NULL),

cacheEntry(NULL),

validLocale(""),

explicitlySetAttributes(0),

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

154

actualLocaleIsSameAsValid(false) {

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

155

internalBuildTailoring(rules, UCOL_DEFAULT, UCOL_DEFAULT, &parseError, &reason, errorCode);

}

void

RuleBasedCollator::internalBuildTailoring(const UnicodeString &rules,

160

int32_t strength,

161

UColAttributeValue decompositionMode,

162

UParseError *outParseError, UnicodeString *outReason,

163

UErrorCode &errorCode) {

164

const CollationTailoring *base = CollationRoot::getRoot(errorCode);

165

if(U_FAILURE(errorCode)) { return; }

166

if(outReason != NULL) { outReason->remove(); }

167

CollationBuilder builder(base, errorCode);

168

UVersionInfo noVersion = { 0, 0, 0, 0 };

169

BundleImporter importer;

170

LocalPointer<CollationTailoring> t(builder.parseAndBuild(rules, noVersion,

171

&importer,

172

outParseError, errorCode));

173

if(U_FAILURE(errorCode)) {

174

const char *reason = builder.getErrorReason();

175

if(reason != NULL && outReason != NULL) {

176

*outReason = UnicodeString(reason, -1, US_INV);

}

return;

}

t->actualLocale.setToBogus();

181

adoptTailoring(t.orphan(), errorCode);

182

// Set attributes after building the collator,

183

// to keep the default settings consistent with the rule string.

184

if(strength != UCOL_DEFAULT) {

185

setAttribute(UCOL_STRENGTH, (UColAttributeValue)strength, errorCode);

186

}

187

if(decompositionMode != UCOL_DEFAULT) {

188

setAttribute(UCOL_NORMALIZATION_MODE, decompositionMode, errorCode);

}

}

// CollationBuilder implementation ----------------------------------------- ***

193

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

194

CollationBuilder::CollationBuilder(const CollationTailoring *b, UBool icu4xMode, UErrorCode &errorCode)

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

195

: nfd(*Normalizer2::getNFDInstance(errorCode)),

196

fcd(*Normalizer2Factory::getFCDInstance(errorCode)),

197

nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)),

198

base(b),

199

baseData(b->data),

200

rootElements(b->data->rootElements, b->data->rootElementsLength),

201

variableTop(0),

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

202

dataBuilder(new CollationDataBuilder(icu4xMode, errorCode)), fastLatinEnabled(true),

203

icu4xMode(icu4xMode),

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

204

errorReason(NULL),

205

cesLength(0),

206

rootPrimaryIndexes(errorCode), nodes(errorCode) {

207

nfcImpl.ensureCanonIterData(errorCode);

208

if(U_FAILURE(errorCode)) {

209

errorReason = "CollationBuilder fields initialization failed";

210

return;

211

}

212

if(dataBuilder == NULL) {

213

errorCode = U_MEMORY_ALLOCATION_ERROR;

214

return;

215

}

216

dataBuilder->initForTailoring(baseData, errorCode);

217

if(U_FAILURE(errorCode)) {

218

errorReason = "CollationBuilder initialization failed";

}

}

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

222

CollationBuilder::CollationBuilder(const CollationTailoring *b, UErrorCode &errorCode)

223

: CollationBuilder(b, false, errorCode)

224

{}

225

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

226

CollationBuilder::~CollationBuilder() {

delete dataBuilder;

}

CollationTailoring *

CollationBuilder::parseAndBuild(const UnicodeString &ruleString,

232

const UVersionInfo rulesVersion,

233

CollationRuleParser::Importer *importer,

234

UParseError *outParseError,

235

UErrorCode &errorCode) {

236

if(U_FAILURE(errorCode)) { return NULL; }

237

if(baseData->rootElements == NULL) {

238

errorCode = U_MISSING_RESOURCE_ERROR;

239

errorReason = "missing root elements data, tailoring not supported";

240

return NULL;

241

}

242

LocalPointer<CollationTailoring> tailoring(new CollationTailoring(base->settings));

243

if(tailoring.isNull() || tailoring->isBogus()) {

244

errorCode = U_MEMORY_ALLOCATION_ERROR;

245

return NULL;

246

}

247

CollationRuleParser parser(baseData, errorCode);

248

if(U_FAILURE(errorCode)) { return NULL; }

249

// Note: This always bases &[last variable] and &[first regular]

250

// on the root collator's maxVariable/variableTop.

251

// If we wanted this to change after [maxVariable x], then we would keep

252

// the tailoring.settings pointer here and read its variableTop when we need it.

253

// See http://unicode.org/cldr/trac/ticket/6070

254

variableTop = base->settings->variableTop;

255

parser.setSink(this);

256

parser.setImporter(importer);

257

CollationSettings &ownedSettings = *SharedObject::copyOnWrite(tailoring->settings);

258

parser.parse(ruleString, ownedSettings, outParseError, errorCode);

259

errorReason = parser.getErrorReason();

260

if(U_FAILURE(errorCode)) { return NULL; }

261

if(dataBuilder->hasMappings()) {

262

makeTailoredCEs(errorCode);

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

263

if (!icu4xMode) {

264

closeOverComposites(errorCode);

265

}

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

266

finalizeCEs(errorCode);

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

267

if (!icu4xMode) {

268

// Copy all of ASCII, and Latin-1 letters, into each tailoring.

269

optimizeSet.add(0, 0x7f);

270

optimizeSet.add(0xc0, 0xff);

271

// Hangul is decomposed on the fly during collation,

272

// and the tailoring data is always built with HANGUL_TAG specials.

273

optimizeSet.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END);

274

dataBuilder->optimize(optimizeSet, errorCode);

275

}

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

276

tailoring->ensureOwnedData(errorCode);

277

if(U_FAILURE(errorCode)) { return NULL; }

278

if(fastLatinEnabled) { dataBuilder->enableFastLatin(); }

279

dataBuilder->build(*tailoring->ownedData, errorCode);

280

tailoring->builder = dataBuilder;

281

dataBuilder = NULL;

282

} else {

283

tailoring->data = baseData;

284

}

285

if(U_FAILURE(errorCode)) { return NULL; }

286

ownedSettings.fastLatinOptions = CollationFastLatin::getOptions(

287

tailoring->data, ownedSettings,

288

ownedSettings.fastLatinPrimaries, UPRV_LENGTHOF(ownedSettings.fastLatinPrimaries));

289

tailoring->rules = ruleString;

290

tailoring->rules.getTerminatedBuffer(); // ensure NUL-termination

291

tailoring->setVersion(base->version, rulesVersion);

292

return tailoring.orphan();

}

void

CollationBuilder::addReset(int32_t strength, const UnicodeString &str,

297

const char *&parserErrorReason, UErrorCode &errorCode) {

298

if(U_FAILURE(errorCode)) { return; }

299

U_ASSERT(!str.isEmpty());

300

if(str.charAt(0) == CollationRuleParser::POS_LEAD) {

301

ces[0] = getSpecialResetPosition(str, parserErrorReason, errorCode);

302

cesLength = 1;

303

if(U_FAILURE(errorCode)) { return; }

304

U_ASSERT((ces[0] & Collation::CASE_AND_QUATERNARY_MASK) == 0);

305

} else {

306

// normal reset to a character or string

307

UnicodeString nfdString = nfd.normalize(str, errorCode);

308

if(U_FAILURE(errorCode)) {

309

parserErrorReason = "normalizing the reset position";

310

return;

311

}

312

cesLength = dataBuilder->getCEs(nfdString, ces, 0);

313

if(cesLength > Collation::MAX_EXPANSION_LENGTH) {

314

errorCode = U_ILLEGAL_ARGUMENT_ERROR;

315

parserErrorReason = "reset position maps to too many collation elements (more than 31)";

return;

}

}

if(strength == UCOL_IDENTICAL) { return; } // simple reset-at-position

320

321

// &[before strength]position

322

U_ASSERT(UCOL_PRIMARY <= strength && strength <= UCOL_TERTIARY);

323

int32_t index = findOrInsertNodeForCEs(strength, parserErrorReason, errorCode);

324

if(U_FAILURE(errorCode)) { return; }

325

326

int64_t node = nodes.elementAti(index);

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

327

// If the index is for a "weaker" node,

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

328

// then skip backwards over this and further "weaker" nodes.

329

while(strengthFromNode(node) > strength) {

330

index = previousIndexFromNode(node);

331

node = nodes.elementAti(index);

332

}

333

334

// Find or insert a node whose index we will put into a temporary CE.

335

if(strengthFromNode(node) == strength && isTailoredNode(node)) {

336

// Reset to just before this same-strength tailored node.

337

index = previousIndexFromNode(node);

338

} else if(strength == UCOL_PRIMARY) {

339

// root primary node (has no previous index)

340

uint32_t p = weight32FromNode(node);

341

if(p == 0) {

342

errorCode = U_UNSUPPORTED_ERROR;

343

parserErrorReason = "reset primary-before ignorable not possible";

344

return;

345

}

346

if(p <= rootElements.getFirstPrimary()) {

347

// There is no primary gap between ignorables and the space-first-primary.

348

errorCode = U_UNSUPPORTED_ERROR;

349

parserErrorReason = "reset primary-before first non-ignorable not supported";

350

return;

351

}

352

if(p == Collation::FIRST_TRAILING_PRIMARY) {

353

// We do not support tailoring to an unassigned-implicit CE.

354

errorCode = U_UNSUPPORTED_ERROR;

355

parserErrorReason = "reset primary-before [first trailing] not supported";

356

return;

357

}

358

p = rootElements.getPrimaryBefore(p, baseData->isCompressiblePrimary(p));

359

index = findOrInsertNodeForPrimary(p, errorCode);

360

// Go to the last node in this list:

361

// Tailor after the last node between adjacent root nodes.

362

for(;;) {

363

node = nodes.elementAti(index);

364

int32_t nextIndex = nextIndexFromNode(node);

365

if(nextIndex == 0) { break; }

index = nextIndex;

}

} else {

// &[before 2] or &[before 3]

370

index = findCommonNode(index, UCOL_SECONDARY);

371

if(strength >= UCOL_TERTIARY) {

372

index = findCommonNode(index, UCOL_TERTIARY);

373

}

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

374

// findCommonNode() stayed on the stronger node or moved to

375

// an explicit common-weight node of the reset-before strength.

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

376

node = nodes.elementAti(index);

377

if(strengthFromNode(node) == strength) {

378

// Found a same-strength node with an explicit weight.

379

uint32_t weight16 = weight16FromNode(node);

380

if(weight16 == 0) {

381

errorCode = U_UNSUPPORTED_ERROR;

382

if(strength == UCOL_SECONDARY) {

383

parserErrorReason = "reset secondary-before secondary ignorable not possible";

384

} else {

385

parserErrorReason = "reset tertiary-before completely ignorable not possible";

386

}

387

return;

388

}

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

389

U_ASSERT(weight16 > Collation::BEFORE_WEIGHT16);

390

// Reset to just before this node.

391

// Insert the preceding same-level explicit weight if it is not there already.

392

// Which explicit weight immediately precedes this one?

393

weight16 = getWeight16Before(index, node, strength);

394

// Does this preceding weight have a node?

395

uint32_t previousWeight16;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

396

int32_t previousIndex = previousIndexFromNode(node);

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

397

for(int32_t i = previousIndex;; i = previousIndexFromNode(node)) {

398

node = nodes.elementAti(i);

399

int32_t previousStrength = strengthFromNode(node);

400

if(previousStrength < strength) {

401

U_ASSERT(weight16 >= Collation::COMMON_WEIGHT16 || i == previousIndex);

402

// Either the reset element has an above-common weight and

403

// the parent node provides the implied common weight,

404

// or the reset element has a weight<=common in the node

405

// right after the parent, and we need to insert the preceding weight.

406

previousWeight16 = Collation::COMMON_WEIGHT16;

407

break;

408

} else if(previousStrength == strength && !isTailoredNode(node)) {

409

previousWeight16 = weight16FromNode(node);

410

break;

411

}

412

// Skip weaker nodes and same-level tailored nodes.

413

}

414

if(previousWeight16 == weight16) {

415

// The preceding weight has a node,

416

// maybe with following weaker or tailored nodes.

417

// Reset to the last of them.

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

418

index = previousIndex;

419

} else {

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

420

// Insert a node with the preceding weight, reset to that.

421

node = nodeFromWeight16(weight16) | nodeFromStrength(strength);

422

index = insertNodeBetween(previousIndex, index, node, errorCode);

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

423

}

424

} else {

425

// Found a stronger node with implied strength-common weight.

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

426

uint32_t weight16 = getWeight16Before(index, node, strength);

427

index = findOrInsertWeakNode(index, weight16, strength, errorCode);

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

428

}

429

// Strength of the temporary CE = strength of its reset position.

430

// Code above raises an error if the before-strength is stronger.

431

strength = ceStrength(ces[cesLength - 1]);

432

}

433

if(U_FAILURE(errorCode)) {

434

parserErrorReason = "inserting reset position for &[before n]";

435

return;

436

}

437

ces[cesLength - 1] = tempCEFromIndexAndStrength(index, strength);

438

}

439

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

440

uint32_t

441

CollationBuilder::getWeight16Before(int32_t index, int64_t node, int32_t level) {

442

U_ASSERT(strengthFromNode(node) < level || !isTailoredNode(node));

443

// Collect the root CE weights if this node is for a root CE.

444

// If it is not, then return the low non-primary boundary for a tailored CE.

445

uint32_t t;

446

if(strengthFromNode(node) == UCOL_TERTIARY) {

447

t = weight16FromNode(node);

448

} else {

449

t = Collation::COMMON_WEIGHT16; // Stronger node with implied common weight.

450

}

451

while(strengthFromNode(node) > UCOL_SECONDARY) {

452

index = previousIndexFromNode(node);

453

node = nodes.elementAti(index);

454

}

455

if(isTailoredNode(node)) {

456

return Collation::BEFORE_WEIGHT16;

457

}

458

uint32_t s;

459

if(strengthFromNode(node) == UCOL_SECONDARY) {

460

s = weight16FromNode(node);

461

} else {

462

s = Collation::COMMON_WEIGHT16; // Stronger node with implied common weight.

463

}

464

while(strengthFromNode(node) > UCOL_PRIMARY) {

465

index = previousIndexFromNode(node);

466

node = nodes.elementAti(index);

467

}

468

if(isTailoredNode(node)) {

469

return Collation::BEFORE_WEIGHT16;

470

}

471

// [p, s, t] is a root CE. Return the preceding weight for the requested level.

472

uint32_t p = weight32FromNode(node);

473

uint32_t weight16;

474

if(level == UCOL_SECONDARY) {

475

weight16 = rootElements.getSecondaryBefore(p, s);

476

} else {

477

weight16 = rootElements.getTertiaryBefore(p, s, t);

478

U_ASSERT((weight16 & ~Collation::ONLY_TERTIARY_MASK) == 0);

}

return weight16;

}

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

483

int64_t

484

CollationBuilder::getSpecialResetPosition(const UnicodeString &str,

485

const char *&parserErrorReason, UErrorCode &errorCode) {

486

U_ASSERT(str.length() == 2);

487

int64_t ce;

488

int32_t strength = UCOL_PRIMARY;

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

489

UBool isBoundary = false;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

490

UChar32 pos = str.charAt(1) - CollationRuleParser::POS_BASE;

491

U_ASSERT(0 <= pos && pos <= CollationRuleParser::LAST_TRAILING);

492

switch(pos) {

493

case CollationRuleParser::FIRST_TERTIARY_IGNORABLE:

494

// Quaternary CEs are not supported.

495

// Non-zero quaternary weights are possible only on tertiary or stronger CEs.

496

return 0;

497

case CollationRuleParser::LAST_TERTIARY_IGNORABLE:

498

return 0;

499

case CollationRuleParser::FIRST_SECONDARY_IGNORABLE: {

500

// Look for a tailored tertiary node after [0, 0, 0].

501

int32_t index = findOrInsertNodeForRootCE(0, UCOL_TERTIARY, errorCode);

502

if(U_FAILURE(errorCode)) { return 0; }

503

int64_t node = nodes.elementAti(index);

504

if((index = nextIndexFromNode(node)) != 0) {

505

node = nodes.elementAti(index);

506

U_ASSERT(strengthFromNode(node) <= UCOL_TERTIARY);

507

if(isTailoredNode(node) && strengthFromNode(node) == UCOL_TERTIARY) {

508

return tempCEFromIndexAndStrength(index, UCOL_TERTIARY);

509

}

510

}

511

return rootElements.getFirstTertiaryCE();

512

// No need to look for nodeHasAnyBefore() on a tertiary node.

513

}

514

case CollationRuleParser::LAST_SECONDARY_IGNORABLE:

515

ce = rootElements.getLastTertiaryCE();

516

strength = UCOL_TERTIARY;

517

break;

518

case CollationRuleParser::FIRST_PRIMARY_IGNORABLE: {

519

// Look for a tailored secondary node after [0, 0, *].

520

int32_t index = findOrInsertNodeForRootCE(0, UCOL_SECONDARY, errorCode);

521

if(U_FAILURE(errorCode)) { return 0; }

522

int64_t node = nodes.elementAti(index);

523

while((index = nextIndexFromNode(node)) != 0) {

524

node = nodes.elementAti(index);

525

strength = strengthFromNode(node);

526

if(strength < UCOL_SECONDARY) { break; }

527

if(strength == UCOL_SECONDARY) {

528

if(isTailoredNode(node)) {

529

if(nodeHasBefore3(node)) {

530

index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));

531

U_ASSERT(isTailoredNode(nodes.elementAti(index)));

532

}

533

return tempCEFromIndexAndStrength(index, UCOL_SECONDARY);

} else {

break;

}

}

}

ce = rootElements.getFirstSecondaryCE();

540

strength = UCOL_SECONDARY;

541

break;

542

}

543

case CollationRuleParser::LAST_PRIMARY_IGNORABLE:

544

ce = rootElements.getLastSecondaryCE();

545

strength = UCOL_SECONDARY;

546

break;

547

case CollationRuleParser::FIRST_VARIABLE:

548

ce = rootElements.getFirstPrimaryCE();

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

549

isBoundary = true; // FractionalUCA.txt: FDD1 00A0, SPACE first primary

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

550

break;

551

case CollationRuleParser::LAST_VARIABLE:

552

ce = rootElements.lastCEWithPrimaryBefore(variableTop + 1);

553

break;

554

case CollationRuleParser::FIRST_REGULAR:

555

ce = rootElements.firstCEWithPrimaryAtLeast(variableTop + 1);

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

556

isBoundary = true; // FractionalUCA.txt: FDD1 263A, SYMBOL first primary

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

557

break;

558

case CollationRuleParser::LAST_REGULAR:

559

// Use the Hani-first-primary rather than the actual last "regular" CE before it,

560

// for backward compatibility with behavior before the introduction of

561

// script-first-primary CEs in the root collator.

562

ce = rootElements.firstCEWithPrimaryAtLeast(

563

baseData->getFirstPrimaryForGroup(USCRIPT_HAN));

564

break;

565

case CollationRuleParser::FIRST_IMPLICIT:

566

ce = baseData->getSingleCE(0x4e00, errorCode);

567

break;

568

case CollationRuleParser::LAST_IMPLICIT:

569

// We do not support tailoring to an unassigned-implicit CE.

570

errorCode = U_UNSUPPORTED_ERROR;

571

parserErrorReason = "reset to [last implicit] not supported";

572

return 0;

573

case CollationRuleParser::FIRST_TRAILING:

574

ce = Collation::makeCE(Collation::FIRST_TRAILING_PRIMARY);

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

575

isBoundary = true; // trailing first primary (there is no mapping for it)

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

576

break;

577

case CollationRuleParser::LAST_TRAILING:

578

errorCode = U_ILLEGAL_ARGUMENT_ERROR;

579

parserErrorReason = "LDML forbids tailoring to U+FFFF";

580

return 0;

581

default:

Frank Tang

2021-11-08 14:04:04 -0800

[diff] [blame]

582

UPRV_UNREACHABLE_EXIT;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

583

}

584

585

int32_t index = findOrInsertNodeForRootCE(ce, strength, errorCode);

586

if(U_FAILURE(errorCode)) { return 0; }

587

int64_t node = nodes.elementAti(index);

588

if((pos & 1) == 0) {

589

// even pos = [first xyz]

590

if(!nodeHasAnyBefore(node) && isBoundary) {

591

// A <group> first primary boundary is artificially added to FractionalUCA.txt.

592

// It is reachable via its special contraction, but is not normally used.

593

// Find the first character tailored after the boundary CE,

594

// or the first real root CE after it.

595

if((index = nextIndexFromNode(node)) != 0) {

596

// If there is a following node, then it must be tailored

597

// because there are no root CEs with a boundary primary

598

// and non-common secondary/tertiary weights.

599

node = nodes.elementAti(index);

600

U_ASSERT(isTailoredNode(node));

601

ce = tempCEFromIndexAndStrength(index, strength);

602

} else {

603

U_ASSERT(strength == UCOL_PRIMARY);

604

uint32_t p = (uint32_t)(ce >> 32);

605

int32_t pIndex = rootElements.findPrimary(p);

606

UBool isCompressible = baseData->isCompressiblePrimary(p);

607

p = rootElements.getPrimaryAfter(p, pIndex, isCompressible);

608

ce = Collation::makeCE(p);

609

index = findOrInsertNodeForRootCE(ce, UCOL_PRIMARY, errorCode);

610

if(U_FAILURE(errorCode)) { return 0; }

611

node = nodes.elementAti(index);

612

}

613

}

614

if(nodeHasAnyBefore(node)) {

615

// Get the first node that was tailored before this one at a weaker strength.

616

if(nodeHasBefore2(node)) {

617

index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));

618

node = nodes.elementAti(index);

619

}

620

if(nodeHasBefore3(node)) {

621

index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));

622

}

623

U_ASSERT(isTailoredNode(nodes.elementAti(index)));

624

ce = tempCEFromIndexAndStrength(index, strength);

625

}

626

} else {

627

// odd pos = [last xyz]

628

// Find the last node that was tailored after the [last xyz]

629

// at a strength no greater than the position's strength.

630

for(;;) {

631

int32_t nextIndex = nextIndexFromNode(node);

632

if(nextIndex == 0) { break; }

633

int64_t nextNode = nodes.elementAti(nextIndex);

634

if(strengthFromNode(nextNode) < strength) { break; }

index = nextIndex;

node = nextNode;

}

// Do not make a temporary CE for a root node.

639

// This last node might be the node for the root CE itself,

640

// or a node with a common secondary or tertiary weight.

641

if(isTailoredNode(node)) {

642

ce = tempCEFromIndexAndStrength(index, strength);

}

}

return ce;

}

void

CollationBuilder::addRelation(int32_t strength, const UnicodeString &prefix,

650

const UnicodeString &str, const UnicodeString &extension,

651

const char *&parserErrorReason, UErrorCode &errorCode) {

652

if(U_FAILURE(errorCode)) { return; }

653

UnicodeString nfdPrefix;

654

if(!prefix.isEmpty()) {

655

nfd.normalize(prefix, nfdPrefix, errorCode);

656

if(U_FAILURE(errorCode)) {

657

parserErrorReason = "normalizing the relation prefix";

return;

}

}

UnicodeString nfdString = nfd.normalize(str, errorCode);

662

if(U_FAILURE(errorCode)) {

663

parserErrorReason = "normalizing the relation string";

return;

}

// The runtime code decomposes Hangul syllables on the fly,

668

// with recursive processing but without making the Jamo pieces visible for matching.

669

// It does not work with certain types of contextual mappings.

670

int32_t nfdLength = nfdString.length();

671

if(nfdLength >= 2) {

672

UChar c = nfdString.charAt(0);

673

if(Hangul::isJamoL(c) || Hangul::isJamoV(c)) {

674

// While handling a Hangul syllable, contractions starting with Jamo L or V

675

// would not see the following Jamo of that syllable.

676

errorCode = U_UNSUPPORTED_ERROR;

677

parserErrorReason = "contractions starting with conjoining Jamo L or V not supported";

678

return;

679

}

680

c = nfdString.charAt(nfdLength - 1);

681

if(Hangul::isJamoL(c) ||

682

(Hangul::isJamoV(c) && Hangul::isJamoL(nfdString.charAt(nfdLength - 2)))) {

683

// A contraction ending with Jamo L or L+V would require

684

// generating Hangul syllables in addTailComposites() (588 for a Jamo L),

685

// or decomposing a following Hangul syllable on the fly, during contraction matching.

686

errorCode = U_UNSUPPORTED_ERROR;

687

parserErrorReason = "contractions ending with conjoining Jamo L or L+V not supported";

688

return;

689

}

690

// A Hangul syllable completely inside a contraction is ok.

691

}

692

// Note: If there is a prefix, then the parser checked that

Frank Tang

7e7574b

2021-04-13 21:19:13 -0700

[diff] [blame]

693

// both the prefix and the string begin with NFC boundaries (not Jamo V or T).

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

694

// Therefore: prefix.isEmpty() || !isJamoVOrT(nfdString.charAt(0))

695

// (While handling a Hangul syllable, prefixes on Jamo V or T

696

// would not see the previous Jamo of that syllable.)

697

698

if(strength != UCOL_IDENTICAL) {

699

// Find the node index after which we insert the new tailored node.

700

int32_t index = findOrInsertNodeForCEs(strength, parserErrorReason, errorCode);

701

U_ASSERT(cesLength > 0);

702

int64_t ce = ces[cesLength - 1];

703

if(strength == UCOL_PRIMARY && !isTempCE(ce) && (uint32_t)(ce >> 32) == 0) {

704

// There is no primary gap between ignorables and the space-first-primary.

705

errorCode = U_UNSUPPORTED_ERROR;

706

parserErrorReason = "tailoring primary after ignorables not supported";

707

return;

708

}

709

if(strength == UCOL_QUATERNARY && ce == 0) {

710

// The CE data structure does not support non-zero quaternary weights

711

// on tertiary ignorables.

712

errorCode = U_UNSUPPORTED_ERROR;

713

parserErrorReason = "tailoring quaternary after tertiary ignorables not supported";

714

return;

715

}

716

// Insert the new tailored node.

717

index = insertTailoredNodeAfter(index, strength, errorCode);

718

if(U_FAILURE(errorCode)) {

719

parserErrorReason = "modifying collation elements";

720

return;

721

}

722

// Strength of the temporary CE:

723

// The new relation may yield a stronger CE but not a weaker one.

724

int32_t tempStrength = ceStrength(ce);

725

if(strength < tempStrength) { tempStrength = strength; }

726

ces[cesLength - 1] = tempCEFromIndexAndStrength(index, tempStrength);

727

}

728

729

setCaseBits(nfdString, parserErrorReason, errorCode);

730

if(U_FAILURE(errorCode)) { return; }

731

732

int32_t cesLengthBeforeExtension = cesLength;

733

if(!extension.isEmpty()) {

734

UnicodeString nfdExtension = nfd.normalize(extension, errorCode);

735

if(U_FAILURE(errorCode)) {

736

parserErrorReason = "normalizing the relation extension";

737

return;

738

}

739

cesLength = dataBuilder->getCEs(nfdExtension, ces, cesLength);

740

if(cesLength > Collation::MAX_EXPANSION_LENGTH) {

741

errorCode = U_ILLEGAL_ARGUMENT_ERROR;

742

parserErrorReason =

743

"extension string adds too many collation elements (more than 31 total)";

return;

}

}

uint32_t ce32 = Collation::UNASSIGNED_CE32;

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

748

if(!icu4xMode && (prefix != nfdPrefix || str != nfdString) &&

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

749

!ignorePrefix(prefix, errorCode) && !ignoreString(str, errorCode)) {

750

// Map from the original input to the CEs.

751

// We do this in case the canonical closure is incomplete,

752

// so that it is possible to explicitly provide the missing mappings.

753

ce32 = addIfDifferent(prefix, str, ces, cesLength, ce32, errorCode);

754

}

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

755

if (!icu4xMode) {

756

addWithClosure(nfdPrefix, nfdString, ces, cesLength, ce32, errorCode);

757

} else {

758

addIfDifferent(nfdPrefix, nfdString, ces, cesLength, ce32, errorCode);

759

}

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

760

if(U_FAILURE(errorCode)) {

761

parserErrorReason = "writing collation elements";

762

return;

763

}

764

cesLength = cesLengthBeforeExtension;

}

int32_t

CollationBuilder::findOrInsertNodeForCEs(int32_t strength, const char *&parserErrorReason,

769

UErrorCode &errorCode) {

770

if(U_FAILURE(errorCode)) { return 0; }

771

U_ASSERT(UCOL_PRIMARY <= strength && strength <= UCOL_QUATERNARY);

772

773

// Find the last CE that is at least as "strong" as the requested difference.

774

// Note: Stronger is smaller (UCOL_PRIMARY=0).

775

int64_t ce;

776

for(;; --cesLength) {

if(cesLength == 0) {

ce = ces[0] = 0;

cesLength = 1;

break;

} else {

ce = ces[cesLength - 1];

783

}

784

if(ceStrength(ce) <= strength) { break; }

}

if(isTempCE(ce)) {

// No need to findCommonNode() here for lower levels

789

// because insertTailoredNodeAfter() will do that anyway.

790

return indexFromTempCE(ce);

}

// root CE

if((uint8_t)(ce >> 56) == Collation::UNASSIGNED_IMPLICIT_BYTE) {

795

errorCode = U_UNSUPPORTED_ERROR;

796

parserErrorReason = "tailoring relative to an unassigned code point not supported";

797

return 0;

798

}

799

return findOrInsertNodeForRootCE(ce, strength, errorCode);

}

int32_t

CollationBuilder::findOrInsertNodeForRootCE(int64_t ce, int32_t strength, UErrorCode &errorCode) {

804

if(U_FAILURE(errorCode)) { return 0; }

805

U_ASSERT((uint8_t)(ce >> 56) != Collation::UNASSIGNED_IMPLICIT_BYTE);

806

807

// Find or insert the node for each of the root CE's weights,

808

// down to the requested level/strength.

809

// Root CEs must have common=zero quaternary weights (for which we never insert any nodes).

810

U_ASSERT((ce & 0xc0) == 0);

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

811

int32_t index = findOrInsertNodeForPrimary((uint32_t)(ce >> 32), errorCode);

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

812

if(strength >= UCOL_SECONDARY) {

813

uint32_t lower32 = (uint32_t)ce;

814

index = findOrInsertWeakNode(index, lower32 >> 16, UCOL_SECONDARY, errorCode);

815

if(strength >= UCOL_TERTIARY) {

816

index = findOrInsertWeakNode(index, lower32 & Collation::ONLY_TERTIARY_MASK,

817

UCOL_TERTIARY, errorCode);

}

}

return index;

}

namespace {

/**

* Like Java Collections.binarySearch(List, key, Comparator).

827

*

828

* @return the index>=0 where the item was found,

829

* or the index<0 for inserting the string at ~index in sorted order

830

* (index into rootPrimaryIndexes)

831

*/

832

int32_t

833

binarySearchForRootPrimaryNode(const int32_t *rootPrimaryIndexes, int32_t length,

834

const int64_t *nodes, uint32_t p) {

835

if(length == 0) { return ~0; }

836

int32_t start = 0;

837

int32_t limit = length;

838

for (;;) {

839

int32_t i = (start + limit) / 2;

840

int64_t node = nodes[rootPrimaryIndexes[i]];

841

uint32_t nodePrimary = (uint32_t)(node >> 32); // weight32FromNode(node)

842

if (p == nodePrimary) {

843

return i;

844

} else if (p < nodePrimary) {

845

if (i == start) {

846

return ~start; // insert s before i

}

limit = i;

} else {

if (i == start) {

return ~(start + 1); // insert s after i

}

start = i;

}

}

}

} // namespace

int32_t

CollationBuilder::findOrInsertNodeForPrimary(uint32_t p, UErrorCode &errorCode) {

862

if(U_FAILURE(errorCode)) { return 0; }

863

864

int32_t rootIndex = binarySearchForRootPrimaryNode(

865

rootPrimaryIndexes.getBuffer(), rootPrimaryIndexes.size(), nodes.getBuffer(), p);

866

if(rootIndex >= 0) {

867

return rootPrimaryIndexes.elementAti(rootIndex);

868

} else {

869

// Start a new list of nodes with this primary.

870

int32_t index = nodes.size();

871

nodes.addElement(nodeFromWeight32(p), errorCode);

872

rootPrimaryIndexes.insertElementAt(index, ~rootIndex, errorCode);

return index;

}

}

int32_t

CollationBuilder::findOrInsertWeakNode(int32_t index, uint32_t weight16, int32_t level, UErrorCode &errorCode) {

879

if(U_FAILURE(errorCode)) { return 0; }

880

U_ASSERT(0 <= index && index < nodes.size());

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

881

U_ASSERT(UCOL_SECONDARY <= level && level <= UCOL_TERTIARY);

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

882

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

883

if(weight16 == Collation::COMMON_WEIGHT16) {

884

return findCommonNode(index, level);

885

}

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

886

887

// If this will be the first below-common weight for the parent node,

888

// then we will also need to insert a common weight after it.

889

int64_t node = nodes.elementAti(index);

890

U_ASSERT(strengthFromNode(node) < level); // parent node is stronger

891

if(weight16 != 0 && weight16 < Collation::COMMON_WEIGHT16) {

892

int32_t hasThisLevelBefore = level == UCOL_SECONDARY ? HAS_BEFORE2 : HAS_BEFORE3;

893

if((node & hasThisLevelBefore) == 0) {

894

// The parent node has an implied level-common weight.

895

int64_t commonNode =

896

nodeFromWeight16(Collation::COMMON_WEIGHT16) | nodeFromStrength(level);

897

if(level == UCOL_SECONDARY) {

898

// Move the HAS_BEFORE3 flag from the parent node

899

// to the new secondary common node.

900

commonNode |= node & HAS_BEFORE3;

901

node &= ~(int64_t)HAS_BEFORE3;

902

}

903

nodes.setElementAt(node | hasThisLevelBefore, index);

904

// Insert below-common-weight node.

905

int32_t nextIndex = nextIndexFromNode(node);

906

node = nodeFromWeight16(weight16) | nodeFromStrength(level);

907

index = insertNodeBetween(index, nextIndex, node, errorCode);

908

// Insert common-weight node.

909

insertNodeBetween(index, nextIndex, commonNode, errorCode);

910

// Return index of below-common-weight node.

return index;

}

}

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

915

// Find the root CE's weight for this level.

916

// Postpone insertion if not found:

917

// Insert the new root node before the next stronger node,

918

// or before the next root node with the same strength and a larger weight.

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

919

int32_t nextIndex;

920

while((nextIndex = nextIndexFromNode(node)) != 0) {

921

node = nodes.elementAti(nextIndex);

922

int32_t nextStrength = strengthFromNode(node);

923

if(nextStrength <= level) {

924

// Insert before a stronger node.

925

if(nextStrength < level) { break; }

926

// nextStrength == level

927

if(!isTailoredNode(node)) {

928

uint32_t nextWeight16 = weight16FromNode(node);

929

if(nextWeight16 == weight16) {

930

// Found the node for the root CE up to this level.

931

return nextIndex;

932

}

933

// Insert before a node with a larger same-strength weight.

934

if(nextWeight16 > weight16) { break; }

935

}

936

}

937

// Skip the next node.

938

index = nextIndex;

939

}

940

node = nodeFromWeight16(weight16) | nodeFromStrength(level);

941

return insertNodeBetween(index, nextIndex, node, errorCode);

}

int32_t

CollationBuilder::insertTailoredNodeAfter(int32_t index, int32_t strength, UErrorCode &errorCode) {

946

if(U_FAILURE(errorCode)) { return 0; }

947

U_ASSERT(0 <= index && index < nodes.size());

948

if(strength >= UCOL_SECONDARY) {

949

index = findCommonNode(index, UCOL_SECONDARY);

950

if(strength >= UCOL_TERTIARY) {

951

index = findCommonNode(index, UCOL_TERTIARY);

952

}

953

}

954

// Postpone insertion:

955

// Insert the new node before the next one with a strength at least as strong.

956

int64_t node = nodes.elementAti(index);

957

int32_t nextIndex;

958

while((nextIndex = nextIndexFromNode(node)) != 0) {

959

node = nodes.elementAti(nextIndex);

960

if(strengthFromNode(node) <= strength) { break; }

961

// Skip the next node which has a weaker (larger) strength than the new one.

962

index = nextIndex;

963

}

964

node = IS_TAILORED | nodeFromStrength(strength);

965

return insertNodeBetween(index, nextIndex, node, errorCode);

}

int32_t

CollationBuilder::insertNodeBetween(int32_t index, int32_t nextIndex, int64_t node,

970

UErrorCode &errorCode) {

971

if(U_FAILURE(errorCode)) { return 0; }

972

U_ASSERT(previousIndexFromNode(node) == 0);

973

U_ASSERT(nextIndexFromNode(node) == 0);

974

U_ASSERT(nextIndexFromNode(nodes.elementAti(index)) == nextIndex);

975

// Append the new node and link it to the existing nodes.

976

int32_t newIndex = nodes.size();

977

node |= nodeFromPreviousIndex(index) | nodeFromNextIndex(nextIndex);

978

nodes.addElement(node, errorCode);

979

if(U_FAILURE(errorCode)) { return 0; }

980

// nodes[index].nextIndex = newIndex

981

node = nodes.elementAti(index);

982

nodes.setElementAt(changeNodeNextIndex(node, newIndex), index);

983

// nodes[nextIndex].previousIndex = newIndex

984

if(nextIndex != 0) {

985

node = nodes.elementAti(nextIndex);

986

nodes.setElementAt(changeNodePreviousIndex(node, newIndex), nextIndex);

}

return newIndex;

}

int32_t

CollationBuilder::findCommonNode(int32_t index, int32_t strength) const {

993

U_ASSERT(UCOL_SECONDARY <= strength && strength <= UCOL_TERTIARY);

994

int64_t node = nodes.elementAti(index);

995

if(strengthFromNode(node) >= strength) {

996

// The current node is no stronger.

997

return index;

998

}

999

if(strength == UCOL_SECONDARY ? !nodeHasBefore2(node) : !nodeHasBefore3(node)) {

1000

// The current node implies the strength-common weight.

1001

return index;

1002

}

1003

index = nextIndexFromNode(node);

1004

node = nodes.elementAti(index);

1005

U_ASSERT(!isTailoredNode(node) && strengthFromNode(node) == strength &&

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

1006

weight16FromNode(node) < Collation::COMMON_WEIGHT16);

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1007

// Skip to the explicit common node.

1008

do {

1009

index = nextIndexFromNode(node);

1010

node = nodes.elementAti(index);

1011

U_ASSERT(strengthFromNode(node) >= strength);

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

1012

} while(isTailoredNode(node) || strengthFromNode(node) > strength ||

1013

weight16FromNode(node) < Collation::COMMON_WEIGHT16);

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1014

U_ASSERT(weight16FromNode(node) == Collation::COMMON_WEIGHT16);

return index;

}

void

CollationBuilder::setCaseBits(const UnicodeString &nfdString,

1020

const char *&parserErrorReason, UErrorCode &errorCode) {

1021

if(U_FAILURE(errorCode)) { return; }

1022

int32_t numTailoredPrimaries = 0;

1023

for(int32_t i = 0; i < cesLength; ++i) {

1024

if(ceStrength(ces[i]) == UCOL_PRIMARY) { ++numTailoredPrimaries; }

1025

}

1026

// We should not be able to get too many case bits because

1027

// cesLength<=31==MAX_EXPANSION_LENGTH.

1028

// 31 pairs of case bits fit into an int64_t without setting its sign bit.

1029

U_ASSERT(numTailoredPrimaries <= 31);

1030

1031

int64_t cases = 0;

1032

if(numTailoredPrimaries > 0) {

1033

const UChar *s = nfdString.getBuffer();

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

1034

UTF16CollationIterator baseCEs(baseData, false, s, s, s + nfdString.length());

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1035

int32_t baseCEsLength = baseCEs.fetchCEs(errorCode) - 1;

1036

if(U_FAILURE(errorCode)) {

1037

parserErrorReason = "fetching root CEs for tailored string";

1038

return;

1039

}

1040

U_ASSERT(baseCEsLength >= 0 && baseCEs.getCE(baseCEsLength) == Collation::NO_CE);

1041

1042

uint32_t lastCase = 0;

1043

int32_t numBasePrimaries = 0;

1044

for(int32_t i = 0; i < baseCEsLength; ++i) {

1045

int64_t ce = baseCEs.getCE(i);

1046

if((ce >> 32) != 0) {

1047

++numBasePrimaries;

1048

uint32_t c = ((uint32_t)ce >> 14) & 3;

1049

U_ASSERT(c == 0 || c == 2); // lowercase or uppercase, no mixed case in any base CE

1050

if(numBasePrimaries < numTailoredPrimaries) {

1051

cases |= (int64_t)c << ((numBasePrimaries - 1) * 2);

1052

} else if(numBasePrimaries == numTailoredPrimaries) {

1053

lastCase = c;

1054

} else if(c != lastCase) {

1055

// There are more base primary CEs than tailored primaries.

1056

// Set mixed case if the case bits of the remainder differ.

1057

lastCase = 1;

1058

// Nothing more can change.

break;

}

}

}

if(numBasePrimaries >= numTailoredPrimaries) {

1064

cases |= (int64_t)lastCase << ((numTailoredPrimaries - 1) * 2);

}

}

for(int32_t i = 0; i < cesLength; ++i) {

1069

int64_t ce = ces[i] & INT64_C(0xffffffffffff3fff); // clear old case bits

1070

int32_t strength = ceStrength(ce);

1071

if(strength == UCOL_PRIMARY) {

1072

ce |= (cases & 3) << 14;

1073

cases >>= 2;

1074

} else if(strength == UCOL_TERTIARY) {

1075

// Tertiary CEs must have uppercase bits.

1076

// See the LDML spec, and comments in class CollationCompare.

1077

ce |= 0x8000;

1078

}

1079

// Tertiary ignorable CEs must have 0 case bits.

1080

// We set 0 case bits for secondary CEs too

1081

// since currently only U+0345 is cased and maps to a secondary CE,

1082

// and it is lowercase. Other secondaries are uncased.

1083

// See [[:Cased:]&[:uca1=:]] where uca1 queries the root primary weight.

ces[i] = ce;

}

}

void

CollationBuilder::suppressContractions(const UnicodeSet &set, const char *&parserErrorReason,

1090

UErrorCode &errorCode) {

1091

if(U_FAILURE(errorCode)) { return; }

1092

dataBuilder->suppressContractions(set, errorCode);

1093

if(U_FAILURE(errorCode)) {

1094

parserErrorReason = "application of [suppressContractions [set]] failed";

}

}

void

CollationBuilder::optimize(const UnicodeSet &set, const char *& /* parserErrorReason */,

1100

UErrorCode &errorCode) {

1101

if(U_FAILURE(errorCode)) { return; }

1102

optimizeSet.addAll(set);

}

uint32_t

CollationBuilder::addWithClosure(const UnicodeString &nfdPrefix, const UnicodeString &nfdString,

1107

const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32,

1108

UErrorCode &errorCode) {

1109

// Map from the NFD input to the CEs.

1110

ce32 = addIfDifferent(nfdPrefix, nfdString, newCEs, newCEsLength, ce32, errorCode);

1111

ce32 = addOnlyClosure(nfdPrefix, nfdString, newCEs, newCEsLength, ce32, errorCode);

1112

addTailComposites(nfdPrefix, nfdString, errorCode);

return ce32;

}

uint32_t

CollationBuilder::addOnlyClosure(const UnicodeString &nfdPrefix, const UnicodeString &nfdString,

1118

const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32,

1119

UErrorCode &errorCode) {

1120

if(U_FAILURE(errorCode)) { return ce32; }

1121

1122

// Map from canonically equivalent input to the CEs. (But not from the all-NFD input.)

1123

if(nfdPrefix.isEmpty()) {

1124

CanonicalIterator stringIter(nfdString, errorCode);

1125

if(U_FAILURE(errorCode)) { return ce32; }

1126

UnicodeString prefix;

1127

for(;;) {

1128

UnicodeString str = stringIter.next();

1129

if(str.isBogus()) { break; }

1130

if(ignoreString(str, errorCode) || str == nfdString) { continue; }

1131

ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32, errorCode);

1132

if(U_FAILURE(errorCode)) { return ce32; }

1133

}

1134

} else {

1135

CanonicalIterator prefixIter(nfdPrefix, errorCode);

1136

CanonicalIterator stringIter(nfdString, errorCode);

1137

if(U_FAILURE(errorCode)) { return ce32; }

1138

for(;;) {

1139

UnicodeString prefix = prefixIter.next();

1140

if(prefix.isBogus()) { break; }

1141

if(ignorePrefix(prefix, errorCode)) { continue; }

1142

UBool samePrefix = prefix == nfdPrefix;

1143

for(;;) {

1144

UnicodeString str = stringIter.next();

1145

if(str.isBogus()) { break; }

1146

if(ignoreString(str, errorCode) || (samePrefix && str == nfdString)) { continue; }

1147

ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32, errorCode);

1148

if(U_FAILURE(errorCode)) { return ce32; }

}

stringIter.reset();

}

}

return ce32;

}

void

CollationBuilder::addTailComposites(const UnicodeString &nfdPrefix, const UnicodeString &nfdString,

1158

UErrorCode &errorCode) {

1159

if(U_FAILURE(errorCode)) { return; }

1160

1161

// Look for the last starter in the NFD string.

1162

UChar32 lastStarter;

1163

int32_t indexAfterLastStarter = nfdString.length();

1164

for(;;) {

1165

if(indexAfterLastStarter == 0) { return; } // no starter at all

1166

lastStarter = nfdString.char32At(indexAfterLastStarter - 1);

1167

if(nfd.getCombiningClass(lastStarter) == 0) { break; }

1168

indexAfterLastStarter -= U16_LENGTH(lastStarter);

1169

}

1170

// No closure to Hangul syllables since we decompose them on the fly.

1171

if(Hangul::isJamoL(lastStarter)) { return; }

1172

1173

// Are there any composites whose decomposition starts with the lastStarter?

1174

// Note: Normalizer2Impl does not currently return start sets for NFC_QC=Maybe characters.

1175

// We might find some more equivalent mappings here if it did.

1176

UnicodeSet composites;

1177

if(!nfcImpl.getCanonStartSet(lastStarter, composites)) { return; }

1178

1179

UnicodeString decomp;

1180

UnicodeString newNFDString, newString;

1181

int64_t newCEs[Collation::MAX_EXPANSION_LENGTH];

1182

UnicodeSetIterator iter(composites);

1183

while(iter.next()) {

1184

U_ASSERT(!iter.isString());

1185

UChar32 composite = iter.getCodepoint();

1186

nfd.getDecomposition(composite, decomp);

1187

if(!mergeCompositeIntoString(nfdString, indexAfterLastStarter, composite, decomp,

1188

newNFDString, newString, errorCode)) {

1189

continue;

1190

}

1191

int32_t newCEsLength = dataBuilder->getCEs(nfdPrefix, newNFDString, newCEs, 0);

1192

if(newCEsLength > Collation::MAX_EXPANSION_LENGTH) {

1193

// Ignore mappings that we cannot store.

1194

continue;

1195

}

1196

// Note: It is possible that the newCEs do not make use of the mapping

1197

// for which we are adding the tail composites, in which case we might be adding

1198

// unnecessary mappings.

1199

// For example, when we add tail composites for ae^ (^=combining circumflex),

1200

// UCA discontiguous-contraction matching does not find any matches

1201

// for ae_^ (_=any combining diacritic below) *unless* there is also

1202

// a contraction mapping for ae.

1203

// Thus, if there is no ae contraction, then the ae^ mapping is ignored

1204

// while fetching the newCEs for ae_^.

1205

// TODO: Try to detect this effectively.

1206

// (Alternatively, print a warning when prefix contractions are missing.)

1207

1208

// We do not need an explicit mapping for the NFD strings.

1209

// It is fine if the NFD input collates like this via a sequence of mappings.

1210

// It also saves a little bit of space, and may reduce the set of characters with contractions.

1211

uint32_t ce32 = addIfDifferent(nfdPrefix, newString,

1212

newCEs, newCEsLength, Collation::UNASSIGNED_CE32, errorCode);

1213

if(ce32 != Collation::UNASSIGNED_CE32) {

1214

// was different, was added

1215

addOnlyClosure(nfdPrefix, newNFDString, newCEs, newCEsLength, ce32, errorCode);

}

}

}

UBool

CollationBuilder::mergeCompositeIntoString(const UnicodeString &nfdString,

1222

int32_t indexAfterLastStarter,

1223

UChar32 composite, const UnicodeString &decomp,

1224

UnicodeString &newNFDString, UnicodeString &newString,

1225

UErrorCode &errorCode) const {

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

1226

if(U_FAILURE(errorCode)) { return false; }

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1227

U_ASSERT(nfdString.char32At(indexAfterLastStarter - 1) == decomp.char32At(0));

1228

int32_t lastStarterLength = decomp.moveIndex32(0, 1);

1229

if(lastStarterLength == decomp.length()) {

1230

// Singleton decompositions should be found by addWithClosure()

1231

// and the CanonicalIterator, so we can ignore them here.

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

1232

return false;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1233

}

1234

if(nfdString.compare(indexAfterLastStarter, 0x7fffffff,

1235

decomp, lastStarterLength, 0x7fffffff) == 0) {

1236

// same strings, nothing new to be found here

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

1237

return false;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1238

}

1239

1240

// Make new FCD strings that combine a composite, or its decomposition,

1241

// into the nfdString's last starter and the combining marks following it.

1242

// Make an NFD version, and a version with the composite.

1243

newNFDString.setTo(nfdString, 0, indexAfterLastStarter);

1244

newString.setTo(nfdString, 0, indexAfterLastStarter - lastStarterLength).append(composite);

1245

1246

// The following is related to discontiguous contraction matching,

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

1247

// but builds only FCD strings (or else returns false).

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1248

int32_t sourceIndex = indexAfterLastStarter;

1249

int32_t decompIndex = lastStarterLength;

1250

// Small optimization: We keep the source character across loop iterations

1251

// because we do not always consume it,

1252

// and then need not fetch it again nor look up its combining class again.

1253

UChar32 sourceChar = U_SENTINEL;

1254

// The cc variables need to be declared before the loop so that at the end

1255

// they are set to the last combining classes seen.

1256

uint8_t sourceCC = 0;

1257

uint8_t decompCC = 0;

1258

for(;;) {

1259

if(sourceChar < 0) {

1260

if(sourceIndex >= nfdString.length()) { break; }

1261

sourceChar = nfdString.char32At(sourceIndex);

1262

sourceCC = nfd.getCombiningClass(sourceChar);

1263

U_ASSERT(sourceCC != 0);

1264

}

1265

// We consume a decomposition character in each iteration.

1266

if(decompIndex >= decomp.length()) { break; }

1267

UChar32 decompChar = decomp.char32At(decompIndex);

1268

decompCC = nfd.getCombiningClass(decompChar);

1269

// Compare the two characters and their combining classes.

1270

if(decompCC == 0) {

1271

// Unable to merge because the source contains a non-zero combining mark

1272

// but the composite's decomposition contains another starter.

1273

// The strings would not be equivalent.

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

1274

return false;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1275

} else if(sourceCC < decompCC) {

1276

// Composite + sourceChar would not be FCD.

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

1277

return false;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1278

} else if(decompCC < sourceCC) {

1279

newNFDString.append(decompChar);

1280

decompIndex += U16_LENGTH(decompChar);

1281

} else if(decompChar != sourceChar) {

1282

// Blocked because same combining class.

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

1283

return false;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1284

} else { // match: decompChar == sourceChar

1285

newNFDString.append(decompChar);

1286

decompIndex += U16_LENGTH(decompChar);

1287

sourceIndex += U16_LENGTH(decompChar);

1288

sourceChar = U_SENTINEL;

1289

}

1290

}

1291

// We are at the end of at least one of the two inputs.

1292

if(sourceChar >= 0) { // more characters from nfdString but not from decomp

1293

if(sourceCC < decompCC) {

1294

// Appending the next source character to the composite would not be FCD.

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

1295

return false;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1296

}

1297

newNFDString.append(nfdString, sourceIndex, 0x7fffffff);

1298

newString.append(nfdString, sourceIndex, 0x7fffffff);

1299

} else if(decompIndex < decomp.length()) { // more characters from decomp, not from nfdString

1300

newNFDString.append(decomp, decompIndex, 0x7fffffff);

1301

}

1302

U_ASSERT(nfd.isNormalized(newNFDString, errorCode));

1303

U_ASSERT(fcd.isNormalized(newString, errorCode));

1304

U_ASSERT(nfd.normalize(newString, errorCode) == newNFDString); // canonically equivalent

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

1305

return true;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

}

UBool

CollationBuilder::ignorePrefix(const UnicodeString &s, UErrorCode &errorCode) const {

1310

// Do not map non-FCD prefixes.

1311

return !isFCD(s, errorCode);

}

UBool

CollationBuilder::ignoreString(const UnicodeString &s, UErrorCode &errorCode) const {

1316

// Do not map non-FCD strings.

1317

// Do not map strings that start with Hangul syllables: We decompose those on the fly.

1318

return !isFCD(s, errorCode) || Hangul::isHangul(s.charAt(0));

}

UBool

CollationBuilder::isFCD(const UnicodeString &s, UErrorCode &errorCode) const {

1323

return U_SUCCESS(errorCode) && fcd.isNormalized(s, errorCode);

}

void

CollationBuilder::closeOverComposites(UErrorCode &errorCode) {

1328

UnicodeSet composites(UNICODE_STRING_SIMPLE("[:NFD_QC=N:]"), errorCode); // Java: static final

1329

if(U_FAILURE(errorCode)) { return; }

1330

// Hangul is decomposed on the fly during collation.

1331

composites.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END);

1332

UnicodeString prefix; // empty

1333

UnicodeString nfdString;

1334

UnicodeSetIterator iter(composites);

1335

while(iter.next()) {

1336

U_ASSERT(!iter.isString());

1337

nfd.getDecomposition(iter.getCodepoint(), nfdString);

1338

cesLength = dataBuilder->getCEs(nfdString, ces, 0);

1339

if(cesLength > Collation::MAX_EXPANSION_LENGTH) {

1340

// Too many CEs from the decomposition (unusual), ignore this composite.

1341

// We could add a capacity parameter to getCEs() and reallocate if necessary.

1342

// However, this can only really happen in contrived cases.

1343

continue;

1344

}

1345

const UnicodeString &composite(iter.getString());

1346

addIfDifferent(prefix, composite, ces, cesLength, Collation::UNASSIGNED_CE32, errorCode);

}

}

uint32_t

CollationBuilder::addIfDifferent(const UnicodeString &prefix, const UnicodeString &str,

1352

const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32,

1353

UErrorCode &errorCode) {

1354

if(U_FAILURE(errorCode)) { return ce32; }

1355

int64_t oldCEs[Collation::MAX_EXPANSION_LENGTH];

1356

int32_t oldCEsLength = dataBuilder->getCEs(prefix, str, oldCEs, 0);

1357

if(!sameCEs(newCEs, newCEsLength, oldCEs, oldCEsLength)) {

1358

if(ce32 == Collation::UNASSIGNED_CE32) {

1359

ce32 = dataBuilder->encodeCEs(newCEs, newCEsLength, errorCode);

1360

}

1361

dataBuilder->addCE32(prefix, str, ce32, errorCode);

}

return ce32;

}

UBool

CollationBuilder::sameCEs(const int64_t ces1[], int32_t ces1Length,

1368

const int64_t ces2[], int32_t ces2Length) {

1369

if(ces1Length != ces2Length) {

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

1370

return false;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1371

}

1372

U_ASSERT(ces1Length <= Collation::MAX_EXPANSION_LENGTH);

1373

for(int32_t i = 0; i < ces1Length; ++i) {

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

1374

if(ces1[i] != ces2[i]) { return false; }

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1375

}

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

1376

return true;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1377

}

1378

1379

#ifdef DEBUG_COLLATION_BUILDER

1380

1381

uint32_t

1382

alignWeightRight(uint32_t w) {

1383

if(w != 0) {

1384

while((w & 0xff) == 0) { w >>= 8; }

}

return w;

}

#endif

void

CollationBuilder::makeTailoredCEs(UErrorCode &errorCode) {

1393

if(U_FAILURE(errorCode)) { return; }

1394

1395

CollationWeights primaries, secondaries, tertiaries;

1396

int64_t *nodesArray = nodes.getBuffer();

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

1397

#ifdef DEBUG_COLLATION_BUILDER

1398

puts("\nCollationBuilder::makeTailoredCEs()");

1399

#endif

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1400

1401

for(int32_t rpi = 0; rpi < rootPrimaryIndexes.size(); ++rpi) {

1402

int32_t i = rootPrimaryIndexes.elementAti(rpi);

1403

int64_t node = nodesArray[i];

1404

uint32_t p = weight32FromNode(node);

1405

uint32_t s = p == 0 ? 0 : Collation::COMMON_WEIGHT16;

1406

uint32_t t = s;

1407

uint32_t q = 0;

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

1408

UBool pIsTailored = false;

1409

UBool sIsTailored = false;

1410

UBool tIsTailored = false;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1411

#ifdef DEBUG_COLLATION_BUILDER

1412

printf("\nprimary %lx\n", (long)alignWeightRight(p));

1413

#endif

1414

int32_t pIndex = p == 0 ? 0 : rootElements.findPrimary(p);

1415

int32_t nextIndex = nextIndexFromNode(node);

1416

while(nextIndex != 0) {

1417

i = nextIndex;

1418

node = nodesArray[i];

1419

nextIndex = nextIndexFromNode(node);

1420

int32_t strength = strengthFromNode(node);

1421

if(strength == UCOL_QUATERNARY) {

1422

U_ASSERT(isTailoredNode(node));

1423

#ifdef DEBUG_COLLATION_BUILDER

printf(" quat+ ");

#endif

if(q == 3) {

errorCode = U_BUFFER_OVERFLOW_ERROR;

1428

errorReason = "quaternary tailoring gap too small";

return;

}

++q;

} else {

if(strength == UCOL_TERTIARY) {

1434

if(isTailoredNode(node)) {

1435

#ifdef DEBUG_COLLATION_BUILDER

printf(" ter+ ");

#endif

if(!tIsTailored) {

// First tailored tertiary node for [p, s].

1440

int32_t tCount = countTailoredNodes(nodesArray, nextIndex,

UCOL_TERTIARY) + 1;

uint32_t tLimit;

if(t == 0) {

// Gap at the beginning of the tertiary CE range.

1445

t = rootElements.getTertiaryBoundary() - 0x100;

1446

tLimit = rootElements.getFirstTertiaryCE() & Collation::ONLY_TERTIARY_MASK;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1447

} else if(!pIsTailored && !sIsTailored) {

1448

// p and s are root weights.

1449

tLimit = rootElements.getTertiaryAfter(pIndex, s, t);

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

1450

} else if(t == Collation::BEFORE_WEIGHT16) {

1451

tLimit = Collation::COMMON_WEIGHT16;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1452

} else {

1453

// [p, s] is tailored.

1454

U_ASSERT(t == Collation::COMMON_WEIGHT16);

1455

tLimit = rootElements.getTertiaryBoundary();

1456

}

1457

U_ASSERT(tLimit == 0x4000 || (tLimit & ~Collation::ONLY_TERTIARY_MASK) == 0);

1458

tertiaries.initForTertiary();

1459

if(!tertiaries.allocWeights(t, tLimit, tCount)) {

1460

errorCode = U_BUFFER_OVERFLOW_ERROR;

1461

errorReason = "tertiary tailoring gap too small";

1462

return;

1463

}

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

1464

tIsTailored = true;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1465

}

1466

t = tertiaries.nextWeight();

1467

U_ASSERT(t != 0xffffffff);

1468

} else {

1469

t = weight16FromNode(node);

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

1470

tIsTailored = false;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1471

#ifdef DEBUG_COLLATION_BUILDER

1472

printf(" ter %lx\n", (long)alignWeightRight(t));

#endif

}

} else {

if(strength == UCOL_SECONDARY) {

1477

if(isTailoredNode(node)) {

1478

#ifdef DEBUG_COLLATION_BUILDER

printf(" sec+ ");

#endif

if(!sIsTailored) {

// First tailored secondary node for p.

1483

int32_t sCount = countTailoredNodes(nodesArray, nextIndex,

UCOL_SECONDARY) + 1;

uint32_t sLimit;

if(s == 0) {

// Gap at the beginning of the secondary CE range.

1488

s = rootElements.getSecondaryBoundary() - 0x100;

1489

sLimit = rootElements.getFirstSecondaryCE() >> 16;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1490

} else if(!pIsTailored) {

1491

// p is a root primary.

1492

sLimit = rootElements.getSecondaryAfter(pIndex, s);

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

1493

} else if(s == Collation::BEFORE_WEIGHT16) {

1494

sLimit = Collation::COMMON_WEIGHT16;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1495

} else {

1496

// p is a tailored primary.

1497

U_ASSERT(s == Collation::COMMON_WEIGHT16);

1498

sLimit = rootElements.getSecondaryBoundary();

1499

}

1500

if(s == Collation::COMMON_WEIGHT16) {

1501

// Do not tailor into the getSortKey() range of

1502

// compressed common secondaries.

1503

s = rootElements.getLastCommonSecondary();

1504

}

1505

secondaries.initForSecondary();

1506

if(!secondaries.allocWeights(s, sLimit, sCount)) {

1507

errorCode = U_BUFFER_OVERFLOW_ERROR;

1508

errorReason = "secondary tailoring gap too small";

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

1509

#ifdef DEBUG_COLLATION_BUILDER

1510

printf("!secondaries.allocWeights(%lx, %lx, sCount=%ld)\n",

1511

(long)alignWeightRight(s), (long)alignWeightRight(sLimit),

1512

(long)alignWeightRight(sCount));

1513

#endif

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1514

return;

1515

}

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

1516

sIsTailored = true;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1517

}

1518

s = secondaries.nextWeight();

1519

U_ASSERT(s != 0xffffffff);

1520

} else {

1521

s = weight16FromNode(node);

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

1522

sIsTailored = false;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1523

#ifdef DEBUG_COLLATION_BUILDER

1524

printf(" sec %lx\n", (long)alignWeightRight(s));

1525

#endif

1526

}

1527

} else /* UCOL_PRIMARY */ {

1528

U_ASSERT(isTailoredNode(node));

1529

#ifdef DEBUG_COLLATION_BUILDER

printf("pri+ ");

#endif

if(!pIsTailored) {

// First tailored primary node in this list.

1534

int32_t pCount = countTailoredNodes(nodesArray, nextIndex,

1535

UCOL_PRIMARY) + 1;

1536

UBool isCompressible = baseData->isCompressiblePrimary(p);

1537

uint32_t pLimit =

1538

rootElements.getPrimaryAfter(p, pIndex, isCompressible);

1539

primaries.initForPrimary(isCompressible);

1540

if(!primaries.allocWeights(p, pLimit, pCount)) {

1541

errorCode = U_BUFFER_OVERFLOW_ERROR; // TODO: introduce a more specific UErrorCode?

1542

errorReason = "primary tailoring gap too small";

1543

return;

1544

}

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

1545

pIsTailored = true;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1546

}

1547

p = primaries.nextWeight();

1548

U_ASSERT(p != 0xffffffff);

1549

s = Collation::COMMON_WEIGHT16;

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

1550

sIsTailored = false;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1551

}

1552

t = s == 0 ? 0 : Collation::COMMON_WEIGHT16;

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

1553

tIsTailored = false;

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

}

q = 0;

}

if(isTailoredNode(node)) {

1558

nodesArray[i] = Collation::makeCE(p, s, t, q);

1559

#ifdef DEBUG_COLLATION_BUILDER

1560

printf("%016llx\n", (long long)nodesArray[i]);

#endif

}

}

}

}

int32_t

CollationBuilder::countTailoredNodes(const int64_t *nodesArray, int32_t i, int32_t strength) {

1569

int32_t count = 0;

1570

for(;;) {

1571

if(i == 0) { break; }

1572

int64_t node = nodesArray[i];

1573

if(strengthFromNode(node) < strength) { break; }

1574

if(strengthFromNode(node) == strength) {

1575

if(isTailoredNode(node)) {

++count;

} else {

break;

}

}

i = nextIndexFromNode(node);

}

return count;

}

class CEFinalizer : public CollationDataBuilder::CEModifier {

1587

public:

1588

CEFinalizer(const int64_t *ces) : finalCEs(ces) {}

1589

virtual ~CEFinalizer();

Frank Tang

2021-11-08 14:04:04 -0800

[diff] [blame]

1590

virtual int64_t modifyCE32(uint32_t ce32) const override {

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1591

U_ASSERT(!Collation::isSpecialCE32(ce32));

1592

if(CollationBuilder::isTempCE32(ce32)) {

1593

// retain case bits

1594

return finalCEs[CollationBuilder::indexFromTempCE32(ce32)] | ((ce32 & 0xc0) << 8);

1595

} else {

1596

return Collation::NO_CE;

1597

}

1598

}

Frank Tang

2021-11-08 14:04:04 -0800

[diff] [blame]

1599

virtual int64_t modifyCE(int64_t ce) const override {

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1600

if(CollationBuilder::isTempCE(ce)) {

1601

// retain case bits

1602

return finalCEs[CollationBuilder::indexFromTempCE(ce)] | (ce & 0xc000);

1603

} else {

1604

return Collation::NO_CE;

}

}

private:

const int64_t *finalCEs;

1610

};

1611

1612

CEFinalizer::~CEFinalizer() {}

1613

1614

void

1615

CollationBuilder::finalizeCEs(UErrorCode &errorCode) {

1616

if(U_FAILURE(errorCode)) { return; }

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

1617

LocalPointer<CollationDataBuilder> newBuilder(new CollationDataBuilder(icu4xMode, errorCode), errorCode);

Jungshik Shin

2016-01-29 00:32:36 -0800

[diff] [blame]

1618

if(U_FAILURE(errorCode)) {

Jungshik Shin (jungshik at google)

2015-01-08 15:46:45 -0800

[diff] [blame]

1619

return;

1620

}

1621

newBuilder->initForTailoring(baseData, errorCode);

1622

CEFinalizer finalizer(nodes.getBuffer());

1623

newBuilder->copyFrom(*dataBuilder, finalizer, errorCode);

1624

if(U_FAILURE(errorCode)) { return; }

1625

delete dataBuilder;

1626

dataBuilder = newBuilder.orphan();

}

int32_t

CollationBuilder::ceStrength(int64_t ce) {

1631

return

1632

isTempCE(ce) ? strengthFromTempCE(ce) :

1633

(ce & INT64_C(0xff00000000000000)) != 0 ? UCOL_PRIMARY :

1634

((uint32_t)ce & 0xff000000) != 0 ? UCOL_SECONDARY :

1635

ce != 0 ? UCOL_TERTIARY :

UCOL_IDENTICAL;

}

U_NAMESPACE_END

U_NAMESPACE_USE

U_CAPI UCollator * U_EXPORT2

1644

ucol_openRules(const UChar *rules, int32_t rulesLength,

1645

UColAttributeValue normalizationMode, UCollationStrength strength,

1646

UParseError *parseError, UErrorCode *pErrorCode) {

1647

if(U_FAILURE(*pErrorCode)) { return NULL; }

1648

if(rules == NULL && rulesLength != 0) {

1649

*pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;

1650

return NULL;

1651

}

1652

RuleBasedCollator *coll = new RuleBasedCollator();

1653

if(coll == NULL) {

1654

*pErrorCode = U_MEMORY_ALLOCATION_ERROR;

1655

return NULL;

1656

}

1657

UnicodeString r((UBool)(rulesLength < 0), rules, rulesLength);

1658

coll->internalBuildTailoring(r, strength, normalizationMode, parseError, NULL, *pErrorCode);

1659

if(U_FAILURE(*pErrorCode)) {

delete coll;

return NULL;

}

return coll->toUCollator();

1664

}

1665

1666

static const int32_t internalBufferSize = 512;

1667

1668

// The @internal ucol_getUnsafeSet() was moved here from ucol_sit.cpp

1669

// because it calls UnicodeSet "builder" code that depends on all Unicode properties,

1670

// and the rest of the collation "runtime" code only depends on normalization.

1671

// This function is not related to the collation builder,

1672

// but it did not seem worth moving it into its own .cpp file,

1673

// nor rewriting it to use lower-level UnicodeSet and Normalizer2Impl methods.

1674

U_CAPI int32_t U_EXPORT2

1675

ucol_getUnsafeSet( const UCollator *coll,

USet *unsafe,

UErrorCode *status)

{

UChar buffer[internalBufferSize];

int32_t len = 0;

uset_clear(unsafe);

// cccpattern = "[[:^tccc=0:][:^lccc=0:]]", unfortunately variant

1685

static const UChar cccpattern[25] = { 0x5b, 0x5b, 0x3a, 0x5e, 0x74, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d,

1686

0x5b, 0x3a, 0x5e, 0x6c, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 0x5d, 0x00 };

1687

1688

// add chars that fail the fcd check

1689

uset_applyPattern(unsafe, cccpattern, 24, USET_IGNORE_SPACE, status);

1690

1691

// add lead/trail surrogates

1692

// (trail surrogates should need to be unsafe only if the caller tests for UTF-16 code *units*,

1693

// not when testing code *points*)

1694

uset_addRange(unsafe, 0xd800, 0xdfff);

1695

1696

USet *contractions = uset_open(0,0);

1697

1698

int32_t i = 0, j = 0;

Frank Tang

2022-11-08 12:31:27 -0800

[diff] [blame^]

1699

ucol_getContractionsAndExpansions(coll, contractions, NULL, false, status);

Jungshik Shin (jungshik at google)