Check in the pristine copy of ICU 52.1

The following directories we don't use are removed:

as_is
packaging
source/layout
source/layoutex

Better plural formatting, updated locale data, the segmentation support
for Chinese, Japanese (our code was upstreamed) and Khmer/Lao are among
many improvements. EcmaScript I18N API also need this update to be compliant
to the spec.

Because codereview.chromium.org keeps giving me a 500 error when uploading
the CL, so this initial check-in is done directly with svn.

BUG=132145
TEST=None


git-svn-id: http://src.chromium.org/svn/trunk/deps/third_party/icu52@259715 4ff67af0-8c30-449e-8e8b-ad334ec8d88c
diff --git a/source/common/stringtriebuilder.cpp b/source/common/stringtriebuilder.cpp
new file mode 100644
index 0000000..109fcdc
--- /dev/null
+++ b/source/common/stringtriebuilder.cpp
@@ -0,0 +1,616 @@
+/*
+*******************************************************************************
+*   Copyright (C) 2010-2012, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+*******************************************************************************
+*   file name:  stringtriebuilder.cpp
+*   encoding:   US-ASCII
+*   tab size:   8 (not used)
+*   indentation:4
+*
+*   created on: 2010dec24
+*   created by: Markus W. Scherer
+*/
+
+#include "utypeinfo.h"  // for 'typeid' to work
+#include "unicode/utypes.h"
+#include "unicode/stringtriebuilder.h"
+#include "uassert.h"
+#include "uhash.h"
+
+U_CDECL_BEGIN
+
+static int32_t U_CALLCONV
+hashStringTrieNode(const UHashTok key) {
+    return icu::StringTrieBuilder::hashNode(key.pointer);
+}
+
+static UBool U_CALLCONV
+equalStringTrieNodes(const UHashTok key1, const UHashTok key2) {
+    return icu::StringTrieBuilder::equalNodes(key1.pointer, key2.pointer);
+}
+
+U_CDECL_END
+
+U_NAMESPACE_BEGIN
+
+StringTrieBuilder::StringTrieBuilder() : nodes(NULL) {}
+
+StringTrieBuilder::~StringTrieBuilder() {
+    deleteCompactBuilder();
+}
+
+void
+StringTrieBuilder::createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode) {
+    if(U_FAILURE(errorCode)) {
+        return;
+    }
+    nodes=uhash_openSize(hashStringTrieNode, equalStringTrieNodes, NULL,
+                         sizeGuess, &errorCode);
+    if(U_SUCCESS(errorCode)) {
+        if(nodes==NULL) {
+          errorCode=U_MEMORY_ALLOCATION_ERROR;
+        } else {
+          uhash_setKeyDeleter(nodes, uprv_deleteUObject);
+        }
+    }
+}
+
+void
+StringTrieBuilder::deleteCompactBuilder() {
+    uhash_close(nodes);
+    nodes=NULL;
+}
+
+void
+StringTrieBuilder::build(UStringTrieBuildOption buildOption, int32_t elementsLength,
+                       UErrorCode &errorCode) {
+    if(buildOption==USTRINGTRIE_BUILD_FAST) {
+        writeNode(0, elementsLength, 0);
+    } else /* USTRINGTRIE_BUILD_SMALL */ {
+        createCompactBuilder(2*elementsLength, errorCode);
+        Node *root=makeNode(0, elementsLength, 0, errorCode);
+        if(U_SUCCESS(errorCode)) {
+            root->markRightEdgesFirst(-1);
+            root->write(*this);
+        }
+        deleteCompactBuilder();
+    }
+}
+
+// Requires start<limit,
+// and all strings of the [start..limit[ elements must be sorted and
+// have a common prefix of length unitIndex.
+int32_t
+StringTrieBuilder::writeNode(int32_t start, int32_t limit, int32_t unitIndex) {
+    UBool hasValue=FALSE;
+    int32_t value=0;
+    int32_t type;
+    if(unitIndex==getElementStringLength(start)) {
+        // An intermediate or final value.
+        value=getElementValue(start++);
+        if(start==limit) {
+            return writeValueAndFinal(value, TRUE);  // final-value node
+        }
+        hasValue=TRUE;
+    }
+    // Now all [start..limit[ strings are longer than unitIndex.
+    int32_t minUnit=getElementUnit(start, unitIndex);
+    int32_t maxUnit=getElementUnit(limit-1, unitIndex);
+    if(minUnit==maxUnit) {
+        // Linear-match node: All strings have the same character at unitIndex.
+        int32_t lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex);
+        writeNode(start, limit, lastUnitIndex);
+        // Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
+        int32_t length=lastUnitIndex-unitIndex;
+        int32_t maxLinearMatchLength=getMaxLinearMatchLength();
+        while(length>maxLinearMatchLength) {
+            lastUnitIndex-=maxLinearMatchLength;
+            length-=maxLinearMatchLength;
+            writeElementUnits(start, lastUnitIndex, maxLinearMatchLength);
+            write(getMinLinearMatch()+maxLinearMatchLength-1);
+        }
+        writeElementUnits(start, unitIndex, length);
+        type=getMinLinearMatch()+length-1;
+    } else {
+        // Branch node.
+        int32_t length=countElementUnits(start, limit, unitIndex);
+        // length>=2 because minUnit!=maxUnit.
+        writeBranchSubNode(start, limit, unitIndex, length);
+        if(--length<getMinLinearMatch()) {
+            type=length;
+        } else {
+            write(length);
+            type=0;
+        }
+    }
+    return writeValueAndType(hasValue, value, type);
+}
+
+// start<limit && all strings longer than unitIndex &&
+// length different units at unitIndex
+int32_t
+StringTrieBuilder::writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length) {
+    UChar middleUnits[kMaxSplitBranchLevels];
+    int32_t lessThan[kMaxSplitBranchLevels];
+    int32_t ltLength=0;
+    while(length>getMaxBranchLinearSubNodeLength()) {
+        // Branch on the middle unit.
+        // First, find the middle unit.
+        int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2);
+        // Encode the less-than branch first.
+        middleUnits[ltLength]=getElementUnit(i, unitIndex);  // middle unit
+        lessThan[ltLength]=writeBranchSubNode(start, i, unitIndex, length/2);
+        ++ltLength;
+        // Continue for the greater-or-equal branch.
+        start=i;
+        length=length-length/2;
+    }
+    // For each unit, find its elements array start and whether it has a final value.
+    int32_t starts[kMaxBranchLinearSubNodeLength];
+    UBool isFinal[kMaxBranchLinearSubNodeLength-1];
+    int32_t unitNumber=0;
+    do {
+        int32_t i=starts[unitNumber]=start;
+        UChar unit=getElementUnit(i++, unitIndex);
+        i=indexOfElementWithNextUnit(i, unitIndex, unit);
+        isFinal[unitNumber]= start==i-1 && unitIndex+1==getElementStringLength(start);
+        start=i;
+    } while(++unitNumber<length-1);
+    // unitNumber==length-1, and the maxUnit elements range is [start..limit[
+    starts[unitNumber]=start;
+
+    // Write the sub-nodes in reverse order: The jump lengths are deltas from
+    // after their own positions, so if we wrote the minUnit sub-node first,
+    // then its jump delta would be larger.
+    // Instead we write the minUnit sub-node last, for a shorter delta.
+    int32_t jumpTargets[kMaxBranchLinearSubNodeLength-1];
+    do {
+        --unitNumber;
+        if(!isFinal[unitNumber]) {
+            jumpTargets[unitNumber]=writeNode(starts[unitNumber], starts[unitNumber+1], unitIndex+1);
+        }
+    } while(unitNumber>0);
+    // The maxUnit sub-node is written as the very last one because we do
+    // not jump for it at all.
+    unitNumber=length-1;
+    writeNode(start, limit, unitIndex+1);
+    int32_t offset=write(getElementUnit(start, unitIndex));
+    // Write the rest of this node's unit-value pairs.
+    while(--unitNumber>=0) {
+        start=starts[unitNumber];
+        int32_t value;
+        if(isFinal[unitNumber]) {
+            // Write the final value for the one string ending with this unit.
+            value=getElementValue(start);
+        } else {
+            // Write the delta to the start position of the sub-node.
+            value=offset-jumpTargets[unitNumber];
+        }
+        writeValueAndFinal(value, isFinal[unitNumber]);
+        offset=write(getElementUnit(start, unitIndex));
+    }
+    // Write the split-branch nodes.
+    while(ltLength>0) {
+        --ltLength;
+        writeDeltaTo(lessThan[ltLength]);
+        offset=write(middleUnits[ltLength]);
+    }
+    return offset;
+}
+
+// Requires start<limit,
+// and all strings of the [start..limit[ elements must be sorted and
+// have a common prefix of length unitIndex.
+StringTrieBuilder::Node *
+StringTrieBuilder::makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode) {
+    if(U_FAILURE(errorCode)) {
+        return NULL;
+    }
+    UBool hasValue=FALSE;
+    int32_t value=0;
+    if(unitIndex==getElementStringLength(start)) {
+        // An intermediate or final value.
+        value=getElementValue(start++);
+        if(start==limit) {
+            return registerFinalValue(value, errorCode);
+        }
+        hasValue=TRUE;
+    }
+    Node *node;
+    // Now all [start..limit[ strings are longer than unitIndex.
+    int32_t minUnit=getElementUnit(start, unitIndex);
+    int32_t maxUnit=getElementUnit(limit-1, unitIndex);
+    if(minUnit==maxUnit) {
+        // Linear-match node: All strings have the same character at unitIndex.
+        int32_t lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex);
+        Node *nextNode=makeNode(start, limit, lastUnitIndex, errorCode);
+        // Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
+        int32_t length=lastUnitIndex-unitIndex;
+        int32_t maxLinearMatchLength=getMaxLinearMatchLength();
+        while(length>maxLinearMatchLength) {
+            lastUnitIndex-=maxLinearMatchLength;
+            length-=maxLinearMatchLength;
+            node=createLinearMatchNode(start, lastUnitIndex, maxLinearMatchLength, nextNode);
+            nextNode=registerNode(node, errorCode);
+        }
+        node=createLinearMatchNode(start, unitIndex, length, nextNode);
+    } else {
+        // Branch node.
+        int32_t length=countElementUnits(start, limit, unitIndex);
+        // length>=2 because minUnit!=maxUnit.
+        Node *subNode=makeBranchSubNode(start, limit, unitIndex, length, errorCode);
+        node=new BranchHeadNode(length, subNode);
+    }
+    if(hasValue && node!=NULL) {
+        if(matchNodesCanHaveValues()) {
+            ((ValueNode *)node)->setValue(value);
+        } else {
+            node=new IntermediateValueNode(value, registerNode(node, errorCode));
+        }
+    }
+    return registerNode(node, errorCode);
+}
+
+// start<limit && all strings longer than unitIndex &&
+// length different units at unitIndex
+StringTrieBuilder::Node *
+StringTrieBuilder::makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
+                                   int32_t length, UErrorCode &errorCode) {
+    if(U_FAILURE(errorCode)) {
+        return NULL;
+    }
+    UChar middleUnits[kMaxSplitBranchLevels];
+    Node *lessThan[kMaxSplitBranchLevels];
+    int32_t ltLength=0;
+    while(length>getMaxBranchLinearSubNodeLength()) {
+        // Branch on the middle unit.
+        // First, find the middle unit.
+        int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2);
+        // Create the less-than branch.
+        middleUnits[ltLength]=getElementUnit(i, unitIndex);  // middle unit
+        lessThan[ltLength]=makeBranchSubNode(start, i, unitIndex, length/2, errorCode);
+        ++ltLength;
+        // Continue for the greater-or-equal branch.
+        start=i;
+        length=length-length/2;
+    }
+    if(U_FAILURE(errorCode)) {
+        return NULL;
+    }
+    ListBranchNode *listNode=new ListBranchNode();
+    if(listNode==NULL) {
+        errorCode=U_MEMORY_ALLOCATION_ERROR;
+        return NULL;
+    }
+    // For each unit, find its elements array start and whether it has a final value.
+    int32_t unitNumber=0;
+    do {
+        int32_t i=start;
+        UChar unit=getElementUnit(i++, unitIndex);
+        i=indexOfElementWithNextUnit(i, unitIndex, unit);
+        if(start==i-1 && unitIndex+1==getElementStringLength(start)) {
+            listNode->add(unit, getElementValue(start));
+        } else {
+            listNode->add(unit, makeNode(start, i, unitIndex+1, errorCode));
+        }
+        start=i;
+    } while(++unitNumber<length-1);
+    // unitNumber==length-1, and the maxUnit elements range is [start..limit[
+    UChar unit=getElementUnit(start, unitIndex);
+    if(start==limit-1 && unitIndex+1==getElementStringLength(start)) {
+        listNode->add(unit, getElementValue(start));
+    } else {
+        listNode->add(unit, makeNode(start, limit, unitIndex+1, errorCode));
+    }
+    Node *node=registerNode(listNode, errorCode);
+    // Create the split-branch nodes.
+    while(ltLength>0) {
+        --ltLength;
+        node=registerNode(
+            new SplitBranchNode(middleUnits[ltLength], lessThan[ltLength], node), errorCode);
+    }
+    return node;
+}
+
+StringTrieBuilder::Node *
+StringTrieBuilder::registerNode(Node *newNode, UErrorCode &errorCode) {
+    if(U_FAILURE(errorCode)) {
+        delete newNode;
+        return NULL;
+    }
+    if(newNode==NULL) {
+        errorCode=U_MEMORY_ALLOCATION_ERROR;
+        return NULL;
+    }
+    const UHashElement *old=uhash_find(nodes, newNode);
+    if(old!=NULL) {
+        delete newNode;
+        return (Node *)old->key.pointer;
+    }
+    // If uhash_puti() returns a non-zero value from an equivalent, previously
+    // registered node, then uhash_find() failed to find that and we will leak newNode.
+#if U_DEBUG
+    int32_t oldValue=  // Only in debug mode to avoid a compiler warning about unused oldValue.
+#endif
+    uhash_puti(nodes, newNode, 1, &errorCode);
+    U_ASSERT(oldValue==0);
+    if(U_FAILURE(errorCode)) {
+        delete newNode;
+        return NULL;
+    }
+    return newNode;
+}
+
+StringTrieBuilder::Node *
+StringTrieBuilder::registerFinalValue(int32_t value, UErrorCode &errorCode) {
+    if(U_FAILURE(errorCode)) {
+        return NULL;
+    }
+    FinalValueNode key(value);
+    const UHashElement *old=uhash_find(nodes, &key);
+    if(old!=NULL) {
+        return (Node *)old->key.pointer;
+    }
+    Node *newNode=new FinalValueNode(value);
+    if(newNode==NULL) {
+        errorCode=U_MEMORY_ALLOCATION_ERROR;
+        return NULL;
+    }
+    // If uhash_puti() returns a non-zero value from an equivalent, previously
+    // registered node, then uhash_find() failed to find that and we will leak newNode.
+#if U_DEBUG
+    int32_t oldValue=  // Only in debug mode to avoid a compiler warning about unused oldValue.
+#endif
+    uhash_puti(nodes, newNode, 1, &errorCode);
+    U_ASSERT(oldValue==0);
+    if(U_FAILURE(errorCode)) {
+        delete newNode;
+        return NULL;
+    }
+    return newNode;
+}
+
+UBool
+StringTrieBuilder::hashNode(const void *node) {
+    return ((const Node *)node)->hashCode();
+}
+
+UBool
+StringTrieBuilder::equalNodes(const void *left, const void *right) {
+    return *(const Node *)left==*(const Node *)right;
+}
+
+UBool
+StringTrieBuilder::Node::operator==(const Node &other) const {
+    return this==&other || (typeid(*this)==typeid(other) && hash==other.hash);
+}
+
+int32_t
+StringTrieBuilder::Node::markRightEdgesFirst(int32_t edgeNumber) {
+    if(offset==0) {
+        offset=edgeNumber;
+    }
+    return edgeNumber;
+}
+
+UBool
+StringTrieBuilder::FinalValueNode::operator==(const Node &other) const {
+    if(this==&other) {
+        return TRUE;
+    }
+    if(!Node::operator==(other)) {
+        return FALSE;
+    }
+    const FinalValueNode &o=(const FinalValueNode &)other;
+    return value==o.value;
+}
+
+void
+StringTrieBuilder::FinalValueNode::write(StringTrieBuilder &builder) {
+    offset=builder.writeValueAndFinal(value, TRUE);
+}
+
+UBool
+StringTrieBuilder::ValueNode::operator==(const Node &other) const {
+    if(this==&other) {
+        return TRUE;
+    }
+    if(!Node::operator==(other)) {
+        return FALSE;
+    }
+    const ValueNode &o=(const ValueNode &)other;
+    return hasValue==o.hasValue && (!hasValue || value==o.value);
+}
+
+UBool
+StringTrieBuilder::IntermediateValueNode::operator==(const Node &other) const {
+    if(this==&other) {
+        return TRUE;
+    }
+    if(!ValueNode::operator==(other)) {
+        return FALSE;
+    }
+    const IntermediateValueNode &o=(const IntermediateValueNode &)other;
+    return next==o.next;
+}
+
+int32_t
+StringTrieBuilder::IntermediateValueNode::markRightEdgesFirst(int32_t edgeNumber) {
+    if(offset==0) {
+        offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
+    }
+    return edgeNumber;
+}
+
+void
+StringTrieBuilder::IntermediateValueNode::write(StringTrieBuilder &builder) {
+    next->write(builder);
+    offset=builder.writeValueAndFinal(value, FALSE);
+}
+
+UBool
+StringTrieBuilder::LinearMatchNode::operator==(const Node &other) const {
+    if(this==&other) {
+        return TRUE;
+    }
+    if(!ValueNode::operator==(other)) {
+        return FALSE;
+    }
+    const LinearMatchNode &o=(const LinearMatchNode &)other;
+    return length==o.length && next==o.next;
+}
+
+int32_t
+StringTrieBuilder::LinearMatchNode::markRightEdgesFirst(int32_t edgeNumber) {
+    if(offset==0) {
+        offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
+    }
+    return edgeNumber;
+}
+
+UBool
+StringTrieBuilder::ListBranchNode::operator==(const Node &other) const {
+    if(this==&other) {
+        return TRUE;
+    }
+    if(!Node::operator==(other)) {
+        return FALSE;
+    }
+    const ListBranchNode &o=(const ListBranchNode &)other;
+    for(int32_t i=0; i<length; ++i) {
+        if(units[i]!=o.units[i] || values[i]!=o.values[i] || equal[i]!=o.equal[i]) {
+            return FALSE;
+        }
+    }
+    return TRUE;
+}
+
+int32_t
+StringTrieBuilder::ListBranchNode::markRightEdgesFirst(int32_t edgeNumber) {
+    if(offset==0) {
+        firstEdgeNumber=edgeNumber;
+        int32_t step=0;
+        int32_t i=length;
+        do {
+            Node *edge=equal[--i];
+            if(edge!=NULL) {
+                edgeNumber=edge->markRightEdgesFirst(edgeNumber-step);
+            }
+            // For all but the rightmost edge, decrement the edge number.
+            step=1;
+        } while(i>0);
+        offset=edgeNumber;
+    }
+    return edgeNumber;
+}
+
+void
+StringTrieBuilder::ListBranchNode::write(StringTrieBuilder &builder) {
+    // Write the sub-nodes in reverse order: The jump lengths are deltas from
+    // after their own positions, so if we wrote the minUnit sub-node first,
+    // then its jump delta would be larger.
+    // Instead we write the minUnit sub-node last, for a shorter delta.
+    int32_t unitNumber=length-1;
+    Node *rightEdge=equal[unitNumber];
+    int32_t rightEdgeNumber= rightEdge==NULL ? firstEdgeNumber : rightEdge->getOffset();
+    do {
+        --unitNumber;
+        if(equal[unitNumber]!=NULL) {
+            equal[unitNumber]->writeUnlessInsideRightEdge(firstEdgeNumber, rightEdgeNumber, builder);
+        }
+    } while(unitNumber>0);
+    // The maxUnit sub-node is written as the very last one because we do
+    // not jump for it at all.
+    unitNumber=length-1;
+    if(rightEdge==NULL) {
+        builder.writeValueAndFinal(values[unitNumber], TRUE);
+    } else {
+        rightEdge->write(builder);
+    }
+    offset=builder.write(units[unitNumber]);
+    // Write the rest of this node's unit-value pairs.
+    while(--unitNumber>=0) {
+        int32_t value;
+        UBool isFinal;
+        if(equal[unitNumber]==NULL) {
+            // Write the final value for the one string ending with this unit.
+            value=values[unitNumber];
+            isFinal=TRUE;
+        } else {
+            // Write the delta to the start position of the sub-node.
+            U_ASSERT(equal[unitNumber]->getOffset()>0);
+            value=offset-equal[unitNumber]->getOffset();
+            isFinal=FALSE;
+        }
+        builder.writeValueAndFinal(value, isFinal);
+        offset=builder.write(units[unitNumber]);
+    }
+}
+
+UBool
+StringTrieBuilder::SplitBranchNode::operator==(const Node &other) const {
+    if(this==&other) {
+        return TRUE;
+    }
+    if(!Node::operator==(other)) {
+        return FALSE;
+    }
+    const SplitBranchNode &o=(const SplitBranchNode &)other;
+    return unit==o.unit && lessThan==o.lessThan && greaterOrEqual==o.greaterOrEqual;
+}
+
+int32_t
+StringTrieBuilder::SplitBranchNode::markRightEdgesFirst(int32_t edgeNumber) {
+    if(offset==0) {
+        firstEdgeNumber=edgeNumber;
+        edgeNumber=greaterOrEqual->markRightEdgesFirst(edgeNumber);
+        offset=edgeNumber=lessThan->markRightEdgesFirst(edgeNumber-1);
+    }
+    return edgeNumber;
+}
+
+void
+StringTrieBuilder::SplitBranchNode::write(StringTrieBuilder &builder) {
+    // Encode the less-than branch first.
+    lessThan->writeUnlessInsideRightEdge(firstEdgeNumber, greaterOrEqual->getOffset(), builder);
+    // Encode the greater-or-equal branch last because we do not jump for it at all.
+    greaterOrEqual->write(builder);
+    // Write this node.
+    U_ASSERT(lessThan->getOffset()>0);
+    builder.writeDeltaTo(lessThan->getOffset());  // less-than
+    offset=builder.write(unit);
+}
+
+UBool
+StringTrieBuilder::BranchHeadNode::operator==(const Node &other) const {
+    if(this==&other) {
+        return TRUE;
+    }
+    if(!ValueNode::operator==(other)) {
+        return FALSE;
+    }
+    const BranchHeadNode &o=(const BranchHeadNode &)other;
+    return length==o.length && next==o.next;
+}
+
+int32_t
+StringTrieBuilder::BranchHeadNode::markRightEdgesFirst(int32_t edgeNumber) {
+    if(offset==0) {
+        offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
+    }
+    return edgeNumber;
+}
+
+void
+StringTrieBuilder::BranchHeadNode::write(StringTrieBuilder &builder) {
+    next->write(builder);
+    if(length<=builder.getMinLinearMatch()) {
+        offset=builder.writeValueAndType(hasValue, value, length-1);
+    } else {
+        builder.write(length-1);
+        offset=builder.writeValueAndType(hasValue, value, 0);
+    }
+}
+
+U_NAMESPACE_END