source/common/rbbi.cpp - Issue 2440913002: Update ICU to 58.1

Unified Diff: source/common/rbbi.cpp

Issue 2440913002: Update ICU to 58.1

Patch Set: Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: source/common/rbbi.cpp

diff --git a/source/common/rbbi.cpp b/source/common/rbbi.cpp

index 19494af26a564a38909aac4af915be6d459f8b9c..2680bf216c789e9f9ee761b7fc939c6b847b6933 100644

--- a/source/common/rbbi.cpp

+++ b/source/common/rbbi.cpp

@@ -1,6 +1,8 @@

+// License & terms of use: http://www.unicode.org/copyright.html

***************************************************************************

@@ -72,21 +74,6 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode

}

-/**

- * Same as above but does not adopt memory

- */

-RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status)

- init();

- fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor

- if (U_FAILURE(status)) {return;}

- if(fData == 0) {

- status = U_MEMORY_ALLOCATION_ERROR;

- return;

- }

// Construct from precompiled binary rules (tables). This constructor is public API,

// taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules().

@@ -715,7 +702,7 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {

// Move requested offset to a code point start. It might be on a trail surrogate,

// or on a trail byte if the input is UTF-8.

utext_setNativeIndex(fText, offset);

- offset = utext_getNativeIndex(fText);

+ offset = (int32_t)utext_getNativeIndex(fText);

// if we have cached break positions and offset is in the range

// covered by them, use them

@@ -826,7 +813,7 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) {

// Move requested offset to a code point start. It might be on a trail surrogate,

// or on a trail byte if the input is UTF-8.

utext_setNativeIndex(fText, offset);

- offset = utext_getNativeIndex(fText);

+ offset = (int32_t)utext_getNativeIndex(fText);

// if we have cached break positions and offset is in the range

// covered by them, use them

@@ -983,6 +970,54 @@ enum RBBIRunMode {

};

+// Map from look-ahead break states (corresponds to rules) to boundary positions.

+// Allows multiple lookahead break rules to be in flight at the same time.

+//

+// This is a temporary approach for ICU 57. A better fix is to make the look-ahead numbers

+// in the state table be sequential, then we can just index an array. And the

+// table could also tell us in advance how big that array needs to be.

+//

+// Before ICU 57 there was just a single simple variable for a look-ahead match that

+// was in progress. Two rules at once did not work.

+static const int32_t kMaxLookaheads = 8;

+struct LookAheadResults {

+ int32_t fUsedSlotLimit;

+ int32_t fPositions[8];

+ int16_t fKeys[8];

+ LookAheadResults() : fUsedSlotLimit(0), fPositions(), fKeys() {};

+ int32_t getPosition(int16_t key) {

+ for (int32_t i=0; i<fUsedSlotLimit; ++i) {

+ if (fKeys[i] == key) {

+ return fPositions[i];

+ }

+ U_ASSERT(FALSE);

+ return -1;

+ }

+ void setPosition(int16_t key, int32_t position) {

+ int32_t i;

+ for (i=0; i<fUsedSlotLimit; ++i) {

+ if (fKeys[i] == key) {

+ fPositions[i] = position;

+ return;

+ }

+ if (i >= kMaxLookaheads) {

+ U_ASSERT(FALSE);

+ i = kMaxLookaheads - 1;

+ }

+ fKeys[i] = key;

+ fPositions[i] = position;

+ U_ASSERT(fUsedSlotLimit == i);

+ fUsedSlotLimit = i + 1;

+ }

+};

//-----------------------------------------------------------------------------------

// handleNext(stateTable)

@@ -1000,14 +1035,11 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {

RBBIStateTableRow *row;

UChar32 c;

- int32_t lookaheadStatus = 0;

- int32_t lookaheadTagIdx = 0;

- int32_t result = 0;

- int32_t initialPosition = 0;

- int32_t lookaheadResult = 0;

- UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;

- const char *tableData = statetable->fTableData;

- uint32_t tableRowLen = statetable->fRowLen;

+ LookAheadResults lookAheadMatches;

+ int32_t result = 0;

+ int32_t initialPosition = 0;

+ const char *tableData = statetable->fTableData;

+ uint32_t tableRowLen = statetable->fRowLen;

#ifdef RBBI_DEBUG

if (fTrace) {

@@ -1050,14 +1082,6 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {

// We have already run the loop one last time with the

// character set to the psueudo {eof} value. Now it is time

// to unconditionally bail out.

- if (lookaheadResult > result) {

- // We ran off the end of the string with a pending look-ahead match.

- // Treat this as if the look-ahead condition had been met, and return

- // the match at the / position from the look-ahead rule.

- result = lookaheadResult;

- fLastRuleStatusIndex = lookaheadTagIdx;

- lookaheadStatus = 0;

- }

break;

}

// Run the loop one last time with the fake end-of-input character category.

@@ -1123,38 +1147,23 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {

fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values.

}

- if (row->fLookAhead != 0) {

- if (lookaheadStatus != 0

- && row->fAccepting == lookaheadStatus) {

- // Lookahead match is completed.

- result = lookaheadResult;

- fLastRuleStatusIndex = lookaheadTagIdx;

- lookaheadStatus = 0;

- // TODO: make a standalone hard break in a rule work.

- if (lookAheadHardBreak) {

- UTEXT_SETNATIVEINDEX(fText, result);

- return result;

- }

- // Look-ahead completed, but other rules may match further. Continue on

- // TODO: junk this feature? I don't think it's used anywhwere.

- goto continueOn;

+ int16_t completedRule = row->fAccepting;

+ if (completedRule > 0) {

+ // Lookahead match is completed.

+ int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);

+ if (lookaheadResult >= 0) {

+ fLastRuleStatusIndex = row->fTagIdx;

+ UTEXT_SETNATIVEINDEX(fText, lookaheadResult);

+ return lookaheadResult;

}

- int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText);

- lookaheadResult = r;

- lookaheadStatus = row->fLookAhead;

- lookaheadTagIdx = row->fTagIdx;

- goto continueOn;

}

- if (row->fAccepting != 0) {

- // Because this is an accepting state, any in-progress look-ahead match

- // is no longer relavant. Clear out the pending lookahead status.

- lookaheadStatus = 0; // clear out any pending look-ahead match.

+ int16_t rule = row->fLookAhead;

+ if (rule != 0) {

+ // At the position of a '/' in a look-ahead match. Record it.

+ int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);

+ lookAheadMatches.setPosition(rule, pos);

}

-continueOn:

if (state == STOP_STATE) {

// This is the normal exit from the lookup state machine.

// We have advanced through the string until it is certain that no

@@ -1216,11 +1225,9 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)

RBBIRunMode mode;

RBBIStateTableRow *row;

UChar32 c;

- int32_t lookaheadStatus = 0;

+ LookAheadResults lookAheadMatches;

int32_t result = 0;

int32_t initialPosition = 0;

- int32_t lookaheadResult = 0;

- UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;

#ifdef RBBI_DEBUG

if (fTrace) {

@@ -1266,13 +1273,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)

// We have already run the loop one last time with the

// character set to the psueudo {eof} value. Now it is time

// to unconditionally bail out.

- if (lookaheadResult < result) {

- // We ran off the end of the string with a pending look-ahead match.

- // Treat this as if the look-ahead condition had been met, and return

- // the match at the / position from the look-ahead rule.

- result = lookaheadResult;

- lookaheadStatus = 0;

- } else if (result == initialPosition) {

+ if (result == initialPosition) {

// Ran off start, no match found.

// move one index one (towards the start, since we are doing a previous())

UTEXT_SETNATIVEINDEX(fText, initialPosition);

@@ -1338,36 +1339,22 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)

result = (int32_t)UTEXT_GETNATIVEINDEX(fText);

}

- if (row->fLookAhead != 0) {

- if (lookaheadStatus != 0

- && row->fAccepting == lookaheadStatus) {

- // Lookahead match is completed.

- result = lookaheadResult;

- lookaheadStatus = 0;

- // TODO: make a standalone hard break in a rule work.

- if (lookAheadHardBreak) {

- UTEXT_SETNATIVEINDEX(fText, result);

- return result;

- }

- // Look-ahead completed, but other rules may match further. Continue on

- // TODO: junk this feature? I don't think it's used anywhwere.

- goto continueOn;

+ int16_t completedRule = row->fAccepting;

+ if (completedRule > 0) {

+ // Lookahead match is completed.

+ int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);

+ if (lookaheadResult >= 0) {

+ UTEXT_SETNATIVEINDEX(fText, lookaheadResult);

+ return lookaheadResult;

}

- int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText);

- lookaheadResult = r;

- lookaheadStatus = row->fLookAhead;

- goto continueOn;

}

- if (row->fAccepting != 0) {

- // Because this is an accepting state, any in-progress look-ahead match

- // is no longer relavant. Clear out the pending lookahead status.

- lookaheadStatus = 0;

+ int16_t rule = row->fLookAhead;

+ if (rule != 0) {

+ // At the position of a '/' in a look-ahead match. Record it.

+ int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);

+ lookAheadMatches.setPosition(rule, pos);

}

-continueOn:

if (state == STOP_STATE) {

// This is the normal exit from the lookup state machine.

// We have advanced through the string until it is certain that no

« no previous file with comments | « source/common/putilimp.h ('k') | source/common/rbbicst.pl » ('j') | no next file with comments »