Index: source/common/rbbi.cpp |
diff --git a/source/common/rbbi.cpp b/source/common/rbbi.cpp |
index 19494af26a564a38909aac4af915be6d459f8b9c..2680bf216c789e9f9ee761b7fc939c6b847b6933 100644 |
--- a/source/common/rbbi.cpp |
+++ b/source/common/rbbi.cpp |
@@ -1,6 +1,8 @@ |
+// Copyright (C) 2016 and later: Unicode, Inc. and others. |
+// License & terms of use: http://www.unicode.org/copyright.html |
/* |
*************************************************************************** |
-* Copyright (C) 1999-2014 International Business Machines Corporation |
+* Copyright (C) 1999-2016 International Business Machines Corporation |
* and others. All rights reserved. |
*************************************************************************** |
*/ |
@@ -72,21 +74,6 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode |
} |
} |
-/** |
- * Same as above but does not adopt memory |
- */ |
-RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status) |
-{ |
- init(); |
- fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor |
- if (U_FAILURE(status)) {return;} |
- if(fData == 0) { |
- status = U_MEMORY_ALLOCATION_ERROR; |
- return; |
- } |
-} |
- |
- |
// |
// Construct from precompiled binary rules (tables). This constructor is public API, |
// taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules(). |
@@ -715,7 +702,7 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) { |
// Move requested offset to a code point start. It might be on a trail surrogate, |
// or on a trail byte if the input is UTF-8. |
utext_setNativeIndex(fText, offset); |
- offset = utext_getNativeIndex(fText); |
+ offset = (int32_t)utext_getNativeIndex(fText); |
// if we have cached break positions and offset is in the range |
// covered by them, use them |
@@ -826,7 +813,7 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) { |
// Move requested offset to a code point start. It might be on a trail surrogate, |
// or on a trail byte if the input is UTF-8. |
utext_setNativeIndex(fText, offset); |
- offset = utext_getNativeIndex(fText); |
+ offset = (int32_t)utext_getNativeIndex(fText); |
// if we have cached break positions and offset is in the range |
// covered by them, use them |
@@ -983,6 +970,54 @@ enum RBBIRunMode { |
}; |
+// Map from look-ahead break states (corresponds to rules) to boundary positions. |
+// Allows multiple lookahead break rules to be in flight at the same time. |
+// |
+// This is a temporary approach for ICU 57. A better fix is to make the look-ahead numbers |
+// in the state table be sequential, then we can just index an array. And the |
+// table could also tell us in advance how big that array needs to be. |
+// |
+// Before ICU 57 there was just a single simple variable for a look-ahead match that |
+// was in progress. Two rules at once did not work. |
+ |
+static const int32_t kMaxLookaheads = 8; |
+struct LookAheadResults { |
+ int32_t fUsedSlotLimit; |
+ int32_t fPositions[8]; |
+ int16_t fKeys[8]; |
+ |
+ LookAheadResults() : fUsedSlotLimit(0), fPositions(), fKeys() {}; |
+ |
+ int32_t getPosition(int16_t key) { |
+ for (int32_t i=0; i<fUsedSlotLimit; ++i) { |
+ if (fKeys[i] == key) { |
+ return fPositions[i]; |
+ } |
+ } |
+ U_ASSERT(FALSE); |
+ return -1; |
+ } |
+ |
+ void setPosition(int16_t key, int32_t position) { |
+ int32_t i; |
+ for (i=0; i<fUsedSlotLimit; ++i) { |
+ if (fKeys[i] == key) { |
+ fPositions[i] = position; |
+ return; |
+ } |
+ } |
+ if (i >= kMaxLookaheads) { |
+ U_ASSERT(FALSE); |
+ i = kMaxLookaheads - 1; |
+ } |
+ fKeys[i] = key; |
+ fPositions[i] = position; |
+ U_ASSERT(fUsedSlotLimit == i); |
+ fUsedSlotLimit = i + 1; |
+ } |
+}; |
+ |
+ |
//----------------------------------------------------------------------------------- |
// |
// handleNext(stateTable) |
@@ -1000,14 +1035,11 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { |
RBBIStateTableRow *row; |
UChar32 c; |
- int32_t lookaheadStatus = 0; |
- int32_t lookaheadTagIdx = 0; |
- int32_t result = 0; |
- int32_t initialPosition = 0; |
- int32_t lookaheadResult = 0; |
- UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; |
- const char *tableData = statetable->fTableData; |
- uint32_t tableRowLen = statetable->fRowLen; |
+ LookAheadResults lookAheadMatches; |
+ int32_t result = 0; |
+ int32_t initialPosition = 0; |
+ const char *tableData = statetable->fTableData; |
+ uint32_t tableRowLen = statetable->fRowLen; |
#ifdef RBBI_DEBUG |
if (fTrace) { |
@@ -1050,14 +1082,6 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { |
// We have already run the loop one last time with the |
// character set to the psueudo {eof} value. Now it is time |
// to unconditionally bail out. |
- if (lookaheadResult > result) { |
- // We ran off the end of the string with a pending look-ahead match. |
- // Treat this as if the look-ahead condition had been met, and return |
- // the match at the / position from the look-ahead rule. |
- result = lookaheadResult; |
- fLastRuleStatusIndex = lookaheadTagIdx; |
- lookaheadStatus = 0; |
- } |
break; |
} |
// Run the loop one last time with the fake end-of-input character category. |
@@ -1123,38 +1147,23 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { |
fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values. |
} |
- if (row->fLookAhead != 0) { |
- if (lookaheadStatus != 0 |
- && row->fAccepting == lookaheadStatus) { |
- // Lookahead match is completed. |
- result = lookaheadResult; |
- fLastRuleStatusIndex = lookaheadTagIdx; |
- lookaheadStatus = 0; |
- // TODO: make a standalone hard break in a rule work. |
- if (lookAheadHardBreak) { |
- UTEXT_SETNATIVEINDEX(fText, result); |
- return result; |
- } |
- // Look-ahead completed, but other rules may match further. Continue on |
- // TODO: junk this feature? I don't think it's used anywhwere. |
- goto continueOn; |
+ int16_t completedRule = row->fAccepting; |
+ if (completedRule > 0) { |
+ // Lookahead match is completed. |
+ int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule); |
+ if (lookaheadResult >= 0) { |
+ fLastRuleStatusIndex = row->fTagIdx; |
+ UTEXT_SETNATIVEINDEX(fText, lookaheadResult); |
+ return lookaheadResult; |
} |
- |
- int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); |
- lookaheadResult = r; |
- lookaheadStatus = row->fLookAhead; |
- lookaheadTagIdx = row->fTagIdx; |
- goto continueOn; |
} |
- |
- |
- if (row->fAccepting != 0) { |
- // Because this is an accepting state, any in-progress look-ahead match |
- // is no longer relavant. Clear out the pending lookahead status. |
- lookaheadStatus = 0; // clear out any pending look-ahead match. |
+ int16_t rule = row->fLookAhead; |
+ if (rule != 0) { |
+ // At the position of a '/' in a look-ahead match. Record it. |
+ int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText); |
+ lookAheadMatches.setPosition(rule, pos); |
} |
-continueOn: |
if (state == STOP_STATE) { |
// This is the normal exit from the lookup state machine. |
// We have advanced through the string until it is certain that no |
@@ -1216,11 +1225,9 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) |
RBBIRunMode mode; |
RBBIStateTableRow *row; |
UChar32 c; |
- int32_t lookaheadStatus = 0; |
+ LookAheadResults lookAheadMatches; |
int32_t result = 0; |
int32_t initialPosition = 0; |
- int32_t lookaheadResult = 0; |
- UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; |
#ifdef RBBI_DEBUG |
if (fTrace) { |
@@ -1266,13 +1273,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) |
// We have already run the loop one last time with the |
// character set to the psueudo {eof} value. Now it is time |
// to unconditionally bail out. |
- if (lookaheadResult < result) { |
- // We ran off the end of the string with a pending look-ahead match. |
- // Treat this as if the look-ahead condition had been met, and return |
- // the match at the / position from the look-ahead rule. |
- result = lookaheadResult; |
- lookaheadStatus = 0; |
- } else if (result == initialPosition) { |
+ if (result == initialPosition) { |
// Ran off start, no match found. |
// move one index one (towards the start, since we are doing a previous()) |
UTEXT_SETNATIVEINDEX(fText, initialPosition); |
@@ -1338,36 +1339,22 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) |
result = (int32_t)UTEXT_GETNATIVEINDEX(fText); |
} |
- if (row->fLookAhead != 0) { |
- if (lookaheadStatus != 0 |
- && row->fAccepting == lookaheadStatus) { |
- // Lookahead match is completed. |
- result = lookaheadResult; |
- lookaheadStatus = 0; |
- // TODO: make a standalone hard break in a rule work. |
- if (lookAheadHardBreak) { |
- UTEXT_SETNATIVEINDEX(fText, result); |
- return result; |
- } |
- // Look-ahead completed, but other rules may match further. Continue on |
- // TODO: junk this feature? I don't think it's used anywhwere. |
- goto continueOn; |
+ int16_t completedRule = row->fAccepting; |
+ if (completedRule > 0) { |
+ // Lookahead match is completed. |
+ int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule); |
+ if (lookaheadResult >= 0) { |
+ UTEXT_SETNATIVEINDEX(fText, lookaheadResult); |
+ return lookaheadResult; |
} |
- |
- int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); |
- lookaheadResult = r; |
- lookaheadStatus = row->fLookAhead; |
- goto continueOn; |
} |
- |
- |
- if (row->fAccepting != 0) { |
- // Because this is an accepting state, any in-progress look-ahead match |
- // is no longer relavant. Clear out the pending lookahead status. |
- lookaheadStatus = 0; |
+ int16_t rule = row->fLookAhead; |
+ if (rule != 0) { |
+ // At the position of a '/' in a look-ahead match. Record it. |
+ int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText); |
+ lookAheadMatches.setPosition(rule, pos); |
} |
-continueOn: |
if (state == STOP_STATE) { |
// This is the normal exit from the lookup state machine. |
// We have advanced through the string until it is certain that no |