Index: source/i18n/rematch.cpp |
diff --git a/source/i18n/rematch.cpp b/source/i18n/rematch.cpp |
index c7aeac015ff3e99e9496f3a247f0e44ccbf419da..0e795f216c23d5ed19abcb04934dda1263204ab0 100644 |
--- a/source/i18n/rematch.cpp |
+++ b/source/i18n/rematch.cpp |
@@ -1,7 +1,9 @@ |
+// Copyright (C) 2016 and later: Unicode, Inc. and others. |
+// License & terms of use: http://www.unicode.org/copyright.html |
/* |
************************************************************************** |
-* Copyright (C) 2002-2015 International Business Machines Corporation * |
-* and others. All rights reserved. * |
+* Copyright (C) 2002-2016 International Business Machines Corporation |
+* and others. All rights reserved. |
************************************************************************** |
*/ |
// |
@@ -23,6 +25,7 @@ |
#include "unicode/utf16.h" |
#include "uassert.h" |
#include "cmemory.h" |
+#include "cstr.h" |
#include "uvector.h" |
#include "uvectr32.h" |
#include "uvectr64.h" |
@@ -33,6 +36,7 @@ |
// #include <malloc.h> // Needed for heapcheck testing |
+ |
U_NAMESPACE_BEGIN |
// Default limit for the size of the back track stack, to avoid system |
@@ -237,7 +241,7 @@ void RegexMatcher::init2(UText *input, UErrorCode &status) { |
return; |
} |
- if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) { |
+ if (fPattern->fDataSize > UPRV_LENGTHOF(fSmallData)) { |
fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t)); |
if (fData == NULL) { |
status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; |
@@ -782,7 +786,7 @@ UBool RegexMatcher::find(UErrorCode &status) { |
if (fMatch) { |
return TRUE; |
} |
- UTEXT_SETNATIVEINDEX(fInputText, pos); |
+ UTEXT_SETNATIVEINDEX(fInputText, startPos); |
} |
if (startPos > testStartLimit) { |
fMatch = FALSE; |
@@ -2723,6 +2727,18 @@ inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatId |
return (REStackFrame *)newFP; |
} |
+#if defined(REGEX_DEBUG) |
+namespace { |
+UnicodeString StringFromUText(UText *ut) { |
+ UnicodeString result; |
+ for (UChar32 c = utext_next32From(ut, 0); c != U_SENTINEL; c = UTEXT_NEXT32(ut)) { |
+ result.append(c); |
+ } |
+ return result; |
+} |
+} |
+#endif // REGEX_DEBUG |
+ |
//-------------------------------------------------------------------------------- |
// |
@@ -2742,32 +2758,10 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) { |
int32_t opValue; // and the operand value. |
#ifdef REGEX_RUN_DEBUG |
- if (fTraceDebug) |
- { |
+ if (fTraceDebug) { |
printf("MatchAt(startIdx=%ld)\n", startIdx); |
- printf("Original Pattern: "); |
- UChar32 c = utext_next32From(fPattern->fPattern, 0); |
- while (c != U_SENTINEL) { |
- if (c<32 || c>256) { |
- c = '.'; |
- } |
- printf("%c", c); |
- |
- c = UTEXT_NEXT32(fPattern->fPattern); |
- } |
- printf("\n"); |
- printf("Input String: "); |
- c = utext_next32From(fInputText, 0); |
- while (c != U_SENTINEL) { |
- if (c<32 || c>256) { |
- c = '.'; |
- } |
- printf("%c", c); |
- |
- c = UTEXT_NEXT32(fInputText); |
- } |
- printf("\n"); |
- printf("\n"); |
+ printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))()); |
+ printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))()); |
} |
#endif |
@@ -3936,28 +3930,38 @@ GC_Done: |
// of this op in the pattern. |
int32_t minML = (int32_t)pat[fp->fPatIdx++]; |
int32_t maxML = (int32_t)pat[fp->fPatIdx++]; |
+ if (!UTEXT_USES_U16(fInputText)) { |
+ // utf-8 fix to maximum match length. The pattern compiler assumes utf-16. |
+ // The max length need not be exact; it just needs to be >= actual maximum. |
+ maxML *= 3; |
+ } |
U_ASSERT(minML <= maxML); |
U_ASSERT(minML >= 0); |
// Fetch (from data) the last input index where a match was attempted. |
U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
- int64_t *lbStartIdx = &fData[opValue+2]; |
- if (*lbStartIdx < 0) { |
+ int64_t &lbStartIdx = fData[opValue+2]; |
+ if (lbStartIdx < 0) { |
// First time through loop. |
- *lbStartIdx = fp->fInputIdx - minML; |
+ lbStartIdx = fp->fInputIdx - minML; |
+ if (lbStartIdx > 0) { |
+ // move index to a code point boudary, if it's not on one already. |
+ UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx); |
+ lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); |
+ } |
} else { |
// 2nd through nth time through the loop. |
// Back up start position for match by one. |
- if (*lbStartIdx == 0) { |
- (*lbStartIdx)--; |
+ if (lbStartIdx == 0) { |
+ (lbStartIdx)--; |
} else { |
- UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx); |
+ UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx); |
(void)UTEXT_PREVIOUS32(fInputText); |
- *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); |
+ lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); |
} |
} |
- if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { |
+ if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) { |
// We have tried all potential match starting points without |
// getting a match. Backtrack out, and out of the |
// Look Behind altogether. |
@@ -3972,7 +3976,7 @@ GC_Done: |
// Save state to this URX_LB_CONT op, so failure to match will repeat the loop. |
// (successful match will fall off the end of the loop.) |
fp = StateSave(fp, fp->fPatIdx-3, status); |
- fp->fInputIdx = *lbStartIdx; |
+ fp->fInputIdx = lbStartIdx; |
} |
break; |
@@ -4009,6 +4013,11 @@ GC_Done: |
// Fetch the extra parameters of this op. |
int32_t minML = (int32_t)pat[fp->fPatIdx++]; |
int32_t maxML = (int32_t)pat[fp->fPatIdx++]; |
+ if (!UTEXT_USES_U16(fInputText)) { |
+ // utf-8 fix to maximum match length. The pattern compiler assumes utf-16. |
+ // The max length need not be exact; it just needs to be >= actual maximum. |
+ maxML *= 3; |
+ } |
int32_t continueLoc = (int32_t)pat[fp->fPatIdx++]; |
continueLoc = URX_VAL(continueLoc); |
U_ASSERT(minML <= maxML); |
@@ -4017,23 +4026,28 @@ GC_Done: |
// Fetch (from data) the last input index where a match was attempted. |
U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
- int64_t *lbStartIdx = &fData[opValue+2]; |
- if (*lbStartIdx < 0) { |
+ int64_t &lbStartIdx = fData[opValue+2]; |
+ if (lbStartIdx < 0) { |
// First time through loop. |
- *lbStartIdx = fp->fInputIdx - minML; |
+ lbStartIdx = fp->fInputIdx - minML; |
+ if (lbStartIdx > 0) { |
+ // move index to a code point boudary, if it's not on one already. |
+ UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx); |
+ lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); |
+ } |
} else { |
// 2nd through nth time through the loop. |
// Back up start position for match by one. |
- if (*lbStartIdx == 0) { |
- (*lbStartIdx)--; |
+ if (lbStartIdx == 0) { |
+ (lbStartIdx)--; |
} else { |
- UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx); |
+ UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx); |
(void)UTEXT_PREVIOUS32(fInputText); |
- *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); |
+ lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); |
} |
} |
- if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { |
+ if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) { |
// We have tried all potential match starting points without |
// getting a match, which means that the negative lookbehind as |
// a whole has succeeded. Jump forward to the continue location |
@@ -4048,7 +4062,7 @@ GC_Done: |
// Save state to this URX_LB_CONT op, so failure to match will repeat the loop. |
// (successful match will cause a FAIL out of the loop altogether.) |
fp = StateSave(fp, fp->fPatIdx-4, status); |
- fp->fInputIdx = *lbStartIdx; |
+ fp->fInputIdx = lbStartIdx; |
} |
break; |
@@ -4310,29 +4324,8 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu |
#ifdef REGEX_RUN_DEBUG |
if (fTraceDebug) { |
printf("MatchAt(startIdx=%d)\n", startIdx); |
- printf("Original Pattern: "); |
- UChar32 c = utext_next32From(fPattern->fPattern, 0); |
- while (c != U_SENTINEL) { |
- if (c<32 || c>256) { |
- c = '.'; |
- } |
- printf("%c", c); |
- |
- c = UTEXT_NEXT32(fPattern->fPattern); |
- } |
- printf("\n"); |
- printf("Input String: "); |
- c = utext_next32From(fInputText, 0); |
- while (c != U_SENTINEL) { |
- if (c<32 || c>256) { |
- c = '.'; |
- } |
- printf("%c", c); |
- |
- c = UTEXT_NEXT32(fInputText); |
- } |
- printf("\n"); |
- printf("\n"); |
+ printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))()); |
+ printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))()); |
} |
#endif |
@@ -5232,6 +5225,12 @@ GC_Done: |
break; |
} |
} |
+ if (success && groupStartIdx < groupEndIdx && U16_IS_LEAD(inputBuf[groupEndIdx-1]) && |
+ inputIndex < fActiveLimit && U16_IS_TRAIL(inputBuf[inputIndex])) { |
+ // Capture group ended with an unpaired lead surrogate. |
+ // Back reference is not permitted to match lead only of a surrogatge pair. |
+ success = FALSE; |
+ } |
if (success) { |
fp->fInputIdx = inputIndex; |
} else { |
@@ -5444,21 +5443,24 @@ GC_Done: |
// Fetch (from data) the last input index where a match was attempted. |
U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
- int64_t *lbStartIdx = &fData[opValue+2]; |
- if (*lbStartIdx < 0) { |
+ int64_t &lbStartIdx = fData[opValue+2]; |
+ if (lbStartIdx < 0) { |
// First time through loop. |
- *lbStartIdx = fp->fInputIdx - minML; |
+ lbStartIdx = fp->fInputIdx - minML; |
+ if (lbStartIdx > 0) { |
+ U16_SET_CP_START(inputBuf, 0, lbStartIdx); |
+ } |
} else { |
// 2nd through nth time through the loop. |
// Back up start position for match by one. |
- if (*lbStartIdx == 0) { |
- (*lbStartIdx)--; |
+ if (lbStartIdx == 0) { |
+ lbStartIdx--; |
} else { |
- U16_BACK_1(inputBuf, 0, *lbStartIdx); |
+ U16_BACK_1(inputBuf, 0, lbStartIdx); |
} |
} |
- if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { |
+ if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) { |
// We have tried all potential match starting points without |
// getting a match. Backtrack out, and out of the |
// Look Behind altogether. |
@@ -5473,7 +5475,7 @@ GC_Done: |
// Save state to this URX_LB_CONT op, so failure to match will repeat the loop. |
// (successful match will fall off the end of the loop.) |
fp = StateSave(fp, fp->fPatIdx-3, status); |
- fp->fInputIdx = *lbStartIdx; |
+ fp->fInputIdx = lbStartIdx; |
} |
break; |
@@ -5518,21 +5520,24 @@ GC_Done: |
// Fetch (from data) the last input index where a match was attempted. |
U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); |
- int64_t *lbStartIdx = &fData[opValue+2]; |
- if (*lbStartIdx < 0) { |
+ int64_t &lbStartIdx = fData[opValue+2]; |
+ if (lbStartIdx < 0) { |
// First time through loop. |
- *lbStartIdx = fp->fInputIdx - minML; |
+ lbStartIdx = fp->fInputIdx - minML; |
+ if (lbStartIdx > 0) { |
+ U16_SET_CP_START(inputBuf, 0, lbStartIdx); |
+ } |
} else { |
// 2nd through nth time through the loop. |
// Back up start position for match by one. |
- if (*lbStartIdx == 0) { |
- (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0. |
+ if (lbStartIdx == 0) { |
+ lbStartIdx--; // Because U16_BACK is unsafe starting at 0. |
} else { |
- U16_BACK_1(inputBuf, 0, *lbStartIdx); |
+ U16_BACK_1(inputBuf, 0, lbStartIdx); |
} |
} |
- if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { |
+ if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) { |
// We have tried all potential match starting points without |
// getting a match, which means that the negative lookbehind as |
// a whole has succeeded. Jump forward to the continue location |
@@ -5547,7 +5552,7 @@ GC_Done: |
// Save state to this URX_LB_CONT op, so failure to match will repeat the loop. |
// (successful match will cause a FAIL out of the loop altogether.) |
fp = StateSave(fp, fp->fPatIdx-4, status); |
- fp->fInputIdx = *lbStartIdx; |
+ fp->fInputIdx = lbStartIdx; |
} |
break; |