Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(10)

Unified Diff: source/i18n/rematch.cpp

Issue 2440913002: Update ICU to 58.1
Patch Set: Created 4 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « source/i18n/reldtfmt.cpp ('k') | source/i18n/remtrans.h » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: source/i18n/rematch.cpp
diff --git a/source/i18n/rematch.cpp b/source/i18n/rematch.cpp
index c7aeac015ff3e99e9496f3a247f0e44ccbf419da..0e795f216c23d5ed19abcb04934dda1263204ab0 100644
--- a/source/i18n/rematch.cpp
+++ b/source/i18n/rematch.cpp
@@ -1,7 +1,9 @@
+// Copyright (C) 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
/*
**************************************************************************
-* Copyright (C) 2002-2015 International Business Machines Corporation *
-* and others. All rights reserved. *
+* Copyright (C) 2002-2016 International Business Machines Corporation
+* and others. All rights reserved.
**************************************************************************
*/
//
@@ -23,6 +25,7 @@
#include "unicode/utf16.h"
#include "uassert.h"
#include "cmemory.h"
+#include "cstr.h"
#include "uvector.h"
#include "uvectr32.h"
#include "uvectr64.h"
@@ -33,6 +36,7 @@
// #include <malloc.h> // Needed for heapcheck testing
+
U_NAMESPACE_BEGIN
// Default limit for the size of the back track stack, to avoid system
@@ -237,7 +241,7 @@ void RegexMatcher::init2(UText *input, UErrorCode &status) {
return;
}
- if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) {
+ if (fPattern->fDataSize > UPRV_LENGTHOF(fSmallData)) {
fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
if (fData == NULL) {
status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
@@ -782,7 +786,7 @@ UBool RegexMatcher::find(UErrorCode &status) {
if (fMatch) {
return TRUE;
}
- UTEXT_SETNATIVEINDEX(fInputText, pos);
+ UTEXT_SETNATIVEINDEX(fInputText, startPos);
}
if (startPos > testStartLimit) {
fMatch = FALSE;
@@ -2723,6 +2727,18 @@ inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatId
return (REStackFrame *)newFP;
}
+#if defined(REGEX_DEBUG)
+namespace {
+UnicodeString StringFromUText(UText *ut) {
+ UnicodeString result;
+ for (UChar32 c = utext_next32From(ut, 0); c != U_SENTINEL; c = UTEXT_NEXT32(ut)) {
+ result.append(c);
+ }
+ return result;
+}
+}
+#endif // REGEX_DEBUG
+
//--------------------------------------------------------------------------------
//
@@ -2742,32 +2758,10 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
int32_t opValue; // and the operand value.
#ifdef REGEX_RUN_DEBUG
- if (fTraceDebug)
- {
+ if (fTraceDebug) {
printf("MatchAt(startIdx=%ld)\n", startIdx);
- printf("Original Pattern: ");
- UChar32 c = utext_next32From(fPattern->fPattern, 0);
- while (c != U_SENTINEL) {
- if (c<32 || c>256) {
- c = '.';
- }
- printf("%c", c);
-
- c = UTEXT_NEXT32(fPattern->fPattern);
- }
- printf("\n");
- printf("Input String: ");
- c = utext_next32From(fInputText, 0);
- while (c != U_SENTINEL) {
- if (c<32 || c>256) {
- c = '.';
- }
- printf("%c", c);
-
- c = UTEXT_NEXT32(fInputText);
- }
- printf("\n");
- printf("\n");
+ printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
+ printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))());
}
#endif
@@ -3936,28 +3930,38 @@ GC_Done:
// of this op in the pattern.
int32_t minML = (int32_t)pat[fp->fPatIdx++];
int32_t maxML = (int32_t)pat[fp->fPatIdx++];
+ if (!UTEXT_USES_U16(fInputText)) {
+ // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
+ // The max length need not be exact; it just needs to be >= actual maximum.
+ maxML *= 3;
+ }
U_ASSERT(minML <= maxML);
U_ASSERT(minML >= 0);
// Fetch (from data) the last input index where a match was attempted.
U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
- int64_t *lbStartIdx = &fData[opValue+2];
- if (*lbStartIdx < 0) {
+ int64_t &lbStartIdx = fData[opValue+2];
+ if (lbStartIdx < 0) {
// First time through loop.
- *lbStartIdx = fp->fInputIdx - minML;
+ lbStartIdx = fp->fInputIdx - minML;
+ if (lbStartIdx > 0) {
+ // move index to a code point boudary, if it's not on one already.
+ UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
+ lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
+ }
} else {
// 2nd through nth time through the loop.
// Back up start position for match by one.
- if (*lbStartIdx == 0) {
- (*lbStartIdx)--;
+ if (lbStartIdx == 0) {
+ (lbStartIdx)--;
} else {
- UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
+ UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
(void)UTEXT_PREVIOUS32(fInputText);
- *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
+ lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
}
}
- if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
+ if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
// We have tried all potential match starting points without
// getting a match. Backtrack out, and out of the
// Look Behind altogether.
@@ -3972,7 +3976,7 @@ GC_Done:
// Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
// (successful match will fall off the end of the loop.)
fp = StateSave(fp, fp->fPatIdx-3, status);
- fp->fInputIdx = *lbStartIdx;
+ fp->fInputIdx = lbStartIdx;
}
break;
@@ -4009,6 +4013,11 @@ GC_Done:
// Fetch the extra parameters of this op.
int32_t minML = (int32_t)pat[fp->fPatIdx++];
int32_t maxML = (int32_t)pat[fp->fPatIdx++];
+ if (!UTEXT_USES_U16(fInputText)) {
+ // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
+ // The max length need not be exact; it just needs to be >= actual maximum.
+ maxML *= 3;
+ }
int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
continueLoc = URX_VAL(continueLoc);
U_ASSERT(minML <= maxML);
@@ -4017,23 +4026,28 @@ GC_Done:
// Fetch (from data) the last input index where a match was attempted.
U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
- int64_t *lbStartIdx = &fData[opValue+2];
- if (*lbStartIdx < 0) {
+ int64_t &lbStartIdx = fData[opValue+2];
+ if (lbStartIdx < 0) {
// First time through loop.
- *lbStartIdx = fp->fInputIdx - minML;
+ lbStartIdx = fp->fInputIdx - minML;
+ if (lbStartIdx > 0) {
+ // move index to a code point boudary, if it's not on one already.
+ UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
+ lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
+ }
} else {
// 2nd through nth time through the loop.
// Back up start position for match by one.
- if (*lbStartIdx == 0) {
- (*lbStartIdx)--;
+ if (lbStartIdx == 0) {
+ (lbStartIdx)--;
} else {
- UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
+ UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
(void)UTEXT_PREVIOUS32(fInputText);
- *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
+ lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
}
}
- if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
+ if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
// We have tried all potential match starting points without
// getting a match, which means that the negative lookbehind as
// a whole has succeeded. Jump forward to the continue location
@@ -4048,7 +4062,7 @@ GC_Done:
// Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
// (successful match will cause a FAIL out of the loop altogether.)
fp = StateSave(fp, fp->fPatIdx-4, status);
- fp->fInputIdx = *lbStartIdx;
+ fp->fInputIdx = lbStartIdx;
}
break;
@@ -4310,29 +4324,8 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
#ifdef REGEX_RUN_DEBUG
if (fTraceDebug) {
printf("MatchAt(startIdx=%d)\n", startIdx);
- printf("Original Pattern: ");
- UChar32 c = utext_next32From(fPattern->fPattern, 0);
- while (c != U_SENTINEL) {
- if (c<32 || c>256) {
- c = '.';
- }
- printf("%c", c);
-
- c = UTEXT_NEXT32(fPattern->fPattern);
- }
- printf("\n");
- printf("Input String: ");
- c = utext_next32From(fInputText, 0);
- while (c != U_SENTINEL) {
- if (c<32 || c>256) {
- c = '.';
- }
- printf("%c", c);
-
- c = UTEXT_NEXT32(fInputText);
- }
- printf("\n");
- printf("\n");
+ printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
+ printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))());
}
#endif
@@ -5232,6 +5225,12 @@ GC_Done:
break;
}
}
+ if (success && groupStartIdx < groupEndIdx && U16_IS_LEAD(inputBuf[groupEndIdx-1]) &&
+ inputIndex < fActiveLimit && U16_IS_TRAIL(inputBuf[inputIndex])) {
+ // Capture group ended with an unpaired lead surrogate.
+ // Back reference is not permitted to match lead only of a surrogatge pair.
+ success = FALSE;
+ }
if (success) {
fp->fInputIdx = inputIndex;
} else {
@@ -5444,21 +5443,24 @@ GC_Done:
// Fetch (from data) the last input index where a match was attempted.
U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
- int64_t *lbStartIdx = &fData[opValue+2];
- if (*lbStartIdx < 0) {
+ int64_t &lbStartIdx = fData[opValue+2];
+ if (lbStartIdx < 0) {
// First time through loop.
- *lbStartIdx = fp->fInputIdx - minML;
+ lbStartIdx = fp->fInputIdx - minML;
+ if (lbStartIdx > 0) {
+ U16_SET_CP_START(inputBuf, 0, lbStartIdx);
+ }
} else {
// 2nd through nth time through the loop.
// Back up start position for match by one.
- if (*lbStartIdx == 0) {
- (*lbStartIdx)--;
+ if (lbStartIdx == 0) {
+ lbStartIdx--;
} else {
- U16_BACK_1(inputBuf, 0, *lbStartIdx);
+ U16_BACK_1(inputBuf, 0, lbStartIdx);
}
}
- if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
+ if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
// We have tried all potential match starting points without
// getting a match. Backtrack out, and out of the
// Look Behind altogether.
@@ -5473,7 +5475,7 @@ GC_Done:
// Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
// (successful match will fall off the end of the loop.)
fp = StateSave(fp, fp->fPatIdx-3, status);
- fp->fInputIdx = *lbStartIdx;
+ fp->fInputIdx = lbStartIdx;
}
break;
@@ -5518,21 +5520,24 @@ GC_Done:
// Fetch (from data) the last input index where a match was attempted.
U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
- int64_t *lbStartIdx = &fData[opValue+2];
- if (*lbStartIdx < 0) {
+ int64_t &lbStartIdx = fData[opValue+2];
+ if (lbStartIdx < 0) {
// First time through loop.
- *lbStartIdx = fp->fInputIdx - minML;
+ lbStartIdx = fp->fInputIdx - minML;
+ if (lbStartIdx > 0) {
+ U16_SET_CP_START(inputBuf, 0, lbStartIdx);
+ }
} else {
// 2nd through nth time through the loop.
// Back up start position for match by one.
- if (*lbStartIdx == 0) {
- (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0.
+ if (lbStartIdx == 0) {
+ lbStartIdx--; // Because U16_BACK is unsafe starting at 0.
} else {
- U16_BACK_1(inputBuf, 0, *lbStartIdx);
+ U16_BACK_1(inputBuf, 0, lbStartIdx);
}
}
- if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
+ if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
// We have tried all potential match starting points without
// getting a match, which means that the negative lookbehind as
// a whole has succeeded. Jump forward to the continue location
@@ -5547,7 +5552,7 @@ GC_Done:
// Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
// (successful match will cause a FAIL out of the loop altogether.)
fp = StateSave(fp, fp->fPatIdx-4, status);
- fp->fInputIdx = *lbStartIdx;
+ fp->fInputIdx = lbStartIdx;
}
break;
« no previous file with comments | « source/i18n/reldtfmt.cpp ('k') | source/i18n/remtrans.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698