aboutsummaryrefslogtreecommitdiffstatshomepage
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2023-10-14 09:13:02 +0300
committerGitHub <noreply@github.com>2023-10-14 09:13:02 +0300
commite2b3d831fd2824d8a5713e3ed2a64aad0fb6b62d (patch)
tree3ab33a7a92325e48c5297ba9c3f4d8d9c38d3d00
parentca0f3d858d069231ce7c5b382790a774f385b467 (diff)
downloadcpython-e2b3d831fd2824d8a5713e3ed2a64aad0fb6b62d.tar.gz
cpython-e2b3d831fd2824d8a5713e3ed2a64aad0fb6b62d.zip
gh-109747: Improve errors for unsupported look-behind patterns (GH-109859)
Now re.error is raised instead of OverflowError or RuntimeError for too large width of look-behind pattern. The limit is increased to 2**32-1 (was 2**31-1).
-rw-r--r--Lib/re/_compiler.py4
-rw-r--r--Lib/re/_parser.py13
-rw-r--r--Lib/test/test_re.py23
-rw-r--r--Misc/NEWS.d/next/Library/2023-09-25-20-05-41.gh-issue-109747._cRJH8.rst3
-rw-r--r--Modules/_sre/sre.c2
-rw-r--r--Modules/_sre/sre_lib.h14
6 files changed, 46 insertions, 13 deletions
diff --git a/Lib/re/_compiler.py b/Lib/re/_compiler.py
index d0a4c55caf6..f87712d6d6f 100644
--- a/Lib/re/_compiler.py
+++ b/Lib/re/_compiler.py
@@ -147,6 +147,8 @@ def _compile(code, pattern, flags):
emit(0) # look ahead
else:
lo, hi = av[1].getwidth()
+ if lo > MAXCODE:
+ raise error("looks too much behind")
if lo != hi:
raise error("look-behind requires fixed-width pattern")
emit(lo) # look behind
@@ -547,7 +549,7 @@ def _compile_info(code, pattern, flags):
else:
emit(MAXCODE)
prefix = prefix[:MAXCODE]
- emit(min(hi, MAXCODE))
+ emit(hi)
# add literal prefix
if prefix:
emit(len(prefix)) # length
diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py
index d00b7e67d55..f3c779340fe 100644
--- a/Lib/re/_parser.py
+++ b/Lib/re/_parser.py
@@ -67,6 +67,10 @@ FLAGS = {
TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE
GLOBAL_FLAGS = SRE_FLAG_DEBUG
+# Maximal value returned by SubPattern.getwidth().
+# Must be larger than MAXREPEAT, MAXCODE and sys.maxsize.
+MAXWIDTH = 1 << 64
+
class State:
# keeps track of state for parsing
def __init__(self):
@@ -177,7 +181,7 @@ class SubPattern:
lo = hi = 0
for op, av in self.data:
if op is BRANCH:
- i = MAXREPEAT - 1
+ i = MAXWIDTH
j = 0
for av in av[1]:
l, h = av.getwidth()
@@ -196,7 +200,10 @@ class SubPattern:
elif op in _REPEATCODES:
i, j = av[2].getwidth()
lo = lo + i * av[0]
- hi = hi + j * av[1]
+ if av[1] == MAXREPEAT and j:
+ hi = MAXWIDTH
+ else:
+ hi = hi + j * av[1]
elif op in _UNITCODES:
lo = lo + 1
hi = hi + 1
@@ -216,7 +223,7 @@ class SubPattern:
hi = hi + j
elif op is SUCCESS:
break
- self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
+ self.width = min(lo, MAXWIDTH), min(hi, MAXWIDTH)
return self.width
class Tokenizer:
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index 301d4a51656..1eca22f4537 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -1861,6 +1861,29 @@ class ReTests(unittest.TestCase):
self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
+ def test_look_behind_overflow(self):
+ string = "x" * 2_500_000
+ p1 = r"(?<=((.{%d}){%d}){%d})"
+ p2 = r"(?<!((.{%d}){%d}){%d})"
+ # Test that the templates are valid and look-behind with width 2**21
+ # (larger than sys.maxunicode) are supported.
+ self.assertEqual(re.search(p1 % (2**7, 2**7, 2**7), string).span(),
+ (2**21, 2**21))
+ self.assertEqual(re.search(p2 % (2**7, 2**7, 2**7), string).span(),
+ (0, 0))
+ # Test that 2**22 is accepted as a repetition number and look-behind
+ # width.
+ re.compile(p1 % (2**22, 1, 1))
+ re.compile(p1 % (1, 2**22, 1))
+ re.compile(p1 % (1, 1, 2**22))
+ re.compile(p2 % (2**22, 1, 1))
+ re.compile(p2 % (1, 2**22, 1))
+ re.compile(p2 % (1, 1, 2**22))
+ # But 2**66 is too large for look-behind width.
+ errmsg = "looks too much behind"
+ self.assertRaisesRegex(re.error, errmsg, re.compile, p1 % (2**22, 2**22, 2**22))
+ self.assertRaisesRegex(re.error, errmsg, re.compile, p2 % (2**22, 2**22, 2**22))
+
def test_backref_group_name_in_exception(self):
# Issue 17341: Poor error message when compiling invalid regex
self.checkPatternError('(?P=<foo>)',
diff --git a/Misc/NEWS.d/next/Library/2023-09-25-20-05-41.gh-issue-109747._cRJH8.rst b/Misc/NEWS.d/next/Library/2023-09-25-20-05-41.gh-issue-109747._cRJH8.rst
new file mode 100644
index 00000000000..b64ba627897
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2023-09-25-20-05-41.gh-issue-109747._cRJH8.rst
@@ -0,0 +1,3 @@
+Improve errors for unsupported look-behind patterns. Now re.error is raised
+instead of OverflowError or RuntimeError for too large width of look-behind
+pattern.
diff --git a/Modules/_sre/sre.c b/Modules/_sre/sre.c
index 798732ccddc..0f134b194de 100644
--- a/Modules/_sre/sre.c
+++ b/Modules/_sre/sre.c
@@ -2070,8 +2070,6 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
GET_SKIP;
GET_ARG; /* 0 for lookahead, width for lookbehind */
code--; /* Back up over arg to simplify math below */
- if (arg & 0x80000000)
- FAIL; /* Width too large */
/* Stop 1 before the end; we check the SUCCESS below */
if (_validate_inner(code+1, code+skip-2, groups))
FAIL;
diff --git a/Modules/_sre/sre_lib.h b/Modules/_sre/sre_lib.h
index 3c805aeeca0..92dd725c70f 100644
--- a/Modules/_sre/sre_lib.h
+++ b/Modules/_sre/sre_lib.h
@@ -591,8 +591,8 @@ entrance:
/* optimization info block */
/* <INFO> <1=skip> <2=flags> <3=min> ... */
if (pattern[3] && (uintptr_t)(end - ptr) < pattern[3]) {
- TRACE(("reject (got %zd chars, need %zd)\n",
- end - ptr, (Py_ssize_t) pattern[3]));
+ TRACE(("reject (got %tu chars, need %zu)\n",
+ end - ptr, (size_t) pattern[3]));
RETURN_FAILURE;
}
pattern += pattern[1] + 1;
@@ -1509,7 +1509,7 @@ dispatch:
/* <ASSERT> <skip> <back> <pattern> */
TRACE(("|%p|%p|ASSERT %d\n", pattern,
ptr, pattern[1]));
- if (ptr - (SRE_CHAR *)state->beginning < (Py_ssize_t)pattern[1])
+ if ((uintptr_t)(ptr - (SRE_CHAR *)state->beginning) < pattern[1])
RETURN_FAILURE;
state->ptr = ptr - pattern[1];
DO_JUMP0(JUMP_ASSERT, jump_assert, pattern+2);
@@ -1522,7 +1522,7 @@ dispatch:
/* <ASSERT_NOT> <skip> <back> <pattern> */
TRACE(("|%p|%p|ASSERT_NOT %d\n", pattern,
ptr, pattern[1]));
- if (ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)pattern[1]) {
+ if ((uintptr_t)(ptr - (SRE_CHAR *)state->beginning) >= pattern[1]) {
state->ptr = ptr - pattern[1];
LASTMARK_SAVE();
if (state->repeat)
@@ -1658,9 +1658,9 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
flags = pattern[2];
- if (pattern[3] && end - ptr < (Py_ssize_t)pattern[3]) {
- TRACE(("reject (got %u chars, need %u)\n",
- (unsigned int)(end - ptr), pattern[3]));
+ if (pattern[3] && (uintptr_t)(end - ptr) < pattern[3]) {
+ TRACE(("reject (got %tu chars, need %zu)\n",
+ end - ptr, (size_t) pattern[3]));
return 0;
}
if (pattern[3] > 1) {