aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/Lib/test/test_re.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/test/test_re.py')
-rw-r--r--Lib/test/test_re.py300
1 files changed, 170 insertions, 130 deletions
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index 70702e63483..0c8c676ed8b 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -1,5 +1,5 @@
-from test.test_support import verbose, run_unittest, import_module
-from test.test_support import precisionbigmemtest, _2G
+from test.support import verbose, run_unittest, gc_collect, bigmemtest, _2G
+import io
import re
from re import Scanner
import sys
@@ -7,6 +7,8 @@ import string
import traceback
from weakref import proxy
+from test.test_bigmem import character_size
+
# Misc tests from Tim Peters' re.doc
@@ -18,6 +20,17 @@ import unittest
class ReTests(unittest.TestCase):
+ def test_keep_buffer(self):
+ # See bug 14212
+ b = bytearray(b'x')
+ it = re.finditer(b'a', b)
+ with self.assertRaises(BufferError):
+ b.extend(b'x'*400)
+ list(it)
+ del it
+ gc_collect()
+ b.extend(b'x'*400)
+
def test_weakref(self):
s = 'QabbbcR'
x = re.compile('ab+c')
@@ -84,31 +97,6 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
'abc\ndef\n')
- def test_bug_1140(self):
- # re.sub(x, y, u'') should return u'', not '', and
- # re.sub(x, y, '') should return '', not u''.
- # Also:
- # re.sub(x, y, unicode(x)) should return unicode(y), and
- # re.sub(x, y, str(x)) should return
- # str(y) if isinstance(y, str) else unicode(y).
- for x in 'x', u'x':
- for y in 'y', u'y':
- z = re.sub(x, y, u'')
- self.assertEqual(z, u'')
- self.assertEqual(type(z), unicode)
- #
- z = re.sub(x, y, '')
- self.assertEqual(z, '')
- self.assertEqual(type(z), str)
- #
- z = re.sub(x, y, unicode(x))
- self.assertEqual(z, y)
- self.assertEqual(type(z), unicode)
- #
- z = re.sub(x, y, str(x))
- self.assertEqual(z, y)
- self.assertEqual(type(z), type(y))
-
def test_bug_1661(self):
# Verify that flags do not get silently ignored with compiled patterns
pattern = re.compile('.')
@@ -359,6 +347,13 @@ class ReTests(unittest.TestCase):
self.assertNotEqual(re.match("^x{}$", "x{}"), None)
def test_getattr(self):
+ self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
+ self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
+ self.assertEqual(re.compile("(?i)(a)(b)").groups, 2)
+ self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {})
+ self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
+ {'first': 1, 'other': 2})
+
self.assertEqual(re.match("(a)", "a").pos, 0)
self.assertEqual(re.match("(a)", "a").endpos, 1)
self.assertEqual(re.match("(a)", "a").string, "a")
@@ -382,12 +377,12 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
self.assertEqual(re.search(r"^\Aabc\Z$", "\nabc\n", re.M), None)
self.assertEqual(re.search(r"\b(b.)\b",
- u"abcd abc bcd bx").group(1), "bx")
+ "abcd abc bcd bx").group(1), "bx")
self.assertEqual(re.search(r"\B(b.)\B",
- u"abc bcd bc abxd").group(1), "bx")
- self.assertEqual(re.search(r"^abc$", u"\nabc\n", re.M).group(0), "abc")
- self.assertEqual(re.search(r"^\Aabc\Z$", u"abc", re.M).group(0), "abc")
- self.assertEqual(re.search(r"^\Aabc\Z$", u"\nabc\n", re.M), None)
+ "abc bcd bc abxd").group(1), "bx")
+ self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
+ self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
+ self.assertEqual(re.search(r"^\Aabc\Z$", "\nabc\n", re.M), None)
self.assertEqual(re.search(r"\d\D\w\W\s\S",
"1aa! a").group(0), "1aa! a")
self.assertEqual(re.search(r"\d\D\w\W\s\S",
@@ -422,10 +417,10 @@ class ReTests(unittest.TestCase):
self.assertEqual(len(re.findall(r"\B", " ")), 2)
def test_bigcharset(self):
- self.assertEqual(re.match(u"([\u2222\u2223])",
- u"\u2222").group(1), u"\u2222")
- self.assertEqual(re.match(u"([\u2222\u2223])",
- u"\u2222", re.UNICODE).group(1), u"\u2222")
+ self.assertEqual(re.match("([\u2222\u2223])",
+ "\u2222").group(1), "\u2222")
+ self.assertEqual(re.match("([\u2222\u2223])",
+ "\u2222", re.UNICODE).group(1), "\u2222")
def test_big_codesize(self):
# Issue #1160
@@ -455,7 +450,7 @@ class ReTests(unittest.TestCase):
def test_ignore_case(self):
self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
- self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
+ self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
@@ -475,7 +470,7 @@ class ReTests(unittest.TestCase):
self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))
self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
- self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
+ self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
def test_not_literal(self):
self.assertEqual(re.search("\s([^a])", " b").group(1), "b")
@@ -501,24 +496,25 @@ class ReTests(unittest.TestCase):
def test_re_escape(self):
alnum_chars = string.ascii_letters + string.digits
- p = u''.join(unichr(i) for i in range(256))
+ p = ''.join(chr(i) for i in range(256))
for c in p:
if c in alnum_chars:
self.assertEqual(re.escape(c), c)
- elif c == u'\x00':
- self.assertEqual(re.escape(c), u'\\000')
+ elif c == '\x00':
+ self.assertEqual(re.escape(c), '\\000')
else:
- self.assertEqual(re.escape(c), u'\\' + c)
+ self.assertEqual(re.escape(c), '\\' + c)
self.assertMatch(re.escape(c), c)
self.assertMatch(re.escape(p), p)
def test_re_escape_byte(self):
alnum_chars = (string.ascii_letters + string.digits).encode('ascii')
- p = ''.join(chr(i) for i in range(256))
- for b in p:
+ p = bytes(range(256))
+ for i in p:
+ b = bytes([i])
if b in alnum_chars:
self.assertEqual(re.escape(b), b)
- elif b == b'\x00':
+ elif i == 0:
self.assertEqual(re.escape(b), b'\\000')
else:
self.assertEqual(re.escape(b), b'\\' + b)
@@ -526,30 +522,21 @@ class ReTests(unittest.TestCase):
self.assertMatch(re.escape(p), p)
def test_re_escape_non_ascii(self):
- s = u'xxx\u2620\u2620\u2620xxx'
+ s = 'xxx\u2620\u2620\u2620xxx'
s_escaped = re.escape(s)
- self.assertEqual(s_escaped, u'xxx\\\u2620\\\u2620\\\u2620xxx')
+ self.assertEqual(s_escaped, 'xxx\\\u2620\\\u2620\\\u2620xxx')
self.assertMatch(s_escaped, s)
- self.assertMatch(u'.%s+.' % re.escape(u'\u2620'), s,
- u'x\u2620\u2620\u2620x', (2, 7), re.search)
+ self.assertMatch('.%s+.' % re.escape('\u2620'), s,
+ 'x\u2620\u2620\u2620x', (2, 7), re.search)
def test_re_escape_non_ascii_bytes(self):
- b = u'y\u2620y\u2620y'.encode('utf-8')
+ b = 'y\u2620y\u2620y'.encode('utf-8')
b_escaped = re.escape(b)
self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y')
self.assertMatch(b_escaped, b)
- res = re.findall(re.escape(u'\u2620'.encode('utf-8')), b)
+ res = re.findall(re.escape('\u2620'.encode('utf-8')), b)
self.assertEqual(len(res), 2)
- def test_pickling(self):
- import pickle
- self.pickle_test(pickle)
- import cPickle
- self.pickle_test(cPickle)
- # old pickles expect the _compile() reconstructor in sre module
- import_module("sre", deprecated=True)
- from sre import _compile
-
def pickle_test(self, pickle):
oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)')
s = pickle.dumps(oldpat)
@@ -618,7 +605,7 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
def test_bug_612074(self):
- pat=u"["+re.escape(u"\u2039")+u"]"
+ pat="["+re.escape("\u2039")+"]"
self.assertEqual(re.compile(pat) and 1, 1)
def test_stack_overflow(self):
@@ -685,11 +672,7 @@ class ReTests(unittest.TestCase):
def test_bug_764548(self):
# bug 764548, re.compile() barfs on str/unicode subclasses
- try:
- unicode
- except NameError:
- return # no problem if we have no unicode
- class my_unicode(unicode): pass
+ class my_unicode(str): pass
pat = re.compile(my_unicode("abc"))
self.assertEqual(pat.match("xyz"), None)
@@ -699,26 +682,18 @@ class ReTests(unittest.TestCase):
[":", "::", ":::"])
def test_bug_926075(self):
- try:
- unicode
- except NameError:
- return # no problem if we have no unicode
self.assertTrue(re.compile('bug_926075') is not
- re.compile(eval("u'bug_926075'")))
+ re.compile(b'bug_926075'))
def test_bug_931848(self):
- try:
- unicode
- except NameError:
- pass
- pattern = eval('u"[\u002E\u3002\uFF0E\uFF61]"')
+ pattern = eval('"[\u002E\u3002\uFF0E\uFF61]"')
self.assertEqual(re.compile(pattern).split("a.b.c"),
['a','b','c'])
def test_bug_581080(self):
iter = re.finditer(r"\s", "a b")
- self.assertEqual(iter.next().span(), (1,2))
- self.assertRaises(StopIteration, iter.next)
+ self.assertEqual(next(iter).span(), (1,2))
+ self.assertRaises(StopIteration, next, iter)
scanner = re.compile(r"\s").scanner("a b")
self.assertEqual(scanner.search().span(), (1, 2))
@@ -726,43 +701,43 @@ class ReTests(unittest.TestCase):
def test_bug_817234(self):
iter = re.finditer(r".*", "asdf")
- self.assertEqual(iter.next().span(), (0, 4))
- self.assertEqual(iter.next().span(), (4, 4))
- self.assertRaises(StopIteration, iter.next)
+ self.assertEqual(next(iter).span(), (0, 4))
+ self.assertEqual(next(iter).span(), (4, 4))
+ self.assertRaises(StopIteration, next, iter)
def test_bug_6561(self):
# '\d' should match characters in Unicode category 'Nd'
# (Number, Decimal Digit), but not those in 'Nl' (Number,
# Letter) or 'No' (Number, Other).
decimal_digits = [
- u'\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
- u'\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
- u'\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
+ '\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
+ '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
+ '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
]
for x in decimal_digits:
- self.assertEqual(re.match('^\d$', x, re.UNICODE).group(0), x)
+ self.assertEqual(re.match('^\d$', x).group(0), x)
not_decimal_digits = [
- u'\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
- u'\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
- u'\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
- u'\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
+ '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
+ '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
+ '\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
+ '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
]
for x in not_decimal_digits:
- self.assertIsNone(re.match('^\d$', x, re.UNICODE))
+ self.assertIsNone(re.match('^\d$', x))
def test_empty_array(self):
# SF buf 1647541
import array
- for typecode in 'cbBuhHiIlLfd':
+ for typecode in 'bBuhHiIlLfd':
a = array.array(typecode)
- self.assertEqual(re.compile("bla").match(a), None)
- self.assertEqual(re.compile("").match(a).groups(), ())
+ self.assertEqual(re.compile(b"bla").match(a), None)
+ self.assertEqual(re.compile(b"").match(a).groups(), ())
def test_inline_flags(self):
# Bug #1700
- upper_char = unichr(0x1ea0) # Latin Capital Letter A with Dot Bellow
- lower_char = unichr(0x1ea1) # Latin Small Letter A with Dot Bellow
+ upper_char = chr(0x1ea0) # Latin Capital Letter A with Dot Bellow
+ lower_char = chr(0x1ea1) # Latin Small Letter A with Dot Bellow
p = re.compile(upper_char, re.I | re.U)
q = p.match(lower_char)
@@ -800,6 +775,66 @@ class ReTests(unittest.TestCase):
self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
self.assertEqual(pattern.sub('#', '\n'), '#\n#')
+ def test_bytes_str_mixing(self):
+ # Mixing str and bytes is disallowed
+ pat = re.compile('.')
+ bpat = re.compile(b'.')
+ self.assertRaises(TypeError, pat.match, b'b')
+ self.assertRaises(TypeError, bpat.match, 'b')
+ self.assertRaises(TypeError, pat.sub, b'b', 'c')
+ self.assertRaises(TypeError, pat.sub, 'b', b'c')
+ self.assertRaises(TypeError, pat.sub, b'b', b'c')
+ self.assertRaises(TypeError, bpat.sub, b'b', 'c')
+ self.assertRaises(TypeError, bpat.sub, 'b', b'c')
+ self.assertRaises(TypeError, bpat.sub, 'b', 'c')
+
+ def test_ascii_and_unicode_flag(self):
+ # String patterns
+ for flags in (0, re.UNICODE):
+ pat = re.compile('\xc0', flags | re.IGNORECASE)
+ self.assertNotEqual(pat.match('\xe0'), None)
+ pat = re.compile('\w', flags)
+ self.assertNotEqual(pat.match('\xe0'), None)
+ pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
+ self.assertEqual(pat.match('\xe0'), None)
+ pat = re.compile('(?a)\xc0', re.IGNORECASE)
+ self.assertEqual(pat.match('\xe0'), None)
+ pat = re.compile('\w', re.ASCII)
+ self.assertEqual(pat.match('\xe0'), None)
+ pat = re.compile('(?a)\w')
+ self.assertEqual(pat.match('\xe0'), None)
+ # Bytes patterns
+ for flags in (0, re.ASCII):
+ pat = re.compile(b'\xc0', re.IGNORECASE)
+ self.assertEqual(pat.match(b'\xe0'), None)
+ pat = re.compile(b'\w')
+ self.assertEqual(pat.match(b'\xe0'), None)
+ # Incompatibilities
+ self.assertRaises(ValueError, re.compile, b'\w', re.UNICODE)
+ self.assertRaises(ValueError, re.compile, b'(?u)\w')
+ self.assertRaises(ValueError, re.compile, '\w', re.UNICODE | re.ASCII)
+ self.assertRaises(ValueError, re.compile, '(?u)\w', re.ASCII)
+ self.assertRaises(ValueError, re.compile, '(?a)\w', re.UNICODE)
+ self.assertRaises(ValueError, re.compile, '(?au)\w')
+
+ def test_bug_6509(self):
+ # Replacement strings of both types must parse properly.
+ # all strings
+ pat = re.compile('a(\w)')
+ self.assertEqual(pat.sub('b\\1', 'ac'), 'bc')
+ pat = re.compile('a(.)')
+ self.assertEqual(pat.sub('b\\1', 'a\u1234'), 'b\u1234')
+ pat = re.compile('..')
+ self.assertEqual(pat.sub(lambda m: 'str', 'a5'), 'str')
+
+ # all bytes
+ pat = re.compile(b'a(\w)')
+ self.assertEqual(pat.sub(b'b\\1', b'ac'), b'bc')
+ pat = re.compile(b'a(.)')
+ self.assertEqual(pat.sub(b'b\\1', b'a\xCD'), b'b\xCD')
+ pat = re.compile(b'..')
+ self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes')
+
def test_dealloc(self):
# issue 3299: check for segfault in debug build
import _sre
@@ -810,6 +845,7 @@ class ReTests(unittest.TestCase):
long_overflow = 2**128
self.assertRaises(TypeError, re.finditer, "a", {})
self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow])
+ self.assertRaises(TypeError, _sre.compile, {}, 0, [])
def test_compile(self):
# Test return value when given string and pattern as parameter
@@ -821,7 +857,7 @@ class ReTests(unittest.TestCase):
# Test behaviour when not given a string or pattern as parameter
self.assertRaises(TypeError, re.compile, 0)
- @precisionbigmemtest(size=_2G, memuse=1)
+ @bigmemtest(size=_2G, memuse=character_size)
def test_large_search(self, size):
# Issue #10182: indices were 32-bit-truncated.
s = 'a' * size
@@ -832,7 +868,7 @@ class ReTests(unittest.TestCase):
# The huge memuse is because of re.sub() using a list and a join()
# to create the replacement result.
- @precisionbigmemtest(size=_2G, memuse=16 + 2)
+ @bigmemtest(size=_2G, memuse=16 + 2 * character_size)
def test_large_subn(self, size):
# Issue #10182: indices were 32-bit-truncated.
s = 'a' * size
@@ -844,7 +880,7 @@ class ReTests(unittest.TestCase):
def run_re_tests():
from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
if verbose:
- print 'Running re_tests test suite'
+ print('Running re_tests test suite')
else:
# To save time, only run the first and last 10 tests
#tests = tests[:10] + tests[-10:]
@@ -858,30 +894,30 @@ def run_re_tests():
elif len(t) == 3:
pattern, s, outcome = t
else:
- raise ValueError, ('Test tuples should have 3 or 5 fields', t)
+ raise ValueError('Test tuples should have 3 or 5 fields', t)
try:
obj = re.compile(pattern)
except re.error:
if outcome == SYNTAX_ERROR: pass # Expected a syntax error
else:
- print '=== Syntax error:', t
+ print('=== Syntax error:', t)
except KeyboardInterrupt: raise KeyboardInterrupt
except:
- print '*** Unexpected error ***', t
+ print('*** Unexpected error ***', t)
if verbose:
traceback.print_exc(file=sys.stdout)
else:
try:
result = obj.search(s)
- except re.error, msg:
- print '=== Unexpected exception', t, repr(msg)
+ except re.error as msg:
+ print('=== Unexpected exception', t, repr(msg))
if outcome == SYNTAX_ERROR:
# This should have been a syntax error; forget it.
pass
elif outcome == FAIL:
if result is None: pass # No match, as expected
- else: print '=== Succeeded incorrectly', t
+ else: print('=== Succeeded incorrectly', t)
elif outcome == SUCCEED:
if result is not None:
# Matched, as expected, so now we compute the
@@ -909,28 +945,30 @@ def run_re_tests():
vardict[i] = gi
repl = eval(repl, vardict)
if repl != expected:
- print '=== grouping error', t,
- print repr(repl) + ' should be ' + repr(expected)
+ print('=== grouping error', t, end=' ')
+ print(repr(repl) + ' should be ' + repr(expected))
else:
- print '=== Failed incorrectly', t
+ print('=== Failed incorrectly', t)
- # Try the match on a unicode string, and check that it
- # still succeeds.
+ # Try the match with both pattern and string converted to
+ # bytes, and check that it still succeeds.
try:
- result = obj.search(unicode(s, "latin-1"))
- if result is None:
- print '=== Fails on unicode match', t
- except NameError:
- continue # 1.5.2
- except TypeError:
- continue # unicode test case
-
- # Try the match on a unicode pattern, and check that it
- # still succeeds.
- obj=re.compile(unicode(pattern, "latin-1"))
- result = obj.search(s)
- if result is None:
- print '=== Fails on unicode pattern match', t
+ bpat = bytes(pattern, "ascii")
+ bs = bytes(s, "ascii")
+ except UnicodeEncodeError:
+ # skip non-ascii tests
+ pass
+ else:
+ try:
+ bpat = re.compile(bpat)
+ except Exception:
+ print('=== Fails on bytes pattern compile', t)
+ if verbose:
+ traceback.print_exc(file=sys.stdout)
+ else:
+ bytes_result = bpat.search(bs)
+ if bytes_result is None:
+ print('=== Fails on bytes pattern match', t)
# Try the match with the search area limited to the extent
# of the match and see if it still succeeds. \B will
@@ -942,28 +980,30 @@ def run_re_tests():
obj = re.compile(pattern)
result = obj.search(s, result.start(0), result.end(0) + 1)
if result is None:
- print '=== Failed on range-limited match', t
+ print('=== Failed on range-limited match', t)
# Try the match with IGNORECASE enabled, and check that it
# still succeeds.
obj = re.compile(pattern, re.IGNORECASE)
result = obj.search(s)
if result is None:
- print '=== Fails on case-insensitive match', t
+ print('=== Fails on case-insensitive match', t)
# Try the match with LOCALE enabled, and check that it
# still succeeds.
- obj = re.compile(pattern, re.LOCALE)
- result = obj.search(s)
- if result is None:
- print '=== Fails on locale-sensitive match', t
+ if '(?u)' not in pattern:
+ obj = re.compile(pattern, re.LOCALE)
+ result = obj.search(s)
+ if result is None:
+ print('=== Fails on locale-sensitive match', t)
# Try the match with UNICODE locale enabled, and check
# that it still succeeds.
obj = re.compile(pattern, re.UNICODE)
result = obj.search(s)
if result is None:
- print '=== Fails on unicode-sensitive match', t
+ print('=== Fails on unicode-sensitive match', t)
+
def test_main():
run_unittest(ReTests)