diff options
Diffstat (limited to 'Lib/test/test_re.py')
-rw-r--r-- | Lib/test/test_re.py | 300 |
1 files changed, 170 insertions, 130 deletions
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 70702e63483..0c8c676ed8b 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -1,5 +1,5 @@ -from test.test_support import verbose, run_unittest, import_module -from test.test_support import precisionbigmemtest, _2G +from test.support import verbose, run_unittest, gc_collect, bigmemtest, _2G +import io import re from re import Scanner import sys @@ -7,6 +7,8 @@ import string import traceback from weakref import proxy +from test.test_bigmem import character_size + # Misc tests from Tim Peters' re.doc @@ -18,6 +20,17 @@ import unittest class ReTests(unittest.TestCase): + def test_keep_buffer(self): + # See bug 14212 + b = bytearray(b'x') + it = re.finditer(b'a', b) + with self.assertRaises(BufferError): + b.extend(b'x'*400) + list(it) + del it + gc_collect() + b.extend(b'x'*400) + def test_weakref(self): s = 'QabbbcR' x = re.compile('ab+c') @@ -84,31 +97,6 @@ class ReTests(unittest.TestCase): self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'), 'abc\ndef\n') - def test_bug_1140(self): - # re.sub(x, y, u'') should return u'', not '', and - # re.sub(x, y, '') should return '', not u''. - # Also: - # re.sub(x, y, unicode(x)) should return unicode(y), and - # re.sub(x, y, str(x)) should return - # str(y) if isinstance(y, str) else unicode(y). - for x in 'x', u'x': - for y in 'y', u'y': - z = re.sub(x, y, u'') - self.assertEqual(z, u'') - self.assertEqual(type(z), unicode) - # - z = re.sub(x, y, '') - self.assertEqual(z, '') - self.assertEqual(type(z), str) - # - z = re.sub(x, y, unicode(x)) - self.assertEqual(z, y) - self.assertEqual(type(z), unicode) - # - z = re.sub(x, y, str(x)) - self.assertEqual(z, y) - self.assertEqual(type(z), type(y)) - def test_bug_1661(self): # Verify that flags do not get silently ignored with compiled patterns pattern = re.compile('.') @@ -359,6 +347,13 @@ class ReTests(unittest.TestCase): self.assertNotEqual(re.match("^x{}$", "x{}"), None) def test_getattr(self): + self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)") + self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U) + self.assertEqual(re.compile("(?i)(a)(b)").groups, 2) + self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {}) + self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex, + {'first': 1, 'other': 2}) + self.assertEqual(re.match("(a)", "a").pos, 0) self.assertEqual(re.match("(a)", "a").endpos, 1) self.assertEqual(re.match("(a)", "a").string, "a") @@ -382,12 +377,12 @@ class ReTests(unittest.TestCase): self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc") self.assertEqual(re.search(r"^\Aabc\Z$", "\nabc\n", re.M), None) self.assertEqual(re.search(r"\b(b.)\b", - u"abcd abc bcd bx").group(1), "bx") + "abcd abc bcd bx").group(1), "bx") self.assertEqual(re.search(r"\B(b.)\B", - u"abc bcd bc abxd").group(1), "bx") - self.assertEqual(re.search(r"^abc$", u"\nabc\n", re.M).group(0), "abc") - self.assertEqual(re.search(r"^\Aabc\Z$", u"abc", re.M).group(0), "abc") - self.assertEqual(re.search(r"^\Aabc\Z$", u"\nabc\n", re.M), None) + "abc bcd bc abxd").group(1), "bx") + self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc") + self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc") + self.assertEqual(re.search(r"^\Aabc\Z$", "\nabc\n", re.M), None) self.assertEqual(re.search(r"\d\D\w\W\s\S", "1aa! a").group(0), "1aa! a") self.assertEqual(re.search(r"\d\D\w\W\s\S", @@ -422,10 +417,10 @@ class ReTests(unittest.TestCase): self.assertEqual(len(re.findall(r"\B", " ")), 2) def test_bigcharset(self): - self.assertEqual(re.match(u"([\u2222\u2223])", - u"\u2222").group(1), u"\u2222") - self.assertEqual(re.match(u"([\u2222\u2223])", - u"\u2222", re.UNICODE).group(1), u"\u2222") + self.assertEqual(re.match("([\u2222\u2223])", + "\u2222").group(1), "\u2222") + self.assertEqual(re.match("([\u2222\u2223])", + "\u2222", re.UNICODE).group(1), "\u2222") def test_big_codesize(self): # Issue #1160 @@ -455,7 +450,7 @@ class ReTests(unittest.TestCase): def test_ignore_case(self): self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC") - self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC") + self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC") self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b") self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb") self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b") @@ -475,7 +470,7 @@ class ReTests(unittest.TestCase): self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a')) self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC") - self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC") + self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC") def test_not_literal(self): self.assertEqual(re.search("\s([^a])", " b").group(1), "b") @@ -501,24 +496,25 @@ class ReTests(unittest.TestCase): def test_re_escape(self): alnum_chars = string.ascii_letters + string.digits - p = u''.join(unichr(i) for i in range(256)) + p = ''.join(chr(i) for i in range(256)) for c in p: if c in alnum_chars: self.assertEqual(re.escape(c), c) - elif c == u'\x00': - self.assertEqual(re.escape(c), u'\\000') + elif c == '\x00': + self.assertEqual(re.escape(c), '\\000') else: - self.assertEqual(re.escape(c), u'\\' + c) + self.assertEqual(re.escape(c), '\\' + c) self.assertMatch(re.escape(c), c) self.assertMatch(re.escape(p), p) def test_re_escape_byte(self): alnum_chars = (string.ascii_letters + string.digits).encode('ascii') - p = ''.join(chr(i) for i in range(256)) - for b in p: + p = bytes(range(256)) + for i in p: + b = bytes([i]) if b in alnum_chars: self.assertEqual(re.escape(b), b) - elif b == b'\x00': + elif i == 0: self.assertEqual(re.escape(b), b'\\000') else: self.assertEqual(re.escape(b), b'\\' + b) @@ -526,30 +522,21 @@ class ReTests(unittest.TestCase): self.assertMatch(re.escape(p), p) def test_re_escape_non_ascii(self): - s = u'xxx\u2620\u2620\u2620xxx' + s = 'xxx\u2620\u2620\u2620xxx' s_escaped = re.escape(s) - self.assertEqual(s_escaped, u'xxx\\\u2620\\\u2620\\\u2620xxx') + self.assertEqual(s_escaped, 'xxx\\\u2620\\\u2620\\\u2620xxx') self.assertMatch(s_escaped, s) - self.assertMatch(u'.%s+.' % re.escape(u'\u2620'), s, - u'x\u2620\u2620\u2620x', (2, 7), re.search) + self.assertMatch('.%s+.' % re.escape('\u2620'), s, + 'x\u2620\u2620\u2620x', (2, 7), re.search) def test_re_escape_non_ascii_bytes(self): - b = u'y\u2620y\u2620y'.encode('utf-8') + b = 'y\u2620y\u2620y'.encode('utf-8') b_escaped = re.escape(b) self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y') self.assertMatch(b_escaped, b) - res = re.findall(re.escape(u'\u2620'.encode('utf-8')), b) + res = re.findall(re.escape('\u2620'.encode('utf-8')), b) self.assertEqual(len(res), 2) - def test_pickling(self): - import pickle - self.pickle_test(pickle) - import cPickle - self.pickle_test(cPickle) - # old pickles expect the _compile() reconstructor in sre module - import_module("sre", deprecated=True) - from sre import _compile - def pickle_test(self, pickle): oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)') s = pickle.dumps(oldpat) @@ -618,7 +605,7 @@ class ReTests(unittest.TestCase): self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001) def test_bug_612074(self): - pat=u"["+re.escape(u"\u2039")+u"]" + pat="["+re.escape("\u2039")+"]" self.assertEqual(re.compile(pat) and 1, 1) def test_stack_overflow(self): @@ -685,11 +672,7 @@ class ReTests(unittest.TestCase): def test_bug_764548(self): # bug 764548, re.compile() barfs on str/unicode subclasses - try: - unicode - except NameError: - return # no problem if we have no unicode - class my_unicode(unicode): pass + class my_unicode(str): pass pat = re.compile(my_unicode("abc")) self.assertEqual(pat.match("xyz"), None) @@ -699,26 +682,18 @@ class ReTests(unittest.TestCase): [":", "::", ":::"]) def test_bug_926075(self): - try: - unicode - except NameError: - return # no problem if we have no unicode self.assertTrue(re.compile('bug_926075') is not - re.compile(eval("u'bug_926075'"))) + re.compile(b'bug_926075')) def test_bug_931848(self): - try: - unicode - except NameError: - pass - pattern = eval('u"[\u002E\u3002\uFF0E\uFF61]"') + pattern = eval('"[\u002E\u3002\uFF0E\uFF61]"') self.assertEqual(re.compile(pattern).split("a.b.c"), ['a','b','c']) def test_bug_581080(self): iter = re.finditer(r"\s", "a b") - self.assertEqual(iter.next().span(), (1,2)) - self.assertRaises(StopIteration, iter.next) + self.assertEqual(next(iter).span(), (1,2)) + self.assertRaises(StopIteration, next, iter) scanner = re.compile(r"\s").scanner("a b") self.assertEqual(scanner.search().span(), (1, 2)) @@ -726,43 +701,43 @@ class ReTests(unittest.TestCase): def test_bug_817234(self): iter = re.finditer(r".*", "asdf") - self.assertEqual(iter.next().span(), (0, 4)) - self.assertEqual(iter.next().span(), (4, 4)) - self.assertRaises(StopIteration, iter.next) + self.assertEqual(next(iter).span(), (0, 4)) + self.assertEqual(next(iter).span(), (4, 4)) + self.assertRaises(StopIteration, next, iter) def test_bug_6561(self): # '\d' should match characters in Unicode category 'Nd' # (Number, Decimal Digit), but not those in 'Nl' (Number, # Letter) or 'No' (Number, Other). decimal_digits = [ - u'\u0037', # '\N{DIGIT SEVEN}', category 'Nd' - u'\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd' - u'\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd' + '\u0037', # '\N{DIGIT SEVEN}', category 'Nd' + '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd' + '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd' ] for x in decimal_digits: - self.assertEqual(re.match('^\d$', x, re.UNICODE).group(0), x) + self.assertEqual(re.match('^\d$', x).group(0), x) not_decimal_digits = [ - u'\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl' - u'\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl' - u'\u2082', # '\N{SUBSCRIPT TWO}', category 'No' - u'\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No' + '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl' + '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl' + '\u2082', # '\N{SUBSCRIPT TWO}', category 'No' + '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No' ] for x in not_decimal_digits: - self.assertIsNone(re.match('^\d$', x, re.UNICODE)) + self.assertIsNone(re.match('^\d$', x)) def test_empty_array(self): # SF buf 1647541 import array - for typecode in 'cbBuhHiIlLfd': + for typecode in 'bBuhHiIlLfd': a = array.array(typecode) - self.assertEqual(re.compile("bla").match(a), None) - self.assertEqual(re.compile("").match(a).groups(), ()) + self.assertEqual(re.compile(b"bla").match(a), None) + self.assertEqual(re.compile(b"").match(a).groups(), ()) def test_inline_flags(self): # Bug #1700 - upper_char = unichr(0x1ea0) # Latin Capital Letter A with Dot Bellow - lower_char = unichr(0x1ea1) # Latin Small Letter A with Dot Bellow + upper_char = chr(0x1ea0) # Latin Capital Letter A with Dot Bellow + lower_char = chr(0x1ea1) # Latin Small Letter A with Dot Bellow p = re.compile(upper_char, re.I | re.U) q = p.match(lower_char) @@ -800,6 +775,66 @@ class ReTests(unittest.TestCase): self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#') self.assertEqual(pattern.sub('#', '\n'), '#\n#') + def test_bytes_str_mixing(self): + # Mixing str and bytes is disallowed + pat = re.compile('.') + bpat = re.compile(b'.') + self.assertRaises(TypeError, pat.match, b'b') + self.assertRaises(TypeError, bpat.match, 'b') + self.assertRaises(TypeError, pat.sub, b'b', 'c') + self.assertRaises(TypeError, pat.sub, 'b', b'c') + self.assertRaises(TypeError, pat.sub, b'b', b'c') + self.assertRaises(TypeError, bpat.sub, b'b', 'c') + self.assertRaises(TypeError, bpat.sub, 'b', b'c') + self.assertRaises(TypeError, bpat.sub, 'b', 'c') + + def test_ascii_and_unicode_flag(self): + # String patterns + for flags in (0, re.UNICODE): + pat = re.compile('\xc0', flags | re.IGNORECASE) + self.assertNotEqual(pat.match('\xe0'), None) + pat = re.compile('\w', flags) + self.assertNotEqual(pat.match('\xe0'), None) + pat = re.compile('\xc0', re.ASCII | re.IGNORECASE) + self.assertEqual(pat.match('\xe0'), None) + pat = re.compile('(?a)\xc0', re.IGNORECASE) + self.assertEqual(pat.match('\xe0'), None) + pat = re.compile('\w', re.ASCII) + self.assertEqual(pat.match('\xe0'), None) + pat = re.compile('(?a)\w') + self.assertEqual(pat.match('\xe0'), None) + # Bytes patterns + for flags in (0, re.ASCII): + pat = re.compile(b'\xc0', re.IGNORECASE) + self.assertEqual(pat.match(b'\xe0'), None) + pat = re.compile(b'\w') + self.assertEqual(pat.match(b'\xe0'), None) + # Incompatibilities + self.assertRaises(ValueError, re.compile, b'\w', re.UNICODE) + self.assertRaises(ValueError, re.compile, b'(?u)\w') + self.assertRaises(ValueError, re.compile, '\w', re.UNICODE | re.ASCII) + self.assertRaises(ValueError, re.compile, '(?u)\w', re.ASCII) + self.assertRaises(ValueError, re.compile, '(?a)\w', re.UNICODE) + self.assertRaises(ValueError, re.compile, '(?au)\w') + + def test_bug_6509(self): + # Replacement strings of both types must parse properly. + # all strings + pat = re.compile('a(\w)') + self.assertEqual(pat.sub('b\\1', 'ac'), 'bc') + pat = re.compile('a(.)') + self.assertEqual(pat.sub('b\\1', 'a\u1234'), 'b\u1234') + pat = re.compile('..') + self.assertEqual(pat.sub(lambda m: 'str', 'a5'), 'str') + + # all bytes + pat = re.compile(b'a(\w)') + self.assertEqual(pat.sub(b'b\\1', b'ac'), b'bc') + pat = re.compile(b'a(.)') + self.assertEqual(pat.sub(b'b\\1', b'a\xCD'), b'b\xCD') + pat = re.compile(b'..') + self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes') + def test_dealloc(self): # issue 3299: check for segfault in debug build import _sre @@ -810,6 +845,7 @@ class ReTests(unittest.TestCase): long_overflow = 2**128 self.assertRaises(TypeError, re.finditer, "a", {}) self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow]) + self.assertRaises(TypeError, _sre.compile, {}, 0, []) def test_compile(self): # Test return value when given string and pattern as parameter @@ -821,7 +857,7 @@ class ReTests(unittest.TestCase): # Test behaviour when not given a string or pattern as parameter self.assertRaises(TypeError, re.compile, 0) - @precisionbigmemtest(size=_2G, memuse=1) + @bigmemtest(size=_2G, memuse=character_size) def test_large_search(self, size): # Issue #10182: indices were 32-bit-truncated. s = 'a' * size @@ -832,7 +868,7 @@ class ReTests(unittest.TestCase): # The huge memuse is because of re.sub() using a list and a join() # to create the replacement result. - @precisionbigmemtest(size=_2G, memuse=16 + 2) + @bigmemtest(size=_2G, memuse=16 + 2 * character_size) def test_large_subn(self, size): # Issue #10182: indices were 32-bit-truncated. s = 'a' * size @@ -844,7 +880,7 @@ class ReTests(unittest.TestCase): def run_re_tests(): from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR if verbose: - print 'Running re_tests test suite' + print('Running re_tests test suite') else: # To save time, only run the first and last 10 tests #tests = tests[:10] + tests[-10:] @@ -858,30 +894,30 @@ def run_re_tests(): elif len(t) == 3: pattern, s, outcome = t else: - raise ValueError, ('Test tuples should have 3 or 5 fields', t) + raise ValueError('Test tuples should have 3 or 5 fields', t) try: obj = re.compile(pattern) except re.error: if outcome == SYNTAX_ERROR: pass # Expected a syntax error else: - print '=== Syntax error:', t + print('=== Syntax error:', t) except KeyboardInterrupt: raise KeyboardInterrupt except: - print '*** Unexpected error ***', t + print('*** Unexpected error ***', t) if verbose: traceback.print_exc(file=sys.stdout) else: try: result = obj.search(s) - except re.error, msg: - print '=== Unexpected exception', t, repr(msg) + except re.error as msg: + print('=== Unexpected exception', t, repr(msg)) if outcome == SYNTAX_ERROR: # This should have been a syntax error; forget it. pass elif outcome == FAIL: if result is None: pass # No match, as expected - else: print '=== Succeeded incorrectly', t + else: print('=== Succeeded incorrectly', t) elif outcome == SUCCEED: if result is not None: # Matched, as expected, so now we compute the @@ -909,28 +945,30 @@ def run_re_tests(): vardict[i] = gi repl = eval(repl, vardict) if repl != expected: - print '=== grouping error', t, - print repr(repl) + ' should be ' + repr(expected) + print('=== grouping error', t, end=' ') + print(repr(repl) + ' should be ' + repr(expected)) else: - print '=== Failed incorrectly', t + print('=== Failed incorrectly', t) - # Try the match on a unicode string, and check that it - # still succeeds. + # Try the match with both pattern and string converted to + # bytes, and check that it still succeeds. try: - result = obj.search(unicode(s, "latin-1")) - if result is None: - print '=== Fails on unicode match', t - except NameError: - continue # 1.5.2 - except TypeError: - continue # unicode test case - - # Try the match on a unicode pattern, and check that it - # still succeeds. - obj=re.compile(unicode(pattern, "latin-1")) - result = obj.search(s) - if result is None: - print '=== Fails on unicode pattern match', t + bpat = bytes(pattern, "ascii") + bs = bytes(s, "ascii") + except UnicodeEncodeError: + # skip non-ascii tests + pass + else: + try: + bpat = re.compile(bpat) + except Exception: + print('=== Fails on bytes pattern compile', t) + if verbose: + traceback.print_exc(file=sys.stdout) + else: + bytes_result = bpat.search(bs) + if bytes_result is None: + print('=== Fails on bytes pattern match', t) # Try the match with the search area limited to the extent # of the match and see if it still succeeds. \B will @@ -942,28 +980,30 @@ def run_re_tests(): obj = re.compile(pattern) result = obj.search(s, result.start(0), result.end(0) + 1) if result is None: - print '=== Failed on range-limited match', t + print('=== Failed on range-limited match', t) # Try the match with IGNORECASE enabled, and check that it # still succeeds. obj = re.compile(pattern, re.IGNORECASE) result = obj.search(s) if result is None: - print '=== Fails on case-insensitive match', t + print('=== Fails on case-insensitive match', t) # Try the match with LOCALE enabled, and check that it # still succeeds. - obj = re.compile(pattern, re.LOCALE) - result = obj.search(s) - if result is None: - print '=== Fails on locale-sensitive match', t + if '(?u)' not in pattern: + obj = re.compile(pattern, re.LOCALE) + result = obj.search(s) + if result is None: + print('=== Fails on locale-sensitive match', t) # Try the match with UNICODE locale enabled, and check # that it still succeeds. obj = re.compile(pattern, re.UNICODE) result = obj.search(s) if result is None: - print '=== Fails on unicode-sensitive match', t + print('=== Fails on unicode-sensitive match', t) + def test_main(): run_unittest(ReTests) |