diff options
Diffstat (limited to 'Lib/sre_compile.py')
-rw-r--r-- | Lib/sre_compile.py | 168 |
1 files changed, 65 insertions, 103 deletions
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 550ea158afd..1241a01c3ea 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -13,19 +13,13 @@ import _sre import sre_parse from sre_constants import * -from _sre import MAXREPEAT assert _sre.MAGIC == MAGIC, "SRE module mismatch" -if _sre.CODESIZE == 2: - MAXCODE = 65535 -else: - MAXCODE = 0xFFFFFFFF - -_LITERAL_CODES = set([LITERAL, NOT_LITERAL]) -_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT]) -_SUCCESS_CODES = set([SUCCESS, FAILURE]) -_ASSERT_CODES = set([ASSERT, ASSERT_NOT]) +_LITERAL_CODES = {LITERAL, NOT_LITERAL} +_REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT} +_SUCCESS_CODES = {SUCCESS, FAILURE} +_ASSERT_CODES = {ASSERT, ASSERT_NOT} # Sets of lowercase characters which have the same uppercase. _equivalences = ( @@ -86,75 +80,75 @@ def _compile(code, pattern, flags): if flags & SRE_FLAG_IGNORECASE: lo = _sre.getlower(av, flags) if fixes and lo in fixes: - emit(OPCODES[IN_IGNORE]) + emit(IN_IGNORE) skip = _len(code); emit(0) if op is NOT_LITERAL: - emit(OPCODES[NEGATE]) + emit(NEGATE) for k in (lo,) + fixes[lo]: - emit(OPCODES[LITERAL]) + emit(LITERAL) emit(k) - emit(OPCODES[FAILURE]) + emit(FAILURE) code[skip] = _len(code) - skip else: - emit(OPCODES[OP_IGNORE[op]]) + emit(OP_IGNORE[op]) emit(lo) else: - emit(OPCODES[op]) + emit(op) emit(av) elif op is IN: if flags & SRE_FLAG_IGNORECASE: - emit(OPCODES[OP_IGNORE[op]]) + emit(OP_IGNORE[op]) def fixup(literal, flags=flags): return _sre.getlower(literal, flags) else: - emit(OPCODES[op]) + emit(op) fixup = None skip = _len(code); emit(0) _compile_charset(av, flags, code, fixup, fixes) code[skip] = _len(code) - skip elif op is ANY: if flags & SRE_FLAG_DOTALL: - emit(OPCODES[ANY_ALL]) + emit(ANY_ALL) else: - emit(OPCODES[ANY]) + emit(ANY) elif op in REPEATING_CODES: if flags & SRE_FLAG_TEMPLATE: raise error("internal: unsupported template operator") elif _simple(av) and op is not REPEAT: if op is MAX_REPEAT: - emit(OPCODES[REPEAT_ONE]) + emit(REPEAT_ONE) else: - emit(OPCODES[MIN_REPEAT_ONE]) + emit(MIN_REPEAT_ONE) skip = _len(code); emit(0) emit(av[0]) emit(av[1]) _compile(code, av[2], flags) - emit(OPCODES[SUCCESS]) + emit(SUCCESS) code[skip] = _len(code) - skip else: - emit(OPCODES[REPEAT]) + emit(REPEAT) skip = _len(code); emit(0) emit(av[0]) emit(av[1]) _compile(code, av[2], flags) code[skip] = _len(code) - skip if op is MAX_REPEAT: - emit(OPCODES[MAX_UNTIL]) + emit(MAX_UNTIL) else: - emit(OPCODES[MIN_UNTIL]) + emit(MIN_UNTIL) elif op is SUBPATTERN: if av[0]: - emit(OPCODES[MARK]) + emit(MARK) emit((av[0]-1)*2) # _compile_info(code, av[1], flags) _compile(code, av[1], flags) if av[0]: - emit(OPCODES[MARK]) + emit(MARK) emit((av[0]-1)*2+1) elif op in SUCCESS_CODES: - emit(OPCODES[op]) + emit(op) elif op in ASSERT_CODES: - emit(OPCODES[op]) + emit(op) skip = _len(code); emit(0) if av[0] >= 0: emit(0) # look ahead @@ -164,57 +158,57 @@ def _compile(code, pattern, flags): raise error("look-behind requires fixed-width pattern") emit(lo) # look behind _compile(code, av[1], flags) - emit(OPCODES[SUCCESS]) + emit(SUCCESS) code[skip] = _len(code) - skip elif op is CALL: - emit(OPCODES[op]) + emit(op) skip = _len(code); emit(0) _compile(code, av, flags) - emit(OPCODES[SUCCESS]) + emit(SUCCESS) code[skip] = _len(code) - skip elif op is AT: - emit(OPCODES[op]) + emit(op) if flags & SRE_FLAG_MULTILINE: av = AT_MULTILINE.get(av, av) if flags & SRE_FLAG_LOCALE: av = AT_LOCALE.get(av, av) elif flags & SRE_FLAG_UNICODE: av = AT_UNICODE.get(av, av) - emit(ATCODES[av]) + emit(av) elif op is BRANCH: - emit(OPCODES[op]) + emit(op) tail = [] tailappend = tail.append for av in av[1]: skip = _len(code); emit(0) # _compile_info(code, av, flags) _compile(code, av, flags) - emit(OPCODES[JUMP]) + emit(JUMP) tailappend(_len(code)); emit(0) code[skip] = _len(code) - skip - emit(0) # end of branch + emit(FAILURE) # end of branch for tail in tail: code[tail] = _len(code) - tail elif op is CATEGORY: - emit(OPCODES[op]) + emit(op) if flags & SRE_FLAG_LOCALE: av = CH_LOCALE[av] elif flags & SRE_FLAG_UNICODE: av = CH_UNICODE[av] - emit(CHCODES[av]) + emit(av) elif op is GROUPREF: if flags & SRE_FLAG_IGNORECASE: - emit(OPCODES[OP_IGNORE[op]]) + emit(OP_IGNORE[op]) else: - emit(OPCODES[op]) + emit(op) emit(av-1) elif op is GROUPREF_EXISTS: - emit(OPCODES[op]) + emit(op) emit(av[0]-1) skipyes = _len(code); emit(0) _compile(code, av[1], flags) if av[2]: - emit(OPCODES[JUMP]) + emit(JUMP) skipno = _len(code); emit(0) code[skipyes] = _len(code) - skipyes + 1 _compile(code, av[2], flags) @@ -227,14 +221,13 @@ def _compile(code, pattern, flags): def _compile_charset(charset, flags, code, fixup=None, fixes=None): # compile charset subprogram emit = code.append - for op, av in _optimize_charset(charset, fixup, fixes, - flags & SRE_FLAG_UNICODE): - emit(OPCODES[op]) + for op, av in _optimize_charset(charset, fixup, fixes): + emit(op) if op is NEGATE: pass elif op is LITERAL: emit(av) - elif op is RANGE: + elif op is RANGE or op is RANGE_IGNORE: emit(av[0]) emit(av[1]) elif op is CHARSET: @@ -243,16 +236,16 @@ def _compile_charset(charset, flags, code, fixup=None, fixes=None): code.extend(av) elif op is CATEGORY: if flags & SRE_FLAG_LOCALE: - emit(CHCODES[CH_LOCALE[av]]) + emit(CH_LOCALE[av]) elif flags & SRE_FLAG_UNICODE: - emit(CHCODES[CH_UNICODE[av]]) + emit(CH_UNICODE[av]) else: - emit(CHCODES[av]) + emit(av) else: raise error("internal: unsupported set operator") - emit(OPCODES[FAILURE]) + emit(FAILURE) -def _optimize_charset(charset, fixup, fixes, isunicode): +def _optimize_charset(charset, fixup, fixes): # internal: optimize character set out = [] tail = [] @@ -262,10 +255,10 @@ def _optimize_charset(charset, fixup, fixes, isunicode): try: if op is LITERAL: if fixup: - i = fixup(av) - charmap[i] = 1 - if fixes and i in fixes: - for k in fixes[i]: + lo = fixup(av) + charmap[lo] = 1 + if fixes and lo in fixes: + for k in fixes[lo]: charmap[k] = 1 else: charmap[av] = 1 @@ -291,21 +284,13 @@ def _optimize_charset(charset, fixup, fixes, isunicode): # character set contains non-UCS1 character codes charmap += b'\0' * 0xff00 continue - # character set contains non-BMP character codes - if fixup and isunicode and op is RANGE: - lo, hi = av - ranges = [av] - # There are only two ranges of cased astral characters: - # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi). - _fixup_range(max(0x10000, lo), min(0x11fff, hi), - ranges, fixup) - for lo, hi in ranges: - if lo == hi: - tail.append((LITERAL, hi)) - else: - tail.append((RANGE, (lo, hi))) - else: - tail.append((op, av)) + # Character set contains non-BMP character codes. + # There are only two ranges of cased non-BMP characters: + # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi), + # and for both ranges RANGE_IGNORE works. + if fixup and op is RANGE: + op = RANGE_IGNORE + tail.append((op, av)) break # compress character map @@ -383,25 +368,8 @@ def _optimize_charset(charset, fixup, fixes, isunicode): out += tail return out -def _fixup_range(lo, hi, ranges, fixup): - for i in map(fixup, range(lo, hi+1)): - for k, (lo, hi) in enumerate(ranges): - if i < lo: - if l == lo - 1: - ranges[k] = (i, hi) - else: - ranges.insert(k, (i, i)) - break - elif i > hi: - if i == hi + 1: - ranges[k] = (lo, i) - break - else: - break - else: - ranges.append((i, i)) - _CODEBITS = _sre.CODESIZE * 8 +MAXCODE = (1 << _CODEBITS) - 1 _BITS_TRANS = b'0' + b'1' * 255 def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int): s = bits.translate(_BITS_TRANS)[::-1] @@ -505,21 +473,21 @@ def _compile_info(code, pattern, flags): elif op is IN: charset = av ## if prefix: -## print "*** PREFIX", prefix, prefix_skip +## print("*** PREFIX", prefix, prefix_skip) ## if charset: -## print "*** CHARSET", charset +## print("*** CHARSET", charset) # add an info block emit = code.append - emit(OPCODES[INFO]) + emit(INFO) skip = len(code); emit(0) # literal flag mask = 0 if prefix: mask = SRE_INFO_PREFIX if len(prefix) == prefix_skip == len(pattern.data): - mask = mask + SRE_INFO_LITERAL + mask = mask | SRE_INFO_LITERAL elif charset: - mask = mask + SRE_INFO_CHARSET + mask = mask | SRE_INFO_CHARSET emit(mask) # pattern length if lo < MAXCODE: @@ -556,7 +524,7 @@ def _code(p, flags): # compile the pattern _compile(code, p.data, flags) - code.append(OPCODES[SUCCESS]) + code.append(SUCCESS) return code @@ -571,13 +539,7 @@ def compile(p, flags=0): code = _code(p, flags) - # print code - - # XXX: <fl> get rid of this limitation! - if p.pattern.groups > 100: - raise AssertionError( - "sorry, but this version only supports 100 named groups" - ) + # print(code) # map in either direction groupindex = p.pattern.groupdict |