diff options
Diffstat (limited to 'Lib/pickletools.py')
-rw-r--r-- | Lib/pickletools.py | 565 |
1 files changed, 346 insertions, 219 deletions
diff --git a/Lib/pickletools.py b/Lib/pickletools.py index d717728d417..ec6cc53ae31 100644 --- a/Lib/pickletools.py +++ b/Lib/pickletools.py @@ -10,8 +10,14 @@ dis(pickle, out=None, memo=None, indentlevel=4) Print a symbolic disassembly of a pickle. ''' +import codecs +import pickle +import re + __all__ = ['dis', 'genops', 'optimize'] +bytes_types = pickle.bytes_types + # Other ideas: # # - A pickle verifier: read a pickle and check it exhaustively for @@ -136,7 +142,7 @@ this and there isn't a use case that warrants the expense of such an analysis. To this end, all tests for __safe_for_unpickling__ or for -copy_reg.safe_constructors are removed from the unpickling code. +copyreg.safe_constructors are removed from the unpickling code. References to these variables in the descriptions below are to be seen as describing unpickling in Python 2.2 and before. """ @@ -200,14 +206,14 @@ from struct import unpack as _unpack def read_uint1(f): r""" - >>> import StringIO - >>> read_uint1(StringIO.StringIO('\xff')) + >>> import io + >>> read_uint1(io.BytesIO(b'\xff')) 255 """ data = f.read(1) if data: - return ord(data) + return data[0] raise ValueError("not enough data in stream to read uint1") uint1 = ArgumentDescriptor( @@ -219,10 +225,10 @@ uint1 = ArgumentDescriptor( def read_uint2(f): r""" - >>> import StringIO - >>> read_uint2(StringIO.StringIO('\xff\x00')) + >>> import io + >>> read_uint2(io.BytesIO(b'\xff\x00')) 255 - >>> read_uint2(StringIO.StringIO('\xff\xff')) + >>> read_uint2(io.BytesIO(b'\xff\xff')) 65535 """ @@ -240,10 +246,10 @@ uint2 = ArgumentDescriptor( def read_int4(f): r""" - >>> import StringIO - >>> read_int4(StringIO.StringIO('\xff\x00\x00\x00')) + >>> import io + >>> read_int4(io.BytesIO(b'\xff\x00\x00\x00')) 255 - >>> read_int4(StringIO.StringIO('\x00\x00\x00\x80')) == -(2**31) + >>> read_int4(io.BytesIO(b'\x00\x00\x00\x80')) == -(2**31) True """ @@ -261,38 +267,38 @@ int4 = ArgumentDescriptor( def read_stringnl(f, decode=True, stripquotes=True): r""" - >>> import StringIO - >>> read_stringnl(StringIO.StringIO("'abcd'\nefg\n")) + >>> import io + >>> read_stringnl(io.BytesIO(b"'abcd'\nefg\n")) 'abcd' - >>> read_stringnl(StringIO.StringIO("\n")) + >>> read_stringnl(io.BytesIO(b"\n")) Traceback (most recent call last): ... - ValueError: no string quotes around '' + ValueError: no string quotes around b'' - >>> read_stringnl(StringIO.StringIO("\n"), stripquotes=False) + >>> read_stringnl(io.BytesIO(b"\n"), stripquotes=False) '' - >>> read_stringnl(StringIO.StringIO("''\n")) + >>> read_stringnl(io.BytesIO(b"''\n")) '' - >>> read_stringnl(StringIO.StringIO('"abcd"')) + >>> read_stringnl(io.BytesIO(b'"abcd"')) Traceback (most recent call last): ... ValueError: no newline found when trying to read stringnl Embedded escapes are undone in the result. - >>> read_stringnl(StringIO.StringIO(r"'a\n\\b\x00c\td'" + "\n'e'")) + >>> read_stringnl(io.BytesIO(br"'a\n\\b\x00c\td'" + b"\n'e'")) 'a\n\\b\x00c\td' """ data = f.readline() - if not data.endswith('\n'): + if not data.endswith(b'\n'): raise ValueError("no newline found when trying to read stringnl") data = data[:-1] # lose the newline if stripquotes: - for q in "'\"": + for q in (b'"', b"'"): if data.startswith(q): if not data.endswith(q): raise ValueError("strinq quote %r not found at both " @@ -302,10 +308,8 @@ def read_stringnl(f, decode=True, stripquotes=True): else: raise ValueError("no string quotes around %r" % data) - # I'm not sure when 'string_escape' was added to the std codecs; it's - # crazy not to use it if it's there. if decode: - data = data.decode('string_escape') + data = codecs.escape_decode(data)[0].decode("ascii") return data stringnl = ArgumentDescriptor( @@ -319,7 +323,7 @@ stringnl = ArgumentDescriptor( """) def read_stringnl_noescape(f): - return read_stringnl(f, decode=False, stripquotes=False) + return read_stringnl(f, stripquotes=False) stringnl_noescape = ArgumentDescriptor( name='stringnl_noescape', @@ -334,8 +338,8 @@ stringnl_noescape = ArgumentDescriptor( def read_stringnl_noescape_pair(f): r""" - >>> import StringIO - >>> read_stringnl_noescape_pair(StringIO.StringIO("Queue\nEmpty\njunk")) + >>> import io + >>> read_stringnl_noescape_pair(io.BytesIO(b"Queue\nEmpty\njunk")) 'Queue Empty' """ @@ -356,12 +360,12 @@ stringnl_noescape_pair = ArgumentDescriptor( def read_string4(f): r""" - >>> import StringIO - >>> read_string4(StringIO.StringIO("\x00\x00\x00\x00abc")) + >>> import io + >>> read_string4(io.BytesIO(b"\x00\x00\x00\x00abc")) '' - >>> read_string4(StringIO.StringIO("\x03\x00\x00\x00abcdef")) + >>> read_string4(io.BytesIO(b"\x03\x00\x00\x00abcdef")) 'abc' - >>> read_string4(StringIO.StringIO("\x00\x00\x00\x03abcdef")) + >>> read_string4(io.BytesIO(b"\x00\x00\x00\x03abcdef")) Traceback (most recent call last): ... ValueError: expected 50331648 bytes in a string4, but only 6 remain @@ -372,7 +376,7 @@ def read_string4(f): raise ValueError("string4 byte count < 0: %d" % n) data = f.read(n) if len(data) == n: - return data + return data.decode("latin-1") raise ValueError("expected %d bytes in a string4, but only %d remain" % (n, len(data))) @@ -390,10 +394,10 @@ string4 = ArgumentDescriptor( def read_string1(f): r""" - >>> import StringIO - >>> read_string1(StringIO.StringIO("\x00")) + >>> import io + >>> read_string1(io.BytesIO(b"\x00")) '' - >>> read_string1(StringIO.StringIO("\x03abcdef")) + >>> read_string1(io.BytesIO(b"\x03abcdef")) 'abc' """ @@ -401,7 +405,7 @@ def read_string1(f): assert n >= 0 data = f.read(n) if len(data) == n: - return data + return data.decode("latin-1") raise ValueError("expected %d bytes in a string1, but only %d remain" % (n, len(data))) @@ -419,17 +423,17 @@ string1 = ArgumentDescriptor( def read_unicodestringnl(f): r""" - >>> import StringIO - >>> read_unicodestringnl(StringIO.StringIO("abc\uabcd\njunk")) - u'abc\uabcd' + >>> import io + >>> read_unicodestringnl(io.BytesIO(b"abc\\uabcd\njunk")) == 'abc\uabcd' + True """ data = f.readline() - if not data.endswith('\n'): + if not data.endswith(b'\n'): raise ValueError("no newline found when trying to read " "unicodestringnl") data = data[:-1] # lose the newline - return unicode(data, 'raw-unicode-escape') + return str(data, 'raw-unicode-escape') unicodestringnl = ArgumentDescriptor( name='unicodestringnl', @@ -444,17 +448,17 @@ unicodestringnl = ArgumentDescriptor( def read_unicodestring4(f): r""" - >>> import StringIO - >>> s = u'abcd\uabcd' + >>> import io + >>> s = 'abcd\uabcd' >>> enc = s.encode('utf-8') >>> enc - 'abcd\xea\xaf\x8d' - >>> n = chr(len(enc)) + chr(0) * 3 # little-endian 4-byte length - >>> t = read_unicodestring4(StringIO.StringIO(n + enc + 'junk')) + b'abcd\xea\xaf\x8d' + >>> n = bytes([len(enc), 0, 0, 0]) # little-endian 4-byte length + >>> t = read_unicodestring4(io.BytesIO(n + enc + b'junk')) >>> s == t True - >>> read_unicodestring4(StringIO.StringIO(n + enc[:-1])) + >>> read_unicodestring4(io.BytesIO(n + enc[:-1])) Traceback (most recent call last): ... ValueError: expected 7 bytes in a unicodestring4, but only 6 remain @@ -465,7 +469,7 @@ def read_unicodestring4(f): raise ValueError("unicodestring4 byte count < 0: %d" % n) data = f.read(n) if len(data) == n: - return unicode(data, 'utf-8') + return str(data, 'utf-8', 'surrogatepass') raise ValueError("expected %d bytes in a unicodestring4, but only %d " "remain" % (n, len(data))) @@ -484,55 +488,48 @@ unicodestring4 = ArgumentDescriptor( def read_decimalnl_short(f): r""" - >>> import StringIO - >>> read_decimalnl_short(StringIO.StringIO("1234\n56")) + >>> import io + >>> read_decimalnl_short(io.BytesIO(b"1234\n56")) 1234 - >>> read_decimalnl_short(StringIO.StringIO("1234L\n56")) + >>> read_decimalnl_short(io.BytesIO(b"1234L\n56")) Traceback (most recent call last): ... - ValueError: trailing 'L' not allowed in '1234L' + ValueError: trailing 'L' not allowed in b'1234L' """ s = read_stringnl(f, decode=False, stripquotes=False) - if s.endswith("L"): + if s.endswith(b"L"): raise ValueError("trailing 'L' not allowed in %r" % s) # It's not necessarily true that the result fits in a Python short int: # the pickle may have been written on a 64-bit box. There's also a hack # for True and False here. - if s == "00": + if s == b"00": return False - elif s == "01": + elif s == b"01": return True try: return int(s) except OverflowError: - return long(s) + return int(s) def read_decimalnl_long(f): r""" - >>> import StringIO + >>> import io - >>> read_decimalnl_long(StringIO.StringIO("1234\n56")) - Traceback (most recent call last): - ... - ValueError: trailing 'L' required in '1234' - - Someday the trailing 'L' will probably go away from this output. - - >>> read_decimalnl_long(StringIO.StringIO("1234L\n56")) - 1234L + >>> read_decimalnl_long(io.BytesIO(b"1234L\n56")) + 1234 - >>> read_decimalnl_long(StringIO.StringIO("123456789012345678901234L\n6")) - 123456789012345678901234L + >>> read_decimalnl_long(io.BytesIO(b"123456789012345678901234L\n6")) + 123456789012345678901234 """ s = read_stringnl(f, decode=False, stripquotes=False) - if not s.endswith("L"): - raise ValueError("trailing 'L' required in %r" % s) - return long(s) + if s[-1:] == b'L': + s = s[:-1] + return int(s) decimalnl_short = ArgumentDescriptor( @@ -561,8 +558,8 @@ decimalnl_long = ArgumentDescriptor( def read_floatnl(f): r""" - >>> import StringIO - >>> read_floatnl(StringIO.StringIO("-1.25\n6")) + >>> import io + >>> read_floatnl(io.BytesIO(b"-1.25\n6")) -1.25 """ s = read_stringnl(f, decode=False, stripquotes=False) @@ -583,11 +580,11 @@ floatnl = ArgumentDescriptor( def read_float8(f): r""" - >>> import StringIO, struct + >>> import io, struct >>> raw = struct.pack(">d", -1.25) >>> raw - '\xbf\xf4\x00\x00\x00\x00\x00\x00' - >>> read_float8(StringIO.StringIO(raw + "\n")) + b'\xbf\xf4\x00\x00\x00\x00\x00\x00' + >>> read_float8(io.BytesIO(raw + b"\n")) -1.25 """ @@ -604,7 +601,7 @@ float8 = ArgumentDescriptor( doc="""An 8-byte binary representation of a float, big-endian. The format is unique to Python, and shared with the struct - module (format string '>d') "in theory" (the struct and cPickle + module (format string '>d') "in theory" (the struct and pickle implementations don't share the code -- they should). It's strongly related to the IEEE-754 double format, and, in normal cases, is in fact identical to the big-endian 754 double format. @@ -621,17 +618,17 @@ from pickle import decode_long def read_long1(f): r""" - >>> import StringIO - >>> read_long1(StringIO.StringIO("\x00")) - 0L - >>> read_long1(StringIO.StringIO("\x02\xff\x00")) - 255L - >>> read_long1(StringIO.StringIO("\x02\xff\x7f")) - 32767L - >>> read_long1(StringIO.StringIO("\x02\x00\xff")) - -256L - >>> read_long1(StringIO.StringIO("\x02\x00\x80")) - -32768L + >>> import io + >>> read_long1(io.BytesIO(b"\x00")) + 0 + >>> read_long1(io.BytesIO(b"\x02\xff\x00")) + 255 + >>> read_long1(io.BytesIO(b"\x02\xff\x7f")) + 32767 + >>> read_long1(io.BytesIO(b"\x02\x00\xff")) + -256 + >>> read_long1(io.BytesIO(b"\x02\x00\x80")) + -32768 """ n = read_uint1(f) @@ -653,17 +650,17 @@ long1 = ArgumentDescriptor( def read_long4(f): r""" - >>> import StringIO - >>> read_long4(StringIO.StringIO("\x02\x00\x00\x00\xff\x00")) - 255L - >>> read_long4(StringIO.StringIO("\x02\x00\x00\x00\xff\x7f")) - 32767L - >>> read_long4(StringIO.StringIO("\x02\x00\x00\x00\x00\xff")) - -256L - >>> read_long4(StringIO.StringIO("\x02\x00\x00\x00\x00\x80")) - -32768L - >>> read_long1(StringIO.StringIO("\x00\x00\x00\x00")) - 0L + >>> import io + >>> read_long4(io.BytesIO(b"\x02\x00\x00\x00\xff\x00")) + 255 + >>> read_long4(io.BytesIO(b"\x02\x00\x00\x00\xff\x7f")) + 32767 + >>> read_long4(io.BytesIO(b"\x02\x00\x00\x00\x00\xff")) + -256 + >>> read_long4(io.BytesIO(b"\x02\x00\x00\x00\x00\x80")) + -32768 + >>> read_long1(io.BytesIO(b"\x00\x00\x00\x00")) + 0 """ n = read_int4(f) @@ -683,7 +680,7 @@ long4 = ArgumentDescriptor( This first reads four bytes as a signed size (but requires the size to be >= 0), then reads that many bytes and interprets them as a little-endian 2's-complement long. If the size is 0, that's taken - as a shortcut for the long 0L, although LONG1 should really be used + as a shortcut for the int 0, although LONG1 should really be used then instead (and in any case where # of bytes < 256). """) @@ -731,12 +728,12 @@ pyint = StackObject( pylong = StackObject( name='long', - obtype=long, + obtype=int, doc="A long (as opposed to short) Python integer object.") pyinteger_or_bool = StackObject( name='int_or_bool', - obtype=(int, long, bool), + obtype=(int, bool), doc="A Python integer object (short or long), or " "a Python bool.") @@ -751,14 +748,19 @@ pyfloat = StackObject( doc="A Python float object.") pystring = StackObject( - name='str', - obtype=str, - doc="A Python string object.") + name='string', + obtype=bytes, + doc="A Python (8-bit) string object.") + +pybytes = StackObject( + name='bytes', + obtype=bytes, + doc="A Python bytes object.") pyunicode = StackObject( - name='unicode', - obtype=unicode, - doc="A Python Unicode string object.") + name='str', + obtype=str, + doc="A Python (Unicode) string object.") pynone = StackObject( name="None", @@ -873,7 +875,7 @@ class OpcodeInfo(object): assert isinstance(x, StackObject) self.stack_after = stack_after - assert isinstance(proto, int) and 0 <= proto <= 2 + assert isinstance(proto, int) and 0 <= proto <= 3 self.proto = proto assert isinstance(doc, str) @@ -1000,7 +1002,9 @@ opcodes = [ The argument is a repr-style string, with bracketing quote characters, and perhaps embedded escapes. The argument extends until the next - newline character. + newline character. (Actually, they are decoded into a str instance + using the encoding given to the Unpickler constructor. or the default, + 'ASCII'.) """), I(name='BINSTRING', @@ -1013,7 +1017,9 @@ opcodes = [ There are two arguments: the first is a 4-byte little-endian signed int giving the number of bytes in the string, and the second is that many - bytes, which are taken literally as the string content. + bytes, which are taken literally as the string content. (Actually, + they are decoded into a str instance using the encoding given to the + Unpickler constructor. or the default, 'ASCII'.) """), I(name='SHORT_BINSTRING', @@ -1026,6 +1032,36 @@ opcodes = [ There are two arguments: the first is a 1-byte unsigned int giving the number of bytes in the string, and the second is that many bytes, + which are taken literally as the string content. (Actually, they + are decoded into a str instance using the encoding given to the + Unpickler constructor. or the default, 'ASCII'.) + """), + + # Bytes (protocol 3 only; older protocols don't support bytes at all) + + I(name='BINBYTES', + code='B', + arg=string4, + stack_before=[], + stack_after=[pybytes], + proto=3, + doc="""Push a Python bytes object. + + There are two arguments: the first is a 4-byte little-endian signed int + giving the number of bytes in the string, and the second is that many + bytes, which are taken literally as the bytes content. + """), + + I(name='SHORT_BINBYTES', + code='C', + arg=string1, + stack_before=[], + stack_after=[pybytes], + proto=3, + doc="""Push a Python string object. + + There are two arguments: the first is a 1-byte unsigned int giving + the number of bytes in the string, and the second is that many bytes, which are taken literally as the string content. """), @@ -1527,8 +1563,8 @@ opcodes = [ opcode is followed by code to create setstate's argument, and then a BUILD opcode to apply __setstate__ to that argument. - If type(callable) is not ClassType, REDUCE complains unless the - callable has been registered with the copy_reg module's + If not isinstance(callable, type), REDUCE complains unless the + callable has been registered with the copyreg module's safe_constructors dict, or the callable has a magic '__safe_for_unpickling__' attribute with a true value. I'm not sure why it does this, but I've sure seen this complaint often enough when @@ -1558,13 +1594,6 @@ opcodes = [ the object is updated via anyobject.__dict__.update(argument) - - This may raise RuntimeError in restricted execution mode (which - disallows access to __dict__ directly); in that case, the object - is updated instead via - - for k, v in argument.items(): - anyobject[k] = v """), I(name='INST', @@ -1590,9 +1619,6 @@ opcodes = [ + The argtuple is empty (markobject was at the top of the stack at the start). - + It's an old-style class object (the type of the class object is - ClassType). - + The class object does not have a __getinitargs__ attribute. then we want to create an old-style class instance without invoking @@ -1600,20 +1626,15 @@ opcodes = [ calling __init__() is current wisdom). In this case, an instance of an old-style dummy class is created, and then we try to rebind its __class__ attribute to the desired class object. If this succeeds, - the new instance object is pushed on the stack, and we're done. In - restricted execution mode it can fail (assignment to __class__ is - disallowed), and I'm not really sure what happens then -- it looks - like the code ends up calling the class object's __init__ anyway, - via falling into the next case. + the new instance object is pushed on the stack, and we're done. Else (the argtuple is not empty, it's not an old-style class object, or the class object does have a __getinitargs__ attribute), the code first insists that the class object have a __safe_for_unpickling__ attribute. Unlike as for the __safe_for_unpickling__ check in REDUCE, it doesn't matter whether this attribute has a true or false value, it - only matters whether it exists (XXX this is a bug; cPickle - requires the attribute to be true). If __safe_for_unpickling__ - doesn't exist, UnpicklingError is raised. + only matters whether it exists (XXX this is a bug). If + __safe_for_unpickling__ doesn't exist, UnpicklingError is raised. Else (the class object does have a __safe_for_unpickling__ attr), the class object obtained from INST's arguments is applied to the @@ -1648,8 +1669,7 @@ opcodes = [ As for INST, the remainder of the stack above the markobject is gathered into an argument tuple, and then the logic seems identical, except that no __safe_for_unpickling__ check is done (XXX this is - a bug; cPickle does test __safe_for_unpickling__). See INST for - the gory details. + a bug). See INST for the gory details. NOTE: In Python 2.3, INST and OBJ are identical except for how they get the class object. That was always the intent; the implementations @@ -1761,24 +1781,24 @@ for d in opcodes: del d def assure_pickle_consistency(verbose=False): - import pickle, re copy = code2op.copy() for name in pickle.__all__: if not re.match("[A-Z][A-Z0-9_]+$", name): if verbose: - print "skipping %r: it doesn't look like an opcode name" % name + print("skipping %r: it doesn't look like an opcode name" % name) continue picklecode = getattr(pickle, name) - if not isinstance(picklecode, str) or len(picklecode) != 1: + if not isinstance(picklecode, bytes) or len(picklecode) != 1: if verbose: - print ("skipping %r: value %r doesn't look like a pickle " - "code" % (name, picklecode)) + print(("skipping %r: value %r doesn't look like a pickle " + "code" % (name, picklecode))) continue + picklecode = picklecode.decode("latin-1") if picklecode in copy: if verbose: - print "checking name %r w/ code %r for consistency" % ( - name, picklecode) + print("checking name %r w/ code %r for consistency" % ( + name, picklecode)) d = copy[picklecode] if d.name != name: raise ValueError("for pickle code %r, pickle.py uses name %r " @@ -1822,16 +1842,15 @@ def genops(pickle): is None. If the pickle has a tell() method, pos was the value of pickle.tell() - before reading the current opcode. If the pickle is a string object, - it's wrapped in a StringIO object, and the latter's tell() result is + before reading the current opcode. If the pickle is a bytes object, + it's wrapped in a BytesIO object, and the latter's tell() result is used. Else (the pickle doesn't have a tell(), and it's not obvious how to query its current position) pos is None. """ - import cStringIO as StringIO - - if isinstance(pickle, str): - pickle = StringIO.StringIO(pickle) + if isinstance(pickle, bytes_types): + import io + pickle = io.BytesIO(pickle) if hasattr(pickle, "tell"): getpos = pickle.tell @@ -1841,9 +1860,9 @@ def genops(pickle): while True: pos = getpos() code = pickle.read(1) - opcode = code2op.get(code) + opcode = code2op.get(code.decode("latin-1")) if opcode is None: - if code == "": + if code == b"": raise ValueError("pickle exhausted before seeing STOP") else: raise ValueError("at position %s, opcode %r unknown" % ( @@ -1854,7 +1873,7 @@ def genops(pickle): else: arg = opcode.arg.reader(pickle) yield opcode, arg, pos - if code == '.': + if code == b'.': assert opcode.name == 'STOP' break @@ -1883,12 +1902,12 @@ def optimize(p): s.append(p[i:j]) i = stop s.append(p[i:]) - return ''.join(s) + return b''.join(s) ############################################################################## # A symbolic pickle disassembler. -def dis(pickle, out=None, memo=None, indentlevel=4): +def dis(pickle, out=None, memo=None, indentlevel=4, annotate=0): """Produce a symbolic disassembly of a pickle. 'pickle' is a file-like object, or string, containing a (at least one) @@ -1904,9 +1923,15 @@ def dis(pickle, out=None, memo=None, indentlevel=4): to proceed across multiple pickles that were all created by the same pickler with the same memo. Ordinarily you don't need to worry about this. - Optional arg indentlevel is the number of blanks by which to indent + Optional arg 'indentlevel' is the number of blanks by which to indent a new MARK level. It defaults to 4. + Optional arg 'annotate' if nonzero instructs dis() to add short + description of the opcode on each line of disassembled output. + The value given to 'annotate' must be an integer and is used as a + hint for the column where annotation should start. The default + value is 0, meaning no annotations. + In addition to printing the disassembly, some sanity checks are made: + All embedded opcode arguments "make sense". @@ -1934,9 +1959,10 @@ def dis(pickle, out=None, memo=None, indentlevel=4): markstack = [] # bytecode positions of MARK opcodes indentchunk = ' ' * indentlevel errormsg = None + annocol = annotate # columnt hint for annotations for opcode, arg, pos in genops(pickle): if pos is not None: - print >> out, "%5d:" % pos, + print("%5d:" % pos, end=' ', file=out) line = "%-4s %s%s" % (repr(opcode.code)[1:-1], indentchunk * len(markstack), @@ -2001,7 +2027,14 @@ def dis(pickle, out=None, memo=None, indentlevel=4): line += ' ' + repr(arg) if markmsg: line += ' ' + markmsg - print >> out, line + if annotate: + line += ' ' * (annocol - len(line)) + # make a mild effort to align annotations + annocol = len(line) + if annocol > 50: + annocol = annotate + line += ' ' + opcode.doc.split('\n', 1)[0] + print(line, file=out) if errormsg: # Note that we delayed complaining until the offending opcode @@ -2020,7 +2053,7 @@ def dis(pickle, out=None, memo=None, indentlevel=4): stack.extend(after) - print >> out, "highest protocol among opcodes =", maxproto + print("highest protocol among opcodes =", maxproto, file=out) if stack: raise ValueError("stack not empty after STOP: %r" % stack) @@ -2031,38 +2064,47 @@ class _Example: _dis_test = r""" >>> import pickle ->>> x = [1, 2, (3, 4), {'abc': u"def"}] ->>> pkl = pickle.dumps(x, 0) ->>> dis(pkl) +>>> x = [1, 2, (3, 4), {b'abc': "def"}] +>>> pkl0 = pickle.dumps(x, 0) +>>> dis(pkl0) 0: ( MARK 1: l LIST (MARK at 0) 2: p PUT 0 - 5: I INT 1 - 8: a APPEND - 9: I INT 2 - 12: a APPEND - 13: ( MARK - 14: I INT 3 - 17: I INT 4 - 20: t TUPLE (MARK at 13) - 21: p PUT 1 - 24: a APPEND - 25: ( MARK - 26: d DICT (MARK at 25) - 27: p PUT 2 - 30: S STRING 'abc' - 37: p PUT 3 - 40: V UNICODE u'def' - 45: p PUT 4 - 48: s SETITEM - 49: a APPEND - 50: . STOP + 5: L LONG 1 + 9: a APPEND + 10: L LONG 2 + 14: a APPEND + 15: ( MARK + 16: L LONG 3 + 20: L LONG 4 + 24: t TUPLE (MARK at 15) + 25: p PUT 1 + 28: a APPEND + 29: ( MARK + 30: d DICT (MARK at 29) + 31: p PUT 2 + 34: c GLOBAL '_codecs encode' + 50: p PUT 3 + 53: ( MARK + 54: V UNICODE 'abc' + 59: p PUT 4 + 62: V UNICODE 'latin1' + 70: p PUT 5 + 73: t TUPLE (MARK at 53) + 74: p PUT 6 + 77: R REDUCE + 78: p PUT 7 + 81: V UNICODE 'def' + 86: p PUT 8 + 89: s SETITEM + 90: a APPEND + 91: . STOP highest protocol among opcodes = 0 Try again with a "binary" pickle. ->>> pkl = pickle.dumps(x, 1) ->>> dis(pkl) +>>> pkl1 = pickle.dumps(x, 1) +>>> dis(pkl1) 0: ] EMPTY_LIST 1: q BINPUT 0 3: ( MARK @@ -2075,13 +2117,22 @@ Try again with a "binary" pickle. 14: q BINPUT 1 16: } EMPTY_DICT 17: q BINPUT 2 - 19: U SHORT_BINSTRING 'abc' - 24: q BINPUT 3 - 26: X BINUNICODE u'def' - 34: q BINPUT 4 - 36: s SETITEM - 37: e APPENDS (MARK at 3) - 38: . STOP + 19: c GLOBAL '_codecs encode' + 35: q BINPUT 3 + 37: ( MARK + 38: X BINUNICODE 'abc' + 46: q BINPUT 4 + 48: X BINUNICODE 'latin1' + 59: q BINPUT 5 + 61: t TUPLE (MARK at 37) + 62: q BINPUT 6 + 64: R REDUCE + 65: q BINPUT 7 + 67: X BINUNICODE 'def' + 75: q BINPUT 8 + 77: s SETITEM + 78: e APPENDS (MARK at 3) + 79: . STOP highest protocol among opcodes = 1 Exercise the INST/OBJ/BUILD family. @@ -2099,42 +2150,58 @@ highest protocol among opcodes = 0 0: ( MARK 1: l LIST (MARK at 0) 2: p PUT 0 - 5: ( MARK - 6: i INST 'pickletools _Example' (MARK at 5) - 28: p PUT 1 - 31: ( MARK - 32: d DICT (MARK at 31) - 33: p PUT 2 - 36: S STRING 'value' - 45: p PUT 3 - 48: I INT 42 - 52: s SETITEM - 53: b BUILD - 54: a APPEND - 55: g GET 1 - 58: a APPEND - 59: . STOP + 5: c GLOBAL 'copy_reg _reconstructor' + 30: p PUT 1 + 33: ( MARK + 34: c GLOBAL 'pickletools _Example' + 56: p PUT 2 + 59: c GLOBAL '__builtin__ object' + 79: p PUT 3 + 82: N NONE + 83: t TUPLE (MARK at 33) + 84: p PUT 4 + 87: R REDUCE + 88: p PUT 5 + 91: ( MARK + 92: d DICT (MARK at 91) + 93: p PUT 6 + 96: V UNICODE 'value' + 103: p PUT 7 + 106: L LONG 42 + 111: s SETITEM + 112: b BUILD + 113: a APPEND + 114: g GET 5 + 117: a APPEND + 118: . STOP highest protocol among opcodes = 0 >>> dis(pickle.dumps(x, 1)) 0: ] EMPTY_LIST 1: q BINPUT 0 3: ( MARK - 4: ( MARK - 5: c GLOBAL 'pickletools _Example' - 27: q BINPUT 1 - 29: o OBJ (MARK at 4) - 30: q BINPUT 2 - 32: } EMPTY_DICT - 33: q BINPUT 3 - 35: U SHORT_BINSTRING 'value' - 42: q BINPUT 4 - 44: K BININT1 42 - 46: s SETITEM - 47: b BUILD - 48: h BINGET 2 - 50: e APPENDS (MARK at 3) - 51: . STOP + 4: c GLOBAL 'copy_reg _reconstructor' + 29: q BINPUT 1 + 31: ( MARK + 32: c GLOBAL 'pickletools _Example' + 54: q BINPUT 2 + 56: c GLOBAL '__builtin__ object' + 76: q BINPUT 3 + 78: N NONE + 79: t TUPLE (MARK at 31) + 80: q BINPUT 4 + 82: R REDUCE + 83: q BINPUT 5 + 85: } EMPTY_DICT + 86: q BINPUT 6 + 88: X BINUNICODE 'value' + 98: q BINPUT 7 + 100: K BININT1 42 + 102: s SETITEM + 103: b BUILD + 104: h BINGET 5 + 106: e APPENDS (MARK at 3) + 107: . STOP highest protocol among opcodes = 1 Try "the canonical" recursive-object test. @@ -2232,17 +2299,34 @@ highest protocol among opcodes = 2 12: h BINGET 1 14: . STOP highest protocol among opcodes = 2 + +Try protocol 3 with annotations: + +>>> dis(pickle.dumps(T, 3), annotate=1) + 0: \x80 PROTO 3 Protocol version indicator. + 2: ] EMPTY_LIST Push an empty list. + 3: q BINPUT 0 Store the stack top into the memo. The stack is not popped. + 5: h BINGET 0 Read an object from the memo and push it on the stack. + 7: \x85 TUPLE1 Build a one-tuple out of the topmost item on the stack. + 8: q BINPUT 1 Store the stack top into the memo. The stack is not popped. + 10: a APPEND Append an object to a list. + 11: 0 POP Discard the top stack item, shrinking the stack by one item. + 12: h BINGET 1 Read an object from the memo and push it on the stack. + 14: . STOP Stop the unpickling machine. +highest protocol among opcodes = 2 + """ _memo_test = r""" >>> import pickle ->>> from StringIO import StringIO ->>> f = StringIO() +>>> import io +>>> f = io.BytesIO() >>> p = pickle.Pickler(f, 2) >>> x = [1, 2, 3] >>> p.dump(x) >>> p.dump(x) >>> f.seek(0) +0 >>> memo = {} >>> dis(f, memo=memo) 0: \x80 PROTO 2 @@ -2271,4 +2355,47 @@ def _test(): return doctest.testmod() if __name__ == "__main__": - _test() + import sys, argparse + parser = argparse.ArgumentParser( + description='disassemble one or more pickle files') + parser.add_argument( + 'pickle_file', type=argparse.FileType('br'), + nargs='*', help='the pickle file') + parser.add_argument( + '-o', '--output', default=sys.stdout, type=argparse.FileType('w'), + help='the file where the output should be written') + parser.add_argument( + '-m', '--memo', action='store_true', + help='preserve memo between disassemblies') + parser.add_argument( + '-l', '--indentlevel', default=4, type=int, + help='the number of blanks by which to indent a new MARK level') + parser.add_argument( + '-a', '--annotate', action='store_true', + help='annotate each line with a short opcode description') + parser.add_argument( + '-p', '--preamble', default="==> {name} <==", + help='if more than one pickle file is specified, print this before' + ' each disassembly') + parser.add_argument( + '-t', '--test', action='store_true', + help='run self-test suite') + parser.add_argument( + '-v', action='store_true', + help='run verbosely; only affects self-test run') + args = parser.parse_args() + if args.test: + _test() + else: + annotate = 30 if args.annotate else 0 + if not args.pickle_file: + parser.print_help() + elif len(args.pickle_file) == 1: + dis(args.pickle_file[0], args.output, None, + args.indentlevel, annotate) + else: + memo = {} if args.memo else None + for f in args.pickle_file: + preamble = args.preamble.format(name=f.name) + args.output.write(preamble + '\n') + dis(f, args.output, memo, args.indentlevel, annotate) |