aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/Lib/pickletools.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/pickletools.py')
-rw-r--r--Lib/pickletools.py565
1 files changed, 346 insertions, 219 deletions
diff --git a/Lib/pickletools.py b/Lib/pickletools.py
index d717728d417..ec6cc53ae31 100644
--- a/Lib/pickletools.py
+++ b/Lib/pickletools.py
@@ -10,8 +10,14 @@ dis(pickle, out=None, memo=None, indentlevel=4)
Print a symbolic disassembly of a pickle.
'''
+import codecs
+import pickle
+import re
+
__all__ = ['dis', 'genops', 'optimize']
+bytes_types = pickle.bytes_types
+
# Other ideas:
#
# - A pickle verifier: read a pickle and check it exhaustively for
@@ -136,7 +142,7 @@ this and there isn't a use case that warrants the expense of such an
analysis.
To this end, all tests for __safe_for_unpickling__ or for
-copy_reg.safe_constructors are removed from the unpickling code.
+copyreg.safe_constructors are removed from the unpickling code.
References to these variables in the descriptions below are to be seen
as describing unpickling in Python 2.2 and before.
"""
@@ -200,14 +206,14 @@ from struct import unpack as _unpack
def read_uint1(f):
r"""
- >>> import StringIO
- >>> read_uint1(StringIO.StringIO('\xff'))
+ >>> import io
+ >>> read_uint1(io.BytesIO(b'\xff'))
255
"""
data = f.read(1)
if data:
- return ord(data)
+ return data[0]
raise ValueError("not enough data in stream to read uint1")
uint1 = ArgumentDescriptor(
@@ -219,10 +225,10 @@ uint1 = ArgumentDescriptor(
def read_uint2(f):
r"""
- >>> import StringIO
- >>> read_uint2(StringIO.StringIO('\xff\x00'))
+ >>> import io
+ >>> read_uint2(io.BytesIO(b'\xff\x00'))
255
- >>> read_uint2(StringIO.StringIO('\xff\xff'))
+ >>> read_uint2(io.BytesIO(b'\xff\xff'))
65535
"""
@@ -240,10 +246,10 @@ uint2 = ArgumentDescriptor(
def read_int4(f):
r"""
- >>> import StringIO
- >>> read_int4(StringIO.StringIO('\xff\x00\x00\x00'))
+ >>> import io
+ >>> read_int4(io.BytesIO(b'\xff\x00\x00\x00'))
255
- >>> read_int4(StringIO.StringIO('\x00\x00\x00\x80')) == -(2**31)
+ >>> read_int4(io.BytesIO(b'\x00\x00\x00\x80')) == -(2**31)
True
"""
@@ -261,38 +267,38 @@ int4 = ArgumentDescriptor(
def read_stringnl(f, decode=True, stripquotes=True):
r"""
- >>> import StringIO
- >>> read_stringnl(StringIO.StringIO("'abcd'\nefg\n"))
+ >>> import io
+ >>> read_stringnl(io.BytesIO(b"'abcd'\nefg\n"))
'abcd'
- >>> read_stringnl(StringIO.StringIO("\n"))
+ >>> read_stringnl(io.BytesIO(b"\n"))
Traceback (most recent call last):
...
- ValueError: no string quotes around ''
+ ValueError: no string quotes around b''
- >>> read_stringnl(StringIO.StringIO("\n"), stripquotes=False)
+ >>> read_stringnl(io.BytesIO(b"\n"), stripquotes=False)
''
- >>> read_stringnl(StringIO.StringIO("''\n"))
+ >>> read_stringnl(io.BytesIO(b"''\n"))
''
- >>> read_stringnl(StringIO.StringIO('"abcd"'))
+ >>> read_stringnl(io.BytesIO(b'"abcd"'))
Traceback (most recent call last):
...
ValueError: no newline found when trying to read stringnl
Embedded escapes are undone in the result.
- >>> read_stringnl(StringIO.StringIO(r"'a\n\\b\x00c\td'" + "\n'e'"))
+ >>> read_stringnl(io.BytesIO(br"'a\n\\b\x00c\td'" + b"\n'e'"))
'a\n\\b\x00c\td'
"""
data = f.readline()
- if not data.endswith('\n'):
+ if not data.endswith(b'\n'):
raise ValueError("no newline found when trying to read stringnl")
data = data[:-1] # lose the newline
if stripquotes:
- for q in "'\"":
+ for q in (b'"', b"'"):
if data.startswith(q):
if not data.endswith(q):
raise ValueError("strinq quote %r not found at both "
@@ -302,10 +308,8 @@ def read_stringnl(f, decode=True, stripquotes=True):
else:
raise ValueError("no string quotes around %r" % data)
- # I'm not sure when 'string_escape' was added to the std codecs; it's
- # crazy not to use it if it's there.
if decode:
- data = data.decode('string_escape')
+ data = codecs.escape_decode(data)[0].decode("ascii")
return data
stringnl = ArgumentDescriptor(
@@ -319,7 +323,7 @@ stringnl = ArgumentDescriptor(
""")
def read_stringnl_noescape(f):
- return read_stringnl(f, decode=False, stripquotes=False)
+ return read_stringnl(f, stripquotes=False)
stringnl_noescape = ArgumentDescriptor(
name='stringnl_noescape',
@@ -334,8 +338,8 @@ stringnl_noescape = ArgumentDescriptor(
def read_stringnl_noescape_pair(f):
r"""
- >>> import StringIO
- >>> read_stringnl_noescape_pair(StringIO.StringIO("Queue\nEmpty\njunk"))
+ >>> import io
+ >>> read_stringnl_noescape_pair(io.BytesIO(b"Queue\nEmpty\njunk"))
'Queue Empty'
"""
@@ -356,12 +360,12 @@ stringnl_noescape_pair = ArgumentDescriptor(
def read_string4(f):
r"""
- >>> import StringIO
- >>> read_string4(StringIO.StringIO("\x00\x00\x00\x00abc"))
+ >>> import io
+ >>> read_string4(io.BytesIO(b"\x00\x00\x00\x00abc"))
''
- >>> read_string4(StringIO.StringIO("\x03\x00\x00\x00abcdef"))
+ >>> read_string4(io.BytesIO(b"\x03\x00\x00\x00abcdef"))
'abc'
- >>> read_string4(StringIO.StringIO("\x00\x00\x00\x03abcdef"))
+ >>> read_string4(io.BytesIO(b"\x00\x00\x00\x03abcdef"))
Traceback (most recent call last):
...
ValueError: expected 50331648 bytes in a string4, but only 6 remain
@@ -372,7 +376,7 @@ def read_string4(f):
raise ValueError("string4 byte count < 0: %d" % n)
data = f.read(n)
if len(data) == n:
- return data
+ return data.decode("latin-1")
raise ValueError("expected %d bytes in a string4, but only %d remain" %
(n, len(data)))
@@ -390,10 +394,10 @@ string4 = ArgumentDescriptor(
def read_string1(f):
r"""
- >>> import StringIO
- >>> read_string1(StringIO.StringIO("\x00"))
+ >>> import io
+ >>> read_string1(io.BytesIO(b"\x00"))
''
- >>> read_string1(StringIO.StringIO("\x03abcdef"))
+ >>> read_string1(io.BytesIO(b"\x03abcdef"))
'abc'
"""
@@ -401,7 +405,7 @@ def read_string1(f):
assert n >= 0
data = f.read(n)
if len(data) == n:
- return data
+ return data.decode("latin-1")
raise ValueError("expected %d bytes in a string1, but only %d remain" %
(n, len(data)))
@@ -419,17 +423,17 @@ string1 = ArgumentDescriptor(
def read_unicodestringnl(f):
r"""
- >>> import StringIO
- >>> read_unicodestringnl(StringIO.StringIO("abc\uabcd\njunk"))
- u'abc\uabcd'
+ >>> import io
+ >>> read_unicodestringnl(io.BytesIO(b"abc\\uabcd\njunk")) == 'abc\uabcd'
+ True
"""
data = f.readline()
- if not data.endswith('\n'):
+ if not data.endswith(b'\n'):
raise ValueError("no newline found when trying to read "
"unicodestringnl")
data = data[:-1] # lose the newline
- return unicode(data, 'raw-unicode-escape')
+ return str(data, 'raw-unicode-escape')
unicodestringnl = ArgumentDescriptor(
name='unicodestringnl',
@@ -444,17 +448,17 @@ unicodestringnl = ArgumentDescriptor(
def read_unicodestring4(f):
r"""
- >>> import StringIO
- >>> s = u'abcd\uabcd'
+ >>> import io
+ >>> s = 'abcd\uabcd'
>>> enc = s.encode('utf-8')
>>> enc
- 'abcd\xea\xaf\x8d'
- >>> n = chr(len(enc)) + chr(0) * 3 # little-endian 4-byte length
- >>> t = read_unicodestring4(StringIO.StringIO(n + enc + 'junk'))
+ b'abcd\xea\xaf\x8d'
+ >>> n = bytes([len(enc), 0, 0, 0]) # little-endian 4-byte length
+ >>> t = read_unicodestring4(io.BytesIO(n + enc + b'junk'))
>>> s == t
True
- >>> read_unicodestring4(StringIO.StringIO(n + enc[:-1]))
+ >>> read_unicodestring4(io.BytesIO(n + enc[:-1]))
Traceback (most recent call last):
...
ValueError: expected 7 bytes in a unicodestring4, but only 6 remain
@@ -465,7 +469,7 @@ def read_unicodestring4(f):
raise ValueError("unicodestring4 byte count < 0: %d" % n)
data = f.read(n)
if len(data) == n:
- return unicode(data, 'utf-8')
+ return str(data, 'utf-8', 'surrogatepass')
raise ValueError("expected %d bytes in a unicodestring4, but only %d "
"remain" % (n, len(data)))
@@ -484,55 +488,48 @@ unicodestring4 = ArgumentDescriptor(
def read_decimalnl_short(f):
r"""
- >>> import StringIO
- >>> read_decimalnl_short(StringIO.StringIO("1234\n56"))
+ >>> import io
+ >>> read_decimalnl_short(io.BytesIO(b"1234\n56"))
1234
- >>> read_decimalnl_short(StringIO.StringIO("1234L\n56"))
+ >>> read_decimalnl_short(io.BytesIO(b"1234L\n56"))
Traceback (most recent call last):
...
- ValueError: trailing 'L' not allowed in '1234L'
+ ValueError: trailing 'L' not allowed in b'1234L'
"""
s = read_stringnl(f, decode=False, stripquotes=False)
- if s.endswith("L"):
+ if s.endswith(b"L"):
raise ValueError("trailing 'L' not allowed in %r" % s)
# It's not necessarily true that the result fits in a Python short int:
# the pickle may have been written on a 64-bit box. There's also a hack
# for True and False here.
- if s == "00":
+ if s == b"00":
return False
- elif s == "01":
+ elif s == b"01":
return True
try:
return int(s)
except OverflowError:
- return long(s)
+ return int(s)
def read_decimalnl_long(f):
r"""
- >>> import StringIO
+ >>> import io
- >>> read_decimalnl_long(StringIO.StringIO("1234\n56"))
- Traceback (most recent call last):
- ...
- ValueError: trailing 'L' required in '1234'
-
- Someday the trailing 'L' will probably go away from this output.
-
- >>> read_decimalnl_long(StringIO.StringIO("1234L\n56"))
- 1234L
+ >>> read_decimalnl_long(io.BytesIO(b"1234L\n56"))
+ 1234
- >>> read_decimalnl_long(StringIO.StringIO("123456789012345678901234L\n6"))
- 123456789012345678901234L
+ >>> read_decimalnl_long(io.BytesIO(b"123456789012345678901234L\n6"))
+ 123456789012345678901234
"""
s = read_stringnl(f, decode=False, stripquotes=False)
- if not s.endswith("L"):
- raise ValueError("trailing 'L' required in %r" % s)
- return long(s)
+ if s[-1:] == b'L':
+ s = s[:-1]
+ return int(s)
decimalnl_short = ArgumentDescriptor(
@@ -561,8 +558,8 @@ decimalnl_long = ArgumentDescriptor(
def read_floatnl(f):
r"""
- >>> import StringIO
- >>> read_floatnl(StringIO.StringIO("-1.25\n6"))
+ >>> import io
+ >>> read_floatnl(io.BytesIO(b"-1.25\n6"))
-1.25
"""
s = read_stringnl(f, decode=False, stripquotes=False)
@@ -583,11 +580,11 @@ floatnl = ArgumentDescriptor(
def read_float8(f):
r"""
- >>> import StringIO, struct
+ >>> import io, struct
>>> raw = struct.pack(">d", -1.25)
>>> raw
- '\xbf\xf4\x00\x00\x00\x00\x00\x00'
- >>> read_float8(StringIO.StringIO(raw + "\n"))
+ b'\xbf\xf4\x00\x00\x00\x00\x00\x00'
+ >>> read_float8(io.BytesIO(raw + b"\n"))
-1.25
"""
@@ -604,7 +601,7 @@ float8 = ArgumentDescriptor(
doc="""An 8-byte binary representation of a float, big-endian.
The format is unique to Python, and shared with the struct
- module (format string '>d') "in theory" (the struct and cPickle
+ module (format string '>d') "in theory" (the struct and pickle
implementations don't share the code -- they should). It's
strongly related to the IEEE-754 double format, and, in normal
cases, is in fact identical to the big-endian 754 double format.
@@ -621,17 +618,17 @@ from pickle import decode_long
def read_long1(f):
r"""
- >>> import StringIO
- >>> read_long1(StringIO.StringIO("\x00"))
- 0L
- >>> read_long1(StringIO.StringIO("\x02\xff\x00"))
- 255L
- >>> read_long1(StringIO.StringIO("\x02\xff\x7f"))
- 32767L
- >>> read_long1(StringIO.StringIO("\x02\x00\xff"))
- -256L
- >>> read_long1(StringIO.StringIO("\x02\x00\x80"))
- -32768L
+ >>> import io
+ >>> read_long1(io.BytesIO(b"\x00"))
+ 0
+ >>> read_long1(io.BytesIO(b"\x02\xff\x00"))
+ 255
+ >>> read_long1(io.BytesIO(b"\x02\xff\x7f"))
+ 32767
+ >>> read_long1(io.BytesIO(b"\x02\x00\xff"))
+ -256
+ >>> read_long1(io.BytesIO(b"\x02\x00\x80"))
+ -32768
"""
n = read_uint1(f)
@@ -653,17 +650,17 @@ long1 = ArgumentDescriptor(
def read_long4(f):
r"""
- >>> import StringIO
- >>> read_long4(StringIO.StringIO("\x02\x00\x00\x00\xff\x00"))
- 255L
- >>> read_long4(StringIO.StringIO("\x02\x00\x00\x00\xff\x7f"))
- 32767L
- >>> read_long4(StringIO.StringIO("\x02\x00\x00\x00\x00\xff"))
- -256L
- >>> read_long4(StringIO.StringIO("\x02\x00\x00\x00\x00\x80"))
- -32768L
- >>> read_long1(StringIO.StringIO("\x00\x00\x00\x00"))
- 0L
+ >>> import io
+ >>> read_long4(io.BytesIO(b"\x02\x00\x00\x00\xff\x00"))
+ 255
+ >>> read_long4(io.BytesIO(b"\x02\x00\x00\x00\xff\x7f"))
+ 32767
+ >>> read_long4(io.BytesIO(b"\x02\x00\x00\x00\x00\xff"))
+ -256
+ >>> read_long4(io.BytesIO(b"\x02\x00\x00\x00\x00\x80"))
+ -32768
+ >>> read_long1(io.BytesIO(b"\x00\x00\x00\x00"))
+ 0
"""
n = read_int4(f)
@@ -683,7 +680,7 @@ long4 = ArgumentDescriptor(
This first reads four bytes as a signed size (but requires the
size to be >= 0), then reads that many bytes and interprets them
as a little-endian 2's-complement long. If the size is 0, that's taken
- as a shortcut for the long 0L, although LONG1 should really be used
+ as a shortcut for the int 0, although LONG1 should really be used
then instead (and in any case where # of bytes < 256).
""")
@@ -731,12 +728,12 @@ pyint = StackObject(
pylong = StackObject(
name='long',
- obtype=long,
+ obtype=int,
doc="A long (as opposed to short) Python integer object.")
pyinteger_or_bool = StackObject(
name='int_or_bool',
- obtype=(int, long, bool),
+ obtype=(int, bool),
doc="A Python integer object (short or long), or "
"a Python bool.")
@@ -751,14 +748,19 @@ pyfloat = StackObject(
doc="A Python float object.")
pystring = StackObject(
- name='str',
- obtype=str,
- doc="A Python string object.")
+ name='string',
+ obtype=bytes,
+ doc="A Python (8-bit) string object.")
+
+pybytes = StackObject(
+ name='bytes',
+ obtype=bytes,
+ doc="A Python bytes object.")
pyunicode = StackObject(
- name='unicode',
- obtype=unicode,
- doc="A Python Unicode string object.")
+ name='str',
+ obtype=str,
+ doc="A Python (Unicode) string object.")
pynone = StackObject(
name="None",
@@ -873,7 +875,7 @@ class OpcodeInfo(object):
assert isinstance(x, StackObject)
self.stack_after = stack_after
- assert isinstance(proto, int) and 0 <= proto <= 2
+ assert isinstance(proto, int) and 0 <= proto <= 3
self.proto = proto
assert isinstance(doc, str)
@@ -1000,7 +1002,9 @@ opcodes = [
The argument is a repr-style string, with bracketing quote characters,
and perhaps embedded escapes. The argument extends until the next
- newline character.
+ newline character. (Actually, they are decoded into a str instance
+ using the encoding given to the Unpickler constructor. or the default,
+ 'ASCII'.)
"""),
I(name='BINSTRING',
@@ -1013,7 +1017,9 @@ opcodes = [
There are two arguments: the first is a 4-byte little-endian signed int
giving the number of bytes in the string, and the second is that many
- bytes, which are taken literally as the string content.
+ bytes, which are taken literally as the string content. (Actually,
+ they are decoded into a str instance using the encoding given to the
+ Unpickler constructor. or the default, 'ASCII'.)
"""),
I(name='SHORT_BINSTRING',
@@ -1026,6 +1032,36 @@ opcodes = [
There are two arguments: the first is a 1-byte unsigned int giving
the number of bytes in the string, and the second is that many bytes,
+ which are taken literally as the string content. (Actually, they
+ are decoded into a str instance using the encoding given to the
+ Unpickler constructor. or the default, 'ASCII'.)
+ """),
+
+ # Bytes (protocol 3 only; older protocols don't support bytes at all)
+
+ I(name='BINBYTES',
+ code='B',
+ arg=string4,
+ stack_before=[],
+ stack_after=[pybytes],
+ proto=3,
+ doc="""Push a Python bytes object.
+
+ There are two arguments: the first is a 4-byte little-endian signed int
+ giving the number of bytes in the string, and the second is that many
+ bytes, which are taken literally as the bytes content.
+ """),
+
+ I(name='SHORT_BINBYTES',
+ code='C',
+ arg=string1,
+ stack_before=[],
+ stack_after=[pybytes],
+ proto=3,
+ doc="""Push a Python string object.
+
+ There are two arguments: the first is a 1-byte unsigned int giving
+ the number of bytes in the string, and the second is that many bytes,
which are taken literally as the string content.
"""),
@@ -1527,8 +1563,8 @@ opcodes = [
opcode is followed by code to create setstate's argument, and then a
BUILD opcode to apply __setstate__ to that argument.
- If type(callable) is not ClassType, REDUCE complains unless the
- callable has been registered with the copy_reg module's
+ If not isinstance(callable, type), REDUCE complains unless the
+ callable has been registered with the copyreg module's
safe_constructors dict, or the callable has a magic
'__safe_for_unpickling__' attribute with a true value. I'm not sure
why it does this, but I've sure seen this complaint often enough when
@@ -1558,13 +1594,6 @@ opcodes = [
the object is updated via
anyobject.__dict__.update(argument)
-
- This may raise RuntimeError in restricted execution mode (which
- disallows access to __dict__ directly); in that case, the object
- is updated instead via
-
- for k, v in argument.items():
- anyobject[k] = v
"""),
I(name='INST',
@@ -1590,9 +1619,6 @@ opcodes = [
+ The argtuple is empty (markobject was at the top of the stack
at the start).
- + It's an old-style class object (the type of the class object is
- ClassType).
-
+ The class object does not have a __getinitargs__ attribute.
then we want to create an old-style class instance without invoking
@@ -1600,20 +1626,15 @@ opcodes = [
calling __init__() is current wisdom). In this case, an instance of
an old-style dummy class is created, and then we try to rebind its
__class__ attribute to the desired class object. If this succeeds,
- the new instance object is pushed on the stack, and we're done. In
- restricted execution mode it can fail (assignment to __class__ is
- disallowed), and I'm not really sure what happens then -- it looks
- like the code ends up calling the class object's __init__ anyway,
- via falling into the next case.
+ the new instance object is pushed on the stack, and we're done.
Else (the argtuple is not empty, it's not an old-style class object,
or the class object does have a __getinitargs__ attribute), the code
first insists that the class object have a __safe_for_unpickling__
attribute. Unlike as for the __safe_for_unpickling__ check in REDUCE,
it doesn't matter whether this attribute has a true or false value, it
- only matters whether it exists (XXX this is a bug; cPickle
- requires the attribute to be true). If __safe_for_unpickling__
- doesn't exist, UnpicklingError is raised.
+ only matters whether it exists (XXX this is a bug). If
+ __safe_for_unpickling__ doesn't exist, UnpicklingError is raised.
Else (the class object does have a __safe_for_unpickling__ attr),
the class object obtained from INST's arguments is applied to the
@@ -1648,8 +1669,7 @@ opcodes = [
As for INST, the remainder of the stack above the markobject is
gathered into an argument tuple, and then the logic seems identical,
except that no __safe_for_unpickling__ check is done (XXX this is
- a bug; cPickle does test __safe_for_unpickling__). See INST for
- the gory details.
+ a bug). See INST for the gory details.
NOTE: In Python 2.3, INST and OBJ are identical except for how they
get the class object. That was always the intent; the implementations
@@ -1761,24 +1781,24 @@ for d in opcodes:
del d
def assure_pickle_consistency(verbose=False):
- import pickle, re
copy = code2op.copy()
for name in pickle.__all__:
if not re.match("[A-Z][A-Z0-9_]+$", name):
if verbose:
- print "skipping %r: it doesn't look like an opcode name" % name
+ print("skipping %r: it doesn't look like an opcode name" % name)
continue
picklecode = getattr(pickle, name)
- if not isinstance(picklecode, str) or len(picklecode) != 1:
+ if not isinstance(picklecode, bytes) or len(picklecode) != 1:
if verbose:
- print ("skipping %r: value %r doesn't look like a pickle "
- "code" % (name, picklecode))
+ print(("skipping %r: value %r doesn't look like a pickle "
+ "code" % (name, picklecode)))
continue
+ picklecode = picklecode.decode("latin-1")
if picklecode in copy:
if verbose:
- print "checking name %r w/ code %r for consistency" % (
- name, picklecode)
+ print("checking name %r w/ code %r for consistency" % (
+ name, picklecode))
d = copy[picklecode]
if d.name != name:
raise ValueError("for pickle code %r, pickle.py uses name %r "
@@ -1822,16 +1842,15 @@ def genops(pickle):
is None.
If the pickle has a tell() method, pos was the value of pickle.tell()
- before reading the current opcode. If the pickle is a string object,
- it's wrapped in a StringIO object, and the latter's tell() result is
+ before reading the current opcode. If the pickle is a bytes object,
+ it's wrapped in a BytesIO object, and the latter's tell() result is
used. Else (the pickle doesn't have a tell(), and it's not obvious how
to query its current position) pos is None.
"""
- import cStringIO as StringIO
-
- if isinstance(pickle, str):
- pickle = StringIO.StringIO(pickle)
+ if isinstance(pickle, bytes_types):
+ import io
+ pickle = io.BytesIO(pickle)
if hasattr(pickle, "tell"):
getpos = pickle.tell
@@ -1841,9 +1860,9 @@ def genops(pickle):
while True:
pos = getpos()
code = pickle.read(1)
- opcode = code2op.get(code)
+ opcode = code2op.get(code.decode("latin-1"))
if opcode is None:
- if code == "":
+ if code == b"":
raise ValueError("pickle exhausted before seeing STOP")
else:
raise ValueError("at position %s, opcode %r unknown" % (
@@ -1854,7 +1873,7 @@ def genops(pickle):
else:
arg = opcode.arg.reader(pickle)
yield opcode, arg, pos
- if code == '.':
+ if code == b'.':
assert opcode.name == 'STOP'
break
@@ -1883,12 +1902,12 @@ def optimize(p):
s.append(p[i:j])
i = stop
s.append(p[i:])
- return ''.join(s)
+ return b''.join(s)
##############################################################################
# A symbolic pickle disassembler.
-def dis(pickle, out=None, memo=None, indentlevel=4):
+def dis(pickle, out=None, memo=None, indentlevel=4, annotate=0):
"""Produce a symbolic disassembly of a pickle.
'pickle' is a file-like object, or string, containing a (at least one)
@@ -1904,9 +1923,15 @@ def dis(pickle, out=None, memo=None, indentlevel=4):
to proceed across multiple pickles that were all created by the same
pickler with the same memo. Ordinarily you don't need to worry about this.
- Optional arg indentlevel is the number of blanks by which to indent
+ Optional arg 'indentlevel' is the number of blanks by which to indent
a new MARK level. It defaults to 4.
+ Optional arg 'annotate' if nonzero instructs dis() to add short
+ description of the opcode on each line of disassembled output.
+ The value given to 'annotate' must be an integer and is used as a
+ hint for the column where annotation should start. The default
+ value is 0, meaning no annotations.
+
In addition to printing the disassembly, some sanity checks are made:
+ All embedded opcode arguments "make sense".
@@ -1934,9 +1959,10 @@ def dis(pickle, out=None, memo=None, indentlevel=4):
markstack = [] # bytecode positions of MARK opcodes
indentchunk = ' ' * indentlevel
errormsg = None
+ annocol = annotate # columnt hint for annotations
for opcode, arg, pos in genops(pickle):
if pos is not None:
- print >> out, "%5d:" % pos,
+ print("%5d:" % pos, end=' ', file=out)
line = "%-4s %s%s" % (repr(opcode.code)[1:-1],
indentchunk * len(markstack),
@@ -2001,7 +2027,14 @@ def dis(pickle, out=None, memo=None, indentlevel=4):
line += ' ' + repr(arg)
if markmsg:
line += ' ' + markmsg
- print >> out, line
+ if annotate:
+ line += ' ' * (annocol - len(line))
+ # make a mild effort to align annotations
+ annocol = len(line)
+ if annocol > 50:
+ annocol = annotate
+ line += ' ' + opcode.doc.split('\n', 1)[0]
+ print(line, file=out)
if errormsg:
# Note that we delayed complaining until the offending opcode
@@ -2020,7 +2053,7 @@ def dis(pickle, out=None, memo=None, indentlevel=4):
stack.extend(after)
- print >> out, "highest protocol among opcodes =", maxproto
+ print("highest protocol among opcodes =", maxproto, file=out)
if stack:
raise ValueError("stack not empty after STOP: %r" % stack)
@@ -2031,38 +2064,47 @@ class _Example:
_dis_test = r"""
>>> import pickle
->>> x = [1, 2, (3, 4), {'abc': u"def"}]
->>> pkl = pickle.dumps(x, 0)
->>> dis(pkl)
+>>> x = [1, 2, (3, 4), {b'abc': "def"}]
+>>> pkl0 = pickle.dumps(x, 0)
+>>> dis(pkl0)
0: ( MARK
1: l LIST (MARK at 0)
2: p PUT 0
- 5: I INT 1
- 8: a APPEND
- 9: I INT 2
- 12: a APPEND
- 13: ( MARK
- 14: I INT 3
- 17: I INT 4
- 20: t TUPLE (MARK at 13)
- 21: p PUT 1
- 24: a APPEND
- 25: ( MARK
- 26: d DICT (MARK at 25)
- 27: p PUT 2
- 30: S STRING 'abc'
- 37: p PUT 3
- 40: V UNICODE u'def'
- 45: p PUT 4
- 48: s SETITEM
- 49: a APPEND
- 50: . STOP
+ 5: L LONG 1
+ 9: a APPEND
+ 10: L LONG 2
+ 14: a APPEND
+ 15: ( MARK
+ 16: L LONG 3
+ 20: L LONG 4
+ 24: t TUPLE (MARK at 15)
+ 25: p PUT 1
+ 28: a APPEND
+ 29: ( MARK
+ 30: d DICT (MARK at 29)
+ 31: p PUT 2
+ 34: c GLOBAL '_codecs encode'
+ 50: p PUT 3
+ 53: ( MARK
+ 54: V UNICODE 'abc'
+ 59: p PUT 4
+ 62: V UNICODE 'latin1'
+ 70: p PUT 5
+ 73: t TUPLE (MARK at 53)
+ 74: p PUT 6
+ 77: R REDUCE
+ 78: p PUT 7
+ 81: V UNICODE 'def'
+ 86: p PUT 8
+ 89: s SETITEM
+ 90: a APPEND
+ 91: . STOP
highest protocol among opcodes = 0
Try again with a "binary" pickle.
->>> pkl = pickle.dumps(x, 1)
->>> dis(pkl)
+>>> pkl1 = pickle.dumps(x, 1)
+>>> dis(pkl1)
0: ] EMPTY_LIST
1: q BINPUT 0
3: ( MARK
@@ -2075,13 +2117,22 @@ Try again with a "binary" pickle.
14: q BINPUT 1
16: } EMPTY_DICT
17: q BINPUT 2
- 19: U SHORT_BINSTRING 'abc'
- 24: q BINPUT 3
- 26: X BINUNICODE u'def'
- 34: q BINPUT 4
- 36: s SETITEM
- 37: e APPENDS (MARK at 3)
- 38: . STOP
+ 19: c GLOBAL '_codecs encode'
+ 35: q BINPUT 3
+ 37: ( MARK
+ 38: X BINUNICODE 'abc'
+ 46: q BINPUT 4
+ 48: X BINUNICODE 'latin1'
+ 59: q BINPUT 5
+ 61: t TUPLE (MARK at 37)
+ 62: q BINPUT 6
+ 64: R REDUCE
+ 65: q BINPUT 7
+ 67: X BINUNICODE 'def'
+ 75: q BINPUT 8
+ 77: s SETITEM
+ 78: e APPENDS (MARK at 3)
+ 79: . STOP
highest protocol among opcodes = 1
Exercise the INST/OBJ/BUILD family.
@@ -2099,42 +2150,58 @@ highest protocol among opcodes = 0
0: ( MARK
1: l LIST (MARK at 0)
2: p PUT 0
- 5: ( MARK
- 6: i INST 'pickletools _Example' (MARK at 5)
- 28: p PUT 1
- 31: ( MARK
- 32: d DICT (MARK at 31)
- 33: p PUT 2
- 36: S STRING 'value'
- 45: p PUT 3
- 48: I INT 42
- 52: s SETITEM
- 53: b BUILD
- 54: a APPEND
- 55: g GET 1
- 58: a APPEND
- 59: . STOP
+ 5: c GLOBAL 'copy_reg _reconstructor'
+ 30: p PUT 1
+ 33: ( MARK
+ 34: c GLOBAL 'pickletools _Example'
+ 56: p PUT 2
+ 59: c GLOBAL '__builtin__ object'
+ 79: p PUT 3
+ 82: N NONE
+ 83: t TUPLE (MARK at 33)
+ 84: p PUT 4
+ 87: R REDUCE
+ 88: p PUT 5
+ 91: ( MARK
+ 92: d DICT (MARK at 91)
+ 93: p PUT 6
+ 96: V UNICODE 'value'
+ 103: p PUT 7
+ 106: L LONG 42
+ 111: s SETITEM
+ 112: b BUILD
+ 113: a APPEND
+ 114: g GET 5
+ 117: a APPEND
+ 118: . STOP
highest protocol among opcodes = 0
>>> dis(pickle.dumps(x, 1))
0: ] EMPTY_LIST
1: q BINPUT 0
3: ( MARK
- 4: ( MARK
- 5: c GLOBAL 'pickletools _Example'
- 27: q BINPUT 1
- 29: o OBJ (MARK at 4)
- 30: q BINPUT 2
- 32: } EMPTY_DICT
- 33: q BINPUT 3
- 35: U SHORT_BINSTRING 'value'
- 42: q BINPUT 4
- 44: K BININT1 42
- 46: s SETITEM
- 47: b BUILD
- 48: h BINGET 2
- 50: e APPENDS (MARK at 3)
- 51: . STOP
+ 4: c GLOBAL 'copy_reg _reconstructor'
+ 29: q BINPUT 1
+ 31: ( MARK
+ 32: c GLOBAL 'pickletools _Example'
+ 54: q BINPUT 2
+ 56: c GLOBAL '__builtin__ object'
+ 76: q BINPUT 3
+ 78: N NONE
+ 79: t TUPLE (MARK at 31)
+ 80: q BINPUT 4
+ 82: R REDUCE
+ 83: q BINPUT 5
+ 85: } EMPTY_DICT
+ 86: q BINPUT 6
+ 88: X BINUNICODE 'value'
+ 98: q BINPUT 7
+ 100: K BININT1 42
+ 102: s SETITEM
+ 103: b BUILD
+ 104: h BINGET 5
+ 106: e APPENDS (MARK at 3)
+ 107: . STOP
highest protocol among opcodes = 1
Try "the canonical" recursive-object test.
@@ -2232,17 +2299,34 @@ highest protocol among opcodes = 2
12: h BINGET 1
14: . STOP
highest protocol among opcodes = 2
+
+Try protocol 3 with annotations:
+
+>>> dis(pickle.dumps(T, 3), annotate=1)
+ 0: \x80 PROTO 3 Protocol version indicator.
+ 2: ] EMPTY_LIST Push an empty list.
+ 3: q BINPUT 0 Store the stack top into the memo. The stack is not popped.
+ 5: h BINGET 0 Read an object from the memo and push it on the stack.
+ 7: \x85 TUPLE1 Build a one-tuple out of the topmost item on the stack.
+ 8: q BINPUT 1 Store the stack top into the memo. The stack is not popped.
+ 10: a APPEND Append an object to a list.
+ 11: 0 POP Discard the top stack item, shrinking the stack by one item.
+ 12: h BINGET 1 Read an object from the memo and push it on the stack.
+ 14: . STOP Stop the unpickling machine.
+highest protocol among opcodes = 2
+
"""
_memo_test = r"""
>>> import pickle
->>> from StringIO import StringIO
->>> f = StringIO()
+>>> import io
+>>> f = io.BytesIO()
>>> p = pickle.Pickler(f, 2)
>>> x = [1, 2, 3]
>>> p.dump(x)
>>> p.dump(x)
>>> f.seek(0)
+0
>>> memo = {}
>>> dis(f, memo=memo)
0: \x80 PROTO 2
@@ -2271,4 +2355,47 @@ def _test():
return doctest.testmod()
if __name__ == "__main__":
- _test()
+ import sys, argparse
+ parser = argparse.ArgumentParser(
+ description='disassemble one or more pickle files')
+ parser.add_argument(
+ 'pickle_file', type=argparse.FileType('br'),
+ nargs='*', help='the pickle file')
+ parser.add_argument(
+ '-o', '--output', default=sys.stdout, type=argparse.FileType('w'),
+ help='the file where the output should be written')
+ parser.add_argument(
+ '-m', '--memo', action='store_true',
+ help='preserve memo between disassemblies')
+ parser.add_argument(
+ '-l', '--indentlevel', default=4, type=int,
+ help='the number of blanks by which to indent a new MARK level')
+ parser.add_argument(
+ '-a', '--annotate', action='store_true',
+ help='annotate each line with a short opcode description')
+ parser.add_argument(
+ '-p', '--preamble', default="==> {name} <==",
+ help='if more than one pickle file is specified, print this before'
+ ' each disassembly')
+ parser.add_argument(
+ '-t', '--test', action='store_true',
+ help='run self-test suite')
+ parser.add_argument(
+ '-v', action='store_true',
+ help='run verbosely; only affects self-test run')
+ args = parser.parse_args()
+ if args.test:
+ _test()
+ else:
+ annotate = 30 if args.annotate else 0
+ if not args.pickle_file:
+ parser.print_help()
+ elif len(args.pickle_file) == 1:
+ dis(args.pickle_file[0], args.output, None,
+ args.indentlevel, annotate)
+ else:
+ memo = {} if args.memo else None
+ for f in args.pickle_file:
+ preamble = args.preamble.format(name=f.name)
+ args.output.write(preamble + '\n')
+ dis(f, args.output, memo, args.indentlevel, annotate)