aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/Lib/compression/zstd
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/compression/zstd')
-rw-r--r--Lib/compression/zstd/__init__.py242
-rw-r--r--Lib/compression/zstd/_zstdfile.py345
2 files changed, 587 insertions, 0 deletions
diff --git a/Lib/compression/zstd/__init__.py b/Lib/compression/zstd/__init__.py
new file mode 100644
index 00000000000..84b25914b0a
--- /dev/null
+++ b/Lib/compression/zstd/__init__.py
@@ -0,0 +1,242 @@
+"""Python bindings to the Zstandard (zstd) compression library (RFC-8878)."""
+
+__all__ = (
+ # compression.zstd
+ 'COMPRESSION_LEVEL_DEFAULT',
+ 'compress',
+ 'CompressionParameter',
+ 'decompress',
+ 'DecompressionParameter',
+ 'finalize_dict',
+ 'get_frame_info',
+ 'Strategy',
+ 'train_dict',
+
+ # compression.zstd._zstdfile
+ 'open',
+ 'ZstdFile',
+
+ # _zstd
+ 'get_frame_size',
+ 'zstd_version',
+ 'zstd_version_info',
+ 'ZstdCompressor',
+ 'ZstdDecompressor',
+ 'ZstdDict',
+ 'ZstdError',
+)
+
+import _zstd
+import enum
+from _zstd import (ZstdCompressor, ZstdDecompressor, ZstdDict, ZstdError,
+ get_frame_size, zstd_version)
+from compression.zstd._zstdfile import ZstdFile, open, _nbytes
+
+# zstd_version_number is (MAJOR * 100 * 100 + MINOR * 100 + RELEASE)
+zstd_version_info = (*divmod(_zstd.zstd_version_number // 100, 100),
+ _zstd.zstd_version_number % 100)
+"""Version number of the runtime zstd library as a tuple of integers."""
+
+COMPRESSION_LEVEL_DEFAULT = _zstd.ZSTD_CLEVEL_DEFAULT
+"""The default compression level for Zstandard, currently '3'."""
+
+
+class FrameInfo:
+ """Information about a Zstandard frame."""
+
+ __slots__ = 'decompressed_size', 'dictionary_id'
+
+ def __init__(self, decompressed_size, dictionary_id):
+ super().__setattr__('decompressed_size', decompressed_size)
+ super().__setattr__('dictionary_id', dictionary_id)
+
+ def __repr__(self):
+ return (f'FrameInfo(decompressed_size={self.decompressed_size}, '
+ f'dictionary_id={self.dictionary_id})')
+
+ def __setattr__(self, name, _):
+ raise AttributeError(f"can't set attribute {name!r}")
+
+
+def get_frame_info(frame_buffer):
+ """Get Zstandard frame information from a frame header.
+
+ *frame_buffer* is a bytes-like object. It should start from the beginning
+ of a frame, and needs to include at least the frame header (6 to 18 bytes).
+
+ The returned FrameInfo object has two attributes.
+ 'decompressed_size' is the size in bytes of the data in the frame when
+ decompressed, or None when the decompressed size is unknown.
+ 'dictionary_id' is an int in the range (0, 2**32). The special value 0
+ means that the dictionary ID was not recorded in the frame header,
+ the frame may or may not need a dictionary to be decoded,
+ and the ID of such a dictionary is not specified.
+ """
+ return FrameInfo(*_zstd.get_frame_info(frame_buffer))
+
+
+def train_dict(samples, dict_size):
+ """Return a ZstdDict representing a trained Zstandard dictionary.
+
+ *samples* is an iterable of samples, where a sample is a bytes-like
+ object representing a file.
+
+ *dict_size* is the dictionary's maximum size, in bytes.
+ """
+ if not isinstance(dict_size, int):
+ ds_cls = type(dict_size).__qualname__
+ raise TypeError(f'dict_size must be an int object, not {ds_cls!r}.')
+
+ samples = tuple(samples)
+ chunks = b''.join(samples)
+ chunk_sizes = tuple(_nbytes(sample) for sample in samples)
+ if not chunks:
+ raise ValueError("samples contained no data; can't train dictionary.")
+ dict_content = _zstd.train_dict(chunks, chunk_sizes, dict_size)
+ return ZstdDict(dict_content)
+
+
+def finalize_dict(zstd_dict, /, samples, dict_size, level):
+ """Return a ZstdDict representing a finalized Zstandard dictionary.
+
+ Given a custom content as a basis for dictionary, and a set of samples,
+ finalize *zstd_dict* by adding headers and statistics according to the
+ Zstandard dictionary format.
+
+ You may compose an effective dictionary content by hand, which is used as
+ basis dictionary, and use some samples to finalize a dictionary. The basis
+ dictionary may be a "raw content" dictionary. See *is_raw* in ZstdDict.
+
+ *samples* is an iterable of samples, where a sample is a bytes-like object
+ representing a file.
+ *dict_size* is the dictionary's maximum size, in bytes.
+ *level* is the expected compression level. The statistics for each
+ compression level differ, so tuning the dictionary to the compression level
+ can provide improvements.
+ """
+
+ if not isinstance(zstd_dict, ZstdDict):
+ raise TypeError('zstd_dict argument should be a ZstdDict object.')
+ if not isinstance(dict_size, int):
+ raise TypeError('dict_size argument should be an int object.')
+ if not isinstance(level, int):
+ raise TypeError('level argument should be an int object.')
+
+ samples = tuple(samples)
+ chunks = b''.join(samples)
+ chunk_sizes = tuple(_nbytes(sample) for sample in samples)
+ if not chunks:
+ raise ValueError("The samples are empty content, can't finalize the "
+ "dictionary.")
+ dict_content = _zstd.finalize_dict(zstd_dict.dict_content, chunks,
+ chunk_sizes, dict_size, level)
+ return ZstdDict(dict_content)
+
+
+def compress(data, level=None, options=None, zstd_dict=None):
+ """Return Zstandard compressed *data* as bytes.
+
+ *level* is an int specifying the compression level to use, defaulting to
+ COMPRESSION_LEVEL_DEFAULT ('3').
+ *options* is a dict object that contains advanced compression
+ parameters. See CompressionParameter for more on options.
+ *zstd_dict* is a ZstdDict object, a pre-trained Zstandard dictionary. See
+ the function train_dict for how to train a ZstdDict on sample data.
+
+ For incremental compression, use a ZstdCompressor instead.
+ """
+ comp = ZstdCompressor(level=level, options=options, zstd_dict=zstd_dict)
+ return comp.compress(data, mode=ZstdCompressor.FLUSH_FRAME)
+
+
+def decompress(data, zstd_dict=None, options=None):
+ """Decompress one or more frames of Zstandard compressed *data*.
+
+ *zstd_dict* is a ZstdDict object, a pre-trained Zstandard dictionary. See
+ the function train_dict for how to train a ZstdDict on sample data.
+ *options* is a dict object that contains advanced compression
+ parameters. See DecompressionParameter for more on options.
+
+ For incremental decompression, use a ZstdDecompressor instead.
+ """
+ results = []
+ while True:
+ decomp = ZstdDecompressor(options=options, zstd_dict=zstd_dict)
+ results.append(decomp.decompress(data))
+ if not decomp.eof:
+ raise ZstdError('Compressed data ended before the '
+ 'end-of-stream marker was reached')
+ data = decomp.unused_data
+ if not data:
+ break
+ return b''.join(results)
+
+
+class CompressionParameter(enum.IntEnum):
+ """Compression parameters."""
+
+ compression_level = _zstd.ZSTD_c_compressionLevel
+ window_log = _zstd.ZSTD_c_windowLog
+ hash_log = _zstd.ZSTD_c_hashLog
+ chain_log = _zstd.ZSTD_c_chainLog
+ search_log = _zstd.ZSTD_c_searchLog
+ min_match = _zstd.ZSTD_c_minMatch
+ target_length = _zstd.ZSTD_c_targetLength
+ strategy = _zstd.ZSTD_c_strategy
+
+ enable_long_distance_matching = _zstd.ZSTD_c_enableLongDistanceMatching
+ ldm_hash_log = _zstd.ZSTD_c_ldmHashLog
+ ldm_min_match = _zstd.ZSTD_c_ldmMinMatch
+ ldm_bucket_size_log = _zstd.ZSTD_c_ldmBucketSizeLog
+ ldm_hash_rate_log = _zstd.ZSTD_c_ldmHashRateLog
+
+ content_size_flag = _zstd.ZSTD_c_contentSizeFlag
+ checksum_flag = _zstd.ZSTD_c_checksumFlag
+ dict_id_flag = _zstd.ZSTD_c_dictIDFlag
+
+ nb_workers = _zstd.ZSTD_c_nbWorkers
+ job_size = _zstd.ZSTD_c_jobSize
+ overlap_log = _zstd.ZSTD_c_overlapLog
+
+ def bounds(self):
+ """Return the (lower, upper) int bounds of a compression parameter.
+
+ Both the lower and upper bounds are inclusive.
+ """
+ return _zstd.get_param_bounds(self.value, is_compress=True)
+
+
+class DecompressionParameter(enum.IntEnum):
+ """Decompression parameters."""
+
+ window_log_max = _zstd.ZSTD_d_windowLogMax
+
+ def bounds(self):
+ """Return the (lower, upper) int bounds of a decompression parameter.
+
+ Both the lower and upper bounds are inclusive.
+ """
+ return _zstd.get_param_bounds(self.value, is_compress=False)
+
+
+class Strategy(enum.IntEnum):
+ """Compression strategies, listed from fastest to strongest.
+
+ Note that new strategies might be added in the future.
+ Only the order (from fast to strong) is guaranteed,
+ the numeric value might change.
+ """
+
+ fast = _zstd.ZSTD_fast
+ dfast = _zstd.ZSTD_dfast
+ greedy = _zstd.ZSTD_greedy
+ lazy = _zstd.ZSTD_lazy
+ lazy2 = _zstd.ZSTD_lazy2
+ btlazy2 = _zstd.ZSTD_btlazy2
+ btopt = _zstd.ZSTD_btopt
+ btultra = _zstd.ZSTD_btultra
+ btultra2 = _zstd.ZSTD_btultra2
+
+
+# Check validity of the CompressionParameter & DecompressionParameter types
+_zstd.set_parameter_types(CompressionParameter, DecompressionParameter)
diff --git a/Lib/compression/zstd/_zstdfile.py b/Lib/compression/zstd/_zstdfile.py
new file mode 100644
index 00000000000..d709f5efc65
--- /dev/null
+++ b/Lib/compression/zstd/_zstdfile.py
@@ -0,0 +1,345 @@
+import io
+from os import PathLike
+from _zstd import ZstdCompressor, ZstdDecompressor, ZSTD_DStreamOutSize
+from compression._common import _streams
+
+__all__ = ('ZstdFile', 'open')
+
+_MODE_CLOSED = 0
+_MODE_READ = 1
+_MODE_WRITE = 2
+
+
+def _nbytes(dat, /):
+ if isinstance(dat, (bytes, bytearray)):
+ return len(dat)
+ with memoryview(dat) as mv:
+ return mv.nbytes
+
+
+class ZstdFile(_streams.BaseStream):
+ """A file-like object providing transparent Zstandard (de)compression.
+
+ A ZstdFile can act as a wrapper for an existing file object, or refer
+ directly to a named file on disk.
+
+ ZstdFile provides a *binary* file interface. Data is read and returned as
+ bytes, and may only be written to objects that support the Buffer Protocol.
+ """
+
+ FLUSH_BLOCK = ZstdCompressor.FLUSH_BLOCK
+ FLUSH_FRAME = ZstdCompressor.FLUSH_FRAME
+
+ def __init__(self, file, /, mode='r', *,
+ level=None, options=None, zstd_dict=None):
+ """Open a Zstandard compressed file in binary mode.
+
+ *file* can be either an file-like object, or a file name to open.
+
+ *mode* can be 'r' for reading (default), 'w' for (over)writing, 'x' for
+ creating exclusively, or 'a' for appending. These can equivalently be
+ given as 'rb', 'wb', 'xb' and 'ab' respectively.
+
+ *level* is an optional int specifying the compression level to use,
+ or COMPRESSION_LEVEL_DEFAULT if not given.
+
+ *options* is an optional dict for advanced compression parameters.
+ See CompressionParameter and DecompressionParameter for the possible
+ options.
+
+ *zstd_dict* is an optional ZstdDict object, a pre-trained Zstandard
+ dictionary. See train_dict() to train ZstdDict on sample data.
+ """
+ self._fp = None
+ self._close_fp = False
+ self._mode = _MODE_CLOSED
+ self._buffer = None
+
+ if not isinstance(mode, str):
+ raise ValueError('mode must be a str')
+ if options is not None and not isinstance(options, dict):
+ raise TypeError('options must be a dict or None')
+ mode = mode.removesuffix('b') # handle rb, wb, xb, ab
+ if mode == 'r':
+ if level is not None:
+ raise TypeError('level is illegal in read mode')
+ self._mode = _MODE_READ
+ elif mode in {'w', 'a', 'x'}:
+ if level is not None and not isinstance(level, int):
+ raise TypeError('level must be int or None')
+ self._mode = _MODE_WRITE
+ self._compressor = ZstdCompressor(level=level, options=options,
+ zstd_dict=zstd_dict)
+ self._pos = 0
+ else:
+ raise ValueError(f'Invalid mode: {mode!r}')
+
+ if isinstance(file, (str, bytes, PathLike)):
+ self._fp = io.open(file, f'{mode}b')
+ self._close_fp = True
+ elif ((mode == 'r' and hasattr(file, 'read'))
+ or (mode != 'r' and hasattr(file, 'write'))):
+ self._fp = file
+ else:
+ raise TypeError('file must be a file-like object '
+ 'or a str, bytes, or PathLike object')
+
+ if self._mode == _MODE_READ:
+ raw = _streams.DecompressReader(
+ self._fp,
+ ZstdDecompressor,
+ zstd_dict=zstd_dict,
+ options=options,
+ )
+ self._buffer = io.BufferedReader(raw)
+
+ def close(self):
+ """Flush and close the file.
+
+ May be called multiple times. Once the file has been closed,
+ any other operation on it will raise ValueError.
+ """
+ if self._fp is None:
+ return
+ try:
+ if self._mode == _MODE_READ:
+ if getattr(self, '_buffer', None):
+ self._buffer.close()
+ self._buffer = None
+ elif self._mode == _MODE_WRITE:
+ self.flush(self.FLUSH_FRAME)
+ self._compressor = None
+ finally:
+ self._mode = _MODE_CLOSED
+ try:
+ if self._close_fp:
+ self._fp.close()
+ finally:
+ self._fp = None
+ self._close_fp = False
+
+ def write(self, data, /):
+ """Write a bytes-like object *data* to the file.
+
+ Returns the number of uncompressed bytes written, which is
+ always the length of data in bytes. Note that due to buffering,
+ the file on disk may not reflect the data written until .flush()
+ or .close() is called.
+ """
+ self._check_can_write()
+
+ length = _nbytes(data)
+
+ compressed = self._compressor.compress(data)
+ self._fp.write(compressed)
+ self._pos += length
+ return length
+
+ def flush(self, mode=FLUSH_BLOCK):
+ """Flush remaining data to the underlying stream.
+
+ The mode argument can be FLUSH_BLOCK or FLUSH_FRAME. Abuse of this
+ method will reduce compression ratio, use it only when necessary.
+
+ If the program is interrupted afterwards, all data can be recovered.
+ To ensure saving to disk, also need to use os.fsync(fd).
+
+ This method does nothing in reading mode.
+ """
+ if self._mode == _MODE_READ:
+ return
+ self._check_not_closed()
+ if mode not in {self.FLUSH_BLOCK, self.FLUSH_FRAME}:
+ raise ValueError('Invalid mode argument, expected either '
+ 'ZstdFile.FLUSH_FRAME or '
+ 'ZstdFile.FLUSH_BLOCK')
+ if self._compressor.last_mode == mode:
+ return
+ # Flush zstd block/frame, and write.
+ data = self._compressor.flush(mode)
+ self._fp.write(data)
+ if hasattr(self._fp, 'flush'):
+ self._fp.flush()
+
+ def read(self, size=-1):
+ """Read up to size uncompressed bytes from the file.
+
+ If size is negative or omitted, read until EOF is reached.
+ Returns b'' if the file is already at EOF.
+ """
+ if size is None:
+ size = -1
+ self._check_can_read()
+ return self._buffer.read(size)
+
+ def read1(self, size=-1):
+ """Read up to size uncompressed bytes, while trying to avoid
+ making multiple reads from the underlying stream. Reads up to a
+ buffer's worth of data if size is negative.
+
+ Returns b'' if the file is at EOF.
+ """
+ self._check_can_read()
+ if size < 0:
+ # Note this should *not* be io.DEFAULT_BUFFER_SIZE.
+ # ZSTD_DStreamOutSize is the minimum amount to read guaranteeing
+ # a full block is read.
+ size = ZSTD_DStreamOutSize
+ return self._buffer.read1(size)
+
+ def readinto(self, b):
+ """Read bytes into b.
+
+ Returns the number of bytes read (0 for EOF).
+ """
+ self._check_can_read()
+ return self._buffer.readinto(b)
+
+ def readinto1(self, b):
+ """Read bytes into b, while trying to avoid making multiple reads
+ from the underlying stream.
+
+ Returns the number of bytes read (0 for EOF).
+ """
+ self._check_can_read()
+ return self._buffer.readinto1(b)
+
+ def readline(self, size=-1):
+ """Read a line of uncompressed bytes from the file.
+
+ The terminating newline (if present) is retained. If size is
+ non-negative, no more than size bytes will be read (in which
+ case the line may be incomplete). Returns b'' if already at EOF.
+ """
+ self._check_can_read()
+ return self._buffer.readline(size)
+
+ def seek(self, offset, whence=io.SEEK_SET):
+ """Change the file position.
+
+ The new position is specified by offset, relative to the
+ position indicated by whence. Possible values for whence are:
+
+ 0: start of stream (default): offset must not be negative
+ 1: current stream position
+ 2: end of stream; offset must not be positive
+
+ Returns the new file position.
+
+ Note that seeking is emulated, so depending on the arguments,
+ this operation may be extremely slow.
+ """
+ self._check_can_read()
+
+ # BufferedReader.seek() checks seekable
+ return self._buffer.seek(offset, whence)
+
+ def peek(self, size=-1):
+ """Return buffered data without advancing the file position.
+
+ Always returns at least one byte of data, unless at EOF.
+ The exact number of bytes returned is unspecified.
+ """
+ # Relies on the undocumented fact that BufferedReader.peek() always
+ # returns at least one byte (except at EOF)
+ self._check_can_read()
+ return self._buffer.peek(size)
+
+ def __next__(self):
+ if ret := self._buffer.readline():
+ return ret
+ raise StopIteration
+
+ def tell(self):
+ """Return the current file position."""
+ self._check_not_closed()
+ if self._mode == _MODE_READ:
+ return self._buffer.tell()
+ elif self._mode == _MODE_WRITE:
+ return self._pos
+
+ def fileno(self):
+ """Return the file descriptor for the underlying file."""
+ self._check_not_closed()
+ return self._fp.fileno()
+
+ @property
+ def name(self):
+ self._check_not_closed()
+ return self._fp.name
+
+ @property
+ def mode(self):
+ return 'wb' if self._mode == _MODE_WRITE else 'rb'
+
+ @property
+ def closed(self):
+ """True if this file is closed."""
+ return self._mode == _MODE_CLOSED
+
+ def seekable(self):
+ """Return whether the file supports seeking."""
+ return self.readable() and self._buffer.seekable()
+
+ def readable(self):
+ """Return whether the file was opened for reading."""
+ self._check_not_closed()
+ return self._mode == _MODE_READ
+
+ def writable(self):
+ """Return whether the file was opened for writing."""
+ self._check_not_closed()
+ return self._mode == _MODE_WRITE
+
+
+def open(file, /, mode='rb', *, level=None, options=None, zstd_dict=None,
+ encoding=None, errors=None, newline=None):
+ """Open a Zstandard compressed file in binary or text mode.
+
+ file can be either a file name (given as a str, bytes, or PathLike object),
+ in which case the named file is opened, or it can be an existing file object
+ to read from or write to.
+
+ The mode parameter can be 'r', 'rb' (default), 'w', 'wb', 'x', 'xb', 'a',
+ 'ab' for binary mode, or 'rt', 'wt', 'xt', 'at' for text mode.
+
+ The level, options, and zstd_dict parameters specify the settings the same
+ as ZstdFile.
+
+ When using read mode (decompression), the options parameter is a dict
+ representing advanced decompression options. The level parameter is not
+ supported in this case. When using write mode (compression), only one of
+ level, an int representing the compression level, or options, a dict
+ representing advanced compression options, may be passed. In both modes,
+ zstd_dict is a ZstdDict instance containing a trained Zstandard dictionary.
+
+ For binary mode, this function is equivalent to the ZstdFile constructor:
+ ZstdFile(filename, mode, ...). In this case, the encoding, errors and
+ newline parameters must not be provided.
+
+ For text mode, an ZstdFile object is created, and wrapped in an
+ io.TextIOWrapper instance with the specified encoding, error handling
+ behavior, and line ending(s).
+ """
+
+ text_mode = 't' in mode
+ mode = mode.replace('t', '')
+
+ if text_mode:
+ if 'b' in mode:
+ raise ValueError(f'Invalid mode: {mode!r}')
+ else:
+ if encoding is not None:
+ raise ValueError('Argument "encoding" not supported in binary mode')
+ if errors is not None:
+ raise ValueError('Argument "errors" not supported in binary mode')
+ if newline is not None:
+ raise ValueError('Argument "newline" not supported in binary mode')
+
+ binary_file = ZstdFile(file, mode, level=level, options=options,
+ zstd_dict=zstd_dict)
+
+ if text_mode:
+ return io.TextIOWrapper(binary_file, encoding, errors, newline)
+ else:
+ return binary_file