diff options
Diffstat (limited to 'Lib/compression/zstd')
-rw-r--r-- | Lib/compression/zstd/__init__.py | 242 | ||||
-rw-r--r-- | Lib/compression/zstd/_zstdfile.py | 345 |
2 files changed, 587 insertions, 0 deletions
diff --git a/Lib/compression/zstd/__init__.py b/Lib/compression/zstd/__init__.py new file mode 100644 index 00000000000..84b25914b0a --- /dev/null +++ b/Lib/compression/zstd/__init__.py @@ -0,0 +1,242 @@ +"""Python bindings to the Zstandard (zstd) compression library (RFC-8878).""" + +__all__ = ( + # compression.zstd + 'COMPRESSION_LEVEL_DEFAULT', + 'compress', + 'CompressionParameter', + 'decompress', + 'DecompressionParameter', + 'finalize_dict', + 'get_frame_info', + 'Strategy', + 'train_dict', + + # compression.zstd._zstdfile + 'open', + 'ZstdFile', + + # _zstd + 'get_frame_size', + 'zstd_version', + 'zstd_version_info', + 'ZstdCompressor', + 'ZstdDecompressor', + 'ZstdDict', + 'ZstdError', +) + +import _zstd +import enum +from _zstd import (ZstdCompressor, ZstdDecompressor, ZstdDict, ZstdError, + get_frame_size, zstd_version) +from compression.zstd._zstdfile import ZstdFile, open, _nbytes + +# zstd_version_number is (MAJOR * 100 * 100 + MINOR * 100 + RELEASE) +zstd_version_info = (*divmod(_zstd.zstd_version_number // 100, 100), + _zstd.zstd_version_number % 100) +"""Version number of the runtime zstd library as a tuple of integers.""" + +COMPRESSION_LEVEL_DEFAULT = _zstd.ZSTD_CLEVEL_DEFAULT +"""The default compression level for Zstandard, currently '3'.""" + + +class FrameInfo: + """Information about a Zstandard frame.""" + + __slots__ = 'decompressed_size', 'dictionary_id' + + def __init__(self, decompressed_size, dictionary_id): + super().__setattr__('decompressed_size', decompressed_size) + super().__setattr__('dictionary_id', dictionary_id) + + def __repr__(self): + return (f'FrameInfo(decompressed_size={self.decompressed_size}, ' + f'dictionary_id={self.dictionary_id})') + + def __setattr__(self, name, _): + raise AttributeError(f"can't set attribute {name!r}") + + +def get_frame_info(frame_buffer): + """Get Zstandard frame information from a frame header. + + *frame_buffer* is a bytes-like object. It should start from the beginning + of a frame, and needs to include at least the frame header (6 to 18 bytes). + + The returned FrameInfo object has two attributes. + 'decompressed_size' is the size in bytes of the data in the frame when + decompressed, or None when the decompressed size is unknown. + 'dictionary_id' is an int in the range (0, 2**32). The special value 0 + means that the dictionary ID was not recorded in the frame header, + the frame may or may not need a dictionary to be decoded, + and the ID of such a dictionary is not specified. + """ + return FrameInfo(*_zstd.get_frame_info(frame_buffer)) + + +def train_dict(samples, dict_size): + """Return a ZstdDict representing a trained Zstandard dictionary. + + *samples* is an iterable of samples, where a sample is a bytes-like + object representing a file. + + *dict_size* is the dictionary's maximum size, in bytes. + """ + if not isinstance(dict_size, int): + ds_cls = type(dict_size).__qualname__ + raise TypeError(f'dict_size must be an int object, not {ds_cls!r}.') + + samples = tuple(samples) + chunks = b''.join(samples) + chunk_sizes = tuple(_nbytes(sample) for sample in samples) + if not chunks: + raise ValueError("samples contained no data; can't train dictionary.") + dict_content = _zstd.train_dict(chunks, chunk_sizes, dict_size) + return ZstdDict(dict_content) + + +def finalize_dict(zstd_dict, /, samples, dict_size, level): + """Return a ZstdDict representing a finalized Zstandard dictionary. + + Given a custom content as a basis for dictionary, and a set of samples, + finalize *zstd_dict* by adding headers and statistics according to the + Zstandard dictionary format. + + You may compose an effective dictionary content by hand, which is used as + basis dictionary, and use some samples to finalize a dictionary. The basis + dictionary may be a "raw content" dictionary. See *is_raw* in ZstdDict. + + *samples* is an iterable of samples, where a sample is a bytes-like object + representing a file. + *dict_size* is the dictionary's maximum size, in bytes. + *level* is the expected compression level. The statistics for each + compression level differ, so tuning the dictionary to the compression level + can provide improvements. + """ + + if not isinstance(zstd_dict, ZstdDict): + raise TypeError('zstd_dict argument should be a ZstdDict object.') + if not isinstance(dict_size, int): + raise TypeError('dict_size argument should be an int object.') + if not isinstance(level, int): + raise TypeError('level argument should be an int object.') + + samples = tuple(samples) + chunks = b''.join(samples) + chunk_sizes = tuple(_nbytes(sample) for sample in samples) + if not chunks: + raise ValueError("The samples are empty content, can't finalize the " + "dictionary.") + dict_content = _zstd.finalize_dict(zstd_dict.dict_content, chunks, + chunk_sizes, dict_size, level) + return ZstdDict(dict_content) + + +def compress(data, level=None, options=None, zstd_dict=None): + """Return Zstandard compressed *data* as bytes. + + *level* is an int specifying the compression level to use, defaulting to + COMPRESSION_LEVEL_DEFAULT ('3'). + *options* is a dict object that contains advanced compression + parameters. See CompressionParameter for more on options. + *zstd_dict* is a ZstdDict object, a pre-trained Zstandard dictionary. See + the function train_dict for how to train a ZstdDict on sample data. + + For incremental compression, use a ZstdCompressor instead. + """ + comp = ZstdCompressor(level=level, options=options, zstd_dict=zstd_dict) + return comp.compress(data, mode=ZstdCompressor.FLUSH_FRAME) + + +def decompress(data, zstd_dict=None, options=None): + """Decompress one or more frames of Zstandard compressed *data*. + + *zstd_dict* is a ZstdDict object, a pre-trained Zstandard dictionary. See + the function train_dict for how to train a ZstdDict on sample data. + *options* is a dict object that contains advanced compression + parameters. See DecompressionParameter for more on options. + + For incremental decompression, use a ZstdDecompressor instead. + """ + results = [] + while True: + decomp = ZstdDecompressor(options=options, zstd_dict=zstd_dict) + results.append(decomp.decompress(data)) + if not decomp.eof: + raise ZstdError('Compressed data ended before the ' + 'end-of-stream marker was reached') + data = decomp.unused_data + if not data: + break + return b''.join(results) + + +class CompressionParameter(enum.IntEnum): + """Compression parameters.""" + + compression_level = _zstd.ZSTD_c_compressionLevel + window_log = _zstd.ZSTD_c_windowLog + hash_log = _zstd.ZSTD_c_hashLog + chain_log = _zstd.ZSTD_c_chainLog + search_log = _zstd.ZSTD_c_searchLog + min_match = _zstd.ZSTD_c_minMatch + target_length = _zstd.ZSTD_c_targetLength + strategy = _zstd.ZSTD_c_strategy + + enable_long_distance_matching = _zstd.ZSTD_c_enableLongDistanceMatching + ldm_hash_log = _zstd.ZSTD_c_ldmHashLog + ldm_min_match = _zstd.ZSTD_c_ldmMinMatch + ldm_bucket_size_log = _zstd.ZSTD_c_ldmBucketSizeLog + ldm_hash_rate_log = _zstd.ZSTD_c_ldmHashRateLog + + content_size_flag = _zstd.ZSTD_c_contentSizeFlag + checksum_flag = _zstd.ZSTD_c_checksumFlag + dict_id_flag = _zstd.ZSTD_c_dictIDFlag + + nb_workers = _zstd.ZSTD_c_nbWorkers + job_size = _zstd.ZSTD_c_jobSize + overlap_log = _zstd.ZSTD_c_overlapLog + + def bounds(self): + """Return the (lower, upper) int bounds of a compression parameter. + + Both the lower and upper bounds are inclusive. + """ + return _zstd.get_param_bounds(self.value, is_compress=True) + + +class DecompressionParameter(enum.IntEnum): + """Decompression parameters.""" + + window_log_max = _zstd.ZSTD_d_windowLogMax + + def bounds(self): + """Return the (lower, upper) int bounds of a decompression parameter. + + Both the lower and upper bounds are inclusive. + """ + return _zstd.get_param_bounds(self.value, is_compress=False) + + +class Strategy(enum.IntEnum): + """Compression strategies, listed from fastest to strongest. + + Note that new strategies might be added in the future. + Only the order (from fast to strong) is guaranteed, + the numeric value might change. + """ + + fast = _zstd.ZSTD_fast + dfast = _zstd.ZSTD_dfast + greedy = _zstd.ZSTD_greedy + lazy = _zstd.ZSTD_lazy + lazy2 = _zstd.ZSTD_lazy2 + btlazy2 = _zstd.ZSTD_btlazy2 + btopt = _zstd.ZSTD_btopt + btultra = _zstd.ZSTD_btultra + btultra2 = _zstd.ZSTD_btultra2 + + +# Check validity of the CompressionParameter & DecompressionParameter types +_zstd.set_parameter_types(CompressionParameter, DecompressionParameter) diff --git a/Lib/compression/zstd/_zstdfile.py b/Lib/compression/zstd/_zstdfile.py new file mode 100644 index 00000000000..d709f5efc65 --- /dev/null +++ b/Lib/compression/zstd/_zstdfile.py @@ -0,0 +1,345 @@ +import io +from os import PathLike +from _zstd import ZstdCompressor, ZstdDecompressor, ZSTD_DStreamOutSize +from compression._common import _streams + +__all__ = ('ZstdFile', 'open') + +_MODE_CLOSED = 0 +_MODE_READ = 1 +_MODE_WRITE = 2 + + +def _nbytes(dat, /): + if isinstance(dat, (bytes, bytearray)): + return len(dat) + with memoryview(dat) as mv: + return mv.nbytes + + +class ZstdFile(_streams.BaseStream): + """A file-like object providing transparent Zstandard (de)compression. + + A ZstdFile can act as a wrapper for an existing file object, or refer + directly to a named file on disk. + + ZstdFile provides a *binary* file interface. Data is read and returned as + bytes, and may only be written to objects that support the Buffer Protocol. + """ + + FLUSH_BLOCK = ZstdCompressor.FLUSH_BLOCK + FLUSH_FRAME = ZstdCompressor.FLUSH_FRAME + + def __init__(self, file, /, mode='r', *, + level=None, options=None, zstd_dict=None): + """Open a Zstandard compressed file in binary mode. + + *file* can be either an file-like object, or a file name to open. + + *mode* can be 'r' for reading (default), 'w' for (over)writing, 'x' for + creating exclusively, or 'a' for appending. These can equivalently be + given as 'rb', 'wb', 'xb' and 'ab' respectively. + + *level* is an optional int specifying the compression level to use, + or COMPRESSION_LEVEL_DEFAULT if not given. + + *options* is an optional dict for advanced compression parameters. + See CompressionParameter and DecompressionParameter for the possible + options. + + *zstd_dict* is an optional ZstdDict object, a pre-trained Zstandard + dictionary. See train_dict() to train ZstdDict on sample data. + """ + self._fp = None + self._close_fp = False + self._mode = _MODE_CLOSED + self._buffer = None + + if not isinstance(mode, str): + raise ValueError('mode must be a str') + if options is not None and not isinstance(options, dict): + raise TypeError('options must be a dict or None') + mode = mode.removesuffix('b') # handle rb, wb, xb, ab + if mode == 'r': + if level is not None: + raise TypeError('level is illegal in read mode') + self._mode = _MODE_READ + elif mode in {'w', 'a', 'x'}: + if level is not None and not isinstance(level, int): + raise TypeError('level must be int or None') + self._mode = _MODE_WRITE + self._compressor = ZstdCompressor(level=level, options=options, + zstd_dict=zstd_dict) + self._pos = 0 + else: + raise ValueError(f'Invalid mode: {mode!r}') + + if isinstance(file, (str, bytes, PathLike)): + self._fp = io.open(file, f'{mode}b') + self._close_fp = True + elif ((mode == 'r' and hasattr(file, 'read')) + or (mode != 'r' and hasattr(file, 'write'))): + self._fp = file + else: + raise TypeError('file must be a file-like object ' + 'or a str, bytes, or PathLike object') + + if self._mode == _MODE_READ: + raw = _streams.DecompressReader( + self._fp, + ZstdDecompressor, + zstd_dict=zstd_dict, + options=options, + ) + self._buffer = io.BufferedReader(raw) + + def close(self): + """Flush and close the file. + + May be called multiple times. Once the file has been closed, + any other operation on it will raise ValueError. + """ + if self._fp is None: + return + try: + if self._mode == _MODE_READ: + if getattr(self, '_buffer', None): + self._buffer.close() + self._buffer = None + elif self._mode == _MODE_WRITE: + self.flush(self.FLUSH_FRAME) + self._compressor = None + finally: + self._mode = _MODE_CLOSED + try: + if self._close_fp: + self._fp.close() + finally: + self._fp = None + self._close_fp = False + + def write(self, data, /): + """Write a bytes-like object *data* to the file. + + Returns the number of uncompressed bytes written, which is + always the length of data in bytes. Note that due to buffering, + the file on disk may not reflect the data written until .flush() + or .close() is called. + """ + self._check_can_write() + + length = _nbytes(data) + + compressed = self._compressor.compress(data) + self._fp.write(compressed) + self._pos += length + return length + + def flush(self, mode=FLUSH_BLOCK): + """Flush remaining data to the underlying stream. + + The mode argument can be FLUSH_BLOCK or FLUSH_FRAME. Abuse of this + method will reduce compression ratio, use it only when necessary. + + If the program is interrupted afterwards, all data can be recovered. + To ensure saving to disk, also need to use os.fsync(fd). + + This method does nothing in reading mode. + """ + if self._mode == _MODE_READ: + return + self._check_not_closed() + if mode not in {self.FLUSH_BLOCK, self.FLUSH_FRAME}: + raise ValueError('Invalid mode argument, expected either ' + 'ZstdFile.FLUSH_FRAME or ' + 'ZstdFile.FLUSH_BLOCK') + if self._compressor.last_mode == mode: + return + # Flush zstd block/frame, and write. + data = self._compressor.flush(mode) + self._fp.write(data) + if hasattr(self._fp, 'flush'): + self._fp.flush() + + def read(self, size=-1): + """Read up to size uncompressed bytes from the file. + + If size is negative or omitted, read until EOF is reached. + Returns b'' if the file is already at EOF. + """ + if size is None: + size = -1 + self._check_can_read() + return self._buffer.read(size) + + def read1(self, size=-1): + """Read up to size uncompressed bytes, while trying to avoid + making multiple reads from the underlying stream. Reads up to a + buffer's worth of data if size is negative. + + Returns b'' if the file is at EOF. + """ + self._check_can_read() + if size < 0: + # Note this should *not* be io.DEFAULT_BUFFER_SIZE. + # ZSTD_DStreamOutSize is the minimum amount to read guaranteeing + # a full block is read. + size = ZSTD_DStreamOutSize + return self._buffer.read1(size) + + def readinto(self, b): + """Read bytes into b. + + Returns the number of bytes read (0 for EOF). + """ + self._check_can_read() + return self._buffer.readinto(b) + + def readinto1(self, b): + """Read bytes into b, while trying to avoid making multiple reads + from the underlying stream. + + Returns the number of bytes read (0 for EOF). + """ + self._check_can_read() + return self._buffer.readinto1(b) + + def readline(self, size=-1): + """Read a line of uncompressed bytes from the file. + + The terminating newline (if present) is retained. If size is + non-negative, no more than size bytes will be read (in which + case the line may be incomplete). Returns b'' if already at EOF. + """ + self._check_can_read() + return self._buffer.readline(size) + + def seek(self, offset, whence=io.SEEK_SET): + """Change the file position. + + The new position is specified by offset, relative to the + position indicated by whence. Possible values for whence are: + + 0: start of stream (default): offset must not be negative + 1: current stream position + 2: end of stream; offset must not be positive + + Returns the new file position. + + Note that seeking is emulated, so depending on the arguments, + this operation may be extremely slow. + """ + self._check_can_read() + + # BufferedReader.seek() checks seekable + return self._buffer.seek(offset, whence) + + def peek(self, size=-1): + """Return buffered data without advancing the file position. + + Always returns at least one byte of data, unless at EOF. + The exact number of bytes returned is unspecified. + """ + # Relies on the undocumented fact that BufferedReader.peek() always + # returns at least one byte (except at EOF) + self._check_can_read() + return self._buffer.peek(size) + + def __next__(self): + if ret := self._buffer.readline(): + return ret + raise StopIteration + + def tell(self): + """Return the current file position.""" + self._check_not_closed() + if self._mode == _MODE_READ: + return self._buffer.tell() + elif self._mode == _MODE_WRITE: + return self._pos + + def fileno(self): + """Return the file descriptor for the underlying file.""" + self._check_not_closed() + return self._fp.fileno() + + @property + def name(self): + self._check_not_closed() + return self._fp.name + + @property + def mode(self): + return 'wb' if self._mode == _MODE_WRITE else 'rb' + + @property + def closed(self): + """True if this file is closed.""" + return self._mode == _MODE_CLOSED + + def seekable(self): + """Return whether the file supports seeking.""" + return self.readable() and self._buffer.seekable() + + def readable(self): + """Return whether the file was opened for reading.""" + self._check_not_closed() + return self._mode == _MODE_READ + + def writable(self): + """Return whether the file was opened for writing.""" + self._check_not_closed() + return self._mode == _MODE_WRITE + + +def open(file, /, mode='rb', *, level=None, options=None, zstd_dict=None, + encoding=None, errors=None, newline=None): + """Open a Zstandard compressed file in binary or text mode. + + file can be either a file name (given as a str, bytes, or PathLike object), + in which case the named file is opened, or it can be an existing file object + to read from or write to. + + The mode parameter can be 'r', 'rb' (default), 'w', 'wb', 'x', 'xb', 'a', + 'ab' for binary mode, or 'rt', 'wt', 'xt', 'at' for text mode. + + The level, options, and zstd_dict parameters specify the settings the same + as ZstdFile. + + When using read mode (decompression), the options parameter is a dict + representing advanced decompression options. The level parameter is not + supported in this case. When using write mode (compression), only one of + level, an int representing the compression level, or options, a dict + representing advanced compression options, may be passed. In both modes, + zstd_dict is a ZstdDict instance containing a trained Zstandard dictionary. + + For binary mode, this function is equivalent to the ZstdFile constructor: + ZstdFile(filename, mode, ...). In this case, the encoding, errors and + newline parameters must not be provided. + + For text mode, an ZstdFile object is created, and wrapped in an + io.TextIOWrapper instance with the specified encoding, error handling + behavior, and line ending(s). + """ + + text_mode = 't' in mode + mode = mode.replace('t', '') + + if text_mode: + if 'b' in mode: + raise ValueError(f'Invalid mode: {mode!r}') + else: + if encoding is not None: + raise ValueError('Argument "encoding" not supported in binary mode') + if errors is not None: + raise ValueError('Argument "errors" not supported in binary mode') + if newline is not None: + raise ValueError('Argument "newline" not supported in binary mode') + + binary_file = ZstdFile(file, mode, level=level, options=options, + zstd_dict=zstd_dict) + + if text_mode: + return io.TextIOWrapper(binary_file, encoding, errors, newline) + else: + return binary_file |