1 files changed, 242 insertions, 0 deletions
diff --git a/Lib/compression/zstd/__init__.py b/Lib/compression/zstd/__init__.py
new file mode 100644
index 00000000000..84b25914b0a
--- /dev/null
+++ b/Lib/compression/zstd/__init__.py
@@ -0,0 +1,242 @@
+"""Python bindings to the Zstandard (zstd) compression library (RFC-8878)."""
+
+__all__ = (
+    # compression.zstd
+    'COMPRESSION_LEVEL_DEFAULT',
+    'compress',
+    'CompressionParameter',
+    'decompress',
+    'DecompressionParameter',
+    'finalize_dict',
+    'get_frame_info',
+    'Strategy',
+    'train_dict',
+
+    # compression.zstd._zstdfile
+    'open',
+    'ZstdFile',
+
+    # _zstd
+    'get_frame_size',
+    'zstd_version',
+    'zstd_version_info',
+    'ZstdCompressor',
+    'ZstdDecompressor',
+    'ZstdDict',
+    'ZstdError',
+)
+
+import _zstd
+import enum
+from _zstd import (ZstdCompressor, ZstdDecompressor, ZstdDict, ZstdError,
+                   get_frame_size, zstd_version)
+from compression.zstd._zstdfile import ZstdFile, open, _nbytes
+
+# zstd_version_number is (MAJOR * 100 * 100 + MINOR * 100 + RELEASE)
+zstd_version_info = (*divmod(_zstd.zstd_version_number // 100, 100),
+                     _zstd.zstd_version_number % 100)
+"""Version number of the runtime zstd library as a tuple of integers."""
+
+COMPRESSION_LEVEL_DEFAULT = _zstd.ZSTD_CLEVEL_DEFAULT
+"""The default compression level for Zstandard, currently '3'."""
+
+
+class FrameInfo:
+    """Information about a Zstandard frame."""
+
+    __slots__ = 'decompressed_size', 'dictionary_id'
+
+    def __init__(self, decompressed_size, dictionary_id):
+        super().__setattr__('decompressed_size', decompressed_size)
+        super().__setattr__('dictionary_id', dictionary_id)
+
+    def __repr__(self):
+        return (f'FrameInfo(decompressed_size={self.decompressed_size}, '
+                f'dictionary_id={self.dictionary_id})')
+
+    def __setattr__(self, name, _):
+        raise AttributeError(f"can't set attribute {name!r}")
+
+
+def get_frame_info(frame_buffer):
+    """Get Zstandard frame information from a frame header.
+
+    *frame_buffer* is a bytes-like object. It should start from the beginning
+    of a frame, and needs to include at least the frame header (6 to 18 bytes).
+
+    The returned FrameInfo object has two attributes.
+    'decompressed_size' is the size in bytes of the data in the frame when
+    decompressed, or None when the decompressed size is unknown.
+    'dictionary_id' is an int in the range (0, 2**32). The special value 0
+    means that the dictionary ID was not recorded in the frame header,
+    the frame may or may not need a dictionary to be decoded,
+    and the ID of such a dictionary is not specified.
+    """
+    return FrameInfo(*_zstd.get_frame_info(frame_buffer))
+
+
+def train_dict(samples, dict_size):
+    """Return a ZstdDict representing a trained Zstandard dictionary.
+
+    *samples* is an iterable of samples, where a sample is a bytes-like
+    object representing a file.
+
+    *dict_size* is the dictionary's maximum size, in bytes.
+    """
+    if not isinstance(dict_size, int):
+        ds_cls = type(dict_size).__qualname__
+        raise TypeError(f'dict_size must be an int object, not {ds_cls!r}.')
+
+    samples = tuple(samples)
+    chunks = b''.join(samples)
+    chunk_sizes = tuple(_nbytes(sample) for sample in samples)
+    if not chunks:
+        raise ValueError("samples contained no data; can't train dictionary.")
+    dict_content = _zstd.train_dict(chunks, chunk_sizes, dict_size)
+    return ZstdDict(dict_content)
+
+
+def finalize_dict(zstd_dict, /, samples, dict_size, level):
+    """Return a ZstdDict representing a finalized Zstandard dictionary.
+
+    Given a custom content as a basis for dictionary, and a set of samples,
+    finalize *zstd_dict* by adding headers and statistics according to the
+    Zstandard dictionary format.
+
+    You may compose an effective dictionary content by hand, which is used as
+    basis dictionary, and use some samples to finalize a dictionary. The basis
+    dictionary may be a "raw content" dictionary. See *is_raw* in ZstdDict.
+
+    *samples* is an iterable of samples, where a sample is a bytes-like object
+    representing a file.
+    *dict_size* is the dictionary's maximum size, in bytes.
+    *level* is the expected compression level. The statistics for each
+    compression level differ, so tuning the dictionary to the compression level
+    can provide improvements.
+    """
+
+    if not isinstance(zstd_dict, ZstdDict):
+        raise TypeError('zstd_dict argument should be a ZstdDict object.')
+    if not isinstance(dict_size, int):
+        raise TypeError('dict_size argument should be an int object.')
+    if not isinstance(level, int):
+        raise TypeError('level argument should be an int object.')
+
+    samples = tuple(samples)
+    chunks = b''.join(samples)
+    chunk_sizes = tuple(_nbytes(sample) for sample in samples)
+    if not chunks:
+        raise ValueError("The samples are empty content, can't finalize the "
+                         "dictionary.")
+    dict_content = _zstd.finalize_dict(zstd_dict.dict_content, chunks,
+                                       chunk_sizes, dict_size, level)
+    return ZstdDict(dict_content)
+
+
+def compress(data, level=None, options=None, zstd_dict=None):
+    """Return Zstandard compressed *data* as bytes.
+
+    *level* is an int specifying the compression level to use, defaulting to
+    COMPRESSION_LEVEL_DEFAULT ('3').
+    *options* is a dict object that contains advanced compression
+    parameters. See CompressionParameter for more on options.
+    *zstd_dict* is a ZstdDict object, a pre-trained Zstandard dictionary. See
+    the function train_dict for how to train a ZstdDict on sample data.
+
+    For incremental compression, use a ZstdCompressor instead.
+    """
+    comp = ZstdCompressor(level=level, options=options, zstd_dict=zstd_dict)
+    return comp.compress(data, mode=ZstdCompressor.FLUSH_FRAME)
+
+
+def decompress(data, zstd_dict=None, options=None):
+    """Decompress one or more frames of Zstandard compressed *data*.
+
+    *zstd_dict* is a ZstdDict object, a pre-trained Zstandard dictionary. See
+    the function train_dict for how to train a ZstdDict on sample data.
+    *options* is a dict object that contains advanced compression
+    parameters. See DecompressionParameter for more on options.
+
+    For incremental decompression, use a ZstdDecompressor instead.
+    """
+    results = []
+    while True:
+        decomp = ZstdDecompressor(options=options, zstd_dict=zstd_dict)
+        results.append(decomp.decompress(data))
+        if not decomp.eof:
+            raise ZstdError('Compressed data ended before the '
+                            'end-of-stream marker was reached')
+        data = decomp.unused_data
+        if not data:
+            break
+    return b''.join(results)
+
+
+class CompressionParameter(enum.IntEnum):
+    """Compression parameters."""
+
+    compression_level = _zstd.ZSTD_c_compressionLevel
+    window_log = _zstd.ZSTD_c_windowLog
+    hash_log = _zstd.ZSTD_c_hashLog
+    chain_log = _zstd.ZSTD_c_chainLog
+    search_log = _zstd.ZSTD_c_searchLog
+    min_match = _zstd.ZSTD_c_minMatch
+    target_length = _zstd.ZSTD_c_targetLength
+    strategy = _zstd.ZSTD_c_strategy
+
+    enable_long_distance_matching = _zstd.ZSTD_c_enableLongDistanceMatching
+    ldm_hash_log = _zstd.ZSTD_c_ldmHashLog
+    ldm_min_match = _zstd.ZSTD_c_ldmMinMatch
+    ldm_bucket_size_log = _zstd.ZSTD_c_ldmBucketSizeLog
+    ldm_hash_rate_log = _zstd.ZSTD_c_ldmHashRateLog
+
+    content_size_flag = _zstd.ZSTD_c_contentSizeFlag
+    checksum_flag = _zstd.ZSTD_c_checksumFlag
+    dict_id_flag = _zstd.ZSTD_c_dictIDFlag
+
+    nb_workers = _zstd.ZSTD_c_nbWorkers
+    job_size = _zstd.ZSTD_c_jobSize
+    overlap_log = _zstd.ZSTD_c_overlapLog
+
+    def bounds(self):
+        """Return the (lower, upper) int bounds of a compression parameter.
+
+        Both the lower and upper bounds are inclusive.
+        """
+        return _zstd.get_param_bounds(self.value, is_compress=True)
+
+
+class DecompressionParameter(enum.IntEnum):
+    """Decompression parameters."""
+
+    window_log_max = _zstd.ZSTD_d_windowLogMax
+
+    def bounds(self):
+        """Return the (lower, upper) int bounds of a decompression parameter.
+
+        Both the lower and upper bounds are inclusive.
+        """
+        return _zstd.get_param_bounds(self.value, is_compress=False)
+
+
+class Strategy(enum.IntEnum):
+    """Compression strategies, listed from fastest to strongest.
+
+    Note that new strategies might be added in the future.
+    Only the order (from fast to strong) is guaranteed,
+    the numeric value might change.
+    """
+
+    fast = _zstd.ZSTD_fast
+    dfast = _zstd.ZSTD_dfast
+    greedy = _zstd.ZSTD_greedy
+    lazy = _zstd.ZSTD_lazy
+    lazy2 = _zstd.ZSTD_lazy2
+    btlazy2 = _zstd.ZSTD_btlazy2
+    btopt = _zstd.ZSTD_btopt
+    btultra = _zstd.ZSTD_btultra
+    btultra2 = _zstd.ZSTD_btultra2
+
+
+# Check validity of the CompressionParameter & DecompressionParameter types
+_zstd.set_parameter_types(CompressionParameter, DecompressionParameter)