diff options
Diffstat (limited to 'Modules/_zstd/decompressor.c')
-rw-r--r-- | Modules/_zstd/decompressor.c | 717 |
1 files changed, 717 insertions, 0 deletions
diff --git a/Modules/_zstd/decompressor.c b/Modules/_zstd/decompressor.c new file mode 100644 index 00000000000..c53d6e4cb05 --- /dev/null +++ b/Modules/_zstd/decompressor.c @@ -0,0 +1,717 @@ +/* Low level interface to the Zstandard algorthm & the zstd library. */ + +/* ZstdDecompressor class definition */ + +/*[clinic input] +module _zstd +class _zstd.ZstdDecompressor "ZstdDecompressor *" "&zstd_decompressor_type_spec" +[clinic start generated code]*/ +/*[clinic end generated code: output=da39a3ee5e6b4b0d input=e2969ddf48a203e0]*/ + +#ifndef Py_BUILD_CORE_BUILTIN +# define Py_BUILD_CORE_MODULE 1 +#endif + +#include "Python.h" + +#include "_zstdmodule.h" +#include "buffer.h" +#include "internal/pycore_lock.h" // PyMutex_IsLocked + +#include <stdbool.h> // bool +#include <stddef.h> // offsetof() +#include <zstd.h> // ZSTD_*() + +typedef struct { + PyObject_HEAD + + /* Decompression context */ + ZSTD_DCtx *dctx; + + /* ZstdDict object in use */ + PyObject *dict; + + /* Unconsumed input data */ + char *input_buffer; + size_t input_buffer_size; + size_t in_begin, in_end; + + /* Unused data */ + PyObject *unused_data; + + /* 0 if decompressor has (or may has) unconsumed input data, 0 or 1. */ + bool needs_input; + + /* For ZstdDecompressor, 0 or 1. + 1 means the end of the first frame has been reached. */ + bool eof; + + /* Lock to protect the decompression context */ + PyMutex lock; +} ZstdDecompressor; + +#define ZstdDecompressor_CAST(op) ((ZstdDecompressor *)op) + +#include "clinic/decompressor.c.h" + +static inline ZSTD_DDict * +_get_DDict(ZstdDict *self) +{ + assert(PyMutex_IsLocked(&self->lock)); + ZSTD_DDict *ret; + + if (self->d_dict == NULL) { + /* Create ZSTD_DDict instance from dictionary content */ + Py_BEGIN_ALLOW_THREADS + ret = ZSTD_createDDict(self->dict_buffer, self->dict_len); + Py_END_ALLOW_THREADS + self->d_dict = ret; + + if (self->d_dict == NULL) { + _zstd_state* mod_state = PyType_GetModuleState(Py_TYPE(self)); + if (mod_state != NULL) { + PyErr_SetString(mod_state->ZstdError, + "Failed to create a ZSTD_DDict instance from " + "Zstandard dictionary content."); + } + } + } + + return self->d_dict; +} + +static int +_zstd_set_d_parameters(ZstdDecompressor *self, PyObject *options) +{ + _zstd_state* mod_state = PyType_GetModuleState(Py_TYPE(self)); + if (mod_state == NULL) { + return -1; + } + + if (!PyDict_Check(options)) { + PyErr_Format(PyExc_TypeError, + "ZstdDecompressor() argument 'options' must be dict, not %T", + options); + return -1; + } + + Py_ssize_t pos = 0; + PyObject *key, *value; + while (PyDict_Next(options, &pos, &key, &value)) { + /* Check key type */ + if (Py_TYPE(key) == mod_state->CParameter_type) { + PyErr_SetString(PyExc_TypeError, + "compression options dictionary key must not be a " + "CompressionParameter attribute"); + return -1; + } + + Py_INCREF(key); + Py_INCREF(value); + int key_v = PyLong_AsInt(key); + Py_DECREF(key); + if (key_v == -1 && PyErr_Occurred()) { + return -1; + } + + int value_v = PyLong_AsInt(value); + Py_DECREF(value); + if (value_v == -1 && PyErr_Occurred()) { + return -1; + } + + /* Set parameter to compression context */ + size_t zstd_ret = ZSTD_DCtx_setParameter(self->dctx, key_v, value_v); + + /* Check error */ + if (ZSTD_isError(zstd_ret)) { + set_parameter_error(0, key_v, value_v); + return -1; + } + } + return 0; +} + +static int +_zstd_load_impl(ZstdDecompressor *self, ZstdDict *zd, + _zstd_state *mod_state, int type) +{ + size_t zstd_ret; + if (type == DICT_TYPE_DIGESTED) { + /* Get ZSTD_DDict */ + ZSTD_DDict *d_dict = _get_DDict(zd); + if (d_dict == NULL) { + return -1; + } + /* Reference a prepared dictionary */ + zstd_ret = ZSTD_DCtx_refDDict(self->dctx, d_dict); + } + else if (type == DICT_TYPE_UNDIGESTED) { + /* Load a dictionary */ + zstd_ret = ZSTD_DCtx_loadDictionary(self->dctx, zd->dict_buffer, + zd->dict_len); + } + else if (type == DICT_TYPE_PREFIX) { + /* Load a prefix */ + zstd_ret = ZSTD_DCtx_refPrefix(self->dctx, zd->dict_buffer, + zd->dict_len); + } + else { + /* Impossible code path */ + PyErr_SetString(PyExc_SystemError, + "load_d_dict() impossible code path"); + return -1; + } + + /* Check error */ + if (ZSTD_isError(zstd_ret)) { + set_zstd_error(mod_state, ERR_LOAD_D_DICT, zstd_ret); + return -1; + } + return 0; +} + +/* Load dictionary or prefix to decompression context */ +static int +_zstd_load_d_dict(ZstdDecompressor *self, PyObject *dict) +{ + _zstd_state* mod_state = PyType_GetModuleState(Py_TYPE(self)); + /* When decompressing, use digested dictionary by default. */ + int type = DICT_TYPE_DIGESTED; + ZstdDict *zd = _Py_parse_zstd_dict(mod_state, dict, &type); + if (zd == NULL) { + return -1; + } + int ret; + PyMutex_Lock(&zd->lock); + ret = _zstd_load_impl(self, zd, mod_state, type); + PyMutex_Unlock(&zd->lock); + return ret; +} + +/* + Decompress implementation in pseudo code: + + initialize_output_buffer + while True: + decompress_data + set_object_flag # .eof + + if output_buffer_exhausted: + if output_buffer_reached_max_length: + finish + grow_output_buffer + elif input_buffer_exhausted: + finish + + ZSTD_decompressStream()'s size_t return value: + - 0 when a frame is completely decoded and fully flushed, + zstd's internal buffer has no data. + - An error code, which can be tested using ZSTD_isError(). + - Or any other value > 0, which means there is still some decoding or + flushing to do to complete current frame. + + Note, decompressing "an empty input" in any case will make it > 0. +*/ +static PyObject * +decompress_lock_held(ZstdDecompressor *self, ZSTD_inBuffer *in, + Py_ssize_t max_length) +{ + size_t zstd_ret; + ZSTD_outBuffer out; + _BlocksOutputBuffer buffer = {.list = NULL}; + PyObject *ret; + + /* Initialize the output buffer */ + if (_OutputBuffer_InitAndGrow(&buffer, &out, max_length) < 0) { + goto error; + } + assert(out.pos == 0); + + while (1) { + /* Decompress */ + Py_BEGIN_ALLOW_THREADS + zstd_ret = ZSTD_decompressStream(self->dctx, &out, in); + Py_END_ALLOW_THREADS + + /* Check error */ + if (ZSTD_isError(zstd_ret)) { + _zstd_state* mod_state = PyType_GetModuleState(Py_TYPE(self)); + set_zstd_error(mod_state, ERR_DECOMPRESS, zstd_ret); + goto error; + } + + /* Set .eof flag */ + if (zstd_ret == 0) { + /* Stop when a frame is decompressed */ + self->eof = 1; + break; + } + + /* Need to check out before in. Maybe zstd's internal buffer still has + a few bytes that can be output, grow the buffer and continue. */ + if (out.pos == out.size) { + /* Output buffer exhausted */ + + /* Output buffer reached max_length */ + if (_OutputBuffer_ReachedMaxLength(&buffer, &out)) { + break; + } + + /* Grow output buffer */ + if (_OutputBuffer_Grow(&buffer, &out) < 0) { + goto error; + } + assert(out.pos == 0); + + } + else if (in->pos == in->size) { + /* Finished */ + break; + } + } + + /* Return a bytes object */ + ret = _OutputBuffer_Finish(&buffer, &out); + if (ret != NULL) { + return ret; + } + +error: + _OutputBuffer_OnError(&buffer); + return NULL; +} + +static void +decompressor_reset_session_lock_held(ZstdDecompressor *self) +{ + assert(PyMutex_IsLocked(&self->lock)); + + /* Reset variables */ + self->in_begin = 0; + self->in_end = 0; + + Py_CLEAR(self->unused_data); + + /* Reset variables in one operation */ + self->needs_input = 1; + self->eof = 0; + + /* Resetting session is guaranteed to never fail */ + ZSTD_DCtx_reset(self->dctx, ZSTD_reset_session_only); +} + +static PyObject * +stream_decompress_lock_held(ZstdDecompressor *self, Py_buffer *data, + Py_ssize_t max_length) +{ + assert(PyMutex_IsLocked(&self->lock)); + ZSTD_inBuffer in; + PyObject *ret = NULL; + int use_input_buffer; + + /* Check .eof flag */ + if (self->eof) { + PyErr_SetString(PyExc_EOFError, + "Already at the end of a Zstandard frame."); + assert(ret == NULL); + return NULL; + } + + /* Prepare input buffer w/wo unconsumed data */ + if (self->in_begin == self->in_end) { + /* No unconsumed data */ + use_input_buffer = 0; + + in.src = data->buf; + in.size = data->len; + in.pos = 0; + } + else if (data->len == 0) { + /* Has unconsumed data, fast path for b'' */ + assert(self->in_begin < self->in_end); + + use_input_buffer = 1; + + in.src = self->input_buffer + self->in_begin; + in.size = self->in_end - self->in_begin; + in.pos = 0; + } + else { + /* Has unconsumed data */ + use_input_buffer = 1; + + /* Unconsumed data size in input_buffer */ + size_t used_now = self->in_end - self->in_begin; + assert(self->in_end > self->in_begin); + + /* Number of bytes we can append to input buffer */ + size_t avail_now = self->input_buffer_size - self->in_end; + assert(self->input_buffer_size >= self->in_end); + + /* Number of bytes we can append if we move existing contents to + beginning of buffer */ + size_t avail_total = self->input_buffer_size - used_now; + assert(self->input_buffer_size >= used_now); + + if (avail_total < (size_t) data->len) { + char *tmp; + size_t new_size = used_now + data->len; + + /* Allocate with new size */ + tmp = PyMem_Malloc(new_size); + if (tmp == NULL) { + PyErr_NoMemory(); + goto error; + } + + /* Copy unconsumed data to the beginning of new buffer */ + memcpy(tmp, + self->input_buffer + self->in_begin, + used_now); + + /* Switch to new buffer */ + PyMem_Free(self->input_buffer); + self->input_buffer = tmp; + self->input_buffer_size = new_size; + + /* Set begin & end position */ + self->in_begin = 0; + self->in_end = used_now; + } + else if (avail_now < (size_t) data->len) { + /* Move unconsumed data to the beginning. + Overlap is possible, so use memmove(). */ + memmove(self->input_buffer, + self->input_buffer + self->in_begin, + used_now); + + /* Set begin & end position */ + self->in_begin = 0; + self->in_end = used_now; + } + + /* Copy data to input buffer */ + memcpy(self->input_buffer + self->in_end, data->buf, data->len); + self->in_end += data->len; + + in.src = self->input_buffer + self->in_begin; + in.size = used_now + data->len; + in.pos = 0; + } + assert(in.pos == 0); + + /* Decompress */ + ret = decompress_lock_held(self, &in, max_length); + if (ret == NULL) { + goto error; + } + + /* Unconsumed input data */ + if (in.pos == in.size) { + if (Py_SIZE(ret) == max_length || self->eof) { + self->needs_input = 0; + } + else { + self->needs_input = 1; + } + + if (use_input_buffer) { + /* Clear input_buffer */ + self->in_begin = 0; + self->in_end = 0; + } + } + else { + size_t data_size = in.size - in.pos; + + self->needs_input = 0; + + if (!use_input_buffer) { + /* Discard buffer if it's too small + (resizing it may needlessly copy the current contents) */ + if (self->input_buffer != NULL + && self->input_buffer_size < data_size) + { + PyMem_Free(self->input_buffer); + self->input_buffer = NULL; + self->input_buffer_size = 0; + } + + /* Allocate if necessary */ + if (self->input_buffer == NULL) { + self->input_buffer = PyMem_Malloc(data_size); + if (self->input_buffer == NULL) { + PyErr_NoMemory(); + goto error; + } + self->input_buffer_size = data_size; + } + + /* Copy unconsumed data */ + memcpy(self->input_buffer, (char*)in.src + in.pos, data_size); + self->in_begin = 0; + self->in_end = data_size; + } + else { + /* Use input buffer */ + self->in_begin += in.pos; + } + } + + return ret; + +error: + /* Reset decompressor's states/session */ + decompressor_reset_session_lock_held(self); + + Py_CLEAR(ret); + return NULL; +} + + +/*[clinic input] +@classmethod +_zstd.ZstdDecompressor.__new__ as _zstd_ZstdDecompressor_new + zstd_dict: object = None + A ZstdDict object, a pre-trained Zstandard dictionary. + options: object = None + A dict object that contains advanced decompression parameters. + +Create a decompressor object for decompressing data incrementally. + +Thread-safe at method level. For one-shot decompression, use the decompress() +function instead. +[clinic start generated code]*/ + +static PyObject * +_zstd_ZstdDecompressor_new_impl(PyTypeObject *type, PyObject *zstd_dict, + PyObject *options) +/*[clinic end generated code: output=590ca65c1102ff4a input=213daa57e3ea4062]*/ +{ + ZstdDecompressor* self = PyObject_GC_New(ZstdDecompressor, type); + if (self == NULL) { + goto error; + } + + self->input_buffer = NULL; + self->input_buffer_size = 0; + self->in_begin = -1; + self->in_end = -1; + self->unused_data = NULL; + self->eof = 0; + self->dict = NULL; + self->lock = (PyMutex){0}; + + /* needs_input flag */ + self->needs_input = 1; + + /* Decompression context */ + self->dctx = ZSTD_createDCtx(); + if (self->dctx == NULL) { + _zstd_state* mod_state = PyType_GetModuleState(Py_TYPE(self)); + if (mod_state != NULL) { + PyErr_SetString(mod_state->ZstdError, + "Unable to create ZSTD_DCtx instance."); + } + goto error; + } + + /* Load Zstandard dictionary to decompression context */ + if (zstd_dict != Py_None) { + if (_zstd_load_d_dict(self, zstd_dict) < 0) { + goto error; + } + Py_INCREF(zstd_dict); + self->dict = zstd_dict; + } + + /* Set options dictionary */ + if (options != Py_None) { + if (_zstd_set_d_parameters(self, options) < 0) { + goto error; + } + } + + // We can only start GC tracking once self->dict is set. + PyObject_GC_Track(self); + + return (PyObject*)self; + +error: + Py_XDECREF(self); + return NULL; +} + +static void +ZstdDecompressor_dealloc(PyObject *ob) +{ + ZstdDecompressor *self = ZstdDecompressor_CAST(ob); + + PyObject_GC_UnTrack(self); + + /* Free decompression context */ + if (self->dctx) { + ZSTD_freeDCtx(self->dctx); + } + + assert(!PyMutex_IsLocked(&self->lock)); + + /* Py_CLEAR the dict after free decompression context */ + Py_CLEAR(self->dict); + + /* Free unconsumed input data buffer */ + PyMem_Free(self->input_buffer); + + /* Free unused data */ + Py_CLEAR(self->unused_data); + + PyTypeObject *tp = Py_TYPE(self); + PyObject_GC_Del(ob); + Py_DECREF(tp); +} + +/*[clinic input] +@getter +_zstd.ZstdDecompressor.unused_data + +A bytes object of un-consumed input data. + +When ZstdDecompressor object stops after a frame is +decompressed, unused input data after the frame. Otherwise this will be b''. +[clinic start generated code]*/ + +static PyObject * +_zstd_ZstdDecompressor_unused_data_get_impl(ZstdDecompressor *self) +/*[clinic end generated code: output=f3a20940f11b6b09 input=54d41ecd681a3444]*/ +{ + PyObject *ret; + + PyMutex_Lock(&self->lock); + + if (!self->eof) { + PyMutex_Unlock(&self->lock); + return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES); + } + else { + if (self->unused_data == NULL) { + self->unused_data = PyBytes_FromStringAndSize( + self->input_buffer + self->in_begin, + self->in_end - self->in_begin); + ret = self->unused_data; + Py_XINCREF(ret); + } + else { + ret = self->unused_data; + Py_INCREF(ret); + } + } + + PyMutex_Unlock(&self->lock); + return ret; +} + +/*[clinic input] +_zstd.ZstdDecompressor.decompress + + data: Py_buffer + A bytes-like object, Zstandard data to be decompressed. + max_length: Py_ssize_t = -1 + Maximum size of returned data. When it is negative, the size of + output buffer is unlimited. When it is nonnegative, returns at + most max_length bytes of decompressed data. + +Decompress *data*, returning uncompressed bytes if possible, or b'' otherwise. + +If *max_length* is nonnegative, returns at most *max_length* bytes of +decompressed data. If this limit is reached and further output can be +produced, *self.needs_input* will be set to ``False``. In this case, the next +call to *decompress()* may provide *data* as b'' to obtain more of the output. + +If all of the input data was decompressed and returned (either because this +was less than *max_length* bytes, or because *max_length* was negative), +*self.needs_input* will be set to True. + +Attempting to decompress data after the end of a frame is reached raises an +EOFError. Any data found after the end of the frame is ignored and saved in +the self.unused_data attribute. +[clinic start generated code]*/ + +static PyObject * +_zstd_ZstdDecompressor_decompress_impl(ZstdDecompressor *self, + Py_buffer *data, + Py_ssize_t max_length) +/*[clinic end generated code: output=a4302b3c940dbec6 input=6463dfdf98091caa]*/ +{ + PyObject *ret; + /* Thread-safe code */ + PyMutex_Lock(&self->lock); + ret = stream_decompress_lock_held(self, data, max_length); + PyMutex_Unlock(&self->lock); + return ret; +} + +static PyMethodDef ZstdDecompressor_methods[] = { + _ZSTD_ZSTDDECOMPRESSOR_DECOMPRESS_METHODDEF + {NULL, NULL} +}; + +PyDoc_STRVAR(ZstdDecompressor_eof_doc, +"True means the end of the first frame has been reached. If decompress data\n" +"after that, an EOFError exception will be raised."); + +PyDoc_STRVAR(ZstdDecompressor_needs_input_doc, +"If the max_length output limit in .decompress() method has been reached,\n" +"and the decompressor has (or may has) unconsumed input data, it will be set\n" +"to False. In this case, passing b'' to the .decompress() method may output\n" +"further data."); + +static PyMemberDef ZstdDecompressor_members[] = { + {"eof", Py_T_BOOL, offsetof(ZstdDecompressor, eof), + Py_READONLY, ZstdDecompressor_eof_doc}, + {"needs_input", Py_T_BOOL, offsetof(ZstdDecompressor, needs_input), + Py_READONLY, ZstdDecompressor_needs_input_doc}, + {NULL} +}; + +static PyGetSetDef ZstdDecompressor_getset[] = { + _ZSTD_ZSTDDECOMPRESSOR_UNUSED_DATA_GETSETDEF + {NULL} +}; + +static int +ZstdDecompressor_traverse(PyObject *ob, visitproc visit, void *arg) +{ + ZstdDecompressor *self = ZstdDecompressor_CAST(ob); + Py_VISIT(self->dict); + return 0; +} + +static int +ZstdDecompressor_clear(PyObject *ob) +{ + ZstdDecompressor *self = ZstdDecompressor_CAST(ob); + Py_CLEAR(self->dict); + Py_CLEAR(self->unused_data); + return 0; +} + +static PyType_Slot ZstdDecompressor_slots[] = { + {Py_tp_new, _zstd_ZstdDecompressor_new}, + {Py_tp_dealloc, ZstdDecompressor_dealloc}, + {Py_tp_methods, ZstdDecompressor_methods}, + {Py_tp_members, ZstdDecompressor_members}, + {Py_tp_getset, ZstdDecompressor_getset}, + {Py_tp_doc, (void *)_zstd_ZstdDecompressor_new__doc__}, + {Py_tp_traverse, ZstdDecompressor_traverse}, + {Py_tp_clear, ZstdDecompressor_clear}, + {0, 0} +}; + +PyType_Spec zstd_decompressor_type_spec = { + .name = "compression.zstd.ZstdDecompressor", + .basicsize = sizeof(ZstdDecompressor), + .flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE + | Py_TPFLAGS_HAVE_GC, + .slots = ZstdDecompressor_slots, +}; |