diff options
author | Bénédikt Tran <10796600+picnixz@users.noreply.github.com> | 2025-02-14 18:34:32 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-02-14 18:34:32 +0100 |
commit | 1775091dc163d1fa76c33b01b9c82dc2430ffac8 (patch) | |
tree | c453f3722fd8e280dfca462cbfdbc67efc050ea7 /Python/codecs.c | |
parent | 303043f5062c1e7ffb7907abde61dbf13c98f8e9 (diff) | |
download | cpython-1775091dc163d1fa76c33b01b9c82dc2430ffac8.tar.gz cpython-1775091dc163d1fa76c33b01b9c82dc2430ffac8.zip |
gh-129173: Use `_PyUnicodeError_GetParams` in `PyCodec_SurrogatePassErrors` (GH-129134)
Diffstat (limited to 'Python/codecs.c')
-rw-r--r-- | Python/codecs.c | 282 |
1 files changed, 162 insertions, 120 deletions
diff --git a/Python/codecs.c b/Python/codecs.c index 6c9f8222079..406d48b56dd 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -1095,7 +1095,7 @@ PyObject *PyCodec_NameReplaceErrors(PyObject *exc) #define ENC_UTF32LE 4 static int -get_standard_encoding(const char *encoding, int *bytelength) +get_standard_encoding_impl(const char *encoding, int *bytelength) { if (Py_TOLOWER(encoding[0]) == 'u' && Py_TOLOWER(encoding[1]) == 't' && @@ -1153,165 +1153,204 @@ get_standard_encoding(const char *encoding, int *bytelength) return ENC_UNKNOWN; } -/* This handler is declared static until someone demonstrates - a need to call it directly. */ + +static int +get_standard_encoding(PyObject *encoding, int *code, int *bytelength) +{ + const char *encoding_cstr = PyUnicode_AsUTF8(encoding); + if (encoding_cstr == NULL) { + return -1; + } + *code = get_standard_encoding_impl(encoding_cstr, bytelength); + return 0; +} + + +// --- handler: 'surrogatepass' ----------------------------------------------- + static PyObject * -PyCodec_SurrogatePassErrors(PyObject *exc) +_PyCodec_SurrogatePassUnicodeEncodeError(PyObject *exc) { - PyObject *restuple; - PyObject *object; - PyObject *encode; - const char *encoding; - int code; - int bytelength; - Py_ssize_t i; - Py_ssize_t start; - Py_ssize_t end; - PyObject *res; + PyObject *encoding = PyUnicodeEncodeError_GetEncoding(exc); + if (encoding == NULL) { + return NULL; + } + int code, bytelength; + int rc = get_standard_encoding(encoding, &code, &bytelength); + Py_DECREF(encoding); + if (rc < 0) { + return NULL; + } + if (code == ENC_UNKNOWN) { + goto bail; + } - if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { - unsigned char *outp; - if (PyUnicodeEncodeError_GetStart(exc, &start)) - return NULL; - if (PyUnicodeEncodeError_GetEnd(exc, &end)) - return NULL; - if (!(object = PyUnicodeEncodeError_GetObject(exc))) - return NULL; - if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) { - Py_DECREF(object); - return NULL; - } - if (!(encoding = PyUnicode_AsUTF8(encode))) { - Py_DECREF(object); - Py_DECREF(encode); - return NULL; - } - code = get_standard_encoding(encoding, &bytelength); - Py_DECREF(encode); - if (code == ENC_UNKNOWN) { - /* Not supported, fail with original exception */ - PyErr_SetObject(PyExceptionInstance_Class(exc), exc); - Py_DECREF(object); - return NULL; - } + PyObject *obj; + Py_ssize_t objlen, start, end, slen; + if (_PyUnicodeError_GetParams(exc, + &obj, &objlen, + &start, &end, &slen, false) < 0) + { + return NULL; + } - if (end - start > PY_SSIZE_T_MAX / bytelength) - end = start + PY_SSIZE_T_MAX / bytelength; - res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start)); - if (!res) { - Py_DECREF(object); - return NULL; + if (slen > PY_SSIZE_T_MAX / bytelength) { + end = start + PY_SSIZE_T_MAX / bytelength; + end = Py_MIN(end, objlen); + slen = Py_MAX(0, end - start); + } + + PyObject *res = PyBytes_FromStringAndSize(NULL, bytelength * slen); + if (res == NULL) { + Py_DECREF(obj); + return NULL; + } + + unsigned char *outp = (unsigned char *)PyBytes_AsString(res); + for (Py_ssize_t i = start; i < end; i++) { + /* object is guaranteed to be "ready" */ + Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i); + if (!Py_UNICODE_IS_SURROGATE(ch)) { + /* Not a surrogate, fail with original exception */ + Py_DECREF(obj); + Py_DECREF(res); + goto bail; } - outp = (unsigned char*)PyBytes_AsString(res); - for (i = start; i < end; i++) { - /* object is guaranteed to be "ready" */ - Py_UCS4 ch = PyUnicode_READ_CHAR(object, i); - if (!Py_UNICODE_IS_SURROGATE(ch)) { - /* Not a surrogate, fail with original exception */ - PyErr_SetObject(PyExceptionInstance_Class(exc), exc); - Py_DECREF(res); - Py_DECREF(object); - return NULL; - } - switch (code) { - case ENC_UTF8: + switch (code) { + case ENC_UTF8: { *outp++ = (unsigned char)(0xe0 | (ch >> 12)); *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f)); *outp++ = (unsigned char)(0x80 | (ch & 0x3f)); break; - case ENC_UTF16LE: - *outp++ = (unsigned char) ch; + } + case ENC_UTF16LE: { + *outp++ = (unsigned char)ch; *outp++ = (unsigned char)(ch >> 8); break; - case ENC_UTF16BE: + } + case ENC_UTF16BE: { *outp++ = (unsigned char)(ch >> 8); - *outp++ = (unsigned char) ch; + *outp++ = (unsigned char)ch; break; - case ENC_UTF32LE: - *outp++ = (unsigned char) ch; + } + case ENC_UTF32LE: { + *outp++ = (unsigned char)ch; *outp++ = (unsigned char)(ch >> 8); *outp++ = (unsigned char)(ch >> 16); *outp++ = (unsigned char)(ch >> 24); break; - case ENC_UTF32BE: + } + case ENC_UTF32BE: { *outp++ = (unsigned char)(ch >> 24); *outp++ = (unsigned char)(ch >> 16); *outp++ = (unsigned char)(ch >> 8); - *outp++ = (unsigned char) ch; + *outp++ = (unsigned char)ch; break; } } - restuple = Py_BuildValue("(On)", res, end); - Py_DECREF(res); - Py_DECREF(object); - return restuple; } - else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { - const unsigned char *p; - Py_UCS4 ch = 0; - if (PyUnicodeDecodeError_GetStart(exc, &start)) - return NULL; - if (PyUnicodeDecodeError_GetEnd(exc, &end)) - return NULL; - if (!(object = PyUnicodeDecodeError_GetObject(exc))) - return NULL; - p = (const unsigned char*)PyBytes_AS_STRING(object); - if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) { - Py_DECREF(object); - return NULL; - } - if (!(encoding = PyUnicode_AsUTF8(encode))) { - Py_DECREF(object); - Py_DECREF(encode); - return NULL; - } - code = get_standard_encoding(encoding, &bytelength); - Py_DECREF(encode); - if (code == ENC_UNKNOWN) { - /* Not supported, fail with original exception */ - PyErr_SetObject(PyExceptionInstance_Class(exc), exc); - Py_DECREF(object); - return NULL; - } - /* Try decoding a single surrogate character. If - there are more, let the codec call us again. */ - p += start; - if (PyBytes_GET_SIZE(object) - start >= bytelength) { - switch (code) { - case ENC_UTF8: + Py_DECREF(obj); + PyObject *restuple = Py_BuildValue("(Nn)", res, end); + return restuple; + +bail: + PyErr_SetObject(PyExceptionInstance_Class(exc), exc); + return NULL; +} + + +static PyObject * +_PyCodec_SurrogatePassUnicodeDecodeError(PyObject *exc) +{ + PyObject *encoding = PyUnicodeDecodeError_GetEncoding(exc); + if (encoding == NULL) { + return NULL; + } + int code, bytelength; + int rc = get_standard_encoding(encoding, &code, &bytelength); + Py_DECREF(encoding); + if (rc < 0) { + return NULL; + } + if (code == ENC_UNKNOWN) { + goto bail; + } + + PyObject *obj; + Py_ssize_t objlen, start, end, slen; + if (_PyUnicodeError_GetParams(exc, + &obj, &objlen, + &start, &end, &slen, true) < 0) + { + return NULL; + } + + /* Try decoding a single surrogate character. If + there are more, let the codec call us again. */ + Py_UCS4 ch = 0; + const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj); + p += start; + + if (objlen - start >= bytelength) { + switch (code) { + case ENC_UTF8: { if ((p[0] & 0xf0) == 0xe0 && (p[1] & 0xc0) == 0x80 && - (p[2] & 0xc0) == 0x80) { + (p[2] & 0xc0) == 0x80) + { /* it's a three-byte code */ - ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f); + ch = ((p[0] & 0x0f) << 12) + + ((p[1] & 0x3f) << 6) + + (p[2] & 0x3f); } break; - case ENC_UTF16LE: + } + case ENC_UTF16LE: { ch = p[1] << 8 | p[0]; break; - case ENC_UTF16BE: + } + case ENC_UTF16BE: { ch = p[0] << 8 | p[1]; break; - case ENC_UTF32LE: + } + case ENC_UTF32LE: { ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0]; break; - case ENC_UTF32BE: + } + case ENC_UTF32BE: { ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]; break; } } + } + Py_DECREF(obj); + if (!Py_UNICODE_IS_SURROGATE(ch)) { + goto bail; + } - Py_DECREF(object); - if (!Py_UNICODE_IS_SURROGATE(ch)) { - /* it's not a surrogate - fail */ - PyErr_SetObject(PyExceptionInstance_Class(exc), exc); - return NULL; - } - res = PyUnicode_FromOrdinal(ch); - if (res == NULL) - return NULL; - return Py_BuildValue("(Nn)", res, start + bytelength); + PyObject *res = PyUnicode_FromOrdinal(ch); + if (res == NULL) { + return NULL; + } + return Py_BuildValue("(Nn)", res, start + bytelength); + +bail: + PyErr_SetObject(PyExceptionInstance_Class(exc), exc); + return NULL; +} + + +/* This handler is declared static until someone demonstrates + a need to call it directly. */ +static PyObject * +PyCodec_SurrogatePassErrors(PyObject *exc) +{ + if (_PyIsUnicodeEncodeError(exc)) { + return _PyCodec_SurrogatePassUnicodeEncodeError(exc); + } + else if (_PyIsUnicodeDecodeError(exc)) { + return _PyCodec_SurrogatePassUnicodeDecodeError(exc); } else { wrong_exception_type(exc); @@ -1319,6 +1358,7 @@ PyCodec_SurrogatePassErrors(PyObject *exc) } } + static PyObject * PyCodec_SurrogateEscapeErrors(PyObject *exc) { @@ -1438,11 +1478,13 @@ namereplace_errors(PyObject *Py_UNUSED(self), PyObject *exc) } -static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc) +static inline PyObject * +surrogatepass_errors(PyObject *Py_UNUSED(self), PyObject *exc) { return PyCodec_SurrogatePassErrors(exc); } + static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc) { return PyCodec_SurrogateEscapeErrors(exc); |