aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/Python/codecs.c
diff options
context:
space:
mode:
authorBénédikt Tran <10796600+picnixz@users.noreply.github.com>2025-02-14 18:34:32 +0100
committerGitHub <noreply@github.com>2025-02-14 18:34:32 +0100
commit1775091dc163d1fa76c33b01b9c82dc2430ffac8 (patch)
treec453f3722fd8e280dfca462cbfdbc67efc050ea7 /Python/codecs.c
parent303043f5062c1e7ffb7907abde61dbf13c98f8e9 (diff)
downloadcpython-1775091dc163d1fa76c33b01b9c82dc2430ffac8.tar.gz
cpython-1775091dc163d1fa76c33b01b9c82dc2430ffac8.zip
gh-129173: Use `_PyUnicodeError_GetParams` in `PyCodec_SurrogatePassErrors` (GH-129134)
Diffstat (limited to 'Python/codecs.c')
-rw-r--r--Python/codecs.c282
1 files changed, 162 insertions, 120 deletions
diff --git a/Python/codecs.c b/Python/codecs.c
index 6c9f8222079..406d48b56dd 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -1095,7 +1095,7 @@ PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
#define ENC_UTF32LE 4
static int
-get_standard_encoding(const char *encoding, int *bytelength)
+get_standard_encoding_impl(const char *encoding, int *bytelength)
{
if (Py_TOLOWER(encoding[0]) == 'u' &&
Py_TOLOWER(encoding[1]) == 't' &&
@@ -1153,165 +1153,204 @@ get_standard_encoding(const char *encoding, int *bytelength)
return ENC_UNKNOWN;
}
-/* This handler is declared static until someone demonstrates
- a need to call it directly. */
+
+static int
+get_standard_encoding(PyObject *encoding, int *code, int *bytelength)
+{
+ const char *encoding_cstr = PyUnicode_AsUTF8(encoding);
+ if (encoding_cstr == NULL) {
+ return -1;
+ }
+ *code = get_standard_encoding_impl(encoding_cstr, bytelength);
+ return 0;
+}
+
+
+// --- handler: 'surrogatepass' -----------------------------------------------
+
static PyObject *
-PyCodec_SurrogatePassErrors(PyObject *exc)
+_PyCodec_SurrogatePassUnicodeEncodeError(PyObject *exc)
{
- PyObject *restuple;
- PyObject *object;
- PyObject *encode;
- const char *encoding;
- int code;
- int bytelength;
- Py_ssize_t i;
- Py_ssize_t start;
- Py_ssize_t end;
- PyObject *res;
+ PyObject *encoding = PyUnicodeEncodeError_GetEncoding(exc);
+ if (encoding == NULL) {
+ return NULL;
+ }
+ int code, bytelength;
+ int rc = get_standard_encoding(encoding, &code, &bytelength);
+ Py_DECREF(encoding);
+ if (rc < 0) {
+ return NULL;
+ }
+ if (code == ENC_UNKNOWN) {
+ goto bail;
+ }
- if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
- unsigned char *outp;
- if (PyUnicodeEncodeError_GetStart(exc, &start))
- return NULL;
- if (PyUnicodeEncodeError_GetEnd(exc, &end))
- return NULL;
- if (!(object = PyUnicodeEncodeError_GetObject(exc)))
- return NULL;
- if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
- Py_DECREF(object);
- return NULL;
- }
- if (!(encoding = PyUnicode_AsUTF8(encode))) {
- Py_DECREF(object);
- Py_DECREF(encode);
- return NULL;
- }
- code = get_standard_encoding(encoding, &bytelength);
- Py_DECREF(encode);
- if (code == ENC_UNKNOWN) {
- /* Not supported, fail with original exception */
- PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
- Py_DECREF(object);
- return NULL;
- }
+ PyObject *obj;
+ Py_ssize_t objlen, start, end, slen;
+ if (_PyUnicodeError_GetParams(exc,
+ &obj, &objlen,
+ &start, &end, &slen, false) < 0)
+ {
+ return NULL;
+ }
- if (end - start > PY_SSIZE_T_MAX / bytelength)
- end = start + PY_SSIZE_T_MAX / bytelength;
- res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
- if (!res) {
- Py_DECREF(object);
- return NULL;
+ if (slen > PY_SSIZE_T_MAX / bytelength) {
+ end = start + PY_SSIZE_T_MAX / bytelength;
+ end = Py_MIN(end, objlen);
+ slen = Py_MAX(0, end - start);
+ }
+
+ PyObject *res = PyBytes_FromStringAndSize(NULL, bytelength * slen);
+ if (res == NULL) {
+ Py_DECREF(obj);
+ return NULL;
+ }
+
+ unsigned char *outp = (unsigned char *)PyBytes_AsString(res);
+ for (Py_ssize_t i = start; i < end; i++) {
+ /* object is guaranteed to be "ready" */
+ Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
+ if (!Py_UNICODE_IS_SURROGATE(ch)) {
+ /* Not a surrogate, fail with original exception */
+ Py_DECREF(obj);
+ Py_DECREF(res);
+ goto bail;
}
- outp = (unsigned char*)PyBytes_AsString(res);
- for (i = start; i < end; i++) {
- /* object is guaranteed to be "ready" */
- Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
- if (!Py_UNICODE_IS_SURROGATE(ch)) {
- /* Not a surrogate, fail with original exception */
- PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
- Py_DECREF(res);
- Py_DECREF(object);
- return NULL;
- }
- switch (code) {
- case ENC_UTF8:
+ switch (code) {
+ case ENC_UTF8: {
*outp++ = (unsigned char)(0xe0 | (ch >> 12));
*outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
*outp++ = (unsigned char)(0x80 | (ch & 0x3f));
break;
- case ENC_UTF16LE:
- *outp++ = (unsigned char) ch;
+ }
+ case ENC_UTF16LE: {
+ *outp++ = (unsigned char)ch;
*outp++ = (unsigned char)(ch >> 8);
break;
- case ENC_UTF16BE:
+ }
+ case ENC_UTF16BE: {
*outp++ = (unsigned char)(ch >> 8);
- *outp++ = (unsigned char) ch;
+ *outp++ = (unsigned char)ch;
break;
- case ENC_UTF32LE:
- *outp++ = (unsigned char) ch;
+ }
+ case ENC_UTF32LE: {
+ *outp++ = (unsigned char)ch;
*outp++ = (unsigned char)(ch >> 8);
*outp++ = (unsigned char)(ch >> 16);
*outp++ = (unsigned char)(ch >> 24);
break;
- case ENC_UTF32BE:
+ }
+ case ENC_UTF32BE: {
*outp++ = (unsigned char)(ch >> 24);
*outp++ = (unsigned char)(ch >> 16);
*outp++ = (unsigned char)(ch >> 8);
- *outp++ = (unsigned char) ch;
+ *outp++ = (unsigned char)ch;
break;
}
}
- restuple = Py_BuildValue("(On)", res, end);
- Py_DECREF(res);
- Py_DECREF(object);
- return restuple;
}
- else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
- const unsigned char *p;
- Py_UCS4 ch = 0;
- if (PyUnicodeDecodeError_GetStart(exc, &start))
- return NULL;
- if (PyUnicodeDecodeError_GetEnd(exc, &end))
- return NULL;
- if (!(object = PyUnicodeDecodeError_GetObject(exc)))
- return NULL;
- p = (const unsigned char*)PyBytes_AS_STRING(object);
- if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
- Py_DECREF(object);
- return NULL;
- }
- if (!(encoding = PyUnicode_AsUTF8(encode))) {
- Py_DECREF(object);
- Py_DECREF(encode);
- return NULL;
- }
- code = get_standard_encoding(encoding, &bytelength);
- Py_DECREF(encode);
- if (code == ENC_UNKNOWN) {
- /* Not supported, fail with original exception */
- PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
- Py_DECREF(object);
- return NULL;
- }
- /* Try decoding a single surrogate character. If
- there are more, let the codec call us again. */
- p += start;
- if (PyBytes_GET_SIZE(object) - start >= bytelength) {
- switch (code) {
- case ENC_UTF8:
+ Py_DECREF(obj);
+ PyObject *restuple = Py_BuildValue("(Nn)", res, end);
+ return restuple;
+
+bail:
+ PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
+ return NULL;
+}
+
+
+static PyObject *
+_PyCodec_SurrogatePassUnicodeDecodeError(PyObject *exc)
+{
+ PyObject *encoding = PyUnicodeDecodeError_GetEncoding(exc);
+ if (encoding == NULL) {
+ return NULL;
+ }
+ int code, bytelength;
+ int rc = get_standard_encoding(encoding, &code, &bytelength);
+ Py_DECREF(encoding);
+ if (rc < 0) {
+ return NULL;
+ }
+ if (code == ENC_UNKNOWN) {
+ goto bail;
+ }
+
+ PyObject *obj;
+ Py_ssize_t objlen, start, end, slen;
+ if (_PyUnicodeError_GetParams(exc,
+ &obj, &objlen,
+ &start, &end, &slen, true) < 0)
+ {
+ return NULL;
+ }
+
+ /* Try decoding a single surrogate character. If
+ there are more, let the codec call us again. */
+ Py_UCS4 ch = 0;
+ const unsigned char *p = (const unsigned char *)PyBytes_AS_STRING(obj);
+ p += start;
+
+ if (objlen - start >= bytelength) {
+ switch (code) {
+ case ENC_UTF8: {
if ((p[0] & 0xf0) == 0xe0 &&
(p[1] & 0xc0) == 0x80 &&
- (p[2] & 0xc0) == 0x80) {
+ (p[2] & 0xc0) == 0x80)
+ {
/* it's a three-byte code */
- ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
+ ch = ((p[0] & 0x0f) << 12) +
+ ((p[1] & 0x3f) << 6) +
+ (p[2] & 0x3f);
}
break;
- case ENC_UTF16LE:
+ }
+ case ENC_UTF16LE: {
ch = p[1] << 8 | p[0];
break;
- case ENC_UTF16BE:
+ }
+ case ENC_UTF16BE: {
ch = p[0] << 8 | p[1];
break;
- case ENC_UTF32LE:
+ }
+ case ENC_UTF32LE: {
ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
break;
- case ENC_UTF32BE:
+ }
+ case ENC_UTF32BE: {
ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
break;
}
}
+ }
+ Py_DECREF(obj);
+ if (!Py_UNICODE_IS_SURROGATE(ch)) {
+ goto bail;
+ }
- Py_DECREF(object);
- if (!Py_UNICODE_IS_SURROGATE(ch)) {
- /* it's not a surrogate - fail */
- PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
- return NULL;
- }
- res = PyUnicode_FromOrdinal(ch);
- if (res == NULL)
- return NULL;
- return Py_BuildValue("(Nn)", res, start + bytelength);
+ PyObject *res = PyUnicode_FromOrdinal(ch);
+ if (res == NULL) {
+ return NULL;
+ }
+ return Py_BuildValue("(Nn)", res, start + bytelength);
+
+bail:
+ PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
+ return NULL;
+}
+
+
+/* This handler is declared static until someone demonstrates
+ a need to call it directly. */
+static PyObject *
+PyCodec_SurrogatePassErrors(PyObject *exc)
+{
+ if (_PyIsUnicodeEncodeError(exc)) {
+ return _PyCodec_SurrogatePassUnicodeEncodeError(exc);
+ }
+ else if (_PyIsUnicodeDecodeError(exc)) {
+ return _PyCodec_SurrogatePassUnicodeDecodeError(exc);
}
else {
wrong_exception_type(exc);
@@ -1319,6 +1358,7 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
}
}
+
static PyObject *
PyCodec_SurrogateEscapeErrors(PyObject *exc)
{
@@ -1438,11 +1478,13 @@ namereplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
}
-static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
+static inline PyObject *
+surrogatepass_errors(PyObject *Py_UNUSED(self), PyObject *exc)
{
return PyCodec_SurrogatePassErrors(exc);
}
+
static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
{
return PyCodec_SurrogateEscapeErrors(exc);