aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/Python
diff options
context:
space:
mode:
authorBénédikt Tran <10796600+picnixz@users.noreply.github.com>2025-02-08 16:01:57 +0100
committerGitHub <noreply@github.com>2025-02-08 16:01:57 +0100
commita56ead089caf4e975d7e36036ad98e932822d7dc (patch)
tree0672655fc0d8ccafd491701b26df97922ee199fb /Python
parent167cf3ace0db553dce50b2068dfae1f9eae4da6e (diff)
downloadcpython-a56ead089caf4e975d7e36036ad98e932822d7dc.tar.gz
cpython-a56ead089caf4e975d7e36036ad98e932822d7dc.zip
gh-129173: Use `_PyUnicodeError_GetParams` in `PyCodec_NameReplaceErrors` (GH-129135)
Diffstat (limited to 'Python')
-rw-r--r--Python/codecs.c234
1 files changed, 123 insertions, 111 deletions
diff --git a/Python/codecs.c b/Python/codecs.c
index 53680a79082..6c9f8222079 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -676,6 +676,60 @@ wrong_exception_type(PyObject *exc)
PyObject_TypeCheck(EXC, (PyTypeObject *)PyExc_UnicodeTranslateError)
+// --- codecs handlers: utilities ---------------------------------------------
+
+/*
+ * Return the number of characters (including special prefixes)
+ * needed to represent 'ch' by codec_handler_write_unicode_hex().
+ */
+static inline Py_ssize_t
+codec_handler_unicode_hex_width(Py_UCS4 ch)
+{
+ if (ch >= 0x10000) {
+ // format: '\\' + 'U' + 8 hex digits
+ return 1 + 1 + 8;
+ }
+ else if (ch >= 0x100) {
+ // format: '\\' + 'u' + 4 hex digits
+ return 1 + 1 + 4;
+ }
+ else {
+ // format: '\\' + 'x' + 2 hex digits
+ return 1 + 1 + 2;
+ }
+}
+
+
+/*
+ * Write the hexadecimal representation of 'ch' to the buffer pointed by 'p'
+ * using 2, 4, or 8 characters prefixed by '\x', '\u', or '\U' respectively.
+ */
+static inline void
+codec_handler_write_unicode_hex(Py_UCS1 **p, Py_UCS4 ch)
+{
+ *(*p)++ = '\\';
+ if (ch >= 0x10000) {
+ *(*p)++ = 'U';
+ *(*p)++ = Py_hexdigits[(ch >> 28) & 0xf];
+ *(*p)++ = Py_hexdigits[(ch >> 24) & 0xf];
+ *(*p)++ = Py_hexdigits[(ch >> 20) & 0xf];
+ *(*p)++ = Py_hexdigits[(ch >> 16) & 0xf];
+ *(*p)++ = Py_hexdigits[(ch >> 12) & 0xf];
+ *(*p)++ = Py_hexdigits[(ch >> 8) & 0xf];
+ }
+ else if (ch >= 0x100) {
+ *(*p)++ = 'u';
+ *(*p)++ = Py_hexdigits[(ch >> 12) & 0xf];
+ *(*p)++ = Py_hexdigits[(ch >> 8) & 0xf];
+ }
+ else {
+ *(*p)++ = 'x';
+ }
+ *(*p)++ = Py_hexdigits[(ch >> 4) & 0xf];
+ *(*p)++ = Py_hexdigits[ch & 0xf];
+}
+
+
// --- handler: 'strict' ------------------------------------------------------
PyObject *PyCodec_StrictErrors(PyObject *exc)
@@ -942,17 +996,8 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
Py_ssize_t ressize = 0;
for (Py_ssize_t i = start; i < end; ++i) {
- /* object is guaranteed to be "ready" */
Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
- if (c >= 0x10000) {
- ressize += 1 + 1 + 8;
- }
- else if (c >= 0x100) {
- ressize += 1 + 1 + 4;
- }
- else {
- ressize += 1 + 1 + 2;
- }
+ ressize += codec_handler_unicode_hex_width(c);
}
PyObject *res = PyUnicode_New(ressize, 127);
if (res == NULL) {
@@ -962,122 +1007,86 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
for (Py_ssize_t i = start; i < end; ++i) {
Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
- *outp++ = '\\';
- if (c >= 0x00010000) {
- *outp++ = 'U';
- *outp++ = Py_hexdigits[(c >> 28) & 0xf];
- *outp++ = Py_hexdigits[(c >> 24) & 0xf];
- *outp++ = Py_hexdigits[(c >> 20) & 0xf];
- *outp++ = Py_hexdigits[(c >> 16) & 0xf];
- *outp++ = Py_hexdigits[(c >> 12) & 0xf];
- *outp++ = Py_hexdigits[(c >> 8) & 0xf];
- }
- else if (c >= 0x100) {
- *outp++ = 'u';
- *outp++ = Py_hexdigits[(c >> 12) & 0xf];
- *outp++ = Py_hexdigits[(c >> 8) & 0xf];
- }
- else {
- *outp++ = 'x';
- }
- *outp++ = Py_hexdigits[(c >> 4) & 0xf];
- *outp++ = Py_hexdigits[c & 0xf];
+ codec_handler_write_unicode_hex(&outp, c);
}
assert(_PyUnicode_CheckConsistency(res, 1));
Py_DECREF(obj);
return Py_BuildValue("(Nn)", res, end);
}
+
+// --- handler: 'namereplace' -------------------------------------------------
+
PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
{
- if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
- PyObject *restuple;
- PyObject *object;
- Py_ssize_t i;
- Py_ssize_t start;
- Py_ssize_t end;
- PyObject *res;
- Py_UCS1 *outp;
- Py_ssize_t ressize;
- int replsize;
- Py_UCS4 c;
- char buffer[256]; /* NAME_MAXLEN */
- if (PyUnicodeEncodeError_GetStart(exc, &start))
- return NULL;
- if (PyUnicodeEncodeError_GetEnd(exc, &end))
- return NULL;
- if (!(object = PyUnicodeEncodeError_GetObject(exc)))
- return NULL;
- _PyUnicode_Name_CAPI *ucnhash_capi = _PyUnicode_GetNameCAPI();
- if (ucnhash_capi == NULL) {
- return NULL;
+ if (!_PyIsUnicodeEncodeError(exc)) {
+ wrong_exception_type(exc);
+ return NULL;
+ }
+
+ _PyUnicode_Name_CAPI *ucnhash_capi = _PyUnicode_GetNameCAPI();
+ if (ucnhash_capi == NULL) {
+ return NULL;
+ }
+
+ PyObject *obj;
+ Py_ssize_t start, end;
+ if (_PyUnicodeError_GetParams(exc,
+ &obj, NULL,
+ &start, &end, NULL, false) < 0)
+ {
+ return NULL;
+ }
+
+ char buffer[256]; /* NAME_MAXLEN in unicodename_db.h */
+ Py_ssize_t imax = start, ressize = 0, replsize;
+ for (; imax < end; ++imax) {
+ Py_UCS4 c = PyUnicode_READ_CHAR(obj, imax);
+ if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
+ // If 'c' is recognized by getname(), the corresponding replacement
+ // is '\\' + 'N' + '{' + NAME + '}', i.e. 1 + 1 + 1 + len(NAME) + 1
+ // characters. Failures of getname() are ignored by the handler.
+ replsize = 1 + 1 + 1 + strlen(buffer) + 1;
}
- for (i = start, ressize = 0; i < end; ++i) {
- /* object is guaranteed to be "ready" */
- c = PyUnicode_READ_CHAR(object, i);
- if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
- replsize = 1+1+1+(int)strlen(buffer)+1;
- }
- else if (c >= 0x10000) {
- replsize = 1+1+8;
- }
- else if (c >= 0x100) {
- replsize = 1+1+4;
- }
- else
- replsize = 1+1+2;
- if (ressize > PY_SSIZE_T_MAX - replsize)
- break;
- ressize += replsize;
+ else {
+ replsize = codec_handler_unicode_hex_width(c);
}
- end = i;
- res = PyUnicode_New(ressize, 127);
- if (res==NULL)
- return NULL;
- for (i = start, outp = PyUnicode_1BYTE_DATA(res);
- i < end; ++i) {
- c = PyUnicode_READ_CHAR(object, i);
- *outp++ = '\\';
- if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
- *outp++ = 'N';
- *outp++ = '{';
- strcpy((char *)outp, buffer);
- outp += strlen(buffer);
- *outp++ = '}';
- continue;
- }
- if (c >= 0x00010000) {
- *outp++ = 'U';
- *outp++ = Py_hexdigits[(c>>28)&0xf];
- *outp++ = Py_hexdigits[(c>>24)&0xf];
- *outp++ = Py_hexdigits[(c>>20)&0xf];
- *outp++ = Py_hexdigits[(c>>16)&0xf];
- *outp++ = Py_hexdigits[(c>>12)&0xf];
- *outp++ = Py_hexdigits[(c>>8)&0xf];
- }
- else if (c >= 0x100) {
- *outp++ = 'u';
- *outp++ = Py_hexdigits[(c>>12)&0xf];
- *outp++ = Py_hexdigits[(c>>8)&0xf];
- }
- else
- *outp++ = 'x';
- *outp++ = Py_hexdigits[(c>>4)&0xf];
- *outp++ = Py_hexdigits[c&0xf];
+ if (ressize > PY_SSIZE_T_MAX - replsize) {
+ break;
}
-
- assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
- assert(_PyUnicode_CheckConsistency(res, 1));
- restuple = Py_BuildValue("(Nn)", res, end);
- Py_DECREF(object);
- return restuple;
+ ressize += replsize;
}
- else {
- wrong_exception_type(exc);
+
+ PyObject *res = PyUnicode_New(ressize, 127);
+ if (res == NULL) {
+ Py_DECREF(obj);
return NULL;
}
+
+ Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
+ for (Py_ssize_t i = start; i < imax; ++i) {
+ Py_UCS4 c = PyUnicode_READ_CHAR(obj, i);
+ if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
+ *outp++ = '\\';
+ *outp++ = 'N';
+ *outp++ = '{';
+ (void)strcpy((char *)outp, buffer);
+ outp += strlen(buffer);
+ *outp++ = '}';
+ }
+ else {
+ codec_handler_write_unicode_hex(&outp, c);
+ }
+ }
+
+ assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
+ assert(_PyUnicode_CheckConsistency(res, 1));
+ PyObject *restuple = Py_BuildValue("(Nn)", res, imax);
+ Py_DECREF(obj);
+ return restuple;
}
+
#define ENC_UNKNOWN -1
#define ENC_UTF8 0
#define ENC_UTF16BE 1
@@ -1421,11 +1430,14 @@ static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
return PyCodec_BackslashReplaceErrors(exc);
}
-static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
+
+static inline PyObject *
+namereplace_errors(PyObject *Py_UNUSED(self), PyObject *exc)
{
return PyCodec_NameReplaceErrors(exc);
}
+
static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
{
return PyCodec_SurrogatePassErrors(exc);