aboutsummaryrefslogtreecommitdiffstatshomepage
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2024-06-24 18:07:07 +0300
committerGitHub <noreply@github.com>2024-06-24 18:07:07 +0300
commit6eb23b1311e7eebf2459076703460ee7f8044f05 (patch)
treea125eacbb2e6e8f14ed25e719b1107da9f92d527
parent22b8a35d6e6660cf7457ed6636cb8c12fff7e8e7 (diff)
downloadcpython-6eb23b1311e7eebf2459076703460ee7f8044f05.tar.gz
cpython-6eb23b1311e7eebf2459076703460ee7f8044f05.zip
gh-70278: Fix PyUnicode_FromFormat() with precision for %s and %V (GH-120365)
PyUnicode_FromFormat() no longer produces the ending \ufffd character for truncated C string when use precision with %s and %V. It now truncates the string before the start of truncated multibyte sequences.
-rw-r--r--Lib/test/test_capi/test_unicode.py46
-rw-r--r--Misc/NEWS.d/next/C API/2024-06-11-21-38-32.gh-issue-70278.WDE4zM.rst4
-rw-r--r--Objects/unicodeobject.c13
3 files changed, 59 insertions, 4 deletions
diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py
index 36106b0730d..48a802c3f8b 100644
--- a/Lib/test/test_capi/test_unicode.py
+++ b/Lib/test/test_capi/test_unicode.py
@@ -419,8 +419,29 @@ class CAPITest(unittest.TestCase):
# truncated string
check_format('abc',
b'%.3s', b'abcdef')
+ check_format('abc[',
+ b'%.6s', 'abc[\u20ac]'.encode('utf8'))
+ check_format('abc[\u20ac',
+ b'%.7s', 'abc[\u20ac]'.encode('utf8'))
check_format('abc[\ufffd',
- b'%.5s', 'abc[\u20ac]'.encode('utf8'))
+ b'%.5s', b'abc[\xff]')
+ check_format('abc[',
+ b'%.6s', b'abc[\xe2\x82]')
+ check_format('abc[\ufffd]',
+ b'%.7s', b'abc[\xe2\x82]')
+ check_format('abc[\ufffd',
+ b'%.7s', b'abc[\xe2\x82\0')
+ check_format(' abc[',
+ b'%10.6s', 'abc[\u20ac]'.encode('utf8'))
+ check_format(' abc[\u20ac',
+ b'%10.7s', 'abc[\u20ac]'.encode('utf8'))
+ check_format(' abc[\ufffd',
+ b'%10.5s', b'abc[\xff]')
+ check_format(' abc[',
+ b'%10.6s', b'abc[\xe2\x82]')
+ check_format(' abc[\ufffd]',
+ b'%10.7s', b'abc[\xe2\x82]')
+
check_format("'\\u20acABC'",
b'%A', '\u20acABC')
check_format("'\\u20",
@@ -433,10 +454,31 @@ class CAPITest(unittest.TestCase):
b'%.3S', '\u20acABCDEF')
check_format('\u20acAB',
b'%.3U', '\u20acABCDEF')
+
check_format('\u20acAB',
b'%.3V', '\u20acABCDEF', None)
+ check_format('abc[',
+ b'%.6V', None, 'abc[\u20ac]'.encode('utf8'))
+ check_format('abc[\u20ac',
+ b'%.7V', None, 'abc[\u20ac]'.encode('utf8'))
check_format('abc[\ufffd',
- b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
+ b'%.5V', None, b'abc[\xff]')
+ check_format('abc[',
+ b'%.6V', None, b'abc[\xe2\x82]')
+ check_format('abc[\ufffd]',
+ b'%.7V', None, b'abc[\xe2\x82]')
+ check_format(' abc[',
+ b'%10.6V', None, 'abc[\u20ac]'.encode('utf8'))
+ check_format(' abc[\u20ac',
+ b'%10.7V', None, 'abc[\u20ac]'.encode('utf8'))
+ check_format(' abc[\ufffd',
+ b'%10.5V', None, b'abc[\xff]')
+ check_format(' abc[',
+ b'%10.6V', None, b'abc[\xe2\x82]')
+ check_format(' abc[\ufffd]',
+ b'%10.7V', None, b'abc[\xe2\x82]')
+ check_format(' abc[\ufffd',
+ b'%10.7V', None, b'abc[\xe2\x82\0')
# following tests comes from #7330
# test width modifier and precision modifier with %S
diff --git a/Misc/NEWS.d/next/C API/2024-06-11-21-38-32.gh-issue-70278.WDE4zM.rst b/Misc/NEWS.d/next/C API/2024-06-11-21-38-32.gh-issue-70278.WDE4zM.rst
new file mode 100644
index 00000000000..1eca36a86bc
--- /dev/null
+++ b/Misc/NEWS.d/next/C API/2024-06-11-21-38-32.gh-issue-70278.WDE4zM.rst
@@ -0,0 +1,4 @@
+:c:func:`PyUnicode_FromFormat` no longer produces the ending ``\ufffd``
+character for truncated C string when use precision with ``%s`` and ``%V``.
+It now truncates the string before the start of truncated multibyte
+sequences.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 279cdaa668e..d11a9dca14b 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -2581,6 +2581,7 @@ unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
Py_ssize_t width, Py_ssize_t precision, int flags)
{
/* UTF-8 */
+ Py_ssize_t *pconsumed = NULL;
Py_ssize_t length;
if (precision == -1) {
length = strlen(str);
@@ -2590,15 +2591,23 @@ unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
while (length < precision && str[length]) {
length++;
}
+ if (length == precision) {
+ /* The input string is not NUL-terminated. If it ends with an
+ * incomplete UTF-8 sequence, truncate the string just before it.
+ * Incomplete sequences in the middle and sequences which cannot
+ * be valid prefixes are still treated as errors and replaced
+ * with \xfffd. */
+ pconsumed = &length;
+ }
}
if (width < 0) {
return unicode_decode_utf8_writer(writer, str, length,
- _Py_ERROR_REPLACE, "replace", NULL);
+ _Py_ERROR_REPLACE, "replace", pconsumed);
}
PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
- "replace", NULL);
+ "replace", pconsumed);
if (unicode == NULL)
return -1;