From 6eb23b1311e7eebf2459076703460ee7f8044f05 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 24 Jun 2024 18:07:07 +0300 Subject: [PATCH] gh-70278: Fix PyUnicode_FromFormat() with precision for %s and %V (GH-120365) PyUnicode_FromFormat() no longer produces the ending \ufffd character for truncated C string when use precision with %s and %V. It now truncates the string before the start of truncated multibyte sequences. --- Lib/test/test_capi/test_unicode.py | 46 ++++++++++++++++++- ...4-06-11-21-38-32.gh-issue-70278.WDE4zM.rst | 4 ++ Objects/unicodeobject.c | 13 +++++- 3 files changed, 59 insertions(+), 4 deletions(-) create mode 100644 Misc/NEWS.d/next/C API/2024-06-11-21-38-32.gh-issue-70278.WDE4zM.rst diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 36106b0730dd26..48a802c3f8bcb2 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -419,8 +419,29 @@ def check_format(expected, format, *args): # truncated string check_format('abc', b'%.3s', b'abcdef') + check_format('abc[', + b'%.6s', 'abc[\u20ac]'.encode('utf8')) + check_format('abc[\u20ac', + b'%.7s', 'abc[\u20ac]'.encode('utf8')) check_format('abc[\ufffd', - b'%.5s', 'abc[\u20ac]'.encode('utf8')) + b'%.5s', b'abc[\xff]') + check_format('abc[', + b'%.6s', b'abc[\xe2\x82]') + check_format('abc[\ufffd]', + b'%.7s', b'abc[\xe2\x82]') + check_format('abc[\ufffd', + b'%.7s', b'abc[\xe2\x82\0') + check_format(' abc[', + b'%10.6s', 'abc[\u20ac]'.encode('utf8')) + check_format(' abc[\u20ac', + b'%10.7s', 'abc[\u20ac]'.encode('utf8')) + check_format(' abc[\ufffd', + b'%10.5s', b'abc[\xff]') + check_format(' abc[', + b'%10.6s', b'abc[\xe2\x82]') + check_format(' abc[\ufffd]', + b'%10.7s', b'abc[\xe2\x82]') + check_format("'\\u20acABC'", b'%A', '\u20acABC') check_format("'\\u20", @@ -433,10 +454,31 @@ def check_format(expected, format, *args): b'%.3S', '\u20acABCDEF') check_format('\u20acAB', b'%.3U', '\u20acABCDEF') + check_format('\u20acAB', b'%.3V', '\u20acABCDEF', None) + check_format('abc[', + b'%.6V', None, 'abc[\u20ac]'.encode('utf8')) + check_format('abc[\u20ac', + b'%.7V', None, 'abc[\u20ac]'.encode('utf8')) check_format('abc[\ufffd', - b'%.5V', None, 'abc[\u20ac]'.encode('utf8')) + b'%.5V', None, b'abc[\xff]') + check_format('abc[', + b'%.6V', None, b'abc[\xe2\x82]') + check_format('abc[\ufffd]', + b'%.7V', None, b'abc[\xe2\x82]') + check_format(' abc[', + b'%10.6V', None, 'abc[\u20ac]'.encode('utf8')) + check_format(' abc[\u20ac', + b'%10.7V', None, 'abc[\u20ac]'.encode('utf8')) + check_format(' abc[\ufffd', + b'%10.5V', None, b'abc[\xff]') + check_format(' abc[', + b'%10.6V', None, b'abc[\xe2\x82]') + check_format(' abc[\ufffd]', + b'%10.7V', None, b'abc[\xe2\x82]') + check_format(' abc[\ufffd', + b'%10.7V', None, b'abc[\xe2\x82\0') # following tests comes from #7330 # test width modifier and precision modifier with %S diff --git a/Misc/NEWS.d/next/C API/2024-06-11-21-38-32.gh-issue-70278.WDE4zM.rst b/Misc/NEWS.d/next/C API/2024-06-11-21-38-32.gh-issue-70278.WDE4zM.rst new file mode 100644 index 00000000000000..1eca36a86bc97e --- /dev/null +++ b/Misc/NEWS.d/next/C API/2024-06-11-21-38-32.gh-issue-70278.WDE4zM.rst @@ -0,0 +1,4 @@ +:c:func:`PyUnicode_FromFormat` no longer produces the ending ``\ufffd`` +character for truncated C string when use precision with ``%s`` and ``%V``. +It now truncates the string before the start of truncated multibyte +sequences. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 279cdaa668e291..d11a9dca14b280 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2581,6 +2581,7 @@ unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str, Py_ssize_t width, Py_ssize_t precision, int flags) { /* UTF-8 */ + Py_ssize_t *pconsumed = NULL; Py_ssize_t length; if (precision == -1) { length = strlen(str); @@ -2590,15 +2591,23 @@ unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str, while (length < precision && str[length]) { length++; } + if (length == precision) { + /* The input string is not NUL-terminated. If it ends with an + * incomplete UTF-8 sequence, truncate the string just before it. + * Incomplete sequences in the middle and sequences which cannot + * be valid prefixes are still treated as errors and replaced + * with \xfffd. */ + pconsumed = &length; + } } if (width < 0) { return unicode_decode_utf8_writer(writer, str, length, - _Py_ERROR_REPLACE, "replace", NULL); + _Py_ERROR_REPLACE, "replace", pconsumed); } PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length, - "replace", NULL); + "replace", pconsumed); if (unicode == NULL) return -1;