diff options
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r-- | Objects/unicodeobject.c | 1483 |
1 files changed, 753 insertions, 730 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 1fcc83e63a3..0226e429c3a 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -42,6 +42,7 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #include "Python.h" #include "ucnhash.h" #include "bytes_methods.h" +#include "stringlib/eq.h" #ifdef MS_WINDOWS #include <windows.h> @@ -162,6 +163,14 @@ extern "C" { *_to++ = (to_type) *_iter++; \ } while (0) +#ifdef MS_WINDOWS + /* On Windows, overallocate by 50% is the best factor */ +# define OVERALLOCATE_FACTOR 2 +#else + /* On Linux, overallocate by 25% is the best factor */ +# define OVERALLOCATE_FACTOR 4 +#endif + /* This dictionary holds all interned unicode strings. Note that references to strings in this dictionary are *not* counted in the string's ob_refcnt. When the interned string reaches a refcnt of 0 the string deallocation @@ -263,7 +272,7 @@ raise_encode_exception(PyObject **exceptionObject, const char *reason); /* Same for linebreaks */ -static unsigned char ascii_linebreak[] = { +static const unsigned char ascii_linebreak[] = { 0, 0, 0, 0, 0, 0, 0, 0, /* 0x000A, * LINE FEED */ /* 0x000B, * LINE TABULATION */ @@ -292,6 +301,38 @@ static unsigned char ascii_linebreak[] = { #include "clinic/unicodeobject.c.h" +typedef enum { + _Py_ERROR_UNKNOWN=0, + _Py_ERROR_STRICT, + _Py_ERROR_SURROGATEESCAPE, + _Py_ERROR_REPLACE, + _Py_ERROR_IGNORE, + _Py_ERROR_BACKSLASHREPLACE, + _Py_ERROR_SURROGATEPASS, + _Py_ERROR_XMLCHARREFREPLACE, + _Py_ERROR_OTHER +} _Py_error_handler; + +static _Py_error_handler +get_error_handler(const char *errors) +{ + if (errors == NULL || strcmp(errors, "strict") == 0) + return _Py_ERROR_STRICT; + if (strcmp(errors, "surrogateescape") == 0) + return _Py_ERROR_SURROGATEESCAPE; + if (strcmp(errors, "replace") == 0) + return _Py_ERROR_REPLACE; + if (strcmp(errors, "ignore") == 0) + return _Py_ERROR_IGNORE; + if (strcmp(errors, "backslashreplace") == 0) + return _Py_ERROR_BACKSLASHREPLACE; + if (strcmp(errors, "surrogatepass") == 0) + return _Py_ERROR_SURROGATEPASS; + if (strcmp(errors, "xmlcharrefreplace") == 0) + return _Py_ERROR_XMLCHARREFREPLACE; + return _Py_ERROR_OTHER; +} + /* The max unicode value is always 0x10FFFF while using the PEP-393 API. This function is kept for backward compatibility with the old API. */ Py_UNICODE @@ -521,6 +562,129 @@ unicode_result_unchanged(PyObject *unicode) return _PyUnicode_Copy(unicode); } +/* Implementation of the "backslashreplace" error handler for 8-bit encodings: + ASCII, Latin1, UTF-8, etc. */ +static char* +backslashreplace(_PyBytesWriter *writer, char *str, + PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend) +{ + Py_ssize_t size, i; + Py_UCS4 ch; + enum PyUnicode_Kind kind; + void *data; + + assert(PyUnicode_IS_READY(unicode)); + kind = PyUnicode_KIND(unicode); + data = PyUnicode_DATA(unicode); + + size = 0; + /* determine replacement size */ + for (i = collstart; i < collend; ++i) { + Py_ssize_t incr; + + ch = PyUnicode_READ(kind, data, i); + if (ch < 0x100) + incr = 2+2; + else if (ch < 0x10000) + incr = 2+4; + else { + assert(ch <= MAX_UNICODE); + incr = 2+8; + } + if (size > PY_SSIZE_T_MAX - incr) { + PyErr_SetString(PyExc_OverflowError, + "encoded result is too long for a Python string"); + return NULL; + } + size += incr; + } + + str = _PyBytesWriter_Prepare(writer, str, size); + if (str == NULL) + return NULL; + + /* generate replacement */ + for (i = collstart; i < collend; ++i) { + ch = PyUnicode_READ(kind, data, i); + *str++ = '\\'; + if (ch >= 0x00010000) { + *str++ = 'U'; + *str++ = Py_hexdigits[(ch>>28)&0xf]; + *str++ = Py_hexdigits[(ch>>24)&0xf]; + *str++ = Py_hexdigits[(ch>>20)&0xf]; + *str++ = Py_hexdigits[(ch>>16)&0xf]; + *str++ = Py_hexdigits[(ch>>12)&0xf]; + *str++ = Py_hexdigits[(ch>>8)&0xf]; + } + else if (ch >= 0x100) { + *str++ = 'u'; + *str++ = Py_hexdigits[(ch>>12)&0xf]; + *str++ = Py_hexdigits[(ch>>8)&0xf]; + } + else + *str++ = 'x'; + *str++ = Py_hexdigits[(ch>>4)&0xf]; + *str++ = Py_hexdigits[ch&0xf]; + } + return str; +} + +/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings: + ASCII, Latin1, UTF-8, etc. */ +static char* +xmlcharrefreplace(_PyBytesWriter *writer, char *str, + PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend) +{ + Py_ssize_t size, i; + Py_UCS4 ch; + enum PyUnicode_Kind kind; + void *data; + + assert(PyUnicode_IS_READY(unicode)); + kind = PyUnicode_KIND(unicode); + data = PyUnicode_DATA(unicode); + + size = 0; + /* determine replacement size */ + for (i = collstart; i < collend; ++i) { + Py_ssize_t incr; + + ch = PyUnicode_READ(kind, data, i); + if (ch < 10) + incr = 2+1+1; + else if (ch < 100) + incr = 2+2+1; + else if (ch < 1000) + incr = 2+3+1; + else if (ch < 10000) + incr = 2+4+1; + else if (ch < 100000) + incr = 2+5+1; + else if (ch < 1000000) + incr = 2+6+1; + else { + assert(ch <= MAX_UNICODE); + incr = 2+7+1; + } + if (size > PY_SSIZE_T_MAX - incr) { + PyErr_SetString(PyExc_OverflowError, + "encoded result is too long for a Python string"); + return NULL; + } + size += incr; + } + + str = _PyBytesWriter_Prepare(writer, str, size); + if (str == NULL) + return NULL; + + /* generate replacement */ + for (i = collstart; i < collend; ++i) { + str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i)); + } + return str; +} + /* --- Bloom Filters ----------------------------------------------------- */ /* stuff to implement simple "bloom filters" for Unicode characters. @@ -587,6 +751,18 @@ make_bloom_mask(int kind, void* ptr, Py_ssize_t len) #undef BLOOM_UPDATE } +static int +ensure_unicode(PyObject *obj) +{ + if (!PyUnicode_Check(obj)) { + PyErr_Format(PyExc_TypeError, + "must be str, not %.100s", + Py_TYPE(obj)->tp_name); + return -1; + } + return PyUnicode_READY(obj); +} + /* Compilation of templated routines */ #include "stringlib/asciilib.h" @@ -647,27 +823,26 @@ Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind, Py_ssize_t size, Py_UCS4 ch, int direction) { - int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH; - switch (kind) { case PyUnicode_1BYTE_KIND: - { - Py_UCS1 ch1 = (Py_UCS1) ch; - if (ch1 == ch) - return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode); - else - return -1; - } + if ((Py_UCS1) ch != ch) + return -1; + if (direction > 0) + return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch); + else + return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch); case PyUnicode_2BYTE_KIND: - { - Py_UCS2 ch2 = (Py_UCS2) ch; - if (ch2 == ch) - return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode); - else - return -1; - } + if ((Py_UCS2) ch != ch) + return -1; + if (direction > 0) + return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch); + else + return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch); case PyUnicode_4BYTE_KIND: - return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode); + if (direction > 0) + return ucs4lib_find_char((Py_UCS4 *) s, size, ch); + else + return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch); default: assert(0); return -1; @@ -2903,7 +3078,7 @@ PyUnicode_FromEncodedObject(PyObject *obj, /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { PyErr_Format(PyExc_TypeError, - "coercing to str: need a bytes-like object, %.80s found", + "decoding to str: need a bytes-like object, %.80s found", Py_TYPE(obj)->tp_name); return NULL; } @@ -3167,24 +3342,22 @@ wcstombs_errorpos(const wchar_t *wstr) static int locale_error_handler(const char *errors, int *surrogateescape) { - if (errors == NULL) { - *surrogateescape = 0; - return 0; - } - - if (strcmp(errors, "strict") == 0) { + _Py_error_handler error_handler = get_error_handler(errors); + switch (error_handler) + { + case _Py_ERROR_STRICT: *surrogateescape = 0; return 0; - } - if (strcmp(errors, "surrogateescape") == 0) { + case _Py_ERROR_SURROGATEESCAPE: *surrogateescape = 1; return 0; + default: + PyErr_Format(PyExc_ValueError, + "only 'strict' and 'surrogateescape' error handlers " + "are supported, not '%s'", + errors); + return -1; } - PyErr_Format(PyExc_ValueError, - "only 'strict' and 'surrogateescape' error handlers " - "are supported, not '%s'", - errors); - return -1; } PyObject * @@ -3626,19 +3799,17 @@ PyUnicode_FSConverter(PyObject* arg, void* addr) output = arg; Py_INCREF(output); } - else { - arg = PyUnicode_FromObject(arg); - if (!arg) - return 0; + else if (PyUnicode_Check(arg)) { output = PyUnicode_EncodeFSDefault(arg); - Py_DECREF(arg); if (!output) return 0; - if (!PyBytes_Check(output)) { - Py_DECREF(output); - PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); - return 0; - } + assert(PyBytes_Check(output)); + } + else { + PyErr_Format(PyExc_TypeError, + "must be str or bytes, not %.100s", + Py_TYPE(arg)->tp_name); + return 0; } size = PyBytes_GET_SIZE(output); data = PyBytes_AS_STRING(output); @@ -3666,7 +3837,13 @@ PyUnicode_FSDecoder(PyObject* arg, void* addr) output = arg; Py_INCREF(output); } - else if (PyObject_CheckBuffer(arg)) { + else if (PyBytes_Check(arg) || PyObject_CheckBuffer(arg)) { + if (!PyBytes_Check(arg) && + PyErr_WarnFormat(PyExc_DeprecationWarning, 1, + "path should be string or bytes, not %.200s", + Py_TYPE(arg)->tp_name)) { + return 0; + } arg = PyBytes_FromObject(arg); if (!arg) return 0; @@ -3675,11 +3852,6 @@ PyUnicode_FSDecoder(PyObject* arg, void* addr) Py_DECREF(arg); if (!output) return 0; - if (!PyUnicode_Check(output)) { - Py_DECREF(output); - PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode"); - return 0; - } } else { PyErr_Format(PyExc_TypeError, @@ -3716,7 +3888,7 @@ PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) if (PyUnicode_UTF8(unicode) == NULL) { assert(!PyUnicode_IS_COMPACT_ASCII(unicode)); - bytes = _PyUnicode_AsUTF8String(unicode, "strict"); + bytes = _PyUnicode_AsUTF8String(unicode, NULL); if (bytes == NULL) return NULL; _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1); @@ -3982,7 +4154,7 @@ unicode_decode_call_errorhandler_wchar( Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, PyObject **output, Py_ssize_t *outpos) { - static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; + static const char *argparse = "O!n;decoding error handler must return (str, int) tuple"; PyObject *restuple = NULL; PyObject *repunicode = NULL; @@ -4090,7 +4262,7 @@ unicode_decode_call_errorhandler_writer( Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */) { - static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; + static const char *argparse = "O!n;decoding error handler must return (str, int) tuple"; PyObject *restuple = NULL; PyObject *repunicode = NULL; @@ -4696,8 +4868,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s, Py_ssize_t startinpos; Py_ssize_t endinpos; const char *errmsg = ""; - PyObject *errorHandler = NULL; + PyObject *error_handler_obj = NULL; PyObject *exc = NULL; + _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; if (size == 0) { if (consumed) @@ -4722,6 +4895,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s, while (s < end) { Py_UCS4 ch; int kind = writer.kind; + if (kind == PyUnicode_1BYTE_KIND) { if (PyUnicode_IS_ASCII(writer.buffer)) ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos); @@ -4760,24 +4934,56 @@ PyUnicode_DecodeUTF8Stateful(const char *s, continue; } - if (unicode_decode_call_errorhandler_writer( - errors, &errorHandler, - "utf-8", errmsg, - &starts, &end, &startinpos, &endinpos, &exc, &s, - &writer)) - goto onError; + if (error_handler == _Py_ERROR_UNKNOWN) + error_handler = get_error_handler(errors); + + switch (error_handler) { + case _Py_ERROR_IGNORE: + s += (endinpos - startinpos); + break; + + case _Py_ERROR_REPLACE: + if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0) + goto onError; + s += (endinpos - startinpos); + break; + + case _Py_ERROR_SURROGATEESCAPE: + { + Py_ssize_t i; + + if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0) + goto onError; + for (i=startinpos; i<endinpos; i++) { + ch = (Py_UCS4)(unsigned char)(starts[i]); + PyUnicode_WRITE(writer.kind, writer.data, writer.pos, + ch + 0xdc00); + writer.pos++; + } + s += (endinpos - startinpos); + break; + } + + default: + if (unicode_decode_call_errorhandler_writer( + errors, &error_handler_obj, + "utf-8", errmsg, + &starts, &end, &startinpos, &endinpos, &exc, &s, + &writer)) + goto onError; + } } End: if (consumed) *consumed = s - starts; - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); Py_XDECREF(exc); return _PyUnicodeWriter_Finish(&writer); onError: - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); Py_XDECREF(exc); _PyUnicodeWriter_Dealloc(&writer); return NULL; @@ -5868,11 +6074,10 @@ PyObject * PyUnicode_AsUnicodeEscapeString(PyObject *unicode) { Py_ssize_t i, len; - PyObject *repr; char *p; int kind; void *data; - Py_ssize_t expandsize = 0; + _PyBytesWriter writer; /* Initial allocation is based on the longest-possible character escape. @@ -5888,35 +6093,28 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode) } if (PyUnicode_READY(unicode) == -1) return NULL; + + _PyBytesWriter_Init(&writer); + len = PyUnicode_GET_LENGTH(unicode); kind = PyUnicode_KIND(unicode); data = PyUnicode_DATA(unicode); - switch (kind) { - case PyUnicode_1BYTE_KIND: expandsize = 4; break; - case PyUnicode_2BYTE_KIND: expandsize = 6; break; - case PyUnicode_4BYTE_KIND: expandsize = 10; break; - } - if (len == 0) - return PyBytes_FromStringAndSize(NULL, 0); - - if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) - return PyErr_NoMemory(); - - repr = PyBytes_FromStringAndSize(NULL, - 2 - + expandsize*len - + 1); - if (repr == NULL) - return NULL; - - p = PyBytes_AS_STRING(repr); + p = _PyBytesWriter_Alloc(&writer, len); + if (p == NULL) + goto error; + writer.overallocate = 1; for (i = 0; i < len; i++) { Py_UCS4 ch = PyUnicode_READ(kind, data, i); /* Escape backslashes */ if (ch == '\\') { + /* -1: subtract 1 preallocated byte */ + p = _PyBytesWriter_Prepare(&writer, p, 2-1); + if (p == NULL) + goto error; + *p++ = '\\'; *p++ = (char) ch; continue; @@ -5925,6 +6123,11 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode) /* Map 21-bit characters to '\U00xxxxxx' */ else if (ch >= 0x10000) { assert(ch <= MAX_UNICODE); + + p = _PyBytesWriter_Prepare(&writer, p, 10-1); + if (p == NULL) + goto error; + *p++ = '\\'; *p++ = 'U'; *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F]; @@ -5940,6 +6143,10 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode) /* Map 16-bit characters to '\uxxxx' */ if (ch >= 256) { + p = _PyBytesWriter_Prepare(&writer, p, 6-1); + if (p == NULL) + goto error; + *p++ = '\\'; *p++ = 'u'; *p++ = Py_hexdigits[(ch >> 12) & 0x000F]; @@ -5950,20 +6157,37 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode) /* Map special whitespace to '\t', \n', '\r' */ else if (ch == '\t') { + p = _PyBytesWriter_Prepare(&writer, p, 2-1); + if (p == NULL) + goto error; + *p++ = '\\'; *p++ = 't'; } else if (ch == '\n') { + p = _PyBytesWriter_Prepare(&writer, p, 2-1); + if (p == NULL) + goto error; + *p++ = '\\'; *p++ = 'n'; } else if (ch == '\r') { + p = _PyBytesWriter_Prepare(&writer, p, 2-1); + if (p == NULL) + goto error; + *p++ = '\\'; *p++ = 'r'; } /* Map non-printable US ASCII to '\xhh' */ else if (ch < ' ' || ch >= 0x7F) { + /* -1: subtract 1 preallocated byte */ + p = _PyBytesWriter_Prepare(&writer, p, 4-1); + if (p == NULL) + goto error; + *p++ = '\\'; *p++ = 'x'; *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; @@ -5975,10 +6199,11 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode) *p++ = (char) ch; } - assert(p - PyBytes_AS_STRING(repr) > 0); - if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) - return NULL; - return repr; + return _PyBytesWriter_Finish(&writer, p); + +error: + _PyBytesWriter_Dealloc(&writer); + return NULL; } PyObject * @@ -6107,13 +6332,12 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, PyObject * PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) { - PyObject *repr; char *p; - char *q; - Py_ssize_t expandsize, pos; + Py_ssize_t pos; int kind; void *data; Py_ssize_t len; + _PyBytesWriter writer; if (!PyUnicode_Check(unicode)) { PyErr_BadArgument(); @@ -6121,28 +6345,29 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) } if (PyUnicode_READY(unicode) == -1) return NULL; + + _PyBytesWriter_Init(&writer); + kind = PyUnicode_KIND(unicode); data = PyUnicode_DATA(unicode); len = PyUnicode_GET_LENGTH(unicode); - /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6 - bytes, and 1 byte characters 4. */ - expandsize = kind * 2 + 2; - - if (len > PY_SSIZE_T_MAX / expandsize) - return PyErr_NoMemory(); - repr = PyBytes_FromStringAndSize(NULL, expandsize * len); - if (repr == NULL) - return NULL; - if (len == 0) - return repr; + p = _PyBytesWriter_Alloc(&writer, len); + if (p == NULL) + goto error; + writer.overallocate = 1; - p = q = PyBytes_AS_STRING(repr); for (pos = 0; pos < len; pos++) { Py_UCS4 ch = PyUnicode_READ(kind, data, pos); /* Map 32-bit characters to '\Uxxxxxxxx' */ if (ch >= 0x10000) { assert(ch <= MAX_UNICODE); + + /* -1: subtract 1 preallocated byte */ + p = _PyBytesWriter_Prepare(&writer, p, 10-1); + if (p == NULL) + goto error; + *p++ = '\\'; *p++ = 'U'; *p++ = Py_hexdigits[(ch >> 28) & 0xf]; @@ -6156,6 +6381,11 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) } /* Map 16-bit characters to '\uxxxx' */ else if (ch >= 256) { + /* -1: subtract 1 preallocated byte */ + p = _PyBytesWriter_Prepare(&writer, p, 6-1); + if (p == NULL) + goto error; + *p++ = '\\'; *p++ = 'u'; *p++ = Py_hexdigits[(ch >> 12) & 0xf]; @@ -6168,10 +6398,11 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) *p++ = (char) ch; } - assert(p > q); - if (_PyBytes_Resize(&repr, p - q) < 0) - return NULL; - return repr; + return _PyBytesWriter_Finish(&writer, p); + +error: + _PyBytesWriter_Dealloc(&writer); + return NULL; } PyObject * @@ -6348,7 +6579,7 @@ unicode_encode_call_errorhandler(const char *errors, Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos) { - static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; + static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; Py_ssize_t len; PyObject *restuple; PyObject *resunicode; @@ -6402,25 +6633,22 @@ unicode_encode_call_errorhandler(const char *errors, static PyObject * unicode_encode_ucs1(PyObject *unicode, const char *errors, - unsigned int limit) + const Py_UCS4 limit) { /* input state */ Py_ssize_t pos=0, size; int kind; void *data; - /* output object */ - PyObject *res; /* pointer into the output */ char *str; - /* current output position */ - Py_ssize_t ressize; const char *encoding = (limit == 256) ? "latin-1" : "ascii"; const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; - PyObject *errorHandler = NULL; + PyObject *error_handler_obj = NULL; PyObject *exc = NULL; - /* the following variable is used for caching string comparisons - * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ - int known_errorHandler = -1; + _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; + PyObject *rep = NULL; + /* output object */ + _PyBytesWriter writer; if (PyUnicode_READY(unicode) == -1) return NULL; @@ -6431,186 +6659,157 @@ unicode_encode_ucs1(PyObject *unicode, replacements, if we need more, we'll resize */ if (size == 0) return PyBytes_FromStringAndSize(NULL, 0); - res = PyBytes_FromStringAndSize(NULL, size); - if (res == NULL) + + _PyBytesWriter_Init(&writer); + str = _PyBytesWriter_Alloc(&writer, size); + if (str == NULL) return NULL; - str = PyBytes_AS_STRING(res); - ressize = size; while (pos < size) { - Py_UCS4 c = PyUnicode_READ(kind, data, pos); + Py_UCS4 ch = PyUnicode_READ(kind, data, pos); /* can we encode this? */ - if (c<limit) { + if (ch < limit) { /* no overflow check, because we know that the space is enough */ - *str++ = (char)c; + *str++ = (char)ch; ++pos; } else { - Py_ssize_t requiredsize; - PyObject *repunicode; - Py_ssize_t repsize, newpos, respos, i; + Py_ssize_t newpos, i; /* startpos for collecting unencodable chars */ Py_ssize_t collstart = pos; - Py_ssize_t collend = pos; + Py_ssize_t collend = collstart + 1; /* find all unecodable characters */ + while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit)) ++collend; + + /* Only overallocate the buffer if it's not the last write */ + writer.overallocate = (collend < size); + /* cache callback name lookup (if not done yet, i.e. it's the first error) */ - if (known_errorHandler==-1) { - if ((errors==NULL) || (!strcmp(errors, "strict"))) - known_errorHandler = 1; - else if (!strcmp(errors, "replace")) - known_errorHandler = 2; - else if (!strcmp(errors, "ignore")) - known_errorHandler = 3; - else if (!strcmp(errors, "xmlcharrefreplace")) - known_errorHandler = 4; - else - known_errorHandler = 0; - } - switch (known_errorHandler) { - case 1: /* strict */ + if (error_handler == _Py_ERROR_UNKNOWN) + error_handler = get_error_handler(errors); + + switch (error_handler) { + case _Py_ERROR_STRICT: raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason); goto onError; - case 2: /* replace */ - while (collstart++ < collend) - *str++ = '?'; /* fall through */ - case 3: /* ignore */ + + case _Py_ERROR_REPLACE: + memset(str, '?', collend - collstart); + str += (collend - collstart); + /* fall through ignore error handler */ + case _Py_ERROR_IGNORE: + pos = collend; + break; + + case _Py_ERROR_BACKSLASHREPLACE: + /* subtract preallocated bytes */ + writer.min_size -= (collend - collstart); + str = backslashreplace(&writer, str, + unicode, collstart, collend); + if (str == NULL) + goto onError; + pos = collend; + break; + + case _Py_ERROR_XMLCHARREFREPLACE: + /* subtract preallocated bytes */ + writer.min_size -= (collend - collstart); + str = xmlcharrefreplace(&writer, str, + unicode, collstart, collend); + if (str == NULL) + goto onError; pos = collend; break; - case 4: /* xmlcharrefreplace */ - respos = str - PyBytes_AS_STRING(res); - requiredsize = respos; - /* determine replacement size */ + + case _Py_ERROR_SURROGATEESCAPE: for (i = collstart; i < collend; ++i) { - Py_UCS4 ch = PyUnicode_READ(kind, data, i); - Py_ssize_t incr; - if (ch < 10) - incr = 2+1+1; - else if (ch < 100) - incr = 2+2+1; - else if (ch < 1000) - incr = 2+3+1; - else if (ch < 10000) - incr = 2+4+1; - else if (ch < 100000) - incr = 2+5+1; - else if (ch < 1000000) - incr = 2+6+1; - else { - assert(ch <= MAX_UNICODE); - incr = 2+7+1; + ch = PyUnicode_READ(kind, data, i); + if (ch < 0xdc80 || 0xdcff < ch) { + /* Not a UTF-8b surrogate */ + break; } - if (requiredsize > PY_SSIZE_T_MAX - incr) - goto overflow; - requiredsize += incr; + *str++ = (char)(ch - 0xdc00); + ++pos; } - if (requiredsize > PY_SSIZE_T_MAX - (size - collend)) - goto overflow; - requiredsize += size - collend; - if (requiredsize > ressize) { - if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize) - requiredsize = 2*ressize; - if (_PyBytes_Resize(&res, requiredsize)) - goto onError; - str = PyBytes_AS_STRING(res) + respos; - ressize = requiredsize; - } - /* generate replacement */ - for (i = collstart; i < collend; ++i) { - str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i)); - } - pos = collend; - break; + if (i >= collend) + break; + collstart = pos; + assert(collstart != collend); + /* fallback to general error handling */ + default: - repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, - encoding, reason, unicode, &exc, - collstart, collend, &newpos); - if (repunicode == NULL || (PyUnicode_Check(repunicode) && - PyUnicode_READY(repunicode) == -1)) + rep = unicode_encode_call_errorhandler(errors, &error_handler_obj, + encoding, reason, unicode, &exc, + collstart, collend, &newpos); + if (rep == NULL) goto onError; - if (PyBytes_Check(repunicode)) { + + /* subtract preallocated bytes */ + writer.min_size -= 1; + + if (PyBytes_Check(rep)) { /* Directly copy bytes result to output. */ - repsize = PyBytes_Size(repunicode); - if (repsize > 1) { - /* Make room for all additional bytes. */ - respos = str - PyBytes_AS_STRING(res); - if (ressize > PY_SSIZE_T_MAX - repsize - 1) { - Py_DECREF(repunicode); - goto overflow; - } - if (_PyBytes_Resize(&res, ressize+repsize-1)) { - Py_DECREF(repunicode); - goto onError; - } - str = PyBytes_AS_STRING(res) + respos; - ressize += repsize-1; - } - memcpy(str, PyBytes_AsString(repunicode), repsize); - str += repsize; - pos = newpos; - Py_DECREF(repunicode); - break; - } - /* need more space? (at least enough for what we - have+the replacement+the rest of the string, so - we won't have to check space for encodable characters) */ - respos = str - PyBytes_AS_STRING(res); - repsize = PyUnicode_GET_LENGTH(repunicode); - requiredsize = respos; - if (requiredsize > PY_SSIZE_T_MAX - repsize) - goto overflow; - requiredsize += repsize; - if (requiredsize > PY_SSIZE_T_MAX - (size - collend)) - goto overflow; - requiredsize += size - collend; - if (requiredsize > ressize) { - if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize) - requiredsize = 2*ressize; - if (_PyBytes_Resize(&res, requiredsize)) { - Py_DECREF(repunicode); + str = _PyBytesWriter_WriteBytes(&writer, str, + PyBytes_AS_STRING(rep), + PyBytes_GET_SIZE(rep)); + if (str == NULL) goto onError; - } - str = PyBytes_AS_STRING(res) + respos; - ressize = requiredsize; } - /* check if there is anything unencodable in the replacement - and copy it to the output */ - for (i = 0; repsize-->0; ++i, ++str) { - c = PyUnicode_READ_CHAR(repunicode, i); - if (c >= limit) { - raise_encode_exception(&exc, encoding, unicode, - pos, pos+1, reason); - Py_DECREF(repunicode); + else { + assert(PyUnicode_Check(rep)); + + if (PyUnicode_READY(rep) < 0) goto onError; + + if (PyUnicode_IS_ASCII(rep)) { + /* Fast path: all characters are smaller than limit */ + assert(limit >= 128); + assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); + str = _PyBytesWriter_WriteBytes(&writer, str, + PyUnicode_DATA(rep), + PyUnicode_GET_LENGTH(rep)); + } + else { + Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep); + + str = _PyBytesWriter_Prepare(&writer, str, repsize); + if (str == NULL) + goto onError; + + /* check if there is anything unencodable in the + replacement and copy it to the output */ + for (i = 0; repsize-->0; ++i, ++str) { + ch = PyUnicode_READ_CHAR(rep, i); + if (ch >= limit) { + raise_encode_exception(&exc, encoding, unicode, + pos, pos+1, reason); + goto onError; + } + *str = (char)ch; + } } - *str = (char)c; } pos = newpos; - Py_DECREF(repunicode); + Py_CLEAR(rep); } + + /* If overallocation was disabled, ensure that it was the last + write. Otherwise, we missed an optimization */ + assert(writer.overallocate || pos == size); } } - /* Resize if we allocated to much */ - size = str - PyBytes_AS_STRING(res); - if (size < ressize) { /* If this falls res will be NULL */ - assert(size >= 0); - if (_PyBytes_Resize(&res, size) < 0) - goto onError; - } - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); Py_XDECREF(exc); - return res; - - overflow: - PyErr_SetString(PyExc_OverflowError, - "encoded result is too long for a Python string"); + return _PyBytesWriter_Finish(&writer, str); onError: - Py_XDECREF(res); - Py_XDECREF(errorHandler); + Py_XDECREF(rep); + _PyBytesWriter_Dealloc(&writer); + Py_XDECREF(error_handler_obj); Py_XDECREF(exc); return NULL; } @@ -6670,8 +6869,9 @@ PyUnicode_DecodeASCII(const char *s, Py_ssize_t endinpos; Py_ssize_t outpos; const char *e; - PyObject *errorHandler = NULL; + PyObject *error_handler_obj = NULL; PyObject *exc = NULL; + _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; if (size == 0) _Py_RETURN_UNICODE_EMPTY(); @@ -6700,12 +6900,42 @@ PyUnicode_DecodeASCII(const char *s, PyUnicode_WRITE(kind, data, writer.pos, c); writer.pos++; ++s; + continue; } - else { + + /* byte outsize range 0x00..0x7f: call the error handler */ + + if (error_handler == _Py_ERROR_UNKNOWN) + error_handler = get_error_handler(errors); + + switch (error_handler) + { + case _Py_ERROR_REPLACE: + case _Py_ERROR_SURROGATEESCAPE: + /* Fast-path: the error handler only writes one character, + but we may switch to UCS2 at the first write */ + if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0) + goto onError; + kind = writer.kind; + data = writer.data; + + if (error_handler == _Py_ERROR_REPLACE) + PyUnicode_WRITE(kind, data, writer.pos, 0xfffd); + else + PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00); + writer.pos++; + ++s; + break; + + case _Py_ERROR_IGNORE: + ++s; + break; + + default: startinpos = s-starts; endinpos = startinpos + 1; if (unicode_decode_call_errorhandler_writer( - errors, &errorHandler, + errors, &error_handler_obj, "ascii", "ordinal not in range(128)", &starts, &e, &startinpos, &endinpos, &exc, &s, &writer)) @@ -6714,13 +6944,13 @@ PyUnicode_DecodeASCII(const char *s, data = writer.data; } } - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); Py_XDECREF(exc); return _PyUnicodeWriter_Finish(&writer); onError: _PyUnicodeWriter_Dealloc(&writer); - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); Py_XDECREF(exc); return NULL; } @@ -6775,7 +7005,7 @@ PyUnicode_AsASCIIString(PyObject *unicode) # define WC_ERR_INVALID_CHARS 0x0080 #endif -static char* +static const char* code_page_name(UINT code_page, PyObject **obj) { *obj = NULL; @@ -6883,7 +7113,7 @@ decode_code_page_errors(UINT code_page, PyObject *errorHandler = NULL; PyObject *exc = NULL; PyObject *encoding_obj = NULL; - char *encoding; + const char *encoding; DWORD err; int ret = -1; @@ -7119,7 +7349,6 @@ encode_code_page_strict(UINT code_page, PyObject **outbytes, BOOL usedDefaultChar = FALSE; BOOL *pusedDefaultChar = &usedDefaultChar; int outsize; - PyObject *exc = NULL; wchar_t *p; Py_ssize_t size; const DWORD flags = encode_code_page_flags(code_page, NULL); @@ -7228,7 +7457,7 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes, PyObject *errorHandler = NULL; PyObject *exc = NULL; PyObject *encoding_obj = NULL; - char *encoding; + const char *encoding; Py_ssize_t newpos, newoutsize; PyObject *rep; int ret = -1; @@ -8086,7 +8315,7 @@ static int charmap_encoding_error( PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping, PyObject **exceptionObject, - int *known_errorHandler, PyObject **errorHandler, const char *errors, + _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors, PyObject **res, Py_ssize_t *respos) { PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ @@ -8133,23 +8362,15 @@ charmap_encoding_error( } /* cache callback name lookup * (if not done yet, i.e. it's the first error) */ - if (*known_errorHandler==-1) { - if ((errors==NULL) || (!strcmp(errors, "strict"))) - *known_errorHandler = 1; - else if (!strcmp(errors, "replace")) - *known_errorHandler = 2; - else if (!strcmp(errors, "ignore")) - *known_errorHandler = 3; - else if (!strcmp(errors, "xmlcharrefreplace")) - *known_errorHandler = 4; - else - *known_errorHandler = 0; - } - switch (*known_errorHandler) { - case 1: /* strict */ + if (*error_handler == _Py_ERROR_UNKNOWN) + *error_handler = get_error_handler(errors); + + switch (*error_handler) { + case _Py_ERROR_STRICT: raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); return -1; - case 2: /* replace */ + + case _Py_ERROR_REPLACE: for (collpos = collstartpos; collpos<collendpos; ++collpos) { x = charmapencode_output('?', mapping, res, respos); if (x==enc_EXCEPTION) { @@ -8161,10 +8382,11 @@ charmap_encoding_error( } } /* fall through */ - case 3: /* ignore */ + case _Py_ERROR_IGNORE: *inpos = collendpos; break; - case 4: /* xmlcharrefreplace */ + + case _Py_ERROR_XMLCHARREFREPLACE: /* generate replacement (temporarily (mis)uses p) */ for (collpos = collstartpos; collpos < collendpos; ++collpos) { char buffer[2+29+1+1]; @@ -8182,8 +8404,9 @@ charmap_encoding_error( } *inpos = collendpos; break; + default: - repunicode = unicode_encode_call_errorhandler(errors, errorHandler, + repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj, encoding, reason, unicode, exceptionObject, collstartpos, collendpos, &newpos); if (repunicode == NULL) @@ -8246,12 +8469,9 @@ _PyUnicode_EncodeCharmap(PyObject *unicode, Py_ssize_t size; /* current output position */ Py_ssize_t respos = 0; - PyObject *errorHandler = NULL; + PyObject *error_handler_obj = NULL; PyObject *exc = NULL; - /* the following variable is used for caching string comparisons - * -1=not initialized, 0=unknown, 1=strict, 2=replace, - * 3=ignore, 4=xmlcharrefreplace */ - int known_errorHandler = -1; + _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; void *data; int kind; @@ -8282,7 +8502,7 @@ _PyUnicode_EncodeCharmap(PyObject *unicode, if (x==enc_FAILED) { /* unencodable character */ if (charmap_encoding_error(unicode, &inpos, mapping, &exc, - &known_errorHandler, &errorHandler, errors, + &error_handler, &error_handler_obj, errors, &res, &respos)) { goto onError; } @@ -8298,13 +8518,13 @@ _PyUnicode_EncodeCharmap(PyObject *unicode, goto onError; Py_XDECREF(exc); - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); return res; onError: Py_XDECREF(res); Py_XDECREF(exc); - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); return NULL; } @@ -8371,7 +8591,7 @@ unicode_translate_call_errorhandler(const char *errors, Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos) { - static char *argparse = "O!n;translating error handler must return (str, int) tuple"; + static const char *argparse = "O!n;translating error handler must return (str, int) tuple"; Py_ssize_t i_newpos; PyObject *restuple; @@ -8628,7 +8848,7 @@ exit: return res; } -PyObject * +static PyObject * _PyUnicode_TranslateCharmap(PyObject *input, PyObject *mapping, const char *errors) @@ -8657,10 +8877,8 @@ _PyUnicode_TranslateCharmap(PyObject *input, kind = PyUnicode_KIND(input); size = PyUnicode_GET_LENGTH(input); - if (size == 0) { - Py_INCREF(input); - return input; - } + if (size == 0) + return PyUnicode_FromObject(input); /* allocate enough for a simple 1:1 translation without replacements, if we need more, we'll resize */ @@ -8771,14 +8989,9 @@ PyUnicode_Translate(PyObject *str, PyObject *mapping, const char *errors) { - PyObject *result; - - str = PyUnicode_FromObject(str); - if (str == NULL) + if (ensure_unicode(str) < 0) return NULL; - result = _PyUnicode_TranslateCharmap(str, mapping, errors); - Py_DECREF(str); - return result; + return _PyUnicode_TranslateCharmap(str, mapping, errors); } static Py_UCS4 @@ -8960,9 +9173,10 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s, } static Py_ssize_t -any_find_slice(int direction, PyObject* s1, PyObject* s2, +any_find_slice(PyObject* s1, PyObject* s2, Py_ssize_t start, - Py_ssize_t end) + Py_ssize_t end, + int direction) { int kind1, kind2; void *buf1, *buf2; @@ -9131,54 +9345,35 @@ PyUnicode_Count(PyObject *str, Py_ssize_t end) { Py_ssize_t result; - PyObject* str_obj; - PyObject* sub_obj; int kind1, kind2; void *buf1 = NULL, *buf2 = NULL; Py_ssize_t len1, len2; - str_obj = PyUnicode_FromObject(str); - if (!str_obj) - return -1; - sub_obj = PyUnicode_FromObject(substr); - if (!sub_obj) { - Py_DECREF(str_obj); - return -1; - } - if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) { - Py_DECREF(sub_obj); - Py_DECREF(str_obj); + if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0) return -1; - } - kind1 = PyUnicode_KIND(str_obj); - kind2 = PyUnicode_KIND(sub_obj); - if (kind1 < kind2) { - Py_DECREF(sub_obj); - Py_DECREF(str_obj); + kind1 = PyUnicode_KIND(str); + kind2 = PyUnicode_KIND(substr); + if (kind1 < kind2) return 0; - } - len1 = PyUnicode_GET_LENGTH(str_obj); - len2 = PyUnicode_GET_LENGTH(sub_obj); + len1 = PyUnicode_GET_LENGTH(str); + len2 = PyUnicode_GET_LENGTH(substr); ADJUST_INDICES(start, end, len1); - if (end - start < len2) { - Py_DECREF(sub_obj); - Py_DECREF(str_obj); + if (end - start < len2) return 0; - } - buf1 = PyUnicode_DATA(str_obj); - buf2 = PyUnicode_DATA(sub_obj); + buf1 = PyUnicode_DATA(str); + buf2 = PyUnicode_DATA(substr); if (kind2 != kind1) { - buf2 = _PyUnicode_AsKind(sub_obj, kind1); + buf2 = _PyUnicode_AsKind(substr, kind1); if (!buf2) goto onError; } switch (kind1) { case PyUnicode_1BYTE_KIND: - if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj)) + if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr)) result = asciilib_count( ((Py_UCS1*)buf1) + start, end - start, buf2, len2, PY_SSIZE_T_MAX @@ -9205,16 +9400,11 @@ PyUnicode_Count(PyObject *str, assert(0); result = 0; } - Py_DECREF(sub_obj); - Py_DECREF(str_obj); - if (kind2 != kind1) PyMem_Free(buf2); return result; onError: - Py_DECREF(sub_obj); - Py_DECREF(str_obj); if (kind2 != kind1 && buf2) PyMem_Free(buf2); return -1; @@ -9222,35 +9412,15 @@ PyUnicode_Count(PyObject *str, Py_ssize_t PyUnicode_Find(PyObject *str, - PyObject *sub, + PyObject *substr, Py_ssize_t start, Py_ssize_t end, int direction) { - Py_ssize_t result; - - str = PyUnicode_FromObject(str); - if (!str) - return -2; - sub = PyUnicode_FromObject(sub); - if (!sub) { - Py_DECREF(str); + if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0) return -2; - } - if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) { - Py_DECREF(sub); - Py_DECREF(str); - return -2; - } - - result = any_find_slice(direction, - str, sub, start, end - ); - - Py_DECREF(str); - Py_DECREF(sub); - return result; + return any_find_slice(str, substr, start, end, direction); } Py_ssize_t @@ -9353,22 +9523,10 @@ PyUnicode_Tailmatch(PyObject *str, Py_ssize_t end, int direction) { - Py_ssize_t result; - - str = PyUnicode_FromObject(str); - if (str == NULL) + if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0) return -1; - substr = PyUnicode_FromObject(substr); - if (substr == NULL) { - Py_DECREF(str); - return -1; - } - result = tailmatch(str, substr, - start, end, direction); - Py_DECREF(str); - Py_DECREF(substr); - return result; + return tailmatch(str, substr, start, end, direction); } /* Apply fixfct filter to the Unicode object self and return a @@ -9974,13 +10132,8 @@ PyUnicode_Splitlines(PyObject *string, int keepends) { PyObject *list; - string = PyUnicode_FromObject(string); - if (string == NULL) - return NULL; - if (PyUnicode_READY(string) == -1) { - Py_DECREF(string); + if (ensure_unicode(string) < 0) return NULL; - } switch (PyUnicode_KIND(string)) { case PyUnicode_1BYTE_KIND: @@ -10007,7 +10160,6 @@ PyUnicode_Splitlines(PyObject *string, int keepends) assert(0); list = 0; } - Py_DECREF(string); return list; } @@ -10568,28 +10720,27 @@ unicode_casefold(PyObject *self) } -/* Argument converter. Coerces to a single unicode character */ +/* Argument converter. Accepts a single Unicode character. */ static int convert_uc(PyObject *obj, void *addr) { Py_UCS4 *fillcharloc = (Py_UCS4 *)addr; - PyObject *uniobj; - uniobj = PyUnicode_FromObject(obj); - if (uniobj == NULL) { - PyErr_SetString(PyExc_TypeError, - "The fill character cannot be converted to Unicode"); + if (!PyUnicode_Check(obj)) { + PyErr_Format(PyExc_TypeError, + "The fill character must be a unicode character, " + "not %.100s", Py_TYPE(obj)->tp_name); return 0; } - if (PyUnicode_GET_LENGTH(uniobj) != 1) { + if (PyUnicode_READY(obj) < 0) + return 0; + if (PyUnicode_GET_LENGTH(obj) != 1) { PyErr_SetString(PyExc_TypeError, "The fill character must be exactly one character long"); - Py_DECREF(uniobj); return 0; } - *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0); - Py_DECREF(uniobj); + *fillcharloc = PyUnicode_READ_CHAR(obj, 0); return 1; } @@ -10905,59 +11056,49 @@ PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) } int -PyUnicode_Contains(PyObject *container, PyObject *element) +_PyUnicode_EQ(PyObject *aa, PyObject *bb) +{ + return unicode_eq(aa, bb); +} + +int +PyUnicode_Contains(PyObject *str, PyObject *substr) { - PyObject *str, *sub; int kind1, kind2; void *buf1, *buf2; Py_ssize_t len1, len2; int result; - /* Coerce the two arguments */ - sub = PyUnicode_FromObject(element); - if (!sub) { + if (!PyUnicode_Check(substr)) { PyErr_Format(PyExc_TypeError, - "'in <string>' requires string as left operand, not %s", - element->ob_type->tp_name); + "'in <string>' requires string as left operand, not %.100s", + Py_TYPE(substr)->tp_name); return -1; } - - str = PyUnicode_FromObject(container); - if (!str) { - Py_DECREF(sub); + if (PyUnicode_READY(substr) == -1) + return -1; + if (ensure_unicode(str) < 0) return -1; - } kind1 = PyUnicode_KIND(str); - kind2 = PyUnicode_KIND(sub); - if (kind1 < kind2) { - Py_DECREF(sub); - Py_DECREF(str); + kind2 = PyUnicode_KIND(substr); + if (kind1 < kind2) return 0; - } len1 = PyUnicode_GET_LENGTH(str); - len2 = PyUnicode_GET_LENGTH(sub); - if (len1 < len2) { - Py_DECREF(sub); - Py_DECREF(str); + len2 = PyUnicode_GET_LENGTH(substr); + if (len1 < len2) return 0; - } buf1 = PyUnicode_DATA(str); - buf2 = PyUnicode_DATA(sub); + buf2 = PyUnicode_DATA(substr); if (len2 == 1) { Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0); result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1; - Py_DECREF(sub); - Py_DECREF(str); return result; } if (kind2 != kind1) { - buf2 = _PyUnicode_AsKind(sub, kind1); - if (!buf2) { - Py_DECREF(sub); - Py_DECREF(str); + buf2 = _PyUnicode_AsKind(substr, kind1); + if (!buf2) return -1; - } } switch (kind1) { @@ -10975,9 +11116,6 @@ PyUnicode_Contains(PyObject *container, PyObject *element) assert(0); } - Py_DECREF(str); - Py_DECREF(sub); - if (kind2 != kind1) PyMem_Free(buf2); @@ -10989,56 +11127,40 @@ PyUnicode_Contains(PyObject *container, PyObject *element) PyObject * PyUnicode_Concat(PyObject *left, PyObject *right) { - PyObject *u = NULL, *v = NULL, *w; + PyObject *result; Py_UCS4 maxchar, maxchar2; - Py_ssize_t u_len, v_len, new_len; + Py_ssize_t left_len, right_len, new_len; - /* Coerce the two arguments */ - u = PyUnicode_FromObject(left); - if (u == NULL) - goto onError; - v = PyUnicode_FromObject(right); - if (v == NULL) - goto onError; + if (ensure_unicode(left) < 0 || ensure_unicode(right) < 0) + return NULL; /* Shortcuts */ - if (v == unicode_empty) { - Py_DECREF(v); - return u; - } - if (u == unicode_empty) { - Py_DECREF(u); - return v; - } + if (left == unicode_empty) + return PyUnicode_FromObject(right); + if (right == unicode_empty) + return PyUnicode_FromObject(left); - u_len = PyUnicode_GET_LENGTH(u); - v_len = PyUnicode_GET_LENGTH(v); - if (u_len > PY_SSIZE_T_MAX - v_len) { + left_len = PyUnicode_GET_LENGTH(left); + right_len = PyUnicode_GET_LENGTH(right); + if (left_len > PY_SSIZE_T_MAX - right_len) { PyErr_SetString(PyExc_OverflowError, "strings are too large to concat"); - goto onError; + return NULL; } - new_len = u_len + v_len; + new_len = left_len + right_len; - maxchar = PyUnicode_MAX_CHAR_VALUE(u); - maxchar2 = PyUnicode_MAX_CHAR_VALUE(v); + maxchar = PyUnicode_MAX_CHAR_VALUE(left); + maxchar2 = PyUnicode_MAX_CHAR_VALUE(right); maxchar = Py_MAX(maxchar, maxchar2); /* Concat the two Unicode strings */ - w = PyUnicode_New(new_len, maxchar); - if (w == NULL) - goto onError; - _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len); - _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len); - Py_DECREF(u); - Py_DECREF(v); - assert(_PyUnicode_CheckConsistency(w, 1)); - return w; - - onError: - Py_XDECREF(u); - Py_XDECREF(v); - return NULL; + result = PyUnicode_New(new_len, maxchar); + if (result == NULL) + return NULL; + _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len); + _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len); + assert(_PyUnicode_CheckConsistency(result, 1)); + return result; } void @@ -11129,6 +11251,25 @@ PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) Py_XDECREF(right); } +/* +Wraps stringlib_parse_args_finds() and additionally ensures that the +first argument is a unicode object. +*/ + +Py_LOCAL_INLINE(int) +parse_args_finds_unicode(const char * function_name, PyObject *args, + PyObject **substring, + Py_ssize_t *start, Py_ssize_t *end) +{ + if(stringlib_parse_args_finds(function_name, args, substring, + start, end)) { + if (ensure_unicode(*substring) < 0) + return 0; + return 1; + } + return 0; +} + PyDoc_STRVAR(count__doc__, "S.count(sub[, start[, end]]) -> int\n\ \n\ @@ -11147,31 +11288,26 @@ unicode_count(PyObject *self, PyObject *args) void *buf1, *buf2; Py_ssize_t len1, len2, iresult; - if (!stringlib_parse_args_finds_unicode("count", args, &substring, - &start, &end)) + if (!parse_args_finds_unicode("count", args, &substring, &start, &end)) return NULL; kind1 = PyUnicode_KIND(self); kind2 = PyUnicode_KIND(substring); - if (kind1 < kind2) { - Py_DECREF(substring); + if (kind1 < kind2) return PyLong_FromLong(0); - } + len1 = PyUnicode_GET_LENGTH(self); len2 = PyUnicode_GET_LENGTH(substring); ADJUST_INDICES(start, end, len1); - if (end - start < len2) { - Py_DECREF(substring); + if (end - start < len2) return PyLong_FromLong(0); - } + buf1 = PyUnicode_DATA(self); buf2 = PyUnicode_DATA(substring); if (kind2 != kind1) { buf2 = _PyUnicode_AsKind(substring, kind1); - if (!buf2) { - Py_DECREF(substring); + if (!buf2) return NULL; - } } switch (kind1) { case PyUnicode_1BYTE_KIND: @@ -11201,8 +11337,6 @@ unicode_count(PyObject *self, PyObject *args) if (kind2 != kind1) PyMem_Free(buf2); - Py_DECREF(substring); - return result; } @@ -11336,22 +11470,13 @@ unicode_find(PyObject *self, PyObject *args) Py_ssize_t end = 0; Py_ssize_t result; - if (!stringlib_parse_args_finds_unicode("find", args, &substring, - &start, &end)) + if (!parse_args_finds_unicode("find", args, &substring, &start, &end)) return NULL; - if (PyUnicode_READY(self) == -1) { - Py_DECREF(substring); - return NULL; - } - if (PyUnicode_READY(substring) == -1) { - Py_DECREF(substring); + if (PyUnicode_READY(self) == -1) return NULL; - } - - result = any_find_slice(1, self, substring, start, end); - Py_DECREF(substring); + result = any_find_slice(self, substring, start, end, 1); if (result == -2) return NULL; @@ -11424,22 +11549,13 @@ unicode_index(PyObject *self, PyObject *args) Py_ssize_t start = 0; Py_ssize_t end = 0; - if (!stringlib_parse_args_finds_unicode("index", args, &substring, - &start, &end)) + if (!parse_args_finds_unicode("index", args, &substring, &start, &end)) return NULL; - if (PyUnicode_READY(self) == -1) { - Py_DECREF(substring); - return NULL; - } - if (PyUnicode_READY(substring) == -1) { - Py_DECREF(substring); + if (PyUnicode_READY(self) == -1) return NULL; - } - result = any_find_slice(1, self, substring, start, end); - - Py_DECREF(substring); + result = any_find_slice(self, substring, start, end, 1); if (result == -2) return NULL; @@ -11953,7 +12069,7 @@ unicode_lower(PyObject *self) #define BOTHSTRIP 2 /* Arrays indexed by above */ -static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; +static const char * const stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; #define STRIPNAME(i) (stripformat[i]+3) @@ -12248,40 +12364,15 @@ unicode_repeat(PyObject *str, Py_ssize_t len) } PyObject * -PyUnicode_Replace(PyObject *obj, - PyObject *subobj, - PyObject *replobj, +PyUnicode_Replace(PyObject *str, + PyObject *substr, + PyObject *replstr, Py_ssize_t maxcount) { - PyObject *self; - PyObject *str1; - PyObject *str2; - PyObject *result; - - self = PyUnicode_FromObject(obj); - if (self == NULL) + if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 || + ensure_unicode(replstr) < 0) return NULL; - str1 = PyUnicode_FromObject(subobj); - if (str1 == NULL) { - Py_DECREF(self); - return NULL; - } - str2 = PyUnicode_FromObject(replobj); - if (str2 == NULL) { - Py_DECREF(self); - Py_DECREF(str1); - return NULL; - } - if (PyUnicode_READY(self) == -1 || - PyUnicode_READY(str1) == -1 || - PyUnicode_READY(str2) == -1) - result = NULL; - else - result = replace(self, str1, str2, maxcount); - Py_DECREF(self); - Py_DECREF(str1); - Py_DECREF(str2); - return result; + return replace(str, substr, replstr, maxcount); } PyDoc_STRVAR(replace__doc__, @@ -12297,28 +12388,12 @@ unicode_replace(PyObject *self, PyObject *args) PyObject *str1; PyObject *str2; Py_ssize_t maxcount = -1; - PyObject *result; - if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) + if (!PyArg_ParseTuple(args, "UU|n:replace", &str1, &str2, &maxcount)) return NULL; if (PyUnicode_READY(self) == -1) return NULL; - str1 = PyUnicode_FromObject(str1); - if (str1 == NULL) - return NULL; - str2 = PyUnicode_FromObject(str2); - if (str2 == NULL) { - Py_DECREF(str1); - return NULL; - } - if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1) - result = NULL; - else - result = replace(self, str1, str2, maxcount); - - Py_DECREF(str1); - Py_DECREF(str2); - return result; + return replace(self, str1, str2, maxcount); } static PyObject * @@ -12503,22 +12578,13 @@ unicode_rfind(PyObject *self, PyObject *args) Py_ssize_t end = 0; Py_ssize_t result; - if (!stringlib_parse_args_finds_unicode("rfind", args, &substring, - &start, &end)) + if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end)) return NULL; - if (PyUnicode_READY(self) == -1) { - Py_DECREF(substring); - return NULL; - } - if (PyUnicode_READY(substring) == -1) { - Py_DECREF(substring); + if (PyUnicode_READY(self) == -1) return NULL; - } - - result = any_find_slice(-1, self, substring, start, end); - Py_DECREF(substring); + result = any_find_slice(self, substring, start, end, -1); if (result == -2) return NULL; @@ -12540,22 +12606,13 @@ unicode_rindex(PyObject *self, PyObject *args) Py_ssize_t end = 0; Py_ssize_t result; - if (!stringlib_parse_args_finds_unicode("rindex", args, &substring, - &start, &end)) + if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end)) return NULL; - if (PyUnicode_READY(self) == -1) { - Py_DECREF(substring); - return NULL; - } - if (PyUnicode_READY(substring) == -1) { - Py_DECREF(substring); + if (PyUnicode_READY(self) == -1) return NULL; - } - - result = any_find_slice(-1, self, substring, start, end); - Py_DECREF(substring); + result = any_find_slice(self, substring, start, end, -1); if (result == -2) return NULL; @@ -12595,24 +12652,10 @@ unicode_rjust(PyObject *self, PyObject *args) PyObject * PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) { - PyObject *result; - - s = PyUnicode_FromObject(s); - if (s == NULL) + if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0)) return NULL; - if (sep != NULL) { - sep = PyUnicode_FromObject(sep); - if (sep == NULL) { - Py_DECREF(s); - return NULL; - } - } - result = split(s, sep, maxsplit); - - Py_DECREF(s); - Py_XDECREF(sep); - return result; + return split(s, sep, maxsplit); } PyDoc_STRVAR(split__doc__, @@ -12637,35 +12680,26 @@ unicode_split(PyObject *self, PyObject *args, PyObject *kwds) if (substring == Py_None) return split(self, NULL, maxcount); - else if (PyUnicode_Check(substring)) + + if (PyUnicode_Check(substring)) return split(self, substring, maxcount); - else - return PyUnicode_Split(self, substring, maxcount); + + PyErr_Format(PyExc_TypeError, + "must be str or None, not %.100s", + Py_TYPE(substring)->tp_name); + return NULL; } PyObject * -PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) +PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj) { - PyObject* str_obj; - PyObject* sep_obj; PyObject* out; int kind1, kind2; void *buf1, *buf2; Py_ssize_t len1, len2; - str_obj = PyUnicode_FromObject(str_in); - if (!str_obj) - return NULL; - sep_obj = PyUnicode_FromObject(sep_in); - if (!sep_obj) { - Py_DECREF(str_obj); - return NULL; - } - if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) { - Py_DECREF(sep_obj); - Py_DECREF(str_obj); + if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0) return NULL; - } kind1 = PyUnicode_KIND(str_obj); kind2 = PyUnicode_KIND(sep_obj); @@ -12679,8 +12713,6 @@ PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty); Py_DECREF(unicode_empty); } - Py_DECREF(sep_obj); - Py_DECREF(str_obj); return out; } buf1 = PyUnicode_DATA(str_obj); @@ -12688,7 +12720,7 @@ PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) if (kind2 != kind1) { buf2 = _PyUnicode_AsKind(sep_obj, kind1); if (!buf2) - goto onError; + return NULL; } switch (kind1) { @@ -12709,39 +12741,23 @@ PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) out = 0; } - Py_DECREF(sep_obj); - Py_DECREF(str_obj); if (kind2 != kind1) PyMem_Free(buf2); return out; - onError: - Py_DECREF(sep_obj); - Py_DECREF(str_obj); - if (kind2 != kind1 && buf2) - PyMem_Free(buf2); - return NULL; } PyObject * -PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) +PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj) { - PyObject* str_obj; - PyObject* sep_obj; PyObject* out; int kind1, kind2; void *buf1, *buf2; Py_ssize_t len1, len2; - str_obj = PyUnicode_FromObject(str_in); - if (!str_obj) + if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0) return NULL; - sep_obj = PyUnicode_FromObject(sep_in); - if (!sep_obj) { - Py_DECREF(str_obj); - return NULL; - } kind1 = PyUnicode_KIND(str_obj); kind2 = PyUnicode_KIND(sep_obj); @@ -12755,8 +12771,6 @@ PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj); Py_DECREF(unicode_empty); } - Py_DECREF(sep_obj); - Py_DECREF(str_obj); return out; } buf1 = PyUnicode_DATA(str_obj); @@ -12764,7 +12778,7 @@ PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) if (kind2 != kind1) { buf2 = _PyUnicode_AsKind(sep_obj, kind1); if (!buf2) - goto onError; + return NULL; } switch (kind1) { @@ -12785,18 +12799,10 @@ PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) out = 0; } - Py_DECREF(sep_obj); - Py_DECREF(str_obj); if (kind2 != kind1) PyMem_Free(buf2); return out; - onError: - Py_DECREF(sep_obj); - Py_DECREF(str_obj); - if (kind2 != kind1 && buf2) - PyMem_Free(buf2); - return NULL; } PyDoc_STRVAR(partition__doc__, @@ -12828,24 +12834,10 @@ unicode_rpartition(PyObject *self, PyObject *separator) PyObject * PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) { - PyObject *result; - - s = PyUnicode_FromObject(s); - if (s == NULL) + if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0)) return NULL; - if (sep != NULL) { - sep = PyUnicode_FromObject(sep); - if (sep == NULL) { - Py_DECREF(s); - return NULL; - } - } - result = rsplit(s, sep, maxsplit); - - Py_DECREF(s); - Py_XDECREF(sep); - return result; + return rsplit(s, sep, maxsplit); } PyDoc_STRVAR(rsplit__doc__, @@ -12870,10 +12862,14 @@ unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds) if (substring == Py_None) return rsplit(self, NULL, maxcount); - else if (PyUnicode_Check(substring)) + + if (PyUnicode_Check(substring)) return rsplit(self, substring, maxcount); - else - return PyUnicode_RSplit(self, substring, maxcount); + + PyErr_Format(PyExc_TypeError, + "must be str or None, not %.100s", + Py_TYPE(substring)->tp_name); + return NULL; } PyDoc_STRVAR(splitlines__doc__, @@ -13154,11 +13150,15 @@ unicode_startswith(PyObject *self, if (PyTuple_Check(subobj)) { Py_ssize_t i; for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { - substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i)); - if (substring == NULL) + substring = PyTuple_GET_ITEM(subobj, i); + if (!PyUnicode_Check(substring)) { + PyErr_Format(PyExc_TypeError, + "tuple for startswith must only contain str, " + "not %.100s", + Py_TYPE(substring)->tp_name); return NULL; + } result = tailmatch(self, substring, start, end, -1); - Py_DECREF(substring); if (result == -1) return NULL; if (result) { @@ -13168,15 +13168,13 @@ unicode_startswith(PyObject *self, /* nothing matched */ Py_RETURN_FALSE; } - substring = PyUnicode_FromObject(subobj); - if (substring == NULL) { - if (PyErr_ExceptionMatches(PyExc_TypeError)) - PyErr_Format(PyExc_TypeError, "startswith first arg must be str or " - "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); + if (!PyUnicode_Check(subobj)) { + PyErr_Format(PyExc_TypeError, + "startswith first arg must be str or " + "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name); return NULL; } - result = tailmatch(self, substring, start, end, -1); - Py_DECREF(substring); + result = tailmatch(self, subobj, start, end, -1); if (result == -1) return NULL; return PyBool_FromLong(result); @@ -13206,12 +13204,15 @@ unicode_endswith(PyObject *self, if (PyTuple_Check(subobj)) { Py_ssize_t i; for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { - substring = PyUnicode_FromObject( - PyTuple_GET_ITEM(subobj, i)); - if (substring == NULL) + substring = PyTuple_GET_ITEM(subobj, i); + if (!PyUnicode_Check(substring)) { + PyErr_Format(PyExc_TypeError, + "tuple for endswith must only contain str, " + "not %.100s", + Py_TYPE(substring)->tp_name); return NULL; + } result = tailmatch(self, substring, start, end, +1); - Py_DECREF(substring); if (result == -1) return NULL; if (result) { @@ -13220,15 +13221,13 @@ unicode_endswith(PyObject *self, } Py_RETURN_FALSE; } - substring = PyUnicode_FromObject(subobj); - if (substring == NULL) { - if (PyErr_ExceptionMatches(PyExc_TypeError)) - PyErr_Format(PyExc_TypeError, "endswith first arg must be str or " - "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); + if (!PyUnicode_Check(subobj)) { + PyErr_Format(PyExc_TypeError, + "endswith first arg must be str or " + "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name); return NULL; } - result = tailmatch(self, substring, start, end, +1); - Py_DECREF(substring); + result = tailmatch(self, subobj, start, end, +1); if (result == -1) return NULL; return PyBool_FromLong(result); @@ -13237,44 +13236,50 @@ unicode_endswith(PyObject *self, Py_LOCAL_INLINE(void) _PyUnicodeWriter_Update(_PyUnicodeWriter *writer) { - if (!writer->readonly) + writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer); + writer->data = PyUnicode_DATA(writer->buffer); + + if (!writer->readonly) { + writer->kind = PyUnicode_KIND(writer->buffer); writer->size = PyUnicode_GET_LENGTH(writer->buffer); + } else { + /* use a value smaller than PyUnicode_1BYTE_KIND() so + _PyUnicodeWriter_PrepareKind() will copy the buffer. */ + writer->kind = PyUnicode_WCHAR_KIND; + assert(writer->kind <= PyUnicode_1BYTE_KIND); + /* Copy-on-write mode: set buffer size to 0 so * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on * next write. */ writer->size = 0; } - writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer); - writer->data = PyUnicode_DATA(writer->buffer); - writer->kind = PyUnicode_KIND(writer->buffer); } void _PyUnicodeWriter_Init(_PyUnicodeWriter *writer) { memset(writer, 0, sizeof(*writer)); -#ifdef Py_DEBUG - writer->kind = 5; /* invalid kind */ -#endif + + /* ASCII is the bare minimum */ writer->min_char = 127; + + /* use a value smaller than PyUnicode_1BYTE_KIND() so + _PyUnicodeWriter_PrepareKind() will copy the buffer. */ + writer->kind = PyUnicode_WCHAR_KIND; + assert(writer->kind <= PyUnicode_1BYTE_KIND); } int _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, Py_ssize_t length, Py_UCS4 maxchar) { -#ifdef MS_WINDOWS - /* On Windows, overallocate by 50% is the best factor */ -# define OVERALLOCATE_FACTOR 2 -#else - /* On Linux, overallocate by 25% is the best factor */ -# define OVERALLOCATE_FACTOR 4 -#endif Py_ssize_t newlen; PyObject *newbuffer; - assert(length > 0); + /* ensure that the _PyUnicodeWriter_Prepare macro was used */ + assert((maxchar > writer->maxchar && length >= 0) + || length > 0); if (length > PY_SSIZE_T_MAX - writer->pos) { PyErr_NoMemory(); @@ -13340,6 +13345,28 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, #undef OVERALLOCATE_FACTOR } +int +_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer, + enum PyUnicode_Kind kind) +{ + Py_UCS4 maxchar; + + /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */ + assert(writer->kind < kind); + + switch (kind) + { + case PyUnicode_1BYTE_KIND: maxchar = 0xff; break; + case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break; + case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break; + default: + assert(0 && "invalid kind"); + return -1; + } + + return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar); +} + Py_LOCAL_INLINE(int) _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch) { @@ -13510,17 +13537,26 @@ _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer) assert(PyUnicode_GET_LENGTH(str) == writer->pos); return str; } - if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) { - PyObject *newbuffer; - newbuffer = resize_compact(writer->buffer, writer->pos); - if (newbuffer == NULL) { - Py_CLEAR(writer->buffer); - return NULL; + if (writer->pos == 0) { + Py_CLEAR(writer->buffer); + + /* Get the empty Unicode string singleton ('') */ + _Py_INCREF_UNICODE_EMPTY(); + str = unicode_empty; + } + else { + str = writer->buffer; + writer->buffer = NULL; + + if (PyUnicode_GET_LENGTH(str) != writer->pos) { + PyObject *str2; + str2 = resize_compact(str, writer->pos); + if (str2 == NULL) + return NULL; + str = str2; } - writer->buffer = newbuffer; } - str = writer->buffer; - writer->buffer = NULL; + assert(_PyUnicode_CheckConsistency(str, 1)); return unicode_result_ready(str); } @@ -14661,13 +14697,10 @@ PyUnicode_Format(PyObject *format, PyObject *args) return NULL; } - ctx.fmtstr = PyUnicode_FromObject(format); - if (ctx.fmtstr == NULL) + if (ensure_unicode(format) < 0) return NULL; - if (PyUnicode_READY(ctx.fmtstr) == -1) { - Py_DECREF(ctx.fmtstr); - return NULL; - } + + ctx.fmtstr = format; ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr); ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr); ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr); @@ -14727,11 +14760,9 @@ PyUnicode_Format(PyObject *format, PyObject *args) if (ctx.args_owned) { Py_DECREF(ctx.args); } - Py_DECREF(ctx.fmtstr); return _PyUnicodeWriter_Finish(&ctx.writer); onError: - Py_DECREF(ctx.fmtstr); _PyUnicodeWriter_Dealloc(&ctx.writer); if (ctx.args_owned) { Py_DECREF(ctx.args); @@ -15009,26 +15040,18 @@ PyUnicode_InternInPlace(PyObject **p) return; } } - /* It might be that the GetItem call fails even - though the key is present in the dictionary, - namely when this happens during a stack overflow. */ Py_ALLOW_RECURSION - t = PyDict_GetItem(interned, s); + t = PyDict_SetDefault(interned, s, s); Py_END_ALLOW_RECURSION - - if (t) { - Py_INCREF(t); - Py_SETREF(*p, t); + if (t == NULL) { + PyErr_Clear(); return; } - - PyThreadState_GET()->recursion_critical = 1; - if (PyDict_SetItem(interned, s, s) < 0) { - PyErr_Clear(); - PyThreadState_GET()->recursion_critical = 0; + if (t != s) { + Py_INCREF(t); + Py_SETREF(*p, t); return; } - PyThreadState_GET()->recursion_critical = 0; /* The two references in interned are not counted by refcnt. The deallocator will take care of this */ Py_REFCNT(s) -= 2; |