aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/Objects/unicodeobject.c
diff options
context:
space:
mode:
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r--Objects/unicodeobject.c1483
1 files changed, 753 insertions, 730 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 1fcc83e63a3..0226e429c3a 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -42,6 +42,7 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#include "Python.h"
#include "ucnhash.h"
#include "bytes_methods.h"
+#include "stringlib/eq.h"
#ifdef MS_WINDOWS
#include <windows.h>
@@ -162,6 +163,14 @@ extern "C" {
*_to++ = (to_type) *_iter++; \
} while (0)
+#ifdef MS_WINDOWS
+ /* On Windows, overallocate by 50% is the best factor */
+# define OVERALLOCATE_FACTOR 2
+#else
+ /* On Linux, overallocate by 25% is the best factor */
+# define OVERALLOCATE_FACTOR 4
+#endif
+
/* This dictionary holds all interned unicode strings. Note that references
to strings in this dictionary are *not* counted in the string's ob_refcnt.
When the interned string reaches a refcnt of 0 the string deallocation
@@ -263,7 +272,7 @@ raise_encode_exception(PyObject **exceptionObject,
const char *reason);
/* Same for linebreaks */
-static unsigned char ascii_linebreak[] = {
+static const unsigned char ascii_linebreak[] = {
0, 0, 0, 0, 0, 0, 0, 0,
/* 0x000A, * LINE FEED */
/* 0x000B, * LINE TABULATION */
@@ -292,6 +301,38 @@ static unsigned char ascii_linebreak[] = {
#include "clinic/unicodeobject.c.h"
+typedef enum {
+ _Py_ERROR_UNKNOWN=0,
+ _Py_ERROR_STRICT,
+ _Py_ERROR_SURROGATEESCAPE,
+ _Py_ERROR_REPLACE,
+ _Py_ERROR_IGNORE,
+ _Py_ERROR_BACKSLASHREPLACE,
+ _Py_ERROR_SURROGATEPASS,
+ _Py_ERROR_XMLCHARREFREPLACE,
+ _Py_ERROR_OTHER
+} _Py_error_handler;
+
+static _Py_error_handler
+get_error_handler(const char *errors)
+{
+ if (errors == NULL || strcmp(errors, "strict") == 0)
+ return _Py_ERROR_STRICT;
+ if (strcmp(errors, "surrogateescape") == 0)
+ return _Py_ERROR_SURROGATEESCAPE;
+ if (strcmp(errors, "replace") == 0)
+ return _Py_ERROR_REPLACE;
+ if (strcmp(errors, "ignore") == 0)
+ return _Py_ERROR_IGNORE;
+ if (strcmp(errors, "backslashreplace") == 0)
+ return _Py_ERROR_BACKSLASHREPLACE;
+ if (strcmp(errors, "surrogatepass") == 0)
+ return _Py_ERROR_SURROGATEPASS;
+ if (strcmp(errors, "xmlcharrefreplace") == 0)
+ return _Py_ERROR_XMLCHARREFREPLACE;
+ return _Py_ERROR_OTHER;
+}
+
/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
This function is kept for backward compatibility with the old API. */
Py_UNICODE
@@ -521,6 +562,129 @@ unicode_result_unchanged(PyObject *unicode)
return _PyUnicode_Copy(unicode);
}
+/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
+ ASCII, Latin1, UTF-8, etc. */
+static char*
+backslashreplace(_PyBytesWriter *writer, char *str,
+ PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
+{
+ Py_ssize_t size, i;
+ Py_UCS4 ch;
+ enum PyUnicode_Kind kind;
+ void *data;
+
+ assert(PyUnicode_IS_READY(unicode));
+ kind = PyUnicode_KIND(unicode);
+ data = PyUnicode_DATA(unicode);
+
+ size = 0;
+ /* determine replacement size */
+ for (i = collstart; i < collend; ++i) {
+ Py_ssize_t incr;
+
+ ch = PyUnicode_READ(kind, data, i);
+ if (ch < 0x100)
+ incr = 2+2;
+ else if (ch < 0x10000)
+ incr = 2+4;
+ else {
+ assert(ch <= MAX_UNICODE);
+ incr = 2+8;
+ }
+ if (size > PY_SSIZE_T_MAX - incr) {
+ PyErr_SetString(PyExc_OverflowError,
+ "encoded result is too long for a Python string");
+ return NULL;
+ }
+ size += incr;
+ }
+
+ str = _PyBytesWriter_Prepare(writer, str, size);
+ if (str == NULL)
+ return NULL;
+
+ /* generate replacement */
+ for (i = collstart; i < collend; ++i) {
+ ch = PyUnicode_READ(kind, data, i);
+ *str++ = '\\';
+ if (ch >= 0x00010000) {
+ *str++ = 'U';
+ *str++ = Py_hexdigits[(ch>>28)&0xf];
+ *str++ = Py_hexdigits[(ch>>24)&0xf];
+ *str++ = Py_hexdigits[(ch>>20)&0xf];
+ *str++ = Py_hexdigits[(ch>>16)&0xf];
+ *str++ = Py_hexdigits[(ch>>12)&0xf];
+ *str++ = Py_hexdigits[(ch>>8)&0xf];
+ }
+ else if (ch >= 0x100) {
+ *str++ = 'u';
+ *str++ = Py_hexdigits[(ch>>12)&0xf];
+ *str++ = Py_hexdigits[(ch>>8)&0xf];
+ }
+ else
+ *str++ = 'x';
+ *str++ = Py_hexdigits[(ch>>4)&0xf];
+ *str++ = Py_hexdigits[ch&0xf];
+ }
+ return str;
+}
+
+/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
+ ASCII, Latin1, UTF-8, etc. */
+static char*
+xmlcharrefreplace(_PyBytesWriter *writer, char *str,
+ PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
+{
+ Py_ssize_t size, i;
+ Py_UCS4 ch;
+ enum PyUnicode_Kind kind;
+ void *data;
+
+ assert(PyUnicode_IS_READY(unicode));
+ kind = PyUnicode_KIND(unicode);
+ data = PyUnicode_DATA(unicode);
+
+ size = 0;
+ /* determine replacement size */
+ for (i = collstart; i < collend; ++i) {
+ Py_ssize_t incr;
+
+ ch = PyUnicode_READ(kind, data, i);
+ if (ch < 10)
+ incr = 2+1+1;
+ else if (ch < 100)
+ incr = 2+2+1;
+ else if (ch < 1000)
+ incr = 2+3+1;
+ else if (ch < 10000)
+ incr = 2+4+1;
+ else if (ch < 100000)
+ incr = 2+5+1;
+ else if (ch < 1000000)
+ incr = 2+6+1;
+ else {
+ assert(ch <= MAX_UNICODE);
+ incr = 2+7+1;
+ }
+ if (size > PY_SSIZE_T_MAX - incr) {
+ PyErr_SetString(PyExc_OverflowError,
+ "encoded result is too long for a Python string");
+ return NULL;
+ }
+ size += incr;
+ }
+
+ str = _PyBytesWriter_Prepare(writer, str, size);
+ if (str == NULL)
+ return NULL;
+
+ /* generate replacement */
+ for (i = collstart; i < collend; ++i) {
+ str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
+ }
+ return str;
+}
+
/* --- Bloom Filters ----------------------------------------------------- */
/* stuff to implement simple "bloom filters" for Unicode characters.
@@ -587,6 +751,18 @@ make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
#undef BLOOM_UPDATE
}
+static int
+ensure_unicode(PyObject *obj)
+{
+ if (!PyUnicode_Check(obj)) {
+ PyErr_Format(PyExc_TypeError,
+ "must be str, not %.100s",
+ Py_TYPE(obj)->tp_name);
+ return -1;
+ }
+ return PyUnicode_READY(obj);
+}
+
/* Compilation of templated routines */
#include "stringlib/asciilib.h"
@@ -647,27 +823,26 @@ Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind,
Py_ssize_t size, Py_UCS4 ch,
int direction)
{
- int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
-
switch (kind) {
case PyUnicode_1BYTE_KIND:
- {
- Py_UCS1 ch1 = (Py_UCS1) ch;
- if (ch1 == ch)
- return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
- else
- return -1;
- }
+ if ((Py_UCS1) ch != ch)
+ return -1;
+ if (direction > 0)
+ return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
+ else
+ return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
case PyUnicode_2BYTE_KIND:
- {
- Py_UCS2 ch2 = (Py_UCS2) ch;
- if (ch2 == ch)
- return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
- else
- return -1;
- }
+ if ((Py_UCS2) ch != ch)
+ return -1;
+ if (direction > 0)
+ return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
+ else
+ return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
case PyUnicode_4BYTE_KIND:
- return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
+ if (direction > 0)
+ return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
+ else
+ return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
default:
assert(0);
return -1;
@@ -2903,7 +3078,7 @@ PyUnicode_FromEncodedObject(PyObject *obj,
/* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
PyErr_Format(PyExc_TypeError,
- "coercing to str: need a bytes-like object, %.80s found",
+ "decoding to str: need a bytes-like object, %.80s found",
Py_TYPE(obj)->tp_name);
return NULL;
}
@@ -3167,24 +3342,22 @@ wcstombs_errorpos(const wchar_t *wstr)
static int
locale_error_handler(const char *errors, int *surrogateescape)
{
- if (errors == NULL) {
- *surrogateescape = 0;
- return 0;
- }
-
- if (strcmp(errors, "strict") == 0) {
+ _Py_error_handler error_handler = get_error_handler(errors);
+ switch (error_handler)
+ {
+ case _Py_ERROR_STRICT:
*surrogateescape = 0;
return 0;
- }
- if (strcmp(errors, "surrogateescape") == 0) {
+ case _Py_ERROR_SURROGATEESCAPE:
*surrogateescape = 1;
return 0;
+ default:
+ PyErr_Format(PyExc_ValueError,
+ "only 'strict' and 'surrogateescape' error handlers "
+ "are supported, not '%s'",
+ errors);
+ return -1;
}
- PyErr_Format(PyExc_ValueError,
- "only 'strict' and 'surrogateescape' error handlers "
- "are supported, not '%s'",
- errors);
- return -1;
}
PyObject *
@@ -3626,19 +3799,17 @@ PyUnicode_FSConverter(PyObject* arg, void* addr)
output = arg;
Py_INCREF(output);
}
- else {
- arg = PyUnicode_FromObject(arg);
- if (!arg)
- return 0;
+ else if (PyUnicode_Check(arg)) {
output = PyUnicode_EncodeFSDefault(arg);
- Py_DECREF(arg);
if (!output)
return 0;
- if (!PyBytes_Check(output)) {
- Py_DECREF(output);
- PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
- return 0;
- }
+ assert(PyBytes_Check(output));
+ }
+ else {
+ PyErr_Format(PyExc_TypeError,
+ "must be str or bytes, not %.100s",
+ Py_TYPE(arg)->tp_name);
+ return 0;
}
size = PyBytes_GET_SIZE(output);
data = PyBytes_AS_STRING(output);
@@ -3666,7 +3837,13 @@ PyUnicode_FSDecoder(PyObject* arg, void* addr)
output = arg;
Py_INCREF(output);
}
- else if (PyObject_CheckBuffer(arg)) {
+ else if (PyBytes_Check(arg) || PyObject_CheckBuffer(arg)) {
+ if (!PyBytes_Check(arg) &&
+ PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+ "path should be string or bytes, not %.200s",
+ Py_TYPE(arg)->tp_name)) {
+ return 0;
+ }
arg = PyBytes_FromObject(arg);
if (!arg)
return 0;
@@ -3675,11 +3852,6 @@ PyUnicode_FSDecoder(PyObject* arg, void* addr)
Py_DECREF(arg);
if (!output)
return 0;
- if (!PyUnicode_Check(output)) {
- Py_DECREF(output);
- PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
- return 0;
- }
}
else {
PyErr_Format(PyExc_TypeError,
@@ -3716,7 +3888,7 @@ PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
if (PyUnicode_UTF8(unicode) == NULL) {
assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
- bytes = _PyUnicode_AsUTF8String(unicode, "strict");
+ bytes = _PyUnicode_AsUTF8String(unicode, NULL);
if (bytes == NULL)
return NULL;
_PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
@@ -3982,7 +4154,7 @@ unicode_decode_call_errorhandler_wchar(
Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
PyObject **output, Py_ssize_t *outpos)
{
- static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
+ static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
PyObject *restuple = NULL;
PyObject *repunicode = NULL;
@@ -4090,7 +4262,7 @@ unicode_decode_call_errorhandler_writer(
Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
_PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
{
- static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
+ static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
PyObject *restuple = NULL;
PyObject *repunicode = NULL;
@@ -4696,8 +4868,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
Py_ssize_t startinpos;
Py_ssize_t endinpos;
const char *errmsg = "";
- PyObject *errorHandler = NULL;
+ PyObject *error_handler_obj = NULL;
PyObject *exc = NULL;
+ _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
if (size == 0) {
if (consumed)
@@ -4722,6 +4895,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
while (s < end) {
Py_UCS4 ch;
int kind = writer.kind;
+
if (kind == PyUnicode_1BYTE_KIND) {
if (PyUnicode_IS_ASCII(writer.buffer))
ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
@@ -4760,24 +4934,56 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
continue;
}
- if (unicode_decode_call_errorhandler_writer(
- errors, &errorHandler,
- "utf-8", errmsg,
- &starts, &end, &startinpos, &endinpos, &exc, &s,
- &writer))
- goto onError;
+ if (error_handler == _Py_ERROR_UNKNOWN)
+ error_handler = get_error_handler(errors);
+
+ switch (error_handler) {
+ case _Py_ERROR_IGNORE:
+ s += (endinpos - startinpos);
+ break;
+
+ case _Py_ERROR_REPLACE:
+ if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
+ goto onError;
+ s += (endinpos - startinpos);
+ break;
+
+ case _Py_ERROR_SURROGATEESCAPE:
+ {
+ Py_ssize_t i;
+
+ if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
+ goto onError;
+ for (i=startinpos; i<endinpos; i++) {
+ ch = (Py_UCS4)(unsigned char)(starts[i]);
+ PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
+ ch + 0xdc00);
+ writer.pos++;
+ }
+ s += (endinpos - startinpos);
+ break;
+ }
+
+ default:
+ if (unicode_decode_call_errorhandler_writer(
+ errors, &error_handler_obj,
+ "utf-8", errmsg,
+ &starts, &end, &startinpos, &endinpos, &exc, &s,
+ &writer))
+ goto onError;
+ }
}
End:
if (consumed)
*consumed = s - starts;
- Py_XDECREF(errorHandler);
+ Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
return _PyUnicodeWriter_Finish(&writer);
onError:
- Py_XDECREF(errorHandler);
+ Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
_PyUnicodeWriter_Dealloc(&writer);
return NULL;
@@ -5868,11 +6074,10 @@ PyObject *
PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
{
Py_ssize_t i, len;
- PyObject *repr;
char *p;
int kind;
void *data;
- Py_ssize_t expandsize = 0;
+ _PyBytesWriter writer;
/* Initial allocation is based on the longest-possible character
escape.
@@ -5888,35 +6093,28 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
}
if (PyUnicode_READY(unicode) == -1)
return NULL;
+
+ _PyBytesWriter_Init(&writer);
+
len = PyUnicode_GET_LENGTH(unicode);
kind = PyUnicode_KIND(unicode);
data = PyUnicode_DATA(unicode);
- switch (kind) {
- case PyUnicode_1BYTE_KIND: expandsize = 4; break;
- case PyUnicode_2BYTE_KIND: expandsize = 6; break;
- case PyUnicode_4BYTE_KIND: expandsize = 10; break;
- }
- if (len == 0)
- return PyBytes_FromStringAndSize(NULL, 0);
-
- if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
- return PyErr_NoMemory();
-
- repr = PyBytes_FromStringAndSize(NULL,
- 2
- + expandsize*len
- + 1);
- if (repr == NULL)
- return NULL;
-
- p = PyBytes_AS_STRING(repr);
+ p = _PyBytesWriter_Alloc(&writer, len);
+ if (p == NULL)
+ goto error;
+ writer.overallocate = 1;
for (i = 0; i < len; i++) {
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
/* Escape backslashes */
if (ch == '\\') {
+ /* -1: subtract 1 preallocated byte */
+ p = _PyBytesWriter_Prepare(&writer, p, 2-1);
+ if (p == NULL)
+ goto error;
+
*p++ = '\\';
*p++ = (char) ch;
continue;
@@ -5925,6 +6123,11 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
/* Map 21-bit characters to '\U00xxxxxx' */
else if (ch >= 0x10000) {
assert(ch <= MAX_UNICODE);
+
+ p = _PyBytesWriter_Prepare(&writer, p, 10-1);
+ if (p == NULL)
+ goto error;
+
*p++ = '\\';
*p++ = 'U';
*p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
@@ -5940,6 +6143,10 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
/* Map 16-bit characters to '\uxxxx' */
if (ch >= 256) {
+ p = _PyBytesWriter_Prepare(&writer, p, 6-1);
+ if (p == NULL)
+ goto error;
+
*p++ = '\\';
*p++ = 'u';
*p++ = Py_hexdigits[(ch >> 12) & 0x000F];
@@ -5950,20 +6157,37 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
/* Map special whitespace to '\t', \n', '\r' */
else if (ch == '\t') {
+ p = _PyBytesWriter_Prepare(&writer, p, 2-1);
+ if (p == NULL)
+ goto error;
+
*p++ = '\\';
*p++ = 't';
}
else if (ch == '\n') {
+ p = _PyBytesWriter_Prepare(&writer, p, 2-1);
+ if (p == NULL)
+ goto error;
+
*p++ = '\\';
*p++ = 'n';
}
else if (ch == '\r') {
+ p = _PyBytesWriter_Prepare(&writer, p, 2-1);
+ if (p == NULL)
+ goto error;
+
*p++ = '\\';
*p++ = 'r';
}
/* Map non-printable US ASCII to '\xhh' */
else if (ch < ' ' || ch >= 0x7F) {
+ /* -1: subtract 1 preallocated byte */
+ p = _PyBytesWriter_Prepare(&writer, p, 4-1);
+ if (p == NULL)
+ goto error;
+
*p++ = '\\';
*p++ = 'x';
*p++ = Py_hexdigits[(ch >> 4) & 0x000F];
@@ -5975,10 +6199,11 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
*p++ = (char) ch;
}
- assert(p - PyBytes_AS_STRING(repr) > 0);
- if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
- return NULL;
- return repr;
+ return _PyBytesWriter_Finish(&writer, p);
+
+error:
+ _PyBytesWriter_Dealloc(&writer);
+ return NULL;
}
PyObject *
@@ -6107,13 +6332,12 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
PyObject *
PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
{
- PyObject *repr;
char *p;
- char *q;
- Py_ssize_t expandsize, pos;
+ Py_ssize_t pos;
int kind;
void *data;
Py_ssize_t len;
+ _PyBytesWriter writer;
if (!PyUnicode_Check(unicode)) {
PyErr_BadArgument();
@@ -6121,28 +6345,29 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
}
if (PyUnicode_READY(unicode) == -1)
return NULL;
+
+ _PyBytesWriter_Init(&writer);
+
kind = PyUnicode_KIND(unicode);
data = PyUnicode_DATA(unicode);
len = PyUnicode_GET_LENGTH(unicode);
- /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
- bytes, and 1 byte characters 4. */
- expandsize = kind * 2 + 2;
-
- if (len > PY_SSIZE_T_MAX / expandsize)
- return PyErr_NoMemory();
- repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
- if (repr == NULL)
- return NULL;
- if (len == 0)
- return repr;
+ p = _PyBytesWriter_Alloc(&writer, len);
+ if (p == NULL)
+ goto error;
+ writer.overallocate = 1;
- p = q = PyBytes_AS_STRING(repr);
for (pos = 0; pos < len; pos++) {
Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
/* Map 32-bit characters to '\Uxxxxxxxx' */
if (ch >= 0x10000) {
assert(ch <= MAX_UNICODE);
+
+ /* -1: subtract 1 preallocated byte */
+ p = _PyBytesWriter_Prepare(&writer, p, 10-1);
+ if (p == NULL)
+ goto error;
+
*p++ = '\\';
*p++ = 'U';
*p++ = Py_hexdigits[(ch >> 28) & 0xf];
@@ -6156,6 +6381,11 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
}
/* Map 16-bit characters to '\uxxxx' */
else if (ch >= 256) {
+ /* -1: subtract 1 preallocated byte */
+ p = _PyBytesWriter_Prepare(&writer, p, 6-1);
+ if (p == NULL)
+ goto error;
+
*p++ = '\\';
*p++ = 'u';
*p++ = Py_hexdigits[(ch >> 12) & 0xf];
@@ -6168,10 +6398,11 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
*p++ = (char) ch;
}
- assert(p > q);
- if (_PyBytes_Resize(&repr, p - q) < 0)
- return NULL;
- return repr;
+ return _PyBytesWriter_Finish(&writer, p);
+
+error:
+ _PyBytesWriter_Dealloc(&writer);
+ return NULL;
}
PyObject *
@@ -6348,7 +6579,7 @@ unicode_encode_call_errorhandler(const char *errors,
Py_ssize_t startpos, Py_ssize_t endpos,
Py_ssize_t *newpos)
{
- static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
+ static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Py_ssize_t len;
PyObject *restuple;
PyObject *resunicode;
@@ -6402,25 +6633,22 @@ unicode_encode_call_errorhandler(const char *errors,
static PyObject *
unicode_encode_ucs1(PyObject *unicode,
const char *errors,
- unsigned int limit)
+ const Py_UCS4 limit)
{
/* input state */
Py_ssize_t pos=0, size;
int kind;
void *data;
- /* output object */
- PyObject *res;
/* pointer into the output */
char *str;
- /* current output position */
- Py_ssize_t ressize;
const char *encoding = (limit == 256) ? "latin-1" : "ascii";
const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
- PyObject *errorHandler = NULL;
+ PyObject *error_handler_obj = NULL;
PyObject *exc = NULL;
- /* the following variable is used for caching string comparisons
- * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
- int known_errorHandler = -1;
+ _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
+ PyObject *rep = NULL;
+ /* output object */
+ _PyBytesWriter writer;
if (PyUnicode_READY(unicode) == -1)
return NULL;
@@ -6431,186 +6659,157 @@ unicode_encode_ucs1(PyObject *unicode,
replacements, if we need more, we'll resize */
if (size == 0)
return PyBytes_FromStringAndSize(NULL, 0);
- res = PyBytes_FromStringAndSize(NULL, size);
- if (res == NULL)
+
+ _PyBytesWriter_Init(&writer);
+ str = _PyBytesWriter_Alloc(&writer, size);
+ if (str == NULL)
return NULL;
- str = PyBytes_AS_STRING(res);
- ressize = size;
while (pos < size) {
- Py_UCS4 c = PyUnicode_READ(kind, data, pos);
+ Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
/* can we encode this? */
- if (c<limit) {
+ if (ch < limit) {
/* no overflow check, because we know that the space is enough */
- *str++ = (char)c;
+ *str++ = (char)ch;
++pos;
}
else {
- Py_ssize_t requiredsize;
- PyObject *repunicode;
- Py_ssize_t repsize, newpos, respos, i;
+ Py_ssize_t newpos, i;
/* startpos for collecting unencodable chars */
Py_ssize_t collstart = pos;
- Py_ssize_t collend = pos;
+ Py_ssize_t collend = collstart + 1;
/* find all unecodable characters */
+
while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
++collend;
+
+ /* Only overallocate the buffer if it's not the last write */
+ writer.overallocate = (collend < size);
+
/* cache callback name lookup (if not done yet, i.e. it's the first error) */
- if (known_errorHandler==-1) {
- if ((errors==NULL) || (!strcmp(errors, "strict")))
- known_errorHandler = 1;
- else if (!strcmp(errors, "replace"))
- known_errorHandler = 2;
- else if (!strcmp(errors, "ignore"))
- known_errorHandler = 3;
- else if (!strcmp(errors, "xmlcharrefreplace"))
- known_errorHandler = 4;
- else
- known_errorHandler = 0;
- }
- switch (known_errorHandler) {
- case 1: /* strict */
+ if (error_handler == _Py_ERROR_UNKNOWN)
+ error_handler = get_error_handler(errors);
+
+ switch (error_handler) {
+ case _Py_ERROR_STRICT:
raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
goto onError;
- case 2: /* replace */
- while (collstart++ < collend)
- *str++ = '?'; /* fall through */
- case 3: /* ignore */
+
+ case _Py_ERROR_REPLACE:
+ memset(str, '?', collend - collstart);
+ str += (collend - collstart);
+ /* fall through ignore error handler */
+ case _Py_ERROR_IGNORE:
+ pos = collend;
+ break;
+
+ case _Py_ERROR_BACKSLASHREPLACE:
+ /* subtract preallocated bytes */
+ writer.min_size -= (collend - collstart);
+ str = backslashreplace(&writer, str,
+ unicode, collstart, collend);
+ if (str == NULL)
+ goto onError;
+ pos = collend;
+ break;
+
+ case _Py_ERROR_XMLCHARREFREPLACE:
+ /* subtract preallocated bytes */
+ writer.min_size -= (collend - collstart);
+ str = xmlcharrefreplace(&writer, str,
+ unicode, collstart, collend);
+ if (str == NULL)
+ goto onError;
pos = collend;
break;
- case 4: /* xmlcharrefreplace */
- respos = str - PyBytes_AS_STRING(res);
- requiredsize = respos;
- /* determine replacement size */
+
+ case _Py_ERROR_SURROGATEESCAPE:
for (i = collstart; i < collend; ++i) {
- Py_UCS4 ch = PyUnicode_READ(kind, data, i);
- Py_ssize_t incr;
- if (ch < 10)
- incr = 2+1+1;
- else if (ch < 100)
- incr = 2+2+1;
- else if (ch < 1000)
- incr = 2+3+1;
- else if (ch < 10000)
- incr = 2+4+1;
- else if (ch < 100000)
- incr = 2+5+1;
- else if (ch < 1000000)
- incr = 2+6+1;
- else {
- assert(ch <= MAX_UNICODE);
- incr = 2+7+1;
+ ch = PyUnicode_READ(kind, data, i);
+ if (ch < 0xdc80 || 0xdcff < ch) {
+ /* Not a UTF-8b surrogate */
+ break;
}
- if (requiredsize > PY_SSIZE_T_MAX - incr)
- goto overflow;
- requiredsize += incr;
+ *str++ = (char)(ch - 0xdc00);
+ ++pos;
}
- if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
- goto overflow;
- requiredsize += size - collend;
- if (requiredsize > ressize) {
- if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
- requiredsize = 2*ressize;
- if (_PyBytes_Resize(&res, requiredsize))
- goto onError;
- str = PyBytes_AS_STRING(res) + respos;
- ressize = requiredsize;
- }
- /* generate replacement */
- for (i = collstart; i < collend; ++i) {
- str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
- }
- pos = collend;
- break;
+ if (i >= collend)
+ break;
+ collstart = pos;
+ assert(collstart != collend);
+ /* fallback to general error handling */
+
default:
- repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
- encoding, reason, unicode, &exc,
- collstart, collend, &newpos);
- if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
- PyUnicode_READY(repunicode) == -1))
+ rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
+ encoding, reason, unicode, &exc,
+ collstart, collend, &newpos);
+ if (rep == NULL)
goto onError;
- if (PyBytes_Check(repunicode)) {
+
+ /* subtract preallocated bytes */
+ writer.min_size -= 1;
+
+ if (PyBytes_Check(rep)) {
/* Directly copy bytes result to output. */
- repsize = PyBytes_Size(repunicode);
- if (repsize > 1) {
- /* Make room for all additional bytes. */
- respos = str - PyBytes_AS_STRING(res);
- if (ressize > PY_SSIZE_T_MAX - repsize - 1) {
- Py_DECREF(repunicode);
- goto overflow;
- }
- if (_PyBytes_Resize(&res, ressize+repsize-1)) {
- Py_DECREF(repunicode);
- goto onError;
- }
- str = PyBytes_AS_STRING(res) + respos;
- ressize += repsize-1;
- }
- memcpy(str, PyBytes_AsString(repunicode), repsize);
- str += repsize;
- pos = newpos;
- Py_DECREF(repunicode);
- break;
- }
- /* need more space? (at least enough for what we
- have+the replacement+the rest of the string, so
- we won't have to check space for encodable characters) */
- respos = str - PyBytes_AS_STRING(res);
- repsize = PyUnicode_GET_LENGTH(repunicode);
- requiredsize = respos;
- if (requiredsize > PY_SSIZE_T_MAX - repsize)
- goto overflow;
- requiredsize += repsize;
- if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
- goto overflow;
- requiredsize += size - collend;
- if (requiredsize > ressize) {
- if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
- requiredsize = 2*ressize;
- if (_PyBytes_Resize(&res, requiredsize)) {
- Py_DECREF(repunicode);
+ str = _PyBytesWriter_WriteBytes(&writer, str,
+ PyBytes_AS_STRING(rep),
+ PyBytes_GET_SIZE(rep));
+ if (str == NULL)
goto onError;
- }
- str = PyBytes_AS_STRING(res) + respos;
- ressize = requiredsize;
}
- /* check if there is anything unencodable in the replacement
- and copy it to the output */
- for (i = 0; repsize-->0; ++i, ++str) {
- c = PyUnicode_READ_CHAR(repunicode, i);
- if (c >= limit) {
- raise_encode_exception(&exc, encoding, unicode,
- pos, pos+1, reason);
- Py_DECREF(repunicode);
+ else {
+ assert(PyUnicode_Check(rep));
+
+ if (PyUnicode_READY(rep) < 0)
goto onError;
+
+ if (PyUnicode_IS_ASCII(rep)) {
+ /* Fast path: all characters are smaller than limit */
+ assert(limit >= 128);
+ assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
+ str = _PyBytesWriter_WriteBytes(&writer, str,
+ PyUnicode_DATA(rep),
+ PyUnicode_GET_LENGTH(rep));
+ }
+ else {
+ Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep);
+
+ str = _PyBytesWriter_Prepare(&writer, str, repsize);
+ if (str == NULL)
+ goto onError;
+
+ /* check if there is anything unencodable in the
+ replacement and copy it to the output */
+ for (i = 0; repsize-->0; ++i, ++str) {
+ ch = PyUnicode_READ_CHAR(rep, i);
+ if (ch >= limit) {
+ raise_encode_exception(&exc, encoding, unicode,
+ pos, pos+1, reason);
+ goto onError;
+ }
+ *str = (char)ch;
+ }
}
- *str = (char)c;
}
pos = newpos;
- Py_DECREF(repunicode);
+ Py_CLEAR(rep);
}
+
+ /* If overallocation was disabled, ensure that it was the last
+ write. Otherwise, we missed an optimization */
+ assert(writer.overallocate || pos == size);
}
}
- /* Resize if we allocated to much */
- size = str - PyBytes_AS_STRING(res);
- if (size < ressize) { /* If this falls res will be NULL */
- assert(size >= 0);
- if (_PyBytes_Resize(&res, size) < 0)
- goto onError;
- }
- Py_XDECREF(errorHandler);
+ Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
- return res;
-
- overflow:
- PyErr_SetString(PyExc_OverflowError,
- "encoded result is too long for a Python string");
+ return _PyBytesWriter_Finish(&writer, str);
onError:
- Py_XDECREF(res);
- Py_XDECREF(errorHandler);
+ Py_XDECREF(rep);
+ _PyBytesWriter_Dealloc(&writer);
+ Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
return NULL;
}
@@ -6670,8 +6869,9 @@ PyUnicode_DecodeASCII(const char *s,
Py_ssize_t endinpos;
Py_ssize_t outpos;
const char *e;
- PyObject *errorHandler = NULL;
+ PyObject *error_handler_obj = NULL;
PyObject *exc = NULL;
+ _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
if (size == 0)
_Py_RETURN_UNICODE_EMPTY();
@@ -6700,12 +6900,42 @@ PyUnicode_DecodeASCII(const char *s,
PyUnicode_WRITE(kind, data, writer.pos, c);
writer.pos++;
++s;
+ continue;
}
- else {
+
+ /* byte outsize range 0x00..0x7f: call the error handler */
+
+ if (error_handler == _Py_ERROR_UNKNOWN)
+ error_handler = get_error_handler(errors);
+
+ switch (error_handler)
+ {
+ case _Py_ERROR_REPLACE:
+ case _Py_ERROR_SURROGATEESCAPE:
+ /* Fast-path: the error handler only writes one character,
+ but we may switch to UCS2 at the first write */
+ if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
+ goto onError;
+ kind = writer.kind;
+ data = writer.data;
+
+ if (error_handler == _Py_ERROR_REPLACE)
+ PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
+ else
+ PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
+ writer.pos++;
+ ++s;
+ break;
+
+ case _Py_ERROR_IGNORE:
+ ++s;
+ break;
+
+ default:
startinpos = s-starts;
endinpos = startinpos + 1;
if (unicode_decode_call_errorhandler_writer(
- errors, &errorHandler,
+ errors, &error_handler_obj,
"ascii", "ordinal not in range(128)",
&starts, &e, &startinpos, &endinpos, &exc, &s,
&writer))
@@ -6714,13 +6944,13 @@ PyUnicode_DecodeASCII(const char *s,
data = writer.data;
}
}
- Py_XDECREF(errorHandler);
+ Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
return _PyUnicodeWriter_Finish(&writer);
onError:
_PyUnicodeWriter_Dealloc(&writer);
- Py_XDECREF(errorHandler);
+ Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
return NULL;
}
@@ -6775,7 +7005,7 @@ PyUnicode_AsASCIIString(PyObject *unicode)
# define WC_ERR_INVALID_CHARS 0x0080
#endif
-static char*
+static const char*
code_page_name(UINT code_page, PyObject **obj)
{
*obj = NULL;
@@ -6883,7 +7113,7 @@ decode_code_page_errors(UINT code_page,
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
PyObject *encoding_obj = NULL;
- char *encoding;
+ const char *encoding;
DWORD err;
int ret = -1;
@@ -7119,7 +7349,6 @@ encode_code_page_strict(UINT code_page, PyObject **outbytes,
BOOL usedDefaultChar = FALSE;
BOOL *pusedDefaultChar = &usedDefaultChar;
int outsize;
- PyObject *exc = NULL;
wchar_t *p;
Py_ssize_t size;
const DWORD flags = encode_code_page_flags(code_page, NULL);
@@ -7228,7 +7457,7 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes,
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
PyObject *encoding_obj = NULL;
- char *encoding;
+ const char *encoding;
Py_ssize_t newpos, newoutsize;
PyObject *rep;
int ret = -1;
@@ -8086,7 +8315,7 @@ static int
charmap_encoding_error(
PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
PyObject **exceptionObject,
- int *known_errorHandler, PyObject **errorHandler, const char *errors,
+ _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
PyObject **res, Py_ssize_t *respos)
{
PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
@@ -8133,23 +8362,15 @@ charmap_encoding_error(
}
/* cache callback name lookup
* (if not done yet, i.e. it's the first error) */
- if (*known_errorHandler==-1) {
- if ((errors==NULL) || (!strcmp(errors, "strict")))
- *known_errorHandler = 1;
- else if (!strcmp(errors, "replace"))
- *known_errorHandler = 2;
- else if (!strcmp(errors, "ignore"))
- *known_errorHandler = 3;
- else if (!strcmp(errors, "xmlcharrefreplace"))
- *known_errorHandler = 4;
- else
- *known_errorHandler = 0;
- }
- switch (*known_errorHandler) {
- case 1: /* strict */
+ if (*error_handler == _Py_ERROR_UNKNOWN)
+ *error_handler = get_error_handler(errors);
+
+ switch (*error_handler) {
+ case _Py_ERROR_STRICT:
raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
return -1;
- case 2: /* replace */
+
+ case _Py_ERROR_REPLACE:
for (collpos = collstartpos; collpos<collendpos; ++collpos) {
x = charmapencode_output('?', mapping, res, respos);
if (x==enc_EXCEPTION) {
@@ -8161,10 +8382,11 @@ charmap_encoding_error(
}
}
/* fall through */
- case 3: /* ignore */
+ case _Py_ERROR_IGNORE:
*inpos = collendpos;
break;
- case 4: /* xmlcharrefreplace */
+
+ case _Py_ERROR_XMLCHARREFREPLACE:
/* generate replacement (temporarily (mis)uses p) */
for (collpos = collstartpos; collpos < collendpos; ++collpos) {
char buffer[2+29+1+1];
@@ -8182,8 +8404,9 @@ charmap_encoding_error(
}
*inpos = collendpos;
break;
+
default:
- repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
+ repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
encoding, reason, unicode, exceptionObject,
collstartpos, collendpos, &newpos);
if (repunicode == NULL)
@@ -8246,12 +8469,9 @@ _PyUnicode_EncodeCharmap(PyObject *unicode,
Py_ssize_t size;
/* current output position */
Py_ssize_t respos = 0;
- PyObject *errorHandler = NULL;
+ PyObject *error_handler_obj = NULL;
PyObject *exc = NULL;
- /* the following variable is used for caching string comparisons
- * -1=not initialized, 0=unknown, 1=strict, 2=replace,
- * 3=ignore, 4=xmlcharrefreplace */
- int known_errorHandler = -1;
+ _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
void *data;
int kind;
@@ -8282,7 +8502,7 @@ _PyUnicode_EncodeCharmap(PyObject *unicode,
if (x==enc_FAILED) { /* unencodable character */
if (charmap_encoding_error(unicode, &inpos, mapping,
&exc,
- &known_errorHandler, &errorHandler, errors,
+ &error_handler, &error_handler_obj, errors,
&res, &respos)) {
goto onError;
}
@@ -8298,13 +8518,13 @@ _PyUnicode_EncodeCharmap(PyObject *unicode,
goto onError;
Py_XDECREF(exc);
- Py_XDECREF(errorHandler);
+ Py_XDECREF(error_handler_obj);
return res;
onError:
Py_XDECREF(res);
Py_XDECREF(exc);
- Py_XDECREF(errorHandler);
+ Py_XDECREF(error_handler_obj);
return NULL;
}
@@ -8371,7 +8591,7 @@ unicode_translate_call_errorhandler(const char *errors,
Py_ssize_t startpos, Py_ssize_t endpos,
Py_ssize_t *newpos)
{
- static char *argparse = "O!n;translating error handler must return (str, int) tuple";
+ static const char *argparse = "O!n;translating error handler must return (str, int) tuple";
Py_ssize_t i_newpos;
PyObject *restuple;
@@ -8628,7 +8848,7 @@ exit:
return res;
}
-PyObject *
+static PyObject *
_PyUnicode_TranslateCharmap(PyObject *input,
PyObject *mapping,
const char *errors)
@@ -8657,10 +8877,8 @@ _PyUnicode_TranslateCharmap(PyObject *input,
kind = PyUnicode_KIND(input);
size = PyUnicode_GET_LENGTH(input);
- if (size == 0) {
- Py_INCREF(input);
- return input;
- }
+ if (size == 0)
+ return PyUnicode_FromObject(input);
/* allocate enough for a simple 1:1 translation without
replacements, if we need more, we'll resize */
@@ -8771,14 +8989,9 @@ PyUnicode_Translate(PyObject *str,
PyObject *mapping,
const char *errors)
{
- PyObject *result;
-
- str = PyUnicode_FromObject(str);
- if (str == NULL)
+ if (ensure_unicode(str) < 0)
return NULL;
- result = _PyUnicode_TranslateCharmap(str, mapping, errors);
- Py_DECREF(str);
- return result;
+ return _PyUnicode_TranslateCharmap(str, mapping, errors);
}
static Py_UCS4
@@ -8960,9 +9173,10 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,
}
static Py_ssize_t
-any_find_slice(int direction, PyObject* s1, PyObject* s2,
+any_find_slice(PyObject* s1, PyObject* s2,
Py_ssize_t start,
- Py_ssize_t end)
+ Py_ssize_t end,
+ int direction)
{
int kind1, kind2;
void *buf1, *buf2;
@@ -9131,54 +9345,35 @@ PyUnicode_Count(PyObject *str,
Py_ssize_t end)
{
Py_ssize_t result;
- PyObject* str_obj;
- PyObject* sub_obj;
int kind1, kind2;
void *buf1 = NULL, *buf2 = NULL;
Py_ssize_t len1, len2;
- str_obj = PyUnicode_FromObject(str);
- if (!str_obj)
- return -1;
- sub_obj = PyUnicode_FromObject(substr);
- if (!sub_obj) {
- Py_DECREF(str_obj);
- return -1;
- }
- if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
- Py_DECREF(sub_obj);
- Py_DECREF(str_obj);
+ if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
return -1;
- }
- kind1 = PyUnicode_KIND(str_obj);
- kind2 = PyUnicode_KIND(sub_obj);
- if (kind1 < kind2) {
- Py_DECREF(sub_obj);
- Py_DECREF(str_obj);
+ kind1 = PyUnicode_KIND(str);
+ kind2 = PyUnicode_KIND(substr);
+ if (kind1 < kind2)
return 0;
- }
- len1 = PyUnicode_GET_LENGTH(str_obj);
- len2 = PyUnicode_GET_LENGTH(sub_obj);
+ len1 = PyUnicode_GET_LENGTH(str);
+ len2 = PyUnicode_GET_LENGTH(substr);
ADJUST_INDICES(start, end, len1);
- if (end - start < len2) {
- Py_DECREF(sub_obj);
- Py_DECREF(str_obj);
+ if (end - start < len2)
return 0;
- }
- buf1 = PyUnicode_DATA(str_obj);
- buf2 = PyUnicode_DATA(sub_obj);
+ buf1 = PyUnicode_DATA(str);
+ buf2 = PyUnicode_DATA(substr);
if (kind2 != kind1) {
- buf2 = _PyUnicode_AsKind(sub_obj, kind1);
+ buf2 = _PyUnicode_AsKind(substr, kind1);
if (!buf2)
goto onError;
}
switch (kind1) {
case PyUnicode_1BYTE_KIND:
- if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
+ if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
result = asciilib_count(
((Py_UCS1*)buf1) + start, end - start,
buf2, len2, PY_SSIZE_T_MAX
@@ -9205,16 +9400,11 @@ PyUnicode_Count(PyObject *str,
assert(0); result = 0;
}
- Py_DECREF(sub_obj);
- Py_DECREF(str_obj);
-
if (kind2 != kind1)
PyMem_Free(buf2);
return result;
onError:
- Py_DECREF(sub_obj);
- Py_DECREF(str_obj);
if (kind2 != kind1 && buf2)
PyMem_Free(buf2);
return -1;
@@ -9222,35 +9412,15 @@ PyUnicode_Count(PyObject *str,
Py_ssize_t
PyUnicode_Find(PyObject *str,
- PyObject *sub,
+ PyObject *substr,
Py_ssize_t start,
Py_ssize_t end,
int direction)
{
- Py_ssize_t result;
-
- str = PyUnicode_FromObject(str);
- if (!str)
- return -2;
- sub = PyUnicode_FromObject(sub);
- if (!sub) {
- Py_DECREF(str);
+ if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
return -2;
- }
- if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
- Py_DECREF(sub);
- Py_DECREF(str);
- return -2;
- }
-
- result = any_find_slice(direction,
- str, sub, start, end
- );
-
- Py_DECREF(str);
- Py_DECREF(sub);
- return result;
+ return any_find_slice(str, substr, start, end, direction);
}
Py_ssize_t
@@ -9353,22 +9523,10 @@ PyUnicode_Tailmatch(PyObject *str,
Py_ssize_t end,
int direction)
{
- Py_ssize_t result;
-
- str = PyUnicode_FromObject(str);
- if (str == NULL)
+ if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
return -1;
- substr = PyUnicode_FromObject(substr);
- if (substr == NULL) {
- Py_DECREF(str);
- return -1;
- }
- result = tailmatch(str, substr,
- start, end, direction);
- Py_DECREF(str);
- Py_DECREF(substr);
- return result;
+ return tailmatch(str, substr, start, end, direction);
}
/* Apply fixfct filter to the Unicode object self and return a
@@ -9974,13 +10132,8 @@ PyUnicode_Splitlines(PyObject *string, int keepends)
{
PyObject *list;
- string = PyUnicode_FromObject(string);
- if (string == NULL)
- return NULL;
- if (PyUnicode_READY(string) == -1) {
- Py_DECREF(string);
+ if (ensure_unicode(string) < 0)
return NULL;
- }
switch (PyUnicode_KIND(string)) {
case PyUnicode_1BYTE_KIND:
@@ -10007,7 +10160,6 @@ PyUnicode_Splitlines(PyObject *string, int keepends)
assert(0);
list = 0;
}
- Py_DECREF(string);
return list;
}
@@ -10568,28 +10720,27 @@ unicode_casefold(PyObject *self)
}
-/* Argument converter. Coerces to a single unicode character */
+/* Argument converter. Accepts a single Unicode character. */
static int
convert_uc(PyObject *obj, void *addr)
{
Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
- PyObject *uniobj;
- uniobj = PyUnicode_FromObject(obj);
- if (uniobj == NULL) {
- PyErr_SetString(PyExc_TypeError,
- "The fill character cannot be converted to Unicode");
+ if (!PyUnicode_Check(obj)) {
+ PyErr_Format(PyExc_TypeError,
+ "The fill character must be a unicode character, "
+ "not %.100s", Py_TYPE(obj)->tp_name);
return 0;
}
- if (PyUnicode_GET_LENGTH(uniobj) != 1) {
+ if (PyUnicode_READY(obj) < 0)
+ return 0;
+ if (PyUnicode_GET_LENGTH(obj) != 1) {
PyErr_SetString(PyExc_TypeError,
"The fill character must be exactly one character long");
- Py_DECREF(uniobj);
return 0;
}
- *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
- Py_DECREF(uniobj);
+ *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
return 1;
}
@@ -10905,59 +11056,49 @@ PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
}
int
-PyUnicode_Contains(PyObject *container, PyObject *element)
+_PyUnicode_EQ(PyObject *aa, PyObject *bb)
+{
+ return unicode_eq(aa, bb);
+}
+
+int
+PyUnicode_Contains(PyObject *str, PyObject *substr)
{
- PyObject *str, *sub;
int kind1, kind2;
void *buf1, *buf2;
Py_ssize_t len1, len2;
int result;
- /* Coerce the two arguments */
- sub = PyUnicode_FromObject(element);
- if (!sub) {
+ if (!PyUnicode_Check(substr)) {
PyErr_Format(PyExc_TypeError,
- "'in <string>' requires string as left operand, not %s",
- element->ob_type->tp_name);
+ "'in <string>' requires string as left operand, not %.100s",
+ Py_TYPE(substr)->tp_name);
return -1;
}
-
- str = PyUnicode_FromObject(container);
- if (!str) {
- Py_DECREF(sub);
+ if (PyUnicode_READY(substr) == -1)
+ return -1;
+ if (ensure_unicode(str) < 0)
return -1;
- }
kind1 = PyUnicode_KIND(str);
- kind2 = PyUnicode_KIND(sub);
- if (kind1 < kind2) {
- Py_DECREF(sub);
- Py_DECREF(str);
+ kind2 = PyUnicode_KIND(substr);
+ if (kind1 < kind2)
return 0;
- }
len1 = PyUnicode_GET_LENGTH(str);
- len2 = PyUnicode_GET_LENGTH(sub);
- if (len1 < len2) {
- Py_DECREF(sub);
- Py_DECREF(str);
+ len2 = PyUnicode_GET_LENGTH(substr);
+ if (len1 < len2)
return 0;
- }
buf1 = PyUnicode_DATA(str);
- buf2 = PyUnicode_DATA(sub);
+ buf2 = PyUnicode_DATA(substr);
if (len2 == 1) {
Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
- Py_DECREF(sub);
- Py_DECREF(str);
return result;
}
if (kind2 != kind1) {
- buf2 = _PyUnicode_AsKind(sub, kind1);
- if (!buf2) {
- Py_DECREF(sub);
- Py_DECREF(str);
+ buf2 = _PyUnicode_AsKind(substr, kind1);
+ if (!buf2)
return -1;
- }
}
switch (kind1) {
@@ -10975,9 +11116,6 @@ PyUnicode_Contains(PyObject *container, PyObject *element)
assert(0);
}
- Py_DECREF(str);
- Py_DECREF(sub);
-
if (kind2 != kind1)
PyMem_Free(buf2);
@@ -10989,56 +11127,40 @@ PyUnicode_Contains(PyObject *container, PyObject *element)
PyObject *
PyUnicode_Concat(PyObject *left, PyObject *right)
{
- PyObject *u = NULL, *v = NULL, *w;
+ PyObject *result;
Py_UCS4 maxchar, maxchar2;
- Py_ssize_t u_len, v_len, new_len;
+ Py_ssize_t left_len, right_len, new_len;
- /* Coerce the two arguments */
- u = PyUnicode_FromObject(left);
- if (u == NULL)
- goto onError;
- v = PyUnicode_FromObject(right);
- if (v == NULL)
- goto onError;
+ if (ensure_unicode(left) < 0 || ensure_unicode(right) < 0)
+ return NULL;
/* Shortcuts */
- if (v == unicode_empty) {
- Py_DECREF(v);
- return u;
- }
- if (u == unicode_empty) {
- Py_DECREF(u);
- return v;
- }
+ if (left == unicode_empty)
+ return PyUnicode_FromObject(right);
+ if (right == unicode_empty)
+ return PyUnicode_FromObject(left);
- u_len = PyUnicode_GET_LENGTH(u);
- v_len = PyUnicode_GET_LENGTH(v);
- if (u_len > PY_SSIZE_T_MAX - v_len) {
+ left_len = PyUnicode_GET_LENGTH(left);
+ right_len = PyUnicode_GET_LENGTH(right);
+ if (left_len > PY_SSIZE_T_MAX - right_len) {
PyErr_SetString(PyExc_OverflowError,
"strings are too large to concat");
- goto onError;
+ return NULL;
}
- new_len = u_len + v_len;
+ new_len = left_len + right_len;
- maxchar = PyUnicode_MAX_CHAR_VALUE(u);
- maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
+ maxchar = PyUnicode_MAX_CHAR_VALUE(left);
+ maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
maxchar = Py_MAX(maxchar, maxchar2);
/* Concat the two Unicode strings */
- w = PyUnicode_New(new_len, maxchar);
- if (w == NULL)
- goto onError;
- _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
- _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
- Py_DECREF(u);
- Py_DECREF(v);
- assert(_PyUnicode_CheckConsistency(w, 1));
- return w;
-
- onError:
- Py_XDECREF(u);
- Py_XDECREF(v);
- return NULL;
+ result = PyUnicode_New(new_len, maxchar);
+ if (result == NULL)
+ return NULL;
+ _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
+ _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
+ assert(_PyUnicode_CheckConsistency(result, 1));
+ return result;
}
void
@@ -11129,6 +11251,25 @@ PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
Py_XDECREF(right);
}
+/*
+Wraps stringlib_parse_args_finds() and additionally ensures that the
+first argument is a unicode object.
+*/
+
+Py_LOCAL_INLINE(int)
+parse_args_finds_unicode(const char * function_name, PyObject *args,
+ PyObject **substring,
+ Py_ssize_t *start, Py_ssize_t *end)
+{
+ if(stringlib_parse_args_finds(function_name, args, substring,
+ start, end)) {
+ if (ensure_unicode(*substring) < 0)
+ return 0;
+ return 1;
+ }
+ return 0;
+}
+
PyDoc_STRVAR(count__doc__,
"S.count(sub[, start[, end]]) -> int\n\
\n\
@@ -11147,31 +11288,26 @@ unicode_count(PyObject *self, PyObject *args)
void *buf1, *buf2;
Py_ssize_t len1, len2, iresult;
- if (!stringlib_parse_args_finds_unicode("count", args, &substring,
- &start, &end))
+ if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
return NULL;
kind1 = PyUnicode_KIND(self);
kind2 = PyUnicode_KIND(substring);
- if (kind1 < kind2) {
- Py_DECREF(substring);
+ if (kind1 < kind2)
return PyLong_FromLong(0);
- }
+
len1 = PyUnicode_GET_LENGTH(self);
len2 = PyUnicode_GET_LENGTH(substring);
ADJUST_INDICES(start, end, len1);
- if (end - start < len2) {
- Py_DECREF(substring);
+ if (end - start < len2)
return PyLong_FromLong(0);
- }
+
buf1 = PyUnicode_DATA(self);
buf2 = PyUnicode_DATA(substring);
if (kind2 != kind1) {
buf2 = _PyUnicode_AsKind(substring, kind1);
- if (!buf2) {
- Py_DECREF(substring);
+ if (!buf2)
return NULL;
- }
}
switch (kind1) {
case PyUnicode_1BYTE_KIND:
@@ -11201,8 +11337,6 @@ unicode_count(PyObject *self, PyObject *args)
if (kind2 != kind1)
PyMem_Free(buf2);
- Py_DECREF(substring);
-
return result;
}
@@ -11336,22 +11470,13 @@ unicode_find(PyObject *self, PyObject *args)
Py_ssize_t end = 0;
Py_ssize_t result;
- if (!stringlib_parse_args_finds_unicode("find", args, &substring,
- &start, &end))
+ if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
return NULL;
- if (PyUnicode_READY(self) == -1) {
- Py_DECREF(substring);
- return NULL;
- }
- if (PyUnicode_READY(substring) == -1) {
- Py_DECREF(substring);
+ if (PyUnicode_READY(self) == -1)
return NULL;
- }
-
- result = any_find_slice(1, self, substring, start, end);
- Py_DECREF(substring);
+ result = any_find_slice(self, substring, start, end, 1);
if (result == -2)
return NULL;
@@ -11424,22 +11549,13 @@ unicode_index(PyObject *self, PyObject *args)
Py_ssize_t start = 0;
Py_ssize_t end = 0;
- if (!stringlib_parse_args_finds_unicode("index", args, &substring,
- &start, &end))
+ if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
return NULL;
- if (PyUnicode_READY(self) == -1) {
- Py_DECREF(substring);
- return NULL;
- }
- if (PyUnicode_READY(substring) == -1) {
- Py_DECREF(substring);
+ if (PyUnicode_READY(self) == -1)
return NULL;
- }
- result = any_find_slice(1, self, substring, start, end);
-
- Py_DECREF(substring);
+ result = any_find_slice(self, substring, start, end, 1);
if (result == -2)
return NULL;
@@ -11953,7 +12069,7 @@ unicode_lower(PyObject *self)
#define BOTHSTRIP 2
/* Arrays indexed by above */
-static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
+static const char * const stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
#define STRIPNAME(i) (stripformat[i]+3)
@@ -12248,40 +12364,15 @@ unicode_repeat(PyObject *str, Py_ssize_t len)
}
PyObject *
-PyUnicode_Replace(PyObject *obj,
- PyObject *subobj,
- PyObject *replobj,
+PyUnicode_Replace(PyObject *str,
+ PyObject *substr,
+ PyObject *replstr,
Py_ssize_t maxcount)
{
- PyObject *self;
- PyObject *str1;
- PyObject *str2;
- PyObject *result;
-
- self = PyUnicode_FromObject(obj);
- if (self == NULL)
+ if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
+ ensure_unicode(replstr) < 0)
return NULL;
- str1 = PyUnicode_FromObject(subobj);
- if (str1 == NULL) {
- Py_DECREF(self);
- return NULL;
- }
- str2 = PyUnicode_FromObject(replobj);
- if (str2 == NULL) {
- Py_DECREF(self);
- Py_DECREF(str1);
- return NULL;
- }
- if (PyUnicode_READY(self) == -1 ||
- PyUnicode_READY(str1) == -1 ||
- PyUnicode_READY(str2) == -1)
- result = NULL;
- else
- result = replace(self, str1, str2, maxcount);
- Py_DECREF(self);
- Py_DECREF(str1);
- Py_DECREF(str2);
- return result;
+ return replace(str, substr, replstr, maxcount);
}
PyDoc_STRVAR(replace__doc__,
@@ -12297,28 +12388,12 @@ unicode_replace(PyObject *self, PyObject *args)
PyObject *str1;
PyObject *str2;
Py_ssize_t maxcount = -1;
- PyObject *result;
- if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
+ if (!PyArg_ParseTuple(args, "UU|n:replace", &str1, &str2, &maxcount))
return NULL;
if (PyUnicode_READY(self) == -1)
return NULL;
- str1 = PyUnicode_FromObject(str1);
- if (str1 == NULL)
- return NULL;
- str2 = PyUnicode_FromObject(str2);
- if (str2 == NULL) {
- Py_DECREF(str1);
- return NULL;
- }
- if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
- result = NULL;
- else
- result = replace(self, str1, str2, maxcount);
-
- Py_DECREF(str1);
- Py_DECREF(str2);
- return result;
+ return replace(self, str1, str2, maxcount);
}
static PyObject *
@@ -12503,22 +12578,13 @@ unicode_rfind(PyObject *self, PyObject *args)
Py_ssize_t end = 0;
Py_ssize_t result;
- if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
- &start, &end))
+ if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
return NULL;
- if (PyUnicode_READY(self) == -1) {
- Py_DECREF(substring);
- return NULL;
- }
- if (PyUnicode_READY(substring) == -1) {
- Py_DECREF(substring);
+ if (PyUnicode_READY(self) == -1)
return NULL;
- }
-
- result = any_find_slice(-1, self, substring, start, end);
- Py_DECREF(substring);
+ result = any_find_slice(self, substring, start, end, -1);
if (result == -2)
return NULL;
@@ -12540,22 +12606,13 @@ unicode_rindex(PyObject *self, PyObject *args)
Py_ssize_t end = 0;
Py_ssize_t result;
- if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
- &start, &end))
+ if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
return NULL;
- if (PyUnicode_READY(self) == -1) {
- Py_DECREF(substring);
- return NULL;
- }
- if (PyUnicode_READY(substring) == -1) {
- Py_DECREF(substring);
+ if (PyUnicode_READY(self) == -1)
return NULL;
- }
-
- result = any_find_slice(-1, self, substring, start, end);
- Py_DECREF(substring);
+ result = any_find_slice(self, substring, start, end, -1);
if (result == -2)
return NULL;
@@ -12595,24 +12652,10 @@ unicode_rjust(PyObject *self, PyObject *args)
PyObject *
PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
{
- PyObject *result;
-
- s = PyUnicode_FromObject(s);
- if (s == NULL)
+ if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
return NULL;
- if (sep != NULL) {
- sep = PyUnicode_FromObject(sep);
- if (sep == NULL) {
- Py_DECREF(s);
- return NULL;
- }
- }
- result = split(s, sep, maxsplit);
-
- Py_DECREF(s);
- Py_XDECREF(sep);
- return result;
+ return split(s, sep, maxsplit);
}
PyDoc_STRVAR(split__doc__,
@@ -12637,35 +12680,26 @@ unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
if (substring == Py_None)
return split(self, NULL, maxcount);
- else if (PyUnicode_Check(substring))
+
+ if (PyUnicode_Check(substring))
return split(self, substring, maxcount);
- else
- return PyUnicode_Split(self, substring, maxcount);
+
+ PyErr_Format(PyExc_TypeError,
+ "must be str or None, not %.100s",
+ Py_TYPE(substring)->tp_name);
+ return NULL;
}
PyObject *
-PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
+PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
{
- PyObject* str_obj;
- PyObject* sep_obj;
PyObject* out;
int kind1, kind2;
void *buf1, *buf2;
Py_ssize_t len1, len2;
- str_obj = PyUnicode_FromObject(str_in);
- if (!str_obj)
- return NULL;
- sep_obj = PyUnicode_FromObject(sep_in);
- if (!sep_obj) {
- Py_DECREF(str_obj);
- return NULL;
- }
- if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
- Py_DECREF(sep_obj);
- Py_DECREF(str_obj);
+ if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
return NULL;
- }
kind1 = PyUnicode_KIND(str_obj);
kind2 = PyUnicode_KIND(sep_obj);
@@ -12679,8 +12713,6 @@ PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
Py_DECREF(unicode_empty);
}
- Py_DECREF(sep_obj);
- Py_DECREF(str_obj);
return out;
}
buf1 = PyUnicode_DATA(str_obj);
@@ -12688,7 +12720,7 @@ PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
if (kind2 != kind1) {
buf2 = _PyUnicode_AsKind(sep_obj, kind1);
if (!buf2)
- goto onError;
+ return NULL;
}
switch (kind1) {
@@ -12709,39 +12741,23 @@ PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
out = 0;
}
- Py_DECREF(sep_obj);
- Py_DECREF(str_obj);
if (kind2 != kind1)
PyMem_Free(buf2);
return out;
- onError:
- Py_DECREF(sep_obj);
- Py_DECREF(str_obj);
- if (kind2 != kind1 && buf2)
- PyMem_Free(buf2);
- return NULL;
}
PyObject *
-PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
+PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
{
- PyObject* str_obj;
- PyObject* sep_obj;
PyObject* out;
int kind1, kind2;
void *buf1, *buf2;
Py_ssize_t len1, len2;
- str_obj = PyUnicode_FromObject(str_in);
- if (!str_obj)
+ if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
return NULL;
- sep_obj = PyUnicode_FromObject(sep_in);
- if (!sep_obj) {
- Py_DECREF(str_obj);
- return NULL;
- }
kind1 = PyUnicode_KIND(str_obj);
kind2 = PyUnicode_KIND(sep_obj);
@@ -12755,8 +12771,6 @@ PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
Py_DECREF(unicode_empty);
}
- Py_DECREF(sep_obj);
- Py_DECREF(str_obj);
return out;
}
buf1 = PyUnicode_DATA(str_obj);
@@ -12764,7 +12778,7 @@ PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
if (kind2 != kind1) {
buf2 = _PyUnicode_AsKind(sep_obj, kind1);
if (!buf2)
- goto onError;
+ return NULL;
}
switch (kind1) {
@@ -12785,18 +12799,10 @@ PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
out = 0;
}
- Py_DECREF(sep_obj);
- Py_DECREF(str_obj);
if (kind2 != kind1)
PyMem_Free(buf2);
return out;
- onError:
- Py_DECREF(sep_obj);
- Py_DECREF(str_obj);
- if (kind2 != kind1 && buf2)
- PyMem_Free(buf2);
- return NULL;
}
PyDoc_STRVAR(partition__doc__,
@@ -12828,24 +12834,10 @@ unicode_rpartition(PyObject *self, PyObject *separator)
PyObject *
PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
{
- PyObject *result;
-
- s = PyUnicode_FromObject(s);
- if (s == NULL)
+ if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
return NULL;
- if (sep != NULL) {
- sep = PyUnicode_FromObject(sep);
- if (sep == NULL) {
- Py_DECREF(s);
- return NULL;
- }
- }
- result = rsplit(s, sep, maxsplit);
-
- Py_DECREF(s);
- Py_XDECREF(sep);
- return result;
+ return rsplit(s, sep, maxsplit);
}
PyDoc_STRVAR(rsplit__doc__,
@@ -12870,10 +12862,14 @@ unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
if (substring == Py_None)
return rsplit(self, NULL, maxcount);
- else if (PyUnicode_Check(substring))
+
+ if (PyUnicode_Check(substring))
return rsplit(self, substring, maxcount);
- else
- return PyUnicode_RSplit(self, substring, maxcount);
+
+ PyErr_Format(PyExc_TypeError,
+ "must be str or None, not %.100s",
+ Py_TYPE(substring)->tp_name);
+ return NULL;
}
PyDoc_STRVAR(splitlines__doc__,
@@ -13154,11 +13150,15 @@ unicode_startswith(PyObject *self,
if (PyTuple_Check(subobj)) {
Py_ssize_t i;
for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
- substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
- if (substring == NULL)
+ substring = PyTuple_GET_ITEM(subobj, i);
+ if (!PyUnicode_Check(substring)) {
+ PyErr_Format(PyExc_TypeError,
+ "tuple for startswith must only contain str, "
+ "not %.100s",
+ Py_TYPE(substring)->tp_name);
return NULL;
+ }
result = tailmatch(self, substring, start, end, -1);
- Py_DECREF(substring);
if (result == -1)
return NULL;
if (result) {
@@ -13168,15 +13168,13 @@ unicode_startswith(PyObject *self,
/* nothing matched */
Py_RETURN_FALSE;
}
- substring = PyUnicode_FromObject(subobj);
- if (substring == NULL) {
- if (PyErr_ExceptionMatches(PyExc_TypeError))
- PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
- "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
+ if (!PyUnicode_Check(subobj)) {
+ PyErr_Format(PyExc_TypeError,
+ "startswith first arg must be str or "
+ "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
return NULL;
}
- result = tailmatch(self, substring, start, end, -1);
- Py_DECREF(substring);
+ result = tailmatch(self, subobj, start, end, -1);
if (result == -1)
return NULL;
return PyBool_FromLong(result);
@@ -13206,12 +13204,15 @@ unicode_endswith(PyObject *self,
if (PyTuple_Check(subobj)) {
Py_ssize_t i;
for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
- substring = PyUnicode_FromObject(
- PyTuple_GET_ITEM(subobj, i));
- if (substring == NULL)
+ substring = PyTuple_GET_ITEM(subobj, i);
+ if (!PyUnicode_Check(substring)) {
+ PyErr_Format(PyExc_TypeError,
+ "tuple for endswith must only contain str, "
+ "not %.100s",
+ Py_TYPE(substring)->tp_name);
return NULL;
+ }
result = tailmatch(self, substring, start, end, +1);
- Py_DECREF(substring);
if (result == -1)
return NULL;
if (result) {
@@ -13220,15 +13221,13 @@ unicode_endswith(PyObject *self,
}
Py_RETURN_FALSE;
}
- substring = PyUnicode_FromObject(subobj);
- if (substring == NULL) {
- if (PyErr_ExceptionMatches(PyExc_TypeError))
- PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
- "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
+ if (!PyUnicode_Check(subobj)) {
+ PyErr_Format(PyExc_TypeError,
+ "endswith first arg must be str or "
+ "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
return NULL;
}
- result = tailmatch(self, substring, start, end, +1);
- Py_DECREF(substring);
+ result = tailmatch(self, subobj, start, end, +1);
if (result == -1)
return NULL;
return PyBool_FromLong(result);
@@ -13237,44 +13236,50 @@ unicode_endswith(PyObject *self,
Py_LOCAL_INLINE(void)
_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
{
- if (!writer->readonly)
+ writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
+ writer->data = PyUnicode_DATA(writer->buffer);
+
+ if (!writer->readonly) {
+ writer->kind = PyUnicode_KIND(writer->buffer);
writer->size = PyUnicode_GET_LENGTH(writer->buffer);
+ }
else {
+ /* use a value smaller than PyUnicode_1BYTE_KIND() so
+ _PyUnicodeWriter_PrepareKind() will copy the buffer. */
+ writer->kind = PyUnicode_WCHAR_KIND;
+ assert(writer->kind <= PyUnicode_1BYTE_KIND);
+
/* Copy-on-write mode: set buffer size to 0 so
* _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
* next write. */
writer->size = 0;
}
- writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
- writer->data = PyUnicode_DATA(writer->buffer);
- writer->kind = PyUnicode_KIND(writer->buffer);
}
void
_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
{
memset(writer, 0, sizeof(*writer));
-#ifdef Py_DEBUG
- writer->kind = 5; /* invalid kind */
-#endif
+
+ /* ASCII is the bare minimum */
writer->min_char = 127;
+
+ /* use a value smaller than PyUnicode_1BYTE_KIND() so
+ _PyUnicodeWriter_PrepareKind() will copy the buffer. */
+ writer->kind = PyUnicode_WCHAR_KIND;
+ assert(writer->kind <= PyUnicode_1BYTE_KIND);
}
int
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
Py_ssize_t length, Py_UCS4 maxchar)
{
-#ifdef MS_WINDOWS
- /* On Windows, overallocate by 50% is the best factor */
-# define OVERALLOCATE_FACTOR 2
-#else
- /* On Linux, overallocate by 25% is the best factor */
-# define OVERALLOCATE_FACTOR 4
-#endif
Py_ssize_t newlen;
PyObject *newbuffer;
- assert(length > 0);
+ /* ensure that the _PyUnicodeWriter_Prepare macro was used */
+ assert((maxchar > writer->maxchar && length >= 0)
+ || length > 0);
if (length > PY_SSIZE_T_MAX - writer->pos) {
PyErr_NoMemory();
@@ -13340,6 +13345,28 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
#undef OVERALLOCATE_FACTOR
}
+int
+_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
+ enum PyUnicode_Kind kind)
+{
+ Py_UCS4 maxchar;
+
+ /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
+ assert(writer->kind < kind);
+
+ switch (kind)
+ {
+ case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
+ case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
+ case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
+ default:
+ assert(0 && "invalid kind");
+ return -1;
+ }
+
+ return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
+}
+
Py_LOCAL_INLINE(int)
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
{
@@ -13510,17 +13537,26 @@ _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
assert(PyUnicode_GET_LENGTH(str) == writer->pos);
return str;
}
- if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
- PyObject *newbuffer;
- newbuffer = resize_compact(writer->buffer, writer->pos);
- if (newbuffer == NULL) {
- Py_CLEAR(writer->buffer);
- return NULL;
+ if (writer->pos == 0) {
+ Py_CLEAR(writer->buffer);
+
+ /* Get the empty Unicode string singleton ('') */
+ _Py_INCREF_UNICODE_EMPTY();
+ str = unicode_empty;
+ }
+ else {
+ str = writer->buffer;
+ writer->buffer = NULL;
+
+ if (PyUnicode_GET_LENGTH(str) != writer->pos) {
+ PyObject *str2;
+ str2 = resize_compact(str, writer->pos);
+ if (str2 == NULL)
+ return NULL;
+ str = str2;
}
- writer->buffer = newbuffer;
}
- str = writer->buffer;
- writer->buffer = NULL;
+
assert(_PyUnicode_CheckConsistency(str, 1));
return unicode_result_ready(str);
}
@@ -14661,13 +14697,10 @@ PyUnicode_Format(PyObject *format, PyObject *args)
return NULL;
}
- ctx.fmtstr = PyUnicode_FromObject(format);
- if (ctx.fmtstr == NULL)
+ if (ensure_unicode(format) < 0)
return NULL;
- if (PyUnicode_READY(ctx.fmtstr) == -1) {
- Py_DECREF(ctx.fmtstr);
- return NULL;
- }
+
+ ctx.fmtstr = format;
ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
@@ -14727,11 +14760,9 @@ PyUnicode_Format(PyObject *format, PyObject *args)
if (ctx.args_owned) {
Py_DECREF(ctx.args);
}
- Py_DECREF(ctx.fmtstr);
return _PyUnicodeWriter_Finish(&ctx.writer);
onError:
- Py_DECREF(ctx.fmtstr);
_PyUnicodeWriter_Dealloc(&ctx.writer);
if (ctx.args_owned) {
Py_DECREF(ctx.args);
@@ -15009,26 +15040,18 @@ PyUnicode_InternInPlace(PyObject **p)
return;
}
}
- /* It might be that the GetItem call fails even
- though the key is present in the dictionary,
- namely when this happens during a stack overflow. */
Py_ALLOW_RECURSION
- t = PyDict_GetItem(interned, s);
+ t = PyDict_SetDefault(interned, s, s);
Py_END_ALLOW_RECURSION
-
- if (t) {
- Py_INCREF(t);
- Py_SETREF(*p, t);
+ if (t == NULL) {
+ PyErr_Clear();
return;
}
-
- PyThreadState_GET()->recursion_critical = 1;
- if (PyDict_SetItem(interned, s, s) < 0) {
- PyErr_Clear();
- PyThreadState_GET()->recursion_critical = 0;
+ if (t != s) {
+ Py_INCREF(t);
+ Py_SETREF(*p, t);
return;
}
- PyThreadState_GET()->recursion_critical = 0;
/* The two references in interned are not counted by refcnt.
The deallocator will take care of this */
Py_REFCNT(s) -= 2;