diff options
Diffstat (limited to 'Objects/stringlib/string_format.h')
-rw-r--r-- | Objects/stringlib/string_format.h | 831 |
1 files changed, 831 insertions, 0 deletions
diff --git a/Objects/stringlib/string_format.h b/Objects/stringlib/string_format.h new file mode 100644 index 00000000000..58032165d34 --- /dev/null +++ b/Objects/stringlib/string_format.h @@ -0,0 +1,831 @@ +/* + string_format.h -- implementation of string.format(). + + It uses the Objects/stringlib conventions, so that it can be + compiled for both unicode and string objects. +*/ + + +/* Defines for more efficiently reallocating the string buffer */ +#define INITIAL_SIZE_INCREMENT 100 +#define SIZE_MULTIPLIER 2 +#define MAX_SIZE_INCREMENT 3200 + + +/************************************************************************/ +/*********** Global data structures and forward declarations *********/ +/************************************************************************/ + +/* + A SubString consists of the characters between two string or + unicode pointers. +*/ +typedef struct { + STRINGLIB_CHAR *ptr; + STRINGLIB_CHAR *end; +} SubString; + + +/* forward declaration for recursion */ +static PyObject * +build_string(SubString *input, PyObject *args, PyObject *kwargs, + int *recursion_level); + + + +/************************************************************************/ +/************************** Utility functions ************************/ +/************************************************************************/ + +/* fill in a SubString from a pointer and length */ +Py_LOCAL_INLINE(void) +SubString_init(SubString *str, STRINGLIB_CHAR *p, Py_ssize_t len) +{ + str->ptr = p; + if (p == NULL) + str->end = NULL; + else + str->end = str->ptr + len; +} + +Py_LOCAL_INLINE(PyObject *) +SubString_new_object(SubString *str) +{ + return STRINGLIB_NEW(str->ptr, str->end - str->ptr); +} + +/************************************************************************/ +/*********** Error handling and exception generation **************/ +/************************************************************************/ + +/* + Most of our errors are value errors, because to Python, the + format string is a "value". Also, it's convenient to return + a NULL when we are erroring out. + + XXX: need better error handling, per PEP 3101. +*/ +static void * +SetError(const char *s) +{ + /* PyErr_Format always returns NULL */ + return PyErr_Format(PyExc_ValueError, "%s in format string", s); +} + +/* + check_input returns True if we still have characters + left in the input string. + + XXX: make this function go away when better error handling is + implemented. +*/ +Py_LOCAL_INLINE(int) +check_input(SubString *input) +{ + if (input->ptr < input->end) + return 1; + PyErr_SetString(PyExc_ValueError, + "unterminated replacement field"); + return 0; +} + +/************************************************************************/ +/*********** Output string management functions ****************/ +/************************************************************************/ + +typedef struct { + STRINGLIB_CHAR *ptr; + STRINGLIB_CHAR *end; + PyObject *obj; + Py_ssize_t size_increment; +} OutputString; + +/* initialize an OutputString object, reserving size characters */ +static int +output_initialize(OutputString *output, Py_ssize_t size) +{ + output->obj = STRINGLIB_NEW(NULL, size); + if (output->obj == NULL) + return 0; + + output->ptr = STRINGLIB_STR(output->obj); + output->end = STRINGLIB_LEN(output->obj) + output->ptr; + output->size_increment = INITIAL_SIZE_INCREMENT; + + return 1; +} + +/* + output_extend reallocates the output string buffer. + It returns a status: 0 for a failed reallocation, + 1 for success. +*/ + +static int +output_extend(OutputString *output, Py_ssize_t count) +{ + STRINGLIB_CHAR *startptr = STRINGLIB_STR(output->obj); + Py_ssize_t curlen = output->ptr - startptr; + Py_ssize_t maxlen = curlen + count + output->size_increment; + + if (STRINGLIB_RESIZE(&output->obj, maxlen) < 0) + return 0; + startptr = STRINGLIB_STR(output->obj); + output->ptr = startptr + curlen; + output->end = startptr + maxlen; + if (output->size_increment < MAX_SIZE_INCREMENT) + output->size_increment *= SIZE_MULTIPLIER; + return 1; +} + +/* + output_data dumps characters into our output string + buffer. + + In some cases, it has to reallocate the string. + + It returns a status: 0 for a failed reallocation, + 1 for success. +*/ +static int +output_data(OutputString *output, const STRINGLIB_CHAR *s, Py_ssize_t count) +{ + if ((count > output->end - output->ptr) && !output_extend(output, count)) + return 0; + memcpy(output->ptr, s, count * sizeof(STRINGLIB_CHAR)); + output->ptr += count; + return 1; +} + +/************************************************************************/ +/*********** Format string parsing -- integers and identifiers *********/ +/************************************************************************/ + +/* + end_identifier returns true if a character marks + the end of an identifier string. + + Although the PEP specifies that identifiers are + numbers or valid Python identifiers, we just let + getattr/getitem handle that, so the implementation + is more flexible than the PEP would indicate. +*/ +Py_LOCAL_INLINE(int) +end_identifier(STRINGLIB_CHAR c) +{ + switch (c) { + case '.': case '[': case ']': + return 1; + default: + return 0; + } +} + +/* + get_integer consumes 0 or more decimal digit characters from an + input string, updates *result with the corresponding positive + integer, and returns the number of digits consumed. + + returns -1 on error. +*/ +static int +get_integer(STRINGLIB_CHAR **ptr, STRINGLIB_CHAR *end, + Py_ssize_t *result) +{ + Py_ssize_t accumulator, digitval, oldaccumulator; + int numdigits; + accumulator = numdigits = 0; + for (;;(*ptr)++, numdigits++) { + if (*ptr >= end) + break; + digitval = STRINGLIB_TODECIMAL(**ptr); + if (digitval < 0) + break; + /* + This trick was copied from old Unicode format code. It's cute, + but would really suck on an old machine with a slow divide + implementation. Fortunately, in the normal case we do not + expect too many digits. + */ + oldaccumulator = accumulator; + accumulator *= 10; + if ((accumulator+10)/10 != oldaccumulator+1) { + PyErr_Format(PyExc_ValueError, + "Too many decimal digits in format string"); + return -1; + } + accumulator += digitval; + } + *result = accumulator; + return numdigits; +} + +/* + get_identifier is a bit of a misnomer. It returns a value for use + with getattr or getindex. This value will a string/unicode + object. The input cannot be zero length. Continues until end of + input, or end_identifier() returns true. +*/ +static PyObject * +get_identifier(SubString *input) +{ + STRINGLIB_CHAR *start; + + for (start = input->ptr; + input->ptr < input->end && !end_identifier(*input->ptr); + input->ptr++) + ; + + return STRINGLIB_NEW(start, input->ptr - start); + + /* + We might want to add code here to check for invalid Python + identifiers. All identifiers are eventually passed to getattr + or getitem, so there is a check when used. However, we might + want to remove (or not) the ability to have strings like + "a/b" or " ab" or "-1" (which is not parsed as a number). + For now, this is left as an exercise for the first disgruntled + user... + + if (XXX -- need check function) { + Py_DECREF(result); + PyErr_SetString(PyExc_ValueError, + "Invalid embedded Python identifier"); + return NULL; + } + */ +} + +/************************************************************************/ +/******** Functions to get field objects and specification strings ******/ +/************************************************************************/ + +/* get_field_and_spec is the main function in this section. It parses + the format string well enough to return a field object to render along + with a field specification string. +*/ + +/* + look up key in our keyword arguments +*/ +static PyObject * +key_lookup(PyObject *kwargs, PyObject *key) +{ + PyObject *result; + + if (kwargs && (result = PyDict_GetItem(kwargs, key)) != NULL) { + Py_INCREF(result); + return result; + } + return NULL; +} + +/* + get_field_object returns the object inside {}, before the + format_spec. It handles getindex and getattr lookups and consumes + the entire input string. +*/ +static PyObject * +get_field_object(SubString *input, PyObject *args, PyObject *kwargs) +{ + PyObject *myobj, *subobj, *newobj; + STRINGLIB_CHAR c; + Py_ssize_t index; + int isindex, isnumeric, isargument; + + index = isnumeric = 0; /* Just to shut up the compiler warnings */ + + myobj = args; + Py_INCREF(myobj); + + for (isindex=1, isargument=1;;) { + if (!check_input(input)) + break; + if (!isindex) { + if ((subobj = get_identifier(input)) == NULL) + break; + newobj = PyObject_GetAttr(myobj, subobj); + Py_DECREF(subobj); + } else { + isnumeric = (STRINGLIB_ISDECIMAL(*input->ptr)); + if (isnumeric) + /* XXX: add error checking */ + get_integer(&input->ptr, input->end, &index); + + if (isnumeric && PySequence_Check(myobj)) + newobj = PySequence_GetItem(myobj, index); + else { + /* XXX -- do we need PyLong_FromLongLong? + Using ssizet, not int... */ + subobj = isnumeric ? + PyInt_FromLong(index) : + get_identifier(input); + if (subobj == NULL) + break; + if (isargument) { + newobj = key_lookup(kwargs, subobj); + } else { + newobj = PyObject_GetItem(myobj, subobj); + } + Py_DECREF(subobj); + } + } + Py_DECREF(myobj); + myobj = newobj; + if (myobj == NULL) + break; + if (!isargument && isindex) + if ((!check_input(input)) || (*(input->ptr++) != ']')) { + SetError("Expected ]"); + break; + } + + /* if at the end of input, return with myobj */ + if (input->ptr >= input->end) + return myobj; + + c = *input->ptr; + input->ptr++; + isargument = 0; + isindex = (c == '['); + if (!isindex && (c != '.')) { + SetError("Expected ., [, :, !, or }"); + break; + } + } + if ((myobj == NULL) && isargument) { + /* XXX: include more useful error information, like which + * keyword not found or which index missing */ + PyErr_Clear(); + return SetError(isnumeric + ? "Not enough positional arguments" + : "Keyword argument not found"); + } + Py_XDECREF(myobj); + return NULL; +} + +/************************************************************************/ +/***************** Field rendering functions **************************/ +/************************************************************************/ + +/* + render_field() is the main function in this section. It takes the + field object and field specification string generated by + get_field_and_spec, and renders the field into the output string. + + format() does the actual calling of the objects __format__ method. +*/ + + +/* returns fieldobj.__format__(format_spec) */ +static PyObject * +format(PyObject *fieldobj, SubString *format_spec) +{ + static PyObject *format_str = NULL; + PyObject *meth; + PyObject *spec = NULL; + PyObject *result = NULL; + + /* Initialize cached value */ + if (format_str == NULL) { + /* Initialize static variable needed by _PyType_Lookup */ + format_str = PyUnicode_FromString("__format__"); + if (format_str == NULL) + return NULL; + } + + /* Make sure the type is initialized. float gets initialized late */ + if (Py_Type(fieldobj)->tp_dict == NULL) + if (PyType_Ready(Py_Type(fieldobj)) < 0) + return NULL; + + /* we need to create an object out of the pointers we have */ + spec = SubString_new_object(format_spec); + if (spec == NULL) + goto done; + + /* Find the (unbound!) __format__ method (a borrowed reference) */ + meth = _PyType_Lookup(Py_Type(fieldobj), format_str); + if (meth == NULL) { + PyErr_Format(PyExc_TypeError, + "Type %.100s doesn't define __format__", + Py_Type(fieldobj)->tp_name); + goto done; + } + + /* And call it, binding it to the value */ + result = PyObject_CallFunctionObjArgs(meth, fieldobj, spec, NULL); + if (result == NULL) + goto done; + + if (!STRINGLIB_CHECK(result)) { + PyErr_SetString(PyExc_TypeError, + "__format__ method did not return " + STRINGLIB_TYPE_NAME); + Py_DECREF(result); + result = NULL; + goto done; + } + +done: + Py_XDECREF(spec); + return result; +} + +/* + render_field calls fieldobj.__format__(format_spec) method, and + appends to the output. +*/ +static int +render_field(PyObject *fieldobj, SubString *format_spec, OutputString *output) +{ + int ok = 0; + PyObject *result = format(fieldobj, format_spec); + + if (result == NULL) + goto done; + + ok = output_data(output, + STRINGLIB_STR(result), STRINGLIB_LEN(result)); +done: + Py_XDECREF(result); + return ok; +} + +static int +parse_field(SubString *str, SubString *field_name, SubString *format_spec, + STRINGLIB_CHAR *conversion) +{ + STRINGLIB_CHAR c = 0; + + /* initialize these, as they may be empty */ + *conversion = '\0'; + SubString_init(format_spec, NULL, 0); + + /* search for the field name. it's terminated by the end of the + string, or a ':' or '!' */ + field_name->ptr = str->ptr; + while (str->ptr < str->end) { + switch (c = *(str->ptr++)) { + case ':': + case '!': + break; + default: + continue; + } + break; + } + + if (c == '!' || c == ':') { + /* we have a format specifier and/or a conversion */ + /* don't include the last character */ + field_name->end = str->ptr-1; + + /* the format specifier is the rest of the string */ + format_spec->ptr = str->ptr; + format_spec->end = str->end; + + /* see if there's a conversion specifier */ + if (c == '!') { + /* there must be another character present */ + if (format_spec->ptr >= format_spec->end) { + PyErr_SetString(PyExc_ValueError, + "end of format while looking for conversion " + "specifier"); + return 0; + } + *conversion = *(format_spec->ptr++); + + /* if there is another character, it must be a colon */ + if (format_spec->ptr < format_spec->end) { + c = *(format_spec->ptr++); + if (c != ':') { + PyErr_SetString(PyExc_ValueError, + "expected ':' after format specifier"); + return 0; + } + } + } + + return 1; + + } else { + /* end of string, there's no format_spec or conversion */ + field_name->end = str->ptr; + return 1; + } +} + +/************************************************************************/ +/******* Output string allocation and escape-to-markup processing ******/ +/************************************************************************/ + +/* MarkupIterator breaks the string into pieces of either literal + text, or things inside {} that need to be marked up. it is + designed to make it easy to wrap a Python iterator around it, for + use with the Formatter class */ + +typedef struct { + SubString str; + int in_markup; +} MarkupIterator; + +static int +MarkupIterator_init(MarkupIterator *self, STRINGLIB_CHAR *ptr, Py_ssize_t len) +{ + SubString_init(&self->str, ptr, len); + self->in_markup = 0; + return 1; +} + +/* returns 0 on error, 1 on non-error termination, and 2 if it got a + string (or something to be expanded) */ +static int +MarkupIterator_next(MarkupIterator *self, int *is_markup, SubString *literal, + SubString *field_name, SubString *format_spec, + STRINGLIB_CHAR *conversion, + int *format_spec_needs_expanding) +{ + int at_end; + STRINGLIB_CHAR c = 0; + STRINGLIB_CHAR *start; + int count; + Py_ssize_t len; + + *format_spec_needs_expanding = 0; + + /* no more input, end of iterator */ + if (self->str.ptr >= self->str.end) + return 1; + + *is_markup = self->in_markup; + start = self->str.ptr; + + if (self->in_markup) { + + /* prepare for next iteration */ + self->in_markup = 0; + + /* this is markup, find the end of the string by counting nested + braces. note that this prohibits escaped braces, so that + format_specs cannot have braces in them. */ + count = 1; + + /* we know we can't have a zero length string, so don't worry + about that case */ + while (self->str.ptr < self->str.end) { + switch (c = *(self->str.ptr++)) { + case '{': + /* the format spec needs to be recursively expanded. + this is an optimization, and not strictly needed */ + *format_spec_needs_expanding = 1; + count++; + break; + case '}': + count--; + if (count <= 0) { + /* we're done. parse and get out */ + literal->ptr = start; + literal->end = self->str.ptr-1; + + if (parse_field(literal, field_name, format_spec, + conversion) == 0) + return 0; + + /* success */ + return 2; + } + break; + } + } + /* end of string while searching for matching '}' */ + PyErr_SetString(PyExc_ValueError, "unmatched '{' in format"); + return 0; + + } else { + /* literal text, read until the end of string, an escaped { or }, + or an unescaped { */ + while (self->str.ptr < self->str.end) { + switch (c = *(self->str.ptr++)) { + case '{': + case '}': + self->in_markup = 1; + break; + default: + continue; + } + break; + } + + at_end = self->str.ptr >= self->str.end; + len = self->str.ptr - start; + + if ((c == '}') && (at_end || (c != *self->str.ptr))) + return (int)SetError("Single } encountered"); + if (at_end && c == '{') + return (int)SetError("Single { encountered"); + if (!at_end) { + if (c == *self->str.ptr) { + /* escaped } or {, skip it in the input */ + self->str.ptr++; + self->in_markup = 0; + } else + len--; + } + + /* this is just plain text, return it */ + literal->ptr = start; + literal->end = start + len; + return 2; + } +} + + +/* do the !r or !s conversion on obj */ +static PyObject * +do_conversion(PyObject *obj, STRINGLIB_CHAR conversion) +{ + /* XXX in pre-3.0, do we need to convert this to unicode, since it + might have returned a string? */ + switch (conversion) { + case 'r': + return PyObject_Repr(obj); + case 's': + return PyObject_Unicode(obj); + default: + PyErr_Format(PyExc_ValueError, + "Unknown converion specifier %c", + conversion); + return NULL; + } +} + +/* given: + + {field_name!conversion:format_spec} + + compute the result and write it to output. + format_spec_needs_expanding is an optimization. if it's false, + just output the string directly, otherwise recursively expand the + format_spec string. */ + +static int +output_markup(SubString *field_name, SubString *format_spec, + int format_spec_needs_expanding, STRINGLIB_CHAR conversion, + OutputString *output, PyObject *args, PyObject *kwargs, + int *recursion_level) +{ + PyObject *tmp = NULL; + PyObject *fieldobj = NULL; + SubString expanded_format_spec; + SubString *actual_format_spec; + int result = 0; + + /* convert field_name to an object */ + fieldobj = get_field_object(field_name, args, kwargs); + if (fieldobj == NULL) + goto done; + + if (conversion != '\0') { + tmp = do_conversion(fieldobj, conversion); + if (tmp == NULL) + goto done; + + /* do the assignment, transferring ownership: fieldobj = tmp */ + Py_DECREF(fieldobj); + fieldobj = tmp; + tmp = NULL; + } + + /* if needed, recurively compute the format_spec */ + if (format_spec_needs_expanding) { + tmp = build_string(format_spec, args, kwargs, recursion_level); + if (tmp == NULL) + goto done; + + /* note that in the case we're expanding the format string, + tmp must be kept around until after the call to + render_field. */ + SubString_init(&expanded_format_spec, + STRINGLIB_STR(tmp), STRINGLIB_LEN(tmp)); + actual_format_spec = &expanded_format_spec; + } else + actual_format_spec = format_spec; + + if (render_field(fieldobj, actual_format_spec, output) == 0) + goto done; + + result = 1; + +done: + Py_XDECREF(fieldobj); + Py_XDECREF(tmp); + + return result; +} + +/* + do_markup is the top-level loop for the format() function. It + searches through the format string for escapes to markup codes, and + calls other functions to move non-markup text to the output, + and to perform the markup to the output. +*/ +static int +do_markup(SubString *input, PyObject *args, PyObject *kwargs, + OutputString *output, int *recursion_level) +{ + MarkupIterator iter; + int is_markup; + int format_spec_needs_expanding; + int result; + SubString str; + SubString field_name; + SubString format_spec; + STRINGLIB_CHAR conversion; + + MarkupIterator_init(&iter, input->ptr, input->end - input->ptr); + while ((result = MarkupIterator_next(&iter, &is_markup, &str, &field_name, + &format_spec, &conversion, + &format_spec_needs_expanding)) == 2) { + if (is_markup) { + if (!output_markup(&field_name, &format_spec, + format_spec_needs_expanding, conversion, output, + args, kwargs, recursion_level)) + return 0; + } else { + if (!output_data(output, str.ptr, str.end-str.ptr)) + return 0; + } + } + return result; +} + + +/* + build_string allocates the output string and then + calls do_markup to do the heavy lifting. +*/ +static PyObject * +build_string(SubString *input, PyObject *args, PyObject *kwargs, + int *recursion_level) +{ + OutputString output; + PyObject *result = NULL; + Py_ssize_t count; + + output.obj = NULL; /* needed so cleanup code always works */ + + /* check the recursion level */ + (*recursion_level)--; + if (*recursion_level < 0) { + PyErr_SetString(PyExc_ValueError, + "Max string recursion exceeded"); + goto done; + } + + /* initial size is the length of the format string, plus the size + increment. seems like a reasonable default */ + if (!output_initialize(&output, + input->end - input->ptr + + INITIAL_SIZE_INCREMENT)) + goto done; + + if (!do_markup(input, args, kwargs, &output, recursion_level)) { + goto done; + } + + count = output.ptr - STRINGLIB_STR(output.obj); + if (STRINGLIB_RESIZE(&output.obj, count) < 0) { + goto done; + } + + /* transfer ownership to result */ + result = output.obj; + output.obj = NULL; + +done: + (*recursion_level)++; + Py_XDECREF(output.obj); + return result; +} + +/************************************************************************/ +/*********** main routine ***********************************************/ +/************************************************************************/ + +/* this is the main entry point */ +static PyObject * +do_string_format(PyObject *self, PyObject *args, PyObject *kwargs) +{ + SubString input; + + /* PEP 3101 says only 2 levels, so that + "{0:{1}}".format('abc', 's') # works + "{0:{1:{2}}}".format('abc', 's', '') # fails + */ + int recursion_level = 2; + + SubString_init(&input, STRINGLIB_STR(self), STRINGLIB_LEN(self)); + return build_string(&input, args, kwargs, &recursion_level); +} |