diff options
Diffstat (limited to 'Parser/action_helpers.c')
-rw-r--r-- | Parser/action_helpers.c | 462 |
1 files changed, 338 insertions, 124 deletions
diff --git a/Parser/action_helpers.c b/Parser/action_helpers.c index 6a825b1abd3..3bcc0870882 100644 --- a/Parser/action_helpers.c +++ b/Parser/action_helpers.c @@ -965,9 +965,21 @@ _PyPegen_check_fstring_conversion(Parser *p, Token* conv_token, expr_ty conv) if (conv_token->lineno != conv->lineno || conv_token->end_col_offset != conv->col_offset) { return RAISE_SYNTAX_ERROR_KNOWN_RANGE( conv_token, conv, - "f-string: conversion type must come right after the exclamanation mark" + "%c-string: conversion type must come right after the exclamanation mark", + TOK_GET_STRING_PREFIX(p->tok) ); } + + Py_UCS4 first = PyUnicode_READ_CHAR(conv->v.Name.id, 0); + if (PyUnicode_GET_LENGTH(conv->v.Name.id) > 1 || + !(first == 's' || first == 'r' || first == 'a')) { + RAISE_SYNTAX_ERROR_KNOWN_LOCATION(conv, + "%c-string: invalid conversion character %R: expected 's', 'r', or 'a'", + TOK_GET_STRING_PREFIX(p->tok), + conv->v.Name.id); + return NULL; + } + return result_token_with_metadata(p, conv, conv_token->metadata); } @@ -1070,6 +1082,9 @@ _PyPegen_get_expr_name(expr_ty e) case JoinedStr_kind: case FormattedValue_kind: return "f-string expression"; + case TemplateStr_kind: + case Interpolation_kind: + return "t-string expression"; case Constant_kind: { PyObject *value = e->v.Constant.value; if (value == Py_None) { @@ -1279,20 +1294,13 @@ _PyPegen_decode_fstring_part(Parser* p, int is_raw, expr_ty constant, Token* tok p->arena); } -expr_ty -_PyPegen_joined_str(Parser *p, Token* a, asdl_expr_seq* expr, Token*b) { - - /* The parser might put multiple f-string values into an individual - * JoinedStr node at the top level due to stuff like f-string debugging - * expressions. This function flattens those and promotes them to the - * upper level. Only simplifies AST, but the compiler already takes care - * of the regular output, so this is not necessary if you are not going - * to expose the output AST to Python level. */ - - Py_ssize_t n_items = asdl_seq_LEN(expr); +static asdl_expr_seq * +_get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b, enum string_kind_t string_kind) +{ + Py_ssize_t n_items = asdl_seq_LEN(raw_expressions); Py_ssize_t total_items = n_items; for (Py_ssize_t i = 0; i < n_items; i++) { - expr_ty item = asdl_seq_GET(expr, i); + expr_ty item = asdl_seq_GET(raw_expressions, i); if (item->kind == JoinedStr_kind) { total_items += asdl_seq_LEN(item->v.JoinedStr.values) - 1; } @@ -1311,17 +1319,19 @@ _PyPegen_joined_str(Parser *p, Token* a, asdl_expr_seq* expr, Token*b) { Py_ssize_t index = 0; for (Py_ssize_t i = 0; i < n_items; i++) { - expr_ty item = asdl_seq_GET(expr, i); + expr_ty item = asdl_seq_GET(raw_expressions, i); // This should correspond to a JoinedStr node of two elements // created _PyPegen_formatted_value. This situation can only be the result of - // a f-string debug expression where the first element is a constant with the text and the second + // a (f|t)-string debug expression where the first element is a constant with the text and the second // a formatted value with the expression. if (item->kind == JoinedStr_kind) { asdl_expr_seq *values = item->v.JoinedStr.values; if (asdl_seq_LEN(values) != 2) { PyErr_Format(PyExc_SystemError, - "unexpected JoinedStr node without debug data in f-string at line %d", + string_kind == TSTRING + ? "unexpected TemplateStr node without debug data in t-string at line %d" + : "unexpected JoinedStr node without debug data in f-string at line %d", item->lineno); return NULL; } @@ -1331,7 +1341,7 @@ _PyPegen_joined_str(Parser *p, Token* a, asdl_expr_seq* expr, Token*b) { asdl_seq_SET(seq, index++, first); expr_ty second = asdl_seq_GET(values, 1); - assert(second->kind == FormattedValue_kind); + assert((string_kind == TSTRING && second->kind == Interpolation_kind) || second->kind == FormattedValue_kind); asdl_seq_SET(seq, index++, second); continue; @@ -1367,7 +1377,22 @@ _PyPegen_joined_str(Parser *p, Token* a, asdl_expr_seq* expr, Token*b) { else { resized_exprs = seq; } + return resized_exprs; +} +expr_ty +_PyPegen_template_str(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b) { + + asdl_expr_seq *resized_exprs = _get_resized_exprs(p, a, raw_expressions, b, TSTRING); + return _PyAST_TemplateStr(resized_exprs, a->lineno, a->col_offset, + b->end_lineno, b->end_col_offset, + p->arena); +} + +expr_ty +_PyPegen_joined_str(Parser *p, Token* a, asdl_expr_seq* raw_expressions, Token*b) { + + asdl_expr_seq *resized_exprs = _get_resized_exprs(p, a, raw_expressions, b, FSTRING); return _PyAST_JoinedStr(resized_exprs, a->lineno, a->col_offset, b->end_lineno, b->end_col_offset, p->arena); @@ -1434,138 +1459,239 @@ expr_ty _PyPegen_constant_from_string(Parser* p, Token* tok) { return _PyAST_Constant(s, kind, tok->lineno, tok->col_offset, tok->end_lineno, tok->end_col_offset, p->arena); } -expr_ty _PyPegen_formatted_value(Parser *p, expr_ty expression, Token *debug, ResultTokenWithMetadata *conversion, - ResultTokenWithMetadata *format, Token *closing_brace, int lineno, int col_offset, - int end_lineno, int end_col_offset, PyArena *arena) { - int conversion_val = -1; +static int +_get_interpolation_conversion(Parser *p, Token *debug, ResultTokenWithMetadata *conversion, + ResultTokenWithMetadata *format) +{ if (conversion != NULL) { expr_ty conversion_expr = (expr_ty) conversion->result; assert(conversion_expr->kind == Name_kind); Py_UCS4 first = PyUnicode_READ_CHAR(conversion_expr->v.Name.id, 0); + return Py_SAFE_DOWNCAST(first, Py_UCS4, int); + } + else if (debug && !format) { + /* If no conversion is specified, use !r for debug expressions */ + return (int)'r'; + } + return -1; +} - if (PyUnicode_GET_LENGTH(conversion_expr->v.Name.id) > 1 || - !(first == 's' || first == 'r' || first == 'a')) { - RAISE_SYNTAX_ERROR_KNOWN_LOCATION(conversion_expr, - "f-string: invalid conversion character %R: expected 's', 'r', or 'a'", - conversion_expr->v.Name.id); - return NULL; +static PyObject * +_strip_interpolation_expr(PyObject *exprstr) +{ + Py_ssize_t len = PyUnicode_GET_LENGTH(exprstr); + + for (Py_ssize_t i = len - 1; i >= 0; i--) { + Py_UCS4 c = PyUnicode_READ_CHAR(exprstr, i); + if (_PyUnicode_IsWhitespace(c) || c == '=') { + len--; + } + else { + break; } + } + + return PyUnicode_Substring(exprstr, 0, len); +} + +expr_ty _PyPegen_interpolation(Parser *p, expr_ty expression, Token *debug, ResultTokenWithMetadata *conversion, + ResultTokenWithMetadata *format, Token *closing_brace, int lineno, int col_offset, + int end_lineno, int end_col_offset, PyArena *arena) { - conversion_val = Py_SAFE_DOWNCAST(first, Py_UCS4, int); + int conversion_val = _get_interpolation_conversion(p, debug, conversion, format); + + /* Find the non whitespace token after the "=" */ + int debug_end_line, debug_end_offset; + PyObject *debug_metadata; + constant exprstr; + + if (conversion) { + debug_end_line = ((expr_ty) conversion->result)->lineno; + debug_end_offset = ((expr_ty) conversion->result)->col_offset; + debug_metadata = exprstr = conversion->metadata; } - else if (debug && !format) { - /* If no conversion is specified, use !r for debug expressions */ - conversion_val = (int)'r'; + else if (format) { + debug_end_line = ((expr_ty) format->result)->lineno; + debug_end_offset = ((expr_ty) format->result)->col_offset + 1; + debug_metadata = exprstr = format->metadata; + } + else { + debug_end_line = end_lineno; + debug_end_offset = end_col_offset; + debug_metadata = exprstr = closing_brace->metadata; + } + + assert(exprstr != NULL); + PyObject *final_exprstr = _strip_interpolation_expr(exprstr); + if (!final_exprstr || _PyArena_AddPyObject(arena, final_exprstr) < 0) { + Py_XDECREF(final_exprstr); + return NULL; + } + + expr_ty interpolation = _PyAST_Interpolation( + expression, final_exprstr, conversion_val, format ? (expr_ty) format->result : NULL, + lineno, col_offset, end_lineno, + end_col_offset, arena + ); + + if (!debug) { + return interpolation; } + expr_ty debug_text = _PyAST_Constant(debug_metadata, NULL, lineno, col_offset + 1, debug_end_line, + debug_end_offset - 1, p->arena); + if (!debug_text) { + return NULL; + } + + asdl_expr_seq *values = _Py_asdl_expr_seq_new(2, arena); + asdl_seq_SET(values, 0, debug_text); + asdl_seq_SET(values, 1, interpolation); + return _PyAST_JoinedStr(values, lineno, col_offset, debug_end_line, debug_end_offset, p->arena); +} + +expr_ty _PyPegen_formatted_value(Parser *p, expr_ty expression, Token *debug, ResultTokenWithMetadata *conversion, + ResultTokenWithMetadata *format, Token *closing_brace, int lineno, int col_offset, + int end_lineno, int end_col_offset, PyArena *arena) { + int conversion_val = _get_interpolation_conversion(p, debug, conversion, format); + expr_ty formatted_value = _PyAST_FormattedValue( expression, conversion_val, format ? (expr_ty) format->result : NULL, lineno, col_offset, end_lineno, end_col_offset, arena ); - if (debug) { - /* Find the non whitespace token after the "=" */ - int debug_end_line, debug_end_offset; - PyObject *debug_metadata; + if (!debug) { + return formatted_value; + } - if (conversion) { - debug_end_line = ((expr_ty) conversion->result)->lineno; - debug_end_offset = ((expr_ty) conversion->result)->col_offset; - debug_metadata = conversion->metadata; - } - else if (format) { - debug_end_line = ((expr_ty) format->result)->lineno; - debug_end_offset = ((expr_ty) format->result)->col_offset + 1; - debug_metadata = format->metadata; - } - else { - debug_end_line = end_lineno; - debug_end_offset = end_col_offset; - debug_metadata = closing_brace->metadata; - } - expr_ty debug_text = _PyAST_Constant(debug_metadata, NULL, lineno, col_offset + 1, debug_end_line, - debug_end_offset - 1, p->arena); - if (!debug_text) { - return NULL; - } + /* Find the non whitespace token after the "=" */ + int debug_end_line, debug_end_offset; + PyObject *debug_metadata; - asdl_expr_seq *values = _Py_asdl_expr_seq_new(2, arena); - if (values == NULL) { - return NULL; - } - asdl_seq_SET(values, 0, debug_text); - asdl_seq_SET(values, 1, formatted_value); - return _PyAST_JoinedStr(values, lineno, col_offset, debug_end_line, debug_end_offset, p->arena); + if (conversion) { + debug_end_line = ((expr_ty) conversion->result)->lineno; + debug_end_offset = ((expr_ty) conversion->result)->col_offset; + debug_metadata = conversion->metadata; + } + else if (format) { + debug_end_line = ((expr_ty) format->result)->lineno; + debug_end_offset = ((expr_ty) format->result)->col_offset + 1; + debug_metadata = format->metadata; } else { - return formatted_value; + debug_end_line = end_lineno; + debug_end_offset = end_col_offset; + debug_metadata = closing_brace->metadata; + } + expr_ty debug_text = _PyAST_Constant(debug_metadata, NULL, lineno, col_offset + 1, debug_end_line, + debug_end_offset - 1, p->arena); + if (!debug_text) { + return NULL; } + + asdl_expr_seq *values = _Py_asdl_expr_seq_new(2, arena); + asdl_seq_SET(values, 0, debug_text); + asdl_seq_SET(values, 1, formatted_value); + return _PyAST_JoinedStr(values, lineno, col_offset, debug_end_line, debug_end_offset, p->arena); } -expr_ty -_PyPegen_concatenate_strings(Parser *p, asdl_expr_seq *strings, - int lineno, int col_offset, int end_lineno, - int end_col_offset, PyArena *arena) +static expr_ty +_build_concatenated_bytes(Parser *p, asdl_expr_seq *strings, int lineno, + int col_offset, int end_lineno, int end_col_offset, + PyArena *arena) { Py_ssize_t len = asdl_seq_LEN(strings); assert(len > 0); - int f_string_found = 0; - int unicode_string_found = 0; - int bytes_found = 0; + PyObject* res = Py_GetConstant(Py_CONSTANT_EMPTY_BYTES); - Py_ssize_t i = 0; - Py_ssize_t n_flattened_elements = 0; - for (i = 0; i < len; i++) { + /* Bytes literals never get a kind, but just for consistency + since they are represented as Constant nodes, we'll mirror + the same behavior as unicode strings for determining the + kind. */ + PyObject* kind = asdl_seq_GET(strings, 0)->v.Constant.kind; + for (Py_ssize_t i = 0; i < len; i++) { expr_ty elem = asdl_seq_GET(strings, i); - switch(elem->kind) { - case Constant_kind: - if (PyBytes_CheckExact(elem->v.Constant.value)) { - bytes_found = 1; - } else { - unicode_string_found = 1; - } - n_flattened_elements++; - break; - case JoinedStr_kind: - n_flattened_elements += asdl_seq_LEN(elem->v.JoinedStr.values); - f_string_found = 1; - break; - default: - n_flattened_elements++; - f_string_found = 1; - break; - } + PyBytes_Concat(&res, elem->v.Constant.value); } + if (!res || _PyArena_AddPyObject(arena, res) < 0) { + Py_XDECREF(res); + return NULL; + } + return _PyAST_Constant(res, kind, lineno, col_offset, end_lineno, end_col_offset, p->arena); +} - if ((unicode_string_found || f_string_found) && bytes_found) { - RAISE_SYNTAX_ERROR("cannot mix bytes and nonbytes literals"); +static expr_ty +_build_concatenated_unicode(Parser *p, asdl_expr_seq *strings, int lineno, + int col_offset, int end_lineno, int end_col_offset, + PyArena *arena) +{ + Py_ssize_t len = asdl_seq_LEN(strings); + assert(len > 1); + + expr_ty first = asdl_seq_GET(strings, 0); + + /* When a string is getting concatenated, the kind of the string + is determined by the first string in the concatenation + sequence. + + u"abc" "def" -> u"abcdef" + "abc" u"abc" -> "abcabc" */ + PyObject *kind = first->v.Constant.kind; + + PyUnicodeWriter *writer = PyUnicodeWriter_Create(0); + if (writer == NULL) { return NULL; } - if (bytes_found) { - PyObject* res = Py_GetConstant(Py_CONSTANT_EMPTY_BYTES); + for (Py_ssize_t i = 0; i < len; i++) { + expr_ty current_elem = asdl_seq_GET(strings, i); + assert(current_elem->kind == Constant_kind); - /* Bytes literals never get a kind, but just for consistency - since they are represented as Constant nodes, we'll mirror - the same behavior as unicode strings for determining the - kind. */ - PyObject* kind = asdl_seq_GET(strings, 0)->v.Constant.kind; - for (i = 0; i < len; i++) { - expr_ty elem = asdl_seq_GET(strings, i); - PyBytes_Concat(&res, elem->v.Constant.value); - } - if (!res || _PyArena_AddPyObject(arena, res) < 0) { - Py_XDECREF(res); + if (PyUnicodeWriter_WriteStr(writer, + current_elem->v.Constant.value)) { + PyUnicodeWriter_Discard(writer); return NULL; } - return _PyAST_Constant(res, kind, lineno, col_offset, end_lineno, end_col_offset, p->arena); } - if (!f_string_found && len == 1) { - return asdl_seq_GET(strings, 0); + PyObject *final = PyUnicodeWriter_Finish(writer); + if (final == NULL) { + return NULL; } + if (_PyArena_AddPyObject(p->arena, final) < 0) { + Py_DECREF(final); + return NULL; + } + return _PyAST_Constant(final, kind, lineno, col_offset, + end_lineno, end_col_offset, arena); +} + +static asdl_expr_seq * +_build_concatenated_str(Parser *p, asdl_expr_seq *strings, + int lineno, int col_offset, int end_lineno, + int end_col_offset, PyArena *arena) +{ + Py_ssize_t len = asdl_seq_LEN(strings); + assert(len > 0); + + Py_ssize_t n_flattened_elements = 0; + for (Py_ssize_t i = 0; i < len; i++) { + expr_ty elem = asdl_seq_GET(strings, i); + switch(elem->kind) { + case JoinedStr_kind: + n_flattened_elements += asdl_seq_LEN(elem->v.JoinedStr.values); + break; + case TemplateStr_kind: + n_flattened_elements += asdl_seq_LEN(elem->v.TemplateStr.values); + break; + default: + n_flattened_elements++; + break; + } + } + asdl_expr_seq* flattened = _Py_asdl_expr_seq_new(n_flattened_elements, p->arena); if (flattened == NULL) { @@ -1574,12 +1700,11 @@ _PyPegen_concatenate_strings(Parser *p, asdl_expr_seq *strings, /* build flattened list */ Py_ssize_t current_pos = 0; - Py_ssize_t j = 0; - for (i = 0; i < len; i++) { + for (Py_ssize_t i = 0; i < len; i++) { expr_ty elem = asdl_seq_GET(strings, i); switch(elem->kind) { case JoinedStr_kind: - for (j = 0; j < asdl_seq_LEN(elem->v.JoinedStr.values); j++) { + for (Py_ssize_t j = 0; j < asdl_seq_LEN(elem->v.JoinedStr.values); j++) { expr_ty subvalue = asdl_seq_GET(elem->v.JoinedStr.values, j); if (subvalue == NULL) { return NULL; @@ -1587,6 +1712,15 @@ _PyPegen_concatenate_strings(Parser *p, asdl_expr_seq *strings, asdl_seq_SET(flattened, current_pos++, subvalue); } break; + case TemplateStr_kind: + for (Py_ssize_t j = 0; j < asdl_seq_LEN(elem->v.TemplateStr.values); j++) { + expr_ty subvalue = asdl_seq_GET(elem->v.TemplateStr.values, j); + if (subvalue == NULL) { + return NULL; + } + asdl_seq_SET(flattened, current_pos++, subvalue); + } + break; default: asdl_seq_SET(flattened, current_pos++, elem); break; @@ -1596,13 +1730,13 @@ _PyPegen_concatenate_strings(Parser *p, asdl_expr_seq *strings, /* calculate folded element count */ Py_ssize_t n_elements = 0; int prev_is_constant = 0; - for (i = 0; i < n_flattened_elements; i++) { + for (Py_ssize_t i = 0; i < n_flattened_elements; i++) { expr_ty elem = asdl_seq_GET(flattened, i); /* The concatenation of a FormattedValue and an empty Constant should lead to the FormattedValue itself. Thus, we will not take any empty constants into account, just as in `_PyPegen_joined_str` */ - if (f_string_found && elem->kind == Constant_kind && + if (elem->kind == Constant_kind && PyUnicode_CheckExact(elem->v.Constant.value) && PyUnicode_GET_LENGTH(elem->v.Constant.value) == 0) continue; @@ -1620,7 +1754,7 @@ _PyPegen_concatenate_strings(Parser *p, asdl_expr_seq *strings, /* build folded list */ current_pos = 0; - for (i = 0; i < n_flattened_elements; i++) { + for (Py_ssize_t i = 0; i < n_flattened_elements; i++) { expr_ty elem = asdl_seq_GET(flattened, i); /* if the current elem and the following are constants, @@ -1643,6 +1777,7 @@ _PyPegen_concatenate_strings(Parser *p, asdl_expr_seq *strings, return NULL; } expr_ty last_elem = elem; + Py_ssize_t j; for (j = i; j < n_flattened_elements; j++) { expr_ty current_elem = asdl_seq_GET(flattened, j); if (current_elem->kind == Constant_kind) { @@ -1676,8 +1811,7 @@ _PyPegen_concatenate_strings(Parser *p, asdl_expr_seq *strings, } /* Drop all empty contanst strings */ - if (f_string_found && - PyUnicode_CheckExact(elem->v.Constant.value) && + if (PyUnicode_CheckExact(elem->v.Constant.value) && PyUnicode_GET_LENGTH(elem->v.Constant.value) == 0) { continue; } @@ -1686,17 +1820,97 @@ _PyPegen_concatenate_strings(Parser *p, asdl_expr_seq *strings, asdl_seq_SET(values, current_pos++, elem); } - if (!f_string_found) { - assert(n_elements == 1); - expr_ty elem = asdl_seq_GET(values, 0); - assert(elem->kind == Constant_kind); - return elem; - } - assert(current_pos == n_elements); + return values; +} + +static expr_ty +_build_concatenated_joined_str(Parser *p, asdl_expr_seq *strings, + int lineno, int col_offset, int end_lineno, + int end_col_offset, PyArena *arena) +{ + asdl_expr_seq *values = _build_concatenated_str(p, strings, lineno, + col_offset, end_lineno, end_col_offset, arena); return _PyAST_JoinedStr(values, lineno, col_offset, end_lineno, end_col_offset, p->arena); } +static expr_ty +_build_concatenated_template_str(Parser *p, asdl_expr_seq *strings, + int lineno, int col_offset, int end_lineno, + int end_col_offset, PyArena *arena) +{ + asdl_expr_seq *values = _build_concatenated_str(p, strings, lineno, + col_offset, end_lineno, end_col_offset, arena); + return _PyAST_TemplateStr(values, lineno, col_offset, end_lineno, + end_col_offset, arena); +} + +expr_ty +_PyPegen_concatenate_strings(Parser *p, asdl_expr_seq *strings, + int lineno, int col_offset, int end_lineno, + int end_col_offset, PyArena *arena) +{ + Py_ssize_t len = asdl_seq_LEN(strings); + assert(len > 0); + + int t_string_found = 0; + int f_string_found = 0; + int unicode_string_found = 0; + int bytes_found = 0; + + Py_ssize_t i = 0; + for (i = 0; i < len; i++) { + expr_ty elem = asdl_seq_GET(strings, i); + switch(elem->kind) { + case Constant_kind: + if (PyBytes_CheckExact(elem->v.Constant.value)) { + bytes_found = 1; + } else { + unicode_string_found = 1; + } + break; + case JoinedStr_kind: + f_string_found = 1; + break; + case TemplateStr_kind: + t_string_found = 1; + break; + default: + f_string_found = 1; + break; + } + } + + // Cannot mix unicode and bytes + if ((unicode_string_found || f_string_found || t_string_found) && bytes_found) { + RAISE_SYNTAX_ERROR("cannot mix bytes and nonbytes literals"); + return NULL; + } + + // If it's only bytes or only unicode string, do a simple concat + if (!f_string_found && !t_string_found) { + if (len == 1) { + return asdl_seq_GET(strings, 0); + } + else if (bytes_found) { + return _build_concatenated_bytes(p, strings, lineno, col_offset, + end_lineno, end_col_offset, arena); + } + else { + return _build_concatenated_unicode(p, strings, lineno, col_offset, + end_lineno, end_col_offset, arena); + } + } + + if (t_string_found) { + return _build_concatenated_template_str(p, strings, lineno, + col_offset, end_lineno, end_col_offset, arena); + } + + return _build_concatenated_joined_str(p, strings, lineno, + col_offset, end_lineno, end_col_offset, arena); +} + stmt_ty _PyPegen_checked_future_import(Parser *p, identifier module, asdl_alias_seq * names, int level, int lineno, int col_offset, int end_lineno, int end_col_offset, |