diff options
Diffstat (limited to 'py/lexer.c')
-rw-r--r-- | py/lexer.c | 107 |
1 files changed, 65 insertions, 42 deletions
diff --git a/py/lexer.c b/py/lexer.c index 58d54b6980..03605373d0 100644 --- a/py/lexer.c +++ b/py/lexer.c @@ -218,8 +218,7 @@ STATIC const char *tok_enc = "%e=" // % %= "^e=" // ^ ^= "=e=" // = == - "!E=" // != - ".c.E."; // . ... + "!E="; // != // TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries STATIC const uint8_t tok_enc_kind[] = { @@ -240,7 +239,6 @@ STATIC const uint8_t tok_enc_kind[] = { MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL, MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL, MP_TOKEN_OP_NOT_EQUAL, - MP_TOKEN_DEL_PERIOD, MP_TOKEN_ELLIPSIS, }; // must have the same order as enum in lexer.h @@ -455,50 +453,55 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs vstr_add_char(&lex->vstr, CUR_CHAR(lex)); } else { n_closing = 0; - if (!is_raw && is_char(lex, '\\')) { + if (is_char(lex, '\\')) { next_char(lex); unichar c = CUR_CHAR(lex); - switch (c) { - case MP_LEXER_CHAR_EOF: break; // TODO a proper error message? - case '\n': c = MP_LEXER_CHAR_EOF; break; // TODO check this works correctly (we are supposed to ignore it - case '\\': break; - case '\'': break; - case '"': break; - case 'a': c = 0x07; break; - case 'b': c = 0x08; break; - case 't': c = 0x09; break; - case 'n': c = 0x0a; break; - case 'v': c = 0x0b; break; - case 'f': c = 0x0c; break; - case 'r': c = 0x0d; break; - case 'x': - { - uint num = 0; - if (!get_hex(lex, 2, &num)) { - // TODO error message - assert(0); - } - c = num; - break; - } - case 'N': break; // TODO \N{name} only in strings - case 'u': break; // TODO \uxxxx only in strings - case 'U': break; // TODO \Uxxxxxxxx only in strings - default: - if (c >= '0' && c <= '7') { - // Octal sequence, 1-3 chars - int digits = 3; - int num = c - '0'; - while (is_following_odigit(lex) && --digits != 0) { - next_char(lex); - num = num * 8 + (CUR_CHAR(lex) - '0'); + if (is_raw) { + // raw strings allow escaping of quotes, but the backslash is also emitted + vstr_add_char(&lex->vstr, '\\'); + } else { + switch (c) { + case MP_LEXER_CHAR_EOF: break; // TODO a proper error message? + case '\n': c = MP_LEXER_CHAR_EOF; break; // TODO check this works correctly (we are supposed to ignore it + case '\\': break; + case '\'': break; + case '"': break; + case 'a': c = 0x07; break; + case 'b': c = 0x08; break; + case 't': c = 0x09; break; + case 'n': c = 0x0a; break; + case 'v': c = 0x0b; break; + case 'f': c = 0x0c; break; + case 'r': c = 0x0d; break; + case 'x': + { + uint num = 0; + if (!get_hex(lex, 2, &num)) { + // TODO error message + assert(0); } c = num; - } else { - // unrecognised escape character; CPython lets this through verbatim as '\' and then the character - vstr_add_char(&lex->vstr, '\\'); + break; } - break; + case 'N': break; // TODO \N{name} only in strings + case 'u': break; // TODO \uxxxx only in strings + case 'U': break; // TODO \Uxxxxxxxx only in strings + default: + if (c >= '0' && c <= '7') { + // Octal sequence, 1-3 chars + int digits = 3; + int num = c - '0'; + while (is_following_odigit(lex) && --digits != 0) { + next_char(lex); + num = num * 8 + (CUR_CHAR(lex) - '0'); + } + c = num; + } else { + // unrecognised escape character; CPython lets this through verbatim as '\' and then the character + vstr_add_char(&lex->vstr, '\\'); + } + break; + } } if (c != MP_LEXER_CHAR_EOF) { vstr_add_char(&lex->vstr, c); @@ -555,6 +558,23 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs } } + } else if (is_char(lex, '.')) { + // special handling for . and ... operators, because .. is not a valid operator + + // get first char + vstr_add_char(&lex->vstr, '.'); + next_char(lex); + + if (is_char_and(lex, '.', '.')) { + vstr_add_char(&lex->vstr, '.'); + vstr_add_char(&lex->vstr, '.'); + next_char(lex); + next_char(lex); + tok->kind = MP_TOKEN_ELLIPSIS; + } else { + tok->kind = MP_TOKEN_DEL_PERIOD; + } + } else { // search for encoded delimiter or operator @@ -600,6 +620,7 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs tok_enc_index = t_index; } else { tok->kind = MP_TOKEN_INVALID; + goto tok_enc_no_match; } break; } @@ -622,6 +643,8 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs // set token kind tok->kind = tok_enc_kind[tok_enc_index]; + tok_enc_no_match: + // compute bracket level for implicit line joining if (tok->kind == MP_TOKEN_DEL_PAREN_OPEN || tok->kind == MP_TOKEN_DEL_BRACKET_OPEN || tok->kind == MP_TOKEN_DEL_BRACE_OPEN) { lex->nested_bracket_level += 1; |