summaryrefslogtreecommitdiffstatshomepage
path: root/py/lexer.c
diff options
context:
space:
mode:
Diffstat (limited to 'py/lexer.c')
-rw-r--r--py/lexer.c107
1 files changed, 65 insertions, 42 deletions
diff --git a/py/lexer.c b/py/lexer.c
index 58d54b6980..03605373d0 100644
--- a/py/lexer.c
+++ b/py/lexer.c
@@ -218,8 +218,7 @@ STATIC const char *tok_enc =
"%e=" // % %=
"^e=" // ^ ^=
"=e=" // = ==
- "!E=" // !=
- ".c.E."; // . ...
+ "!E="; // !=
// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
STATIC const uint8_t tok_enc_kind[] = {
@@ -240,7 +239,6 @@ STATIC const uint8_t tok_enc_kind[] = {
MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
MP_TOKEN_OP_NOT_EQUAL,
- MP_TOKEN_DEL_PERIOD, MP_TOKEN_ELLIPSIS,
};
// must have the same order as enum in lexer.h
@@ -455,50 +453,55 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
vstr_add_char(&lex->vstr, CUR_CHAR(lex));
} else {
n_closing = 0;
- if (!is_raw && is_char(lex, '\\')) {
+ if (is_char(lex, '\\')) {
next_char(lex);
unichar c = CUR_CHAR(lex);
- switch (c) {
- case MP_LEXER_CHAR_EOF: break; // TODO a proper error message?
- case '\n': c = MP_LEXER_CHAR_EOF; break; // TODO check this works correctly (we are supposed to ignore it
- case '\\': break;
- case '\'': break;
- case '"': break;
- case 'a': c = 0x07; break;
- case 'b': c = 0x08; break;
- case 't': c = 0x09; break;
- case 'n': c = 0x0a; break;
- case 'v': c = 0x0b; break;
- case 'f': c = 0x0c; break;
- case 'r': c = 0x0d; break;
- case 'x':
- {
- uint num = 0;
- if (!get_hex(lex, 2, &num)) {
- // TODO error message
- assert(0);
- }
- c = num;
- break;
- }
- case 'N': break; // TODO \N{name} only in strings
- case 'u': break; // TODO \uxxxx only in strings
- case 'U': break; // TODO \Uxxxxxxxx only in strings
- default:
- if (c >= '0' && c <= '7') {
- // Octal sequence, 1-3 chars
- int digits = 3;
- int num = c - '0';
- while (is_following_odigit(lex) && --digits != 0) {
- next_char(lex);
- num = num * 8 + (CUR_CHAR(lex) - '0');
+ if (is_raw) {
+ // raw strings allow escaping of quotes, but the backslash is also emitted
+ vstr_add_char(&lex->vstr, '\\');
+ } else {
+ switch (c) {
+ case MP_LEXER_CHAR_EOF: break; // TODO a proper error message?
+ case '\n': c = MP_LEXER_CHAR_EOF; break; // TODO check this works correctly (we are supposed to ignore it
+ case '\\': break;
+ case '\'': break;
+ case '"': break;
+ case 'a': c = 0x07; break;
+ case 'b': c = 0x08; break;
+ case 't': c = 0x09; break;
+ case 'n': c = 0x0a; break;
+ case 'v': c = 0x0b; break;
+ case 'f': c = 0x0c; break;
+ case 'r': c = 0x0d; break;
+ case 'x':
+ {
+ uint num = 0;
+ if (!get_hex(lex, 2, &num)) {
+ // TODO error message
+ assert(0);
}
c = num;
- } else {
- // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
- vstr_add_char(&lex->vstr, '\\');
+ break;
}
- break;
+ case 'N': break; // TODO \N{name} only in strings
+ case 'u': break; // TODO \uxxxx only in strings
+ case 'U': break; // TODO \Uxxxxxxxx only in strings
+ default:
+ if (c >= '0' && c <= '7') {
+ // Octal sequence, 1-3 chars
+ int digits = 3;
+ int num = c - '0';
+ while (is_following_odigit(lex) && --digits != 0) {
+ next_char(lex);
+ num = num * 8 + (CUR_CHAR(lex) - '0');
+ }
+ c = num;
+ } else {
+ // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
+ vstr_add_char(&lex->vstr, '\\');
+ }
+ break;
+ }
}
if (c != MP_LEXER_CHAR_EOF) {
vstr_add_char(&lex->vstr, c);
@@ -555,6 +558,23 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
}
}
+ } else if (is_char(lex, '.')) {
+ // special handling for . and ... operators, because .. is not a valid operator
+
+ // get first char
+ vstr_add_char(&lex->vstr, '.');
+ next_char(lex);
+
+ if (is_char_and(lex, '.', '.')) {
+ vstr_add_char(&lex->vstr, '.');
+ vstr_add_char(&lex->vstr, '.');
+ next_char(lex);
+ next_char(lex);
+ tok->kind = MP_TOKEN_ELLIPSIS;
+ } else {
+ tok->kind = MP_TOKEN_DEL_PERIOD;
+ }
+
} else {
// search for encoded delimiter or operator
@@ -600,6 +620,7 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
tok_enc_index = t_index;
} else {
tok->kind = MP_TOKEN_INVALID;
+ goto tok_enc_no_match;
}
break;
}
@@ -622,6 +643,8 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
// set token kind
tok->kind = tok_enc_kind[tok_enc_index];
+ tok_enc_no_match:
+
// compute bracket level for implicit line joining
if (tok->kind == MP_TOKEN_DEL_PAREN_OPEN || tok->kind == MP_TOKEN_DEL_BRACKET_OPEN || tok->kind == MP_TOKEN_DEL_BRACE_OPEN) {
lex->nested_bracket_level += 1;