summaryrefslogtreecommitdiffstatshomepage
path: root/py/lexer.c
diff options
context:
space:
mode:
Diffstat (limited to 'py/lexer.c')
-rw-r--r--py/lexer.c355
1 files changed, 198 insertions, 157 deletions
diff --git a/py/lexer.c b/py/lexer.c
index ad4fe3fcb8..329875ab06 100644
--- a/py/lexer.c
+++ b/py/lexer.c
@@ -63,11 +63,9 @@ STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) {
return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
}
-/*
STATIC bool is_char_following(mp_lexer_t *lex, byte c) {
return lex->chr1 == c;
}
-*/
STATIC bool is_char_following_or(mp_lexer_t *lex, byte c1, byte c2) {
return lex->chr1 == c1 || lex->chr1 == c2;
@@ -106,6 +104,13 @@ STATIC bool is_following_odigit(mp_lexer_t *lex) {
return lex->chr1 >= '0' && lex->chr1 <= '7';
}
+STATIC bool is_string_or_bytes(mp_lexer_t *lex) {
+ return is_char_or(lex, '\'', '\"')
+ || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
+ || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r'))
+ && is_char_following_following_or(lex, '\'', '\"'));
+}
+
// to easily parse utf-8 identifiers we allow any raw byte with high bit set
STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
return is_letter(lex) || lex->chr0 == '_' || lex->chr0 >= 0x80;
@@ -272,14 +277,144 @@ STATIC bool get_hex(mp_lexer_t *lex, mp_uint_t num_digits, mp_uint_t *result) {
return true;
}
-void mp_lexer_to_next(mp_lexer_t *lex) {
- // start new token text
- vstr_reset(&lex->vstr);
+STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) {
+ // get first quoting character
+ char quote_char = '\'';
+ if (is_char(lex, '\"')) {
+ quote_char = '\"';
+ }
+ next_char(lex);
- // skip white space and comments
+ // work out if it's a single or triple quoted literal
+ size_t num_quotes;
+ if (is_char_and(lex, quote_char, quote_char)) {
+ // triple quotes
+ next_char(lex);
+ next_char(lex);
+ num_quotes = 3;
+ } else {
+ // single quotes
+ num_quotes = 1;
+ }
+
+ size_t n_closing = 0;
+ while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
+ if (is_char(lex, quote_char)) {
+ n_closing += 1;
+ vstr_add_char(&lex->vstr, CUR_CHAR(lex));
+ } else {
+ n_closing = 0;
+ if (is_char(lex, '\\')) {
+ next_char(lex);
+ unichar c = CUR_CHAR(lex);
+ if (is_raw) {
+ // raw strings allow escaping of quotes, but the backslash is also emitted
+ vstr_add_char(&lex->vstr, '\\');
+ } else {
+ switch (c) {
+ // note: "c" can never be MP_LEXER_EOF because next_char
+ // always inserts a newline at the end of the input stream
+ case '\n': c = MP_LEXER_EOF; break; // backslash escape the newline, just ignore it
+ case '\\': break;
+ case '\'': break;
+ case '"': break;
+ case 'a': c = 0x07; break;
+ case 'b': c = 0x08; break;
+ case 't': c = 0x09; break;
+ case 'n': c = 0x0a; break;
+ case 'v': c = 0x0b; break;
+ case 'f': c = 0x0c; break;
+ case 'r': c = 0x0d; break;
+ case 'u':
+ case 'U':
+ if (lex->tok_kind == MP_TOKEN_BYTES) {
+ // b'\u1234' == b'\\u1234'
+ vstr_add_char(&lex->vstr, '\\');
+ break;
+ }
+ // Otherwise fall through.
+ case 'x':
+ {
+ mp_uint_t num = 0;
+ if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) {
+ // not enough hex chars for escape sequence
+ lex->tok_kind = MP_TOKEN_INVALID;
+ }
+ c = num;
+ break;
+ }
+ case 'N':
+ // Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the
+ // entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly
+ // 3MB of text; even gzip-compressed and with minimal structure, it'll take
+ // roughly half a meg of storage. This form of Unicode escape may be added
+ // later on, but it's definitely not a priority right now. -- CJA 20140607
+ mp_not_implemented("unicode name escapes");
+ break;
+ default:
+ if (c >= '0' && c <= '7') {
+ // Octal sequence, 1-3 chars
+ mp_uint_t digits = 3;
+ mp_uint_t num = c - '0';
+ while (is_following_odigit(lex) && --digits != 0) {
+ next_char(lex);
+ num = num * 8 + (CUR_CHAR(lex) - '0');
+ }
+ c = num;
+ } else {
+ // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
+ vstr_add_char(&lex->vstr, '\\');
+ }
+ break;
+ }
+ }
+ if (c != MP_LEXER_EOF) {
+ if (MICROPY_PY_BUILTINS_STR_UNICODE_DYNAMIC) {
+ if (c < 0x110000 && lex->tok_kind == MP_TOKEN_STRING) {
+ vstr_add_char(&lex->vstr, c);
+ } else if (c < 0x100 && lex->tok_kind == MP_TOKEN_BYTES) {
+ vstr_add_byte(&lex->vstr, c);
+ } else {
+ // unicode character out of range
+ // this raises a generic SyntaxError; could provide more info
+ lex->tok_kind = MP_TOKEN_INVALID;
+ }
+ } else {
+ // without unicode everything is just added as an 8-bit byte
+ if (c < 0x100) {
+ vstr_add_byte(&lex->vstr, c);
+ } else {
+ // 8-bit character out of range
+ // this raises a generic SyntaxError; could provide more info
+ lex->tok_kind = MP_TOKEN_INVALID;
+ }
+ }
+ }
+ } else {
+ // Add the "character" as a byte so that we remain 8-bit clean.
+ // This way, strings are parsed correctly whether or not they contain utf-8 chars.
+ vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
+ }
+ }
+ next_char(lex);
+ }
+
+ // check we got the required end quotes
+ if (n_closing < num_quotes) {
+ lex->tok_kind = MP_TOKEN_LONELY_STRING_OPEN;
+ }
+
+ // cut off the end quotes from the token text
+ vstr_cut_tail_bytes(&lex->vstr, n_closing);
+}
+
+STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) {
bool had_physical_newline = false;
while (!is_end(lex)) {
if (is_physical_newline(lex)) {
+ if (stop_at_newline && lex->nested_bracket_level == 0) {
+ break;
+ }
had_physical_newline = true;
next_char(lex);
} else if (is_whitespace(lex)) {
@@ -298,6 +433,15 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
break;
}
}
+ return had_physical_newline;
+}
+
+void mp_lexer_to_next(mp_lexer_t *lex) {
+ // start new token text
+ vstr_reset(&lex->vstr);
+
+ // skip white space and comments
+ bool had_physical_newline = skip_whitespace(lex, false);
// set token source information
lex->tok_line = lex->line;
@@ -332,168 +476,65 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
} else if (is_end(lex)) {
lex->tok_kind = MP_TOKEN_END;
- } else if (is_char_or(lex, '\'', '\"')
- || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
- || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
+ } else if (is_string_or_bytes(lex)) {
// a string or bytes literal
- // parse type codes
- bool is_raw = false;
- bool is_bytes = false;
- if (is_char(lex, 'u')) {
- next_char(lex);
- } else if (is_char(lex, 'b')) {
- is_bytes = true;
- next_char(lex);
- if (is_char(lex, 'r')) {
- is_raw = true;
- next_char(lex);
- }
- } else if (is_char(lex, 'r')) {
- is_raw = true;
- next_char(lex);
- if (is_char(lex, 'b')) {
- is_bytes = true;
- next_char(lex);
- }
- }
+ // Python requires adjacent string/bytes literals to be automatically
+ // concatenated. We do it here in the tokeniser to make efficient use of RAM,
+ // because then the lexer's vstr can be used to accumulate the string literal,
+ // in contrast to creating a parse tree of strings and then joining them later
+ // in the compiler. It's also more compact in code size to do it here.
- // set token kind
- if (is_bytes) {
- lex->tok_kind = MP_TOKEN_BYTES;
- } else {
- lex->tok_kind = MP_TOKEN_STRING;
- }
+ // MP_TOKEN_END is used to indicate that this is the first string token
+ lex->tok_kind = MP_TOKEN_END;
- // get first quoting character
- char quote_char = '\'';
- if (is_char(lex, '\"')) {
- quote_char = '\"';
- }
- next_char(lex);
+ // Loop to accumulate string/bytes literals
+ do {
+ // parse type codes
+ bool is_raw = false;
+ mp_token_kind_t kind = MP_TOKEN_STRING;
+ int n_char = 0;
+ if (is_char(lex, 'u')) {
+ n_char = 1;
+ } else if (is_char(lex, 'b')) {
+ kind = MP_TOKEN_BYTES;
+ n_char = 1;
+ if (is_char_following(lex, 'r')) {
+ is_raw = true;
+ n_char = 2;
+ }
+ } else if (is_char(lex, 'r')) {
+ is_raw = true;
+ n_char = 1;
+ if (is_char_following(lex, 'b')) {
+ kind = MP_TOKEN_BYTES;
+ n_char = 2;
+ }
+ }
- // work out if it's a single or triple quoted literal
- mp_uint_t num_quotes;
- if (is_char_and(lex, quote_char, quote_char)) {
- // triple quotes
- next_char(lex);
- next_char(lex);
- num_quotes = 3;
- } else {
- // single quotes
- num_quotes = 1;
- }
+ // Set or check token kind
+ if (lex->tok_kind == MP_TOKEN_END) {
+ lex->tok_kind = kind;
+ } else if (lex->tok_kind != kind) {
+ // Can't concatenate string with bytes
+ break;
+ }
- // parse the literal
- mp_uint_t n_closing = 0;
- while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
- if (is_char(lex, quote_char)) {
- n_closing += 1;
- vstr_add_char(&lex->vstr, CUR_CHAR(lex));
- } else {
- n_closing = 0;
- if (is_char(lex, '\\')) {
+ // Skip any type code characters
+ if (n_char != 0) {
+ next_char(lex);
+ if (n_char == 2) {
next_char(lex);
- unichar c = CUR_CHAR(lex);
- if (is_raw) {
- // raw strings allow escaping of quotes, but the backslash is also emitted
- vstr_add_char(&lex->vstr, '\\');
- } else {
- switch (c) {
- // note: "c" can never be MP_LEXER_EOF because next_char
- // always inserts a newline at the end of the input stream
- case '\n': c = MP_LEXER_EOF; break; // backslash escape the newline, just ignore it
- case '\\': break;
- case '\'': break;
- case '"': break;
- case 'a': c = 0x07; break;
- case 'b': c = 0x08; break;
- case 't': c = 0x09; break;
- case 'n': c = 0x0a; break;
- case 'v': c = 0x0b; break;
- case 'f': c = 0x0c; break;
- case 'r': c = 0x0d; break;
- case 'u':
- case 'U':
- if (is_bytes) {
- // b'\u1234' == b'\\u1234'
- vstr_add_char(&lex->vstr, '\\');
- break;
- }
- // Otherwise fall through.
- case 'x':
- {
- mp_uint_t num = 0;
- if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) {
- // not enough hex chars for escape sequence
- lex->tok_kind = MP_TOKEN_INVALID;
- }
- c = num;
- break;
- }
- case 'N':
- // Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the
- // entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly
- // 3MB of text; even gzip-compressed and with minimal structure, it'll take
- // roughly half a meg of storage. This form of Unicode escape may be added
- // later on, but it's definitely not a priority right now. -- CJA 20140607
- mp_not_implemented("unicode name escapes");
- break;
- default:
- if (c >= '0' && c <= '7') {
- // Octal sequence, 1-3 chars
- mp_uint_t digits = 3;
- mp_uint_t num = c - '0';
- while (is_following_odigit(lex) && --digits != 0) {
- next_char(lex);
- num = num * 8 + (CUR_CHAR(lex) - '0');
- }
- c = num;
- } else {
- // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
- vstr_add_char(&lex->vstr, '\\');
- }
- break;
- }
- }
- if (c != MP_LEXER_EOF) {
- if (MICROPY_PY_BUILTINS_STR_UNICODE_DYNAMIC) {
- if (c < 0x110000 && !is_bytes) {
- vstr_add_char(&lex->vstr, c);
- } else if (c < 0x100 && is_bytes) {
- vstr_add_byte(&lex->vstr, c);
- } else {
- // unicode character out of range
- // this raises a generic SyntaxError; could provide more info
- lex->tok_kind = MP_TOKEN_INVALID;
- }
- } else {
- // without unicode everything is just added as an 8-bit byte
- if (c < 0x100) {
- vstr_add_byte(&lex->vstr, c);
- } else {
- // 8-bit character out of range
- // this raises a generic SyntaxError; could provide more info
- lex->tok_kind = MP_TOKEN_INVALID;
- }
- }
- }
- } else {
- // Add the "character" as a byte so that we remain 8-bit clean.
- // This way, strings are parsed correctly whether or not they contain utf-8 chars.
- vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
}
}
- next_char(lex);
- }
- // check we got the required end quotes
- if (n_closing < num_quotes) {
- lex->tok_kind = MP_TOKEN_LONELY_STRING_OPEN;
- }
+ // Parse the literal
+ parse_string_literal(lex, is_raw);
+
+ // Skip whitespace so we can check if there's another string following
+ skip_whitespace(lex, true);
- // cut off the end quotes from the token text
- vstr_cut_tail_bytes(&lex->vstr, n_closing);
+ } while (is_string_or_bytes(lex));
} else if (is_head_of_identifier(lex)) {
lex->tok_kind = MP_TOKEN_NAME;