summaryrefslogtreecommitdiffstatshomepage
path: root/py/lexer.c
diff options
context:
space:
mode:
Diffstat (limited to 'py/lexer.c')
-rw-r--r--py/lexer.c627
1 files changed, 286 insertions, 341 deletions
diff --git a/py/lexer.c b/py/lexer.c
index 458fba0900..abc1f3ebbb 100644
--- a/py/lexer.c
+++ b/py/lexer.c
@@ -25,6 +25,7 @@
*/
#include <stdio.h>
+#include <string.h>
#include <assert.h>
#include "py/mpstate.h"
@@ -39,19 +40,6 @@
// TODO seems that CPython allows NULL byte in the input stream
// don't know if that's intentional or not, but we don't allow it
-// TODO replace with a call to a standard function
-STATIC bool str_strn_equal(const char *str, const char *strn, mp_uint_t len) {
- mp_uint_t i = 0;
-
- while (i < len && *str == *strn) {
- ++i;
- ++str;
- ++strn;
- }
-
- return i == len && *str == 0;
-}
-
#define MP_LEXER_EOF ((unichar)MP_READER_EOF)
#define CUR_CHAR(lex) ((lex)->chr0)
@@ -75,11 +63,9 @@ STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) {
return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
}
-/*
STATIC bool is_char_following(mp_lexer_t *lex, byte c) {
return lex->chr1 == c;
}
-*/
STATIC bool is_char_following_or(mp_lexer_t *lex, byte c1, byte c2) {
return lex->chr1 == c1 || lex->chr1 == c2;
@@ -118,6 +104,13 @@ STATIC bool is_following_odigit(mp_lexer_t *lex) {
return lex->chr1 >= '0' && lex->chr1 <= '7';
}
+STATIC bool is_string_or_bytes(mp_lexer_t *lex) {
+ return is_char_or(lex, '\'', '\"')
+ || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
+ || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r'))
+ && is_char_following_following_or(lex, '\'', '\"'));
+}
+
// to easily parse utf-8 identifiers we allow any raw byte with high bit set
STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
return is_letter(lex) || lex->chr0 == '_' || lex->chr0 >= 0x80;
@@ -144,36 +137,30 @@ STATIC void next_char(mp_lexer_t *lex) {
lex->chr1 = lex->chr2;
lex->chr2 = lex->reader.readbyte(lex->reader.data);
- if (lex->chr0 == '\r') {
+ if (lex->chr1 == '\r') {
// CR is a new line, converted to LF
- lex->chr0 = '\n';
- if (lex->chr1 == '\n') {
- // CR LF is a single new line
- lex->chr1 = lex->chr2;
+ lex->chr1 = '\n';
+ if (lex->chr2 == '\n') {
+ // CR LF is a single new line, throw out the extra LF
lex->chr2 = lex->reader.readbyte(lex->reader.data);
}
}
- if (lex->chr2 == MP_LEXER_EOF) {
- // EOF, check if we need to insert a newline at end of file
- if (lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') {
- // if lex->chr1 == '\r' then this makes a CR LF which will be converted to LF above
- // otherwise it just inserts a LF
- lex->chr2 = '\n';
- }
+ // check if we need to insert a newline at end of file
+ if (lex->chr2 == MP_LEXER_EOF && lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') {
+ lex->chr2 = '\n';
}
}
-STATIC void indent_push(mp_lexer_t *lex, mp_uint_t indent) {
+STATIC void indent_push(mp_lexer_t *lex, size_t indent) {
if (lex->num_indent_level >= lex->alloc_indent_level) {
- // TODO use m_renew_maybe and somehow indicate an error if it fails... probably by using MP_TOKEN_MEMORY_ERROR
lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level + MICROPY_ALLOC_LEXEL_INDENT_INC);
lex->alloc_indent_level += MICROPY_ALLOC_LEXEL_INDENT_INC;
}
lex->indent_level[lex->num_indent_level++] = indent;
}
-STATIC mp_uint_t indent_top(mp_lexer_t *lex) {
+STATIC size_t indent_top(mp_lexer_t *lex) {
return lex->indent_level[lex->num_indent_level - 1];
}
@@ -184,7 +171,6 @@ STATIC void indent_pop(mp_lexer_t *lex) {
// some tricky operator encoding:
// <op> = begin with <op>, if this opchar matches then begin here
// e<op> = end with <op>, if this opchar matches then end
-// E<op> = mandatory end with <op>, this opchar must match, then end
// c<op> = continue with <op>, if this opchar matches then continue matching
// this means if the start of two ops are the same then they are equal til the last char
@@ -201,7 +187,7 @@ STATIC const char *const tok_enc =
"%e=" // % %=
"^e=" // ^ ^=
"=e=" // = ==
- "!E="; // !=
+ "!."; // start of special cases: != . ...
// TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
STATIC const uint8_t tok_enc_kind[] = {
@@ -221,14 +207,15 @@ STATIC const uint8_t tok_enc_kind[] = {
MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
- MP_TOKEN_OP_NOT_EQUAL,
};
// must have the same order as enum in lexer.h
+// must be sorted according to strcmp
STATIC const char *const tok_kw[] = {
"False",
"None",
"True",
+ "__debug__",
"and",
"as",
"assert",
@@ -263,13 +250,12 @@ STATIC const char *const tok_kw[] = {
"while",
"with",
"yield",
- "__debug__",
};
// This is called with CUR_CHAR() before first hex digit, and should return with
// it pointing to last hex digit
// num_digits must be greater than zero
-STATIC bool get_hex(mp_lexer_t *lex, mp_uint_t num_digits, mp_uint_t *result) {
+STATIC bool get_hex(mp_lexer_t *lex, size_t num_digits, mp_uint_t *result) {
mp_uint_t num = 0;
while (num_digits-- != 0) {
next_char(lex);
@@ -283,14 +269,144 @@ STATIC bool get_hex(mp_lexer_t *lex, mp_uint_t num_digits, mp_uint_t *result) {
return true;
}
-STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
- // start new token text
- vstr_reset(&lex->vstr);
+STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) {
+ // get first quoting character
+ char quote_char = '\'';
+ if (is_char(lex, '\"')) {
+ quote_char = '\"';
+ }
+ next_char(lex);
- // skip white space and comments
+ // work out if it's a single or triple quoted literal
+ size_t num_quotes;
+ if (is_char_and(lex, quote_char, quote_char)) {
+ // triple quotes
+ next_char(lex);
+ next_char(lex);
+ num_quotes = 3;
+ } else {
+ // single quotes
+ num_quotes = 1;
+ }
+
+ size_t n_closing = 0;
+ while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
+ if (is_char(lex, quote_char)) {
+ n_closing += 1;
+ vstr_add_char(&lex->vstr, CUR_CHAR(lex));
+ } else {
+ n_closing = 0;
+ if (is_char(lex, '\\')) {
+ next_char(lex);
+ unichar c = CUR_CHAR(lex);
+ if (is_raw) {
+ // raw strings allow escaping of quotes, but the backslash is also emitted
+ vstr_add_char(&lex->vstr, '\\');
+ } else {
+ switch (c) {
+ // note: "c" can never be MP_LEXER_EOF because next_char
+ // always inserts a newline at the end of the input stream
+ case '\n': c = MP_LEXER_EOF; break; // backslash escape the newline, just ignore it
+ case '\\': break;
+ case '\'': break;
+ case '"': break;
+ case 'a': c = 0x07; break;
+ case 'b': c = 0x08; break;
+ case 't': c = 0x09; break;
+ case 'n': c = 0x0a; break;
+ case 'v': c = 0x0b; break;
+ case 'f': c = 0x0c; break;
+ case 'r': c = 0x0d; break;
+ case 'u':
+ case 'U':
+ if (lex->tok_kind == MP_TOKEN_BYTES) {
+ // b'\u1234' == b'\\u1234'
+ vstr_add_char(&lex->vstr, '\\');
+ break;
+ }
+ // Otherwise fall through.
+ case 'x':
+ {
+ mp_uint_t num = 0;
+ if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) {
+ // not enough hex chars for escape sequence
+ lex->tok_kind = MP_TOKEN_INVALID;
+ }
+ c = num;
+ break;
+ }
+ case 'N':
+ // Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the
+ // entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly
+ // 3MB of text; even gzip-compressed and with minimal structure, it'll take
+ // roughly half a meg of storage. This form of Unicode escape may be added
+ // later on, but it's definitely not a priority right now. -- CJA 20140607
+ mp_not_implemented("unicode name escapes");
+ break;
+ default:
+ if (c >= '0' && c <= '7') {
+ // Octal sequence, 1-3 chars
+ size_t digits = 3;
+ mp_uint_t num = c - '0';
+ while (is_following_odigit(lex) && --digits != 0) {
+ next_char(lex);
+ num = num * 8 + (CUR_CHAR(lex) - '0');
+ }
+ c = num;
+ } else {
+ // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
+ vstr_add_char(&lex->vstr, '\\');
+ }
+ break;
+ }
+ }
+ if (c != MP_LEXER_EOF) {
+ if (MICROPY_PY_BUILTINS_STR_UNICODE_DYNAMIC) {
+ if (c < 0x110000 && lex->tok_kind == MP_TOKEN_STRING) {
+ vstr_add_char(&lex->vstr, c);
+ } else if (c < 0x100 && lex->tok_kind == MP_TOKEN_BYTES) {
+ vstr_add_byte(&lex->vstr, c);
+ } else {
+ // unicode character out of range
+ // this raises a generic SyntaxError; could provide more info
+ lex->tok_kind = MP_TOKEN_INVALID;
+ }
+ } else {
+ // without unicode everything is just added as an 8-bit byte
+ if (c < 0x100) {
+ vstr_add_byte(&lex->vstr, c);
+ } else {
+ // 8-bit character out of range
+ // this raises a generic SyntaxError; could provide more info
+ lex->tok_kind = MP_TOKEN_INVALID;
+ }
+ }
+ }
+ } else {
+ // Add the "character" as a byte so that we remain 8-bit clean.
+ // This way, strings are parsed correctly whether or not they contain utf-8 chars.
+ vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
+ }
+ }
+ next_char(lex);
+ }
+
+ // check we got the required end quotes
+ if (n_closing < num_quotes) {
+ lex->tok_kind = MP_TOKEN_LONELY_STRING_OPEN;
+ }
+
+ // cut off the end quotes from the token text
+ vstr_cut_tail_bytes(&lex->vstr, n_closing);
+}
+
+STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) {
bool had_physical_newline = false;
while (!is_end(lex)) {
if (is_physical_newline(lex)) {
+ if (stop_at_newline && lex->nested_bracket_level == 0) {
+ break;
+ }
had_physical_newline = true;
next_char(lex);
} else if (is_whitespace(lex)) {
@@ -301,35 +417,29 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
next_char(lex);
}
// had_physical_newline will be set on next loop
- } else if (is_char(lex, '\\')) {
- // backslash (outside string literals) must appear just before a physical newline
+ } else if (is_char_and(lex, '\\', '\n')) {
+ // line-continuation, so don't set had_physical_newline
+ next_char(lex);
next_char(lex);
- if (!is_physical_newline(lex)) {
- // SyntaxError: unexpected character after line continuation character
- lex->tok_line = lex->line;
- lex->tok_column = lex->column;
- lex->tok_kind = MP_TOKEN_BAD_LINE_CONTINUATION;
- return;
- } else {
- next_char(lex);
- }
} else {
break;
}
}
+ return had_physical_newline;
+}
+
+void mp_lexer_to_next(mp_lexer_t *lex) {
+ // start new token text
+ vstr_reset(&lex->vstr);
+
+ // skip white space and comments
+ bool had_physical_newline = skip_whitespace(lex, false);
// set token source information
lex->tok_line = lex->line;
lex->tok_column = lex->column;
- if (first_token && lex->line == 1 && lex->column != 1) {
- // check that the first token is in the first column
- // if first token is not on first line, we get a physical newline and
- // this check is done as part of normal indent/dedent checking below
- // (done to get equivalence with CPython)
- lex->tok_kind = MP_TOKEN_INDENT;
-
- } else if (lex->emit_dent < 0) {
+ if (lex->emit_dent < 0) {
lex->tok_kind = MP_TOKEN_DEDENT;
lex->emit_dent += 1;
@@ -340,7 +450,7 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
} else if (had_physical_newline && lex->nested_bracket_level == 0) {
lex->tok_kind = MP_TOKEN_NEWLINE;
- mp_uint_t num_spaces = lex->column - 1;
+ size_t num_spaces = lex->column - 1;
if (num_spaces == indent_top(lex)) {
} else if (num_spaces > indent_top(lex)) {
indent_push(lex, num_spaces);
@@ -358,168 +468,65 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
} else if (is_end(lex)) {
lex->tok_kind = MP_TOKEN_END;
- } else if (is_char_or(lex, '\'', '\"')
- || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
- || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
+ } else if (is_string_or_bytes(lex)) {
// a string or bytes literal
- // parse type codes
- bool is_raw = false;
- bool is_bytes = false;
- if (is_char(lex, 'u')) {
- next_char(lex);
- } else if (is_char(lex, 'b')) {
- is_bytes = true;
- next_char(lex);
- if (is_char(lex, 'r')) {
- is_raw = true;
- next_char(lex);
- }
- } else if (is_char(lex, 'r')) {
- is_raw = true;
- next_char(lex);
- if (is_char(lex, 'b')) {
- is_bytes = true;
- next_char(lex);
- }
- }
+ // Python requires adjacent string/bytes literals to be automatically
+ // concatenated. We do it here in the tokeniser to make efficient use of RAM,
+ // because then the lexer's vstr can be used to accumulate the string literal,
+ // in contrast to creating a parse tree of strings and then joining them later
+ // in the compiler. It's also more compact in code size to do it here.
- // set token kind
- if (is_bytes) {
- lex->tok_kind = MP_TOKEN_BYTES;
- } else {
- lex->tok_kind = MP_TOKEN_STRING;
- }
+ // MP_TOKEN_END is used to indicate that this is the first string token
+ lex->tok_kind = MP_TOKEN_END;
- // get first quoting character
- char quote_char = '\'';
- if (is_char(lex, '\"')) {
- quote_char = '\"';
- }
- next_char(lex);
+ // Loop to accumulate string/bytes literals
+ do {
+ // parse type codes
+ bool is_raw = false;
+ mp_token_kind_t kind = MP_TOKEN_STRING;
+ int n_char = 0;
+ if (is_char(lex, 'u')) {
+ n_char = 1;
+ } else if (is_char(lex, 'b')) {
+ kind = MP_TOKEN_BYTES;
+ n_char = 1;
+ if (is_char_following(lex, 'r')) {
+ is_raw = true;
+ n_char = 2;
+ }
+ } else if (is_char(lex, 'r')) {
+ is_raw = true;
+ n_char = 1;
+ if (is_char_following(lex, 'b')) {
+ kind = MP_TOKEN_BYTES;
+ n_char = 2;
+ }
+ }
- // work out if it's a single or triple quoted literal
- mp_uint_t num_quotes;
- if (is_char_and(lex, quote_char, quote_char)) {
- // triple quotes
- next_char(lex);
- next_char(lex);
- num_quotes = 3;
- } else {
- // single quotes
- num_quotes = 1;
- }
+ // Set or check token kind
+ if (lex->tok_kind == MP_TOKEN_END) {
+ lex->tok_kind = kind;
+ } else if (lex->tok_kind != kind) {
+ // Can't concatenate string with bytes
+ break;
+ }
- // parse the literal
- mp_uint_t n_closing = 0;
- while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
- if (is_char(lex, quote_char)) {
- n_closing += 1;
- vstr_add_char(&lex->vstr, CUR_CHAR(lex));
- } else {
- n_closing = 0;
- if (is_char(lex, '\\')) {
+ // Skip any type code characters
+ if (n_char != 0) {
+ next_char(lex);
+ if (n_char == 2) {
next_char(lex);
- unichar c = CUR_CHAR(lex);
- if (is_raw) {
- // raw strings allow escaping of quotes, but the backslash is also emitted
- vstr_add_char(&lex->vstr, '\\');
- } else {
- switch (c) {
- // note: "c" can never be MP_LEXER_EOF because next_char
- // always inserts a newline at the end of the input stream
- case '\n': c = MP_LEXER_EOF; break; // backslash escape the newline, just ignore it
- case '\\': break;
- case '\'': break;
- case '"': break;
- case 'a': c = 0x07; break;
- case 'b': c = 0x08; break;
- case 't': c = 0x09; break;
- case 'n': c = 0x0a; break;
- case 'v': c = 0x0b; break;
- case 'f': c = 0x0c; break;
- case 'r': c = 0x0d; break;
- case 'u':
- case 'U':
- if (is_bytes) {
- // b'\u1234' == b'\\u1234'
- vstr_add_char(&lex->vstr, '\\');
- break;
- }
- // Otherwise fall through.
- case 'x':
- {
- mp_uint_t num = 0;
- if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) {
- // not enough hex chars for escape sequence
- lex->tok_kind = MP_TOKEN_INVALID;
- }
- c = num;
- break;
- }
- case 'N':
- // Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the
- // entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly
- // 3MB of text; even gzip-compressed and with minimal structure, it'll take
- // roughly half a meg of storage. This form of Unicode escape may be added
- // later on, but it's definitely not a priority right now. -- CJA 20140607
- mp_not_implemented("unicode name escapes");
- break;
- default:
- if (c >= '0' && c <= '7') {
- // Octal sequence, 1-3 chars
- mp_uint_t digits = 3;
- mp_uint_t num = c - '0';
- while (is_following_odigit(lex) && --digits != 0) {
- next_char(lex);
- num = num * 8 + (CUR_CHAR(lex) - '0');
- }
- c = num;
- } else {
- // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
- vstr_add_char(&lex->vstr, '\\');
- }
- break;
- }
- }
- if (c != MP_LEXER_EOF) {
- if (MICROPY_PY_BUILTINS_STR_UNICODE_DYNAMIC) {
- if (c < 0x110000 && !is_bytes) {
- vstr_add_char(&lex->vstr, c);
- } else if (c < 0x100 && is_bytes) {
- vstr_add_byte(&lex->vstr, c);
- } else {
- // unicode character out of range
- // this raises a generic SyntaxError; could provide more info
- lex->tok_kind = MP_TOKEN_INVALID;
- }
- } else {
- // without unicode everything is just added as an 8-bit byte
- if (c < 0x100) {
- vstr_add_byte(&lex->vstr, c);
- } else {
- // 8-bit character out of range
- // this raises a generic SyntaxError; could provide more info
- lex->tok_kind = MP_TOKEN_INVALID;
- }
- }
- }
- } else {
- // Add the "character" as a byte so that we remain 8-bit clean.
- // This way, strings are parsed correctly whether or not they contain utf-8 chars.
- vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
}
}
- next_char(lex);
- }
- // check we got the required end quotes
- if (n_closing < num_quotes) {
- lex->tok_kind = MP_TOKEN_LONELY_STRING_OPEN;
- }
+ // Parse the literal
+ parse_string_literal(lex, is_raw);
+
+ // Skip whitespace so we can check if there's another string following
+ skip_whitespace(lex, true);
- // cut off the end quotes from the token text
- vstr_cut_tail_bytes(&lex->vstr, n_closing);
+ } while (is_string_or_bytes(lex));
} else if (is_head_of_identifier(lex)) {
lex->tok_kind = MP_TOKEN_NAME;
@@ -534,6 +541,25 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
next_char(lex);
}
+ // Check if the name is a keyword.
+ // We also check for __debug__ here and convert it to its value. This is
+ // so the parser gives a syntax error on, eg, x.__debug__. Otherwise, we
+ // need to check for this special token in many places in the compiler.
+ const char *s = vstr_null_terminated_str(&lex->vstr);
+ for (size_t i = 0; i < MP_ARRAY_SIZE(tok_kw); i++) {
+ int cmp = strcmp(s, tok_kw[i]);
+ if (cmp == 0) {
+ lex->tok_kind = MP_TOKEN_KW_FALSE + i;
+ if (lex->tok_kind == MP_TOKEN_KW___DEBUG__) {
+ lex->tok_kind = (MP_STATE_VM(mp_optimise_value) == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);
+ }
+ break;
+ } else if (cmp < 0) {
+ // Table is sorted and comparison was less-than, so stop searching
+ break;
+ }
+ }
+
} else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
bool forced_integer = false;
if (is_char(lex, '.')) {
@@ -570,34 +596,14 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
}
}
- } else if (is_char(lex, '.')) {
- // special handling for . and ... operators, because .. is not a valid operator
-
- // get first char
- vstr_add_char(&lex->vstr, '.');
- next_char(lex);
-
- if (is_char_and(lex, '.', '.')) {
- vstr_add_char(&lex->vstr, '.');
- vstr_add_char(&lex->vstr, '.');
- next_char(lex);
- next_char(lex);
- lex->tok_kind = MP_TOKEN_ELLIPSIS;
- } else {
- lex->tok_kind = MP_TOKEN_DEL_PERIOD;
- }
-
} else {
// search for encoded delimiter or operator
const char *t = tok_enc;
- mp_uint_t tok_enc_index = 0;
+ size_t tok_enc_index = 0;
for (; *t != 0 && !is_char(lex, *t); t += 1) {
if (*t == 'e' || *t == 'c') {
t += 1;
- } else if (*t == 'E') {
- tok_enc_index -= 1;
- t += 1;
}
tok_enc_index += 1;
}
@@ -608,55 +614,48 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
// didn't match any delimiter or operator characters
lex->tok_kind = MP_TOKEN_INVALID;
+ } else if (*t == '!') {
+ // "!=" is a special case because "!" is not a valid operator
+ if (is_char(lex, '=')) {
+ next_char(lex);
+ lex->tok_kind = MP_TOKEN_OP_NOT_EQUAL;
+ } else {
+ lex->tok_kind = MP_TOKEN_INVALID;
+ }
+
+ } else if (*t == '.') {
+ // "." and "..." are special cases because ".." is not a valid operator
+ if (is_char_and(lex, '.', '.')) {
+ next_char(lex);
+ next_char(lex);
+ lex->tok_kind = MP_TOKEN_ELLIPSIS;
+ } else {
+ lex->tok_kind = MP_TOKEN_DEL_PERIOD;
+ }
+
} else {
// matched a delimiter or operator character
// get the maximum characters for a valid token
t += 1;
- mp_uint_t t_index = tok_enc_index;
- for (;;) {
- for (; *t == 'e'; t += 1) {
- t += 1;
- t_index += 1;
- if (is_char(lex, *t)) {
- next_char(lex);
- tok_enc_index = t_index;
- break;
- }
- }
-
- if (*t == 'E') {
- t += 1;
- if (is_char(lex, *t)) {
- next_char(lex);
- tok_enc_index = t_index;
- } else {
- lex->tok_kind = MP_TOKEN_INVALID;
- goto tok_enc_no_match;
- }
- break;
- }
-
- if (*t == 'c') {
- t += 1;
- t_index += 1;
- if (is_char(lex, *t)) {
- next_char(lex);
- tok_enc_index = t_index;
- t += 1;
- } else {
+ size_t t_index = tok_enc_index;
+ while (*t == 'c' || *t == 'e') {
+ t_index += 1;
+ if (is_char(lex, t[1])) {
+ next_char(lex);
+ tok_enc_index = t_index;
+ if (*t == 'e') {
break;
}
- } else {
+ } else if (*t == 'c') {
break;
}
+ t += 2;
}
// set token kind
lex->tok_kind = tok_enc_kind[tok_enc_index];
- tok_enc_no_match:
-
// compute bracket level for implicit line joining
if (lex->tok_kind == MP_TOKEN_DEL_PAREN_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACKET_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACE_OPEN) {
lex->nested_bracket_level += 1;
@@ -665,102 +664,55 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
}
}
}
-
- // check for keywords
- if (lex->tok_kind == MP_TOKEN_NAME) {
- // We check for __debug__ here and convert it to its value. This is so
- // the parser gives a syntax error on, eg, x.__debug__. Otherwise, we
- // need to check for this special token in many places in the compiler.
- // TODO improve speed of these string comparisons
- //for (mp_int_t i = 0; tok_kw[i] != NULL; i++) {
- for (size_t i = 0; i < MP_ARRAY_SIZE(tok_kw); i++) {
- if (str_strn_equal(tok_kw[i], lex->vstr.buf, lex->vstr.len)) {
- if (i == MP_ARRAY_SIZE(tok_kw) - 1) {
- // tok_kw[MP_ARRAY_SIZE(tok_kw) - 1] == "__debug__"
- lex->tok_kind = (MP_STATE_VM(mp_optimise_value) == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);
- } else {
- lex->tok_kind = MP_TOKEN_KW_FALSE + i;
- }
- break;
- }
- }
- }
}
mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader) {
- mp_lexer_t *lex = m_new_obj_maybe(mp_lexer_t);
-
- // check for memory allocation error
- if (lex == NULL) {
- reader.close(reader.data);
- return NULL;
- }
+ mp_lexer_t *lex = m_new_obj(mp_lexer_t);
lex->source_name = src_name;
lex->reader = reader;
lex->line = 1;
- lex->column = 1;
+ lex->column = -2; // account for 3 dummy bytes
lex->emit_dent = 0;
lex->nested_bracket_level = 0;
lex->alloc_indent_level = MICROPY_ALLOC_LEXER_INDENT_INIT;
lex->num_indent_level = 1;
- lex->indent_level = m_new_maybe(uint16_t, lex->alloc_indent_level);
+ lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);
vstr_init(&lex->vstr, 32);
- // check for memory allocation error
- // note: vstr_init above may fail on malloc, but so may mp_lexer_next_token_into below
- if (lex->indent_level == NULL) {
- mp_lexer_free(lex);
- return NULL;
- }
-
// store sentinel for first indentation level
lex->indent_level[0] = 0;
- // preload characters
- lex->chr0 = reader.readbyte(reader.data);
- lex->chr1 = reader.readbyte(reader.data);
- lex->chr2 = reader.readbyte(reader.data);
-
- // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
- if (lex->chr0 == MP_LEXER_EOF) {
- lex->chr0 = '\n';
- } else if (lex->chr1 == MP_LEXER_EOF) {
- if (lex->chr0 == '\r') {
- lex->chr0 = '\n';
- } else if (lex->chr0 != '\n') {
- lex->chr1 = '\n';
- }
- } else if (lex->chr2 == MP_LEXER_EOF) {
- if (lex->chr1 == '\r') {
- lex->chr1 = '\n';
- } else if (lex->chr1 != '\n') {
- lex->chr2 = '\n';
- }
- }
+ // load lexer with start of file, advancing lex->column to 1
+ // start with dummy bytes and use next_char() for proper EOL/EOF handling
+ lex->chr0 = lex->chr1 = lex->chr2 = 0;
+ next_char(lex);
+ next_char(lex);
+ next_char(lex);
// preload first token
- mp_lexer_next_token_into(lex, true);
+ mp_lexer_to_next(lex);
+
+ // Check that the first token is in the first column. If it's not then we
+ // convert the token kind to INDENT so that the parser gives a syntax error.
+ if (lex->tok_column != 1) {
+ lex->tok_kind = MP_TOKEN_INDENT;
+ }
return lex;
}
-mp_lexer_t *mp_lexer_new_from_str_len(qstr src_name, const char *str, mp_uint_t len, mp_uint_t free_len) {
+mp_lexer_t *mp_lexer_new_from_str_len(qstr src_name, const char *str, size_t len, size_t free_len) {
mp_reader_t reader;
- if (!mp_reader_new_mem(&reader, (const byte*)str, len, free_len)) {
- return NULL;
- }
+ mp_reader_new_mem(&reader, (const byte*)str, len, free_len);
return mp_lexer_new(src_name, reader);
}
-#if MICROPY_READER_POSIX || MICROPY_READER_FATFS
+#if MICROPY_READER_POSIX || MICROPY_READER_VFS
mp_lexer_t *mp_lexer_new_from_file(const char *filename) {
mp_reader_t reader;
- int ret = mp_reader_new_file(&reader, filename);
- if (ret != 0) {
- return NULL;
- }
+ mp_reader_new_file(&reader, filename);
return mp_lexer_new(qstr_from_str(filename), reader);
}
@@ -768,10 +720,7 @@ mp_lexer_t *mp_lexer_new_from_file(const char *filename) {
mp_lexer_t *mp_lexer_new_from_fd(qstr filename, int fd, bool close_fd) {
mp_reader_t reader;
- int ret = mp_reader_new_file_from_fd(&reader, fd, close_fd);
- if (ret != 0) {
- return NULL;
- }
+ mp_reader_new_file_from_fd(&reader, fd, close_fd);
return mp_lexer_new(filename, reader);
}
@@ -788,10 +737,6 @@ void mp_lexer_free(mp_lexer_t *lex) {
}
}
-void mp_lexer_to_next(mp_lexer_t *lex) {
- mp_lexer_next_token_into(lex, false);
-}
-
#if 0
// This function is used to print the current token and should only be
// needed to debug the lexer, so it's not available via a config option.