diff options
author | Damien George <damien.p.george@gmail.com> | 2017-03-29 10:55:36 +1100 |
---|---|---|
committer | Damien George <damien.p.george@gmail.com> | 2017-03-29 10:56:52 +1100 |
commit | 5010d1958feee84fe49f887ff6e4b1d37d211152 (patch) | |
tree | 57ae2f6a7f51b9fdfc4ee8371df0643b2b5f12a0 /py | |
parent | e93c4ca18101e97ab5d427b97d4e0b6a72e31deb (diff) | |
download | micropython-5010d1958feee84fe49f887ff6e4b1d37d211152.tar.gz micropython-5010d1958feee84fe49f887ff6e4b1d37d211152.zip |
py/lexer: Simplify and reduce code size for operator tokenising.
By removing the 'E' code from the operator token encoding mini-language the
tokenising can be simplified. The 'E' code was only used for the !=
operator which is now handled as a special case; the optimisations for the
general case more than make up for the addition of this single, special
case. Furthermore, the . and ... operators can be handled in the same way
as != which reduces the code size a little further.
This simplification also removes a "goto".
Changes in code size for this patch are (measured in bytes):
bare-arm: -48
minimal x86: -64
unix x86-64: -112
unix nanbox: -64
stmhal: -48
cc3200: -48
esp8266: -76
Diffstat (limited to 'py')
-rw-r--r-- | py/lexer.c | 82 |
1 files changed, 28 insertions, 54 deletions
diff --git a/py/lexer.c b/py/lexer.c index a91f5c9c8b..05651abecf 100644 --- a/py/lexer.c +++ b/py/lexer.c @@ -176,7 +176,6 @@ STATIC void indent_pop(mp_lexer_t *lex) { // some tricky operator encoding: // <op> = begin with <op>, if this opchar matches then begin here // e<op> = end with <op>, if this opchar matches then end -// E<op> = mandatory end with <op>, this opchar must match, then end // c<op> = continue with <op>, if this opchar matches then continue matching // this means if the start of two ops are the same then they are equal til the last char @@ -193,7 +192,7 @@ STATIC const char *const tok_enc = "%e=" // % %= "^e=" // ^ ^= "=e=" // = == - "!E="; // != + "!."; // start of special cases: != . ... // TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries STATIC const uint8_t tok_enc_kind[] = { @@ -213,7 +212,6 @@ STATIC const uint8_t tok_enc_kind[] = { MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL, MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL, MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL, - MP_TOKEN_OP_NOT_EQUAL, }; // must have the same order as enum in lexer.h @@ -603,20 +601,6 @@ void mp_lexer_to_next(mp_lexer_t *lex) { } } - } else if (is_char(lex, '.')) { - // special handling for . and ... operators, because .. is not a valid operator - - // get first char - next_char(lex); - - if (is_char_and(lex, '.', '.')) { - next_char(lex); - next_char(lex); - lex->tok_kind = MP_TOKEN_ELLIPSIS; - } else { - lex->tok_kind = MP_TOKEN_DEL_PERIOD; - } - } else { // search for encoded delimiter or operator @@ -625,9 +609,6 @@ void mp_lexer_to_next(mp_lexer_t *lex) { for (; *t != 0 && !is_char(lex, *t); t += 1) { if (*t == 'e' || *t == 'c') { t += 1; - } else if (*t == 'E') { - tok_enc_index -= 1; - t += 1; } tok_enc_index += 1; } @@ -638,55 +619,48 @@ void mp_lexer_to_next(mp_lexer_t *lex) { // didn't match any delimiter or operator characters lex->tok_kind = MP_TOKEN_INVALID; + } else if (*t == '!') { + // "!=" is a special case because "!" is not a valid operator + if (is_char(lex, '=')) { + next_char(lex); + lex->tok_kind = MP_TOKEN_OP_NOT_EQUAL; + } else { + lex->tok_kind = MP_TOKEN_INVALID; + } + + } else if (*t == '.') { + // "." and "..." are special cases because ".." is not a valid operator + if (is_char_and(lex, '.', '.')) { + next_char(lex); + next_char(lex); + lex->tok_kind = MP_TOKEN_ELLIPSIS; + } else { + lex->tok_kind = MP_TOKEN_DEL_PERIOD; + } + } else { // matched a delimiter or operator character // get the maximum characters for a valid token t += 1; size_t t_index = tok_enc_index; - for (;;) { - for (; *t == 'e'; t += 1) { - t += 1; - t_index += 1; - if (is_char(lex, *t)) { - next_char(lex); - tok_enc_index = t_index; - break; - } - } - - if (*t == 'E') { - t += 1; - if (is_char(lex, *t)) { - next_char(lex); - tok_enc_index = t_index; - } else { - lex->tok_kind = MP_TOKEN_INVALID; - goto tok_enc_no_match; - } - break; - } - - if (*t == 'c') { - t += 1; - t_index += 1; - if (is_char(lex, *t)) { - next_char(lex); - tok_enc_index = t_index; - t += 1; - } else { + while (*t == 'c' || *t == 'e') { + t_index += 1; + if (is_char(lex, t[1])) { + next_char(lex); + tok_enc_index = t_index; + if (*t == 'e') { break; } - } else { + } else if (*t == 'c') { break; } + t += 2; } // set token kind lex->tok_kind = tok_enc_kind[tok_enc_index]; - tok_enc_no_match: - // compute bracket level for implicit line joining if (lex->tok_kind == MP_TOKEN_DEL_PAREN_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACKET_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACE_OPEN) { lex->nested_bracket_level += 1; |