1 files changed, 286 insertions, 341 deletions
diff --git a/py/lexer.c b/py/lexer.c
index 458fba0900..abc1f3ebbb 100644
--- a/py/lexer.c
+++ b/py/lexer.c
@@ -25,6 +25,7 @@
  */
 
 #include <stdio.h>
+#include <string.h>
 #include <assert.h>
 
 #include "py/mpstate.h"
@@ -39,19 +40,6 @@
 // TODO seems that CPython allows NULL byte in the input stream
 // don't know if that's intentional or not, but we don't allow it
 
-// TODO replace with a call to a standard function
-STATIC bool str_strn_equal(const char *str, const char *strn, mp_uint_t len) {
-    mp_uint_t i = 0;
-
-    while (i < len && *str == *strn) {
-        ++i;
-        ++str;
-        ++strn;
-    }
-
-    return i == len && *str == 0;
-}
-
 #define MP_LEXER_EOF ((unichar)MP_READER_EOF)
 #define CUR_CHAR(lex) ((lex)->chr0)
 
@@ -75,11 +63,9 @@ STATIC bool is_char_or3(mp_lexer_t *lex, byte c1, byte c2, byte c3) {
     return lex->chr0 == c1 || lex->chr0 == c2 || lex->chr0 == c3;
 }
 
-/*
 STATIC bool is_char_following(mp_lexer_t *lex, byte c) {
     return lex->chr1 == c;
 }
-*/
 
 STATIC bool is_char_following_or(mp_lexer_t *lex, byte c1, byte c2) {
     return lex->chr1 == c1 || lex->chr1 == c2;
@@ -118,6 +104,13 @@ STATIC bool is_following_odigit(mp_lexer_t *lex) {
     return lex->chr1 >= '0' && lex->chr1 <= '7';
 }
 
+STATIC bool is_string_or_bytes(mp_lexer_t *lex) {
+    return is_char_or(lex, '\'', '\"')
+        || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
+        || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r'))
+            && is_char_following_following_or(lex, '\'', '\"'));
+}
+
 // to easily parse utf-8 identifiers we allow any raw byte with high bit set
 STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
     return is_letter(lex) || lex->chr0 == '_' || lex->chr0 >= 0x80;
@@ -144,36 +137,30 @@ STATIC void next_char(mp_lexer_t *lex) {
     lex->chr1 = lex->chr2;
     lex->chr2 = lex->reader.readbyte(lex->reader.data);
 
-    if (lex->chr0 == '\r') {
+    if (lex->chr1 == '\r') {
         // CR is a new line, converted to LF
-        lex->chr0 = '\n';
-        if (lex->chr1 == '\n') {
-            // CR LF is a single new line
-            lex->chr1 = lex->chr2;
+        lex->chr1 = '\n';
+        if (lex->chr2 == '\n') {
+            // CR LF is a single new line, throw out the extra LF
             lex->chr2 = lex->reader.readbyte(lex->reader.data);
         }
     }
 
-    if (lex->chr2 == MP_LEXER_EOF) {
-        // EOF, check if we need to insert a newline at end of file
-        if (lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') {
-            // if lex->chr1 == '\r' then this makes a CR LF which will be converted to LF above
-            // otherwise it just inserts a LF
-            lex->chr2 = '\n';
-        }
+    // check if we need to insert a newline at end of file
+    if (lex->chr2 == MP_LEXER_EOF && lex->chr1 != MP_LEXER_EOF && lex->chr1 != '\n') {
+        lex->chr2 = '\n';
     }
 }
 
-STATIC void indent_push(mp_lexer_t *lex, mp_uint_t indent) {
+STATIC void indent_push(mp_lexer_t *lex, size_t indent) {
     if (lex->num_indent_level >= lex->alloc_indent_level) {
-        // TODO use m_renew_maybe and somehow indicate an error if it fails... probably by using MP_TOKEN_MEMORY_ERROR
         lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level + MICROPY_ALLOC_LEXEL_INDENT_INC);
         lex->alloc_indent_level += MICROPY_ALLOC_LEXEL_INDENT_INC;
     }
     lex->indent_level[lex->num_indent_level++] = indent;
 }
 
-STATIC mp_uint_t indent_top(mp_lexer_t *lex) {
+STATIC size_t indent_top(mp_lexer_t *lex) {
     return lex->indent_level[lex->num_indent_level - 1];
 }
 
@@ -184,7 +171,6 @@ STATIC void indent_pop(mp_lexer_t *lex) {
 // some tricky operator encoding:
 //     <op>  = begin with <op>, if this opchar matches then begin here
 //     e<op> = end with <op>, if this opchar matches then end
-//     E<op> = mandatory end with <op>, this opchar must match, then end
 //     c<op> = continue with <op>, if this opchar matches then continue matching
 // this means if the start of two ops are the same then they are equal til the last char
 
@@ -201,7 +187,7 @@ STATIC const char *const tok_enc =
     "%e="         // % %=
     "^e="         // ^ ^=
     "=e="         // = ==
-    "!E=";        // !=
+    "!.";         // start of special cases: != . ...
 
 // TODO static assert that number of tokens is less than 256 so we can safely make this table with byte sized entries
 STATIC const uint8_t tok_enc_kind[] = {
@@ -221,14 +207,15 @@ STATIC const uint8_t tok_enc_kind[] = {
     MP_TOKEN_OP_PERCENT, MP_TOKEN_DEL_PERCENT_EQUAL,
     MP_TOKEN_OP_CARET, MP_TOKEN_DEL_CARET_EQUAL,
     MP_TOKEN_DEL_EQUAL, MP_TOKEN_OP_DBL_EQUAL,
-    MP_TOKEN_OP_NOT_EQUAL,
 };
 
 // must have the same order as enum in lexer.h
+// must be sorted according to strcmp
 STATIC const char *const tok_kw[] = {
     "False",
     "None",
     "True",
+    "__debug__",
     "and",
     "as",
     "assert",
@@ -263,13 +250,12 @@ STATIC const char *const tok_kw[] = {
     "while",
     "with",
     "yield",
-    "__debug__",
 };
 
 // This is called with CUR_CHAR() before first hex digit, and should return with
 // it pointing to last hex digit
 // num_digits must be greater than zero
-STATIC bool get_hex(mp_lexer_t *lex, mp_uint_t num_digits, mp_uint_t *result) {
+STATIC bool get_hex(mp_lexer_t *lex, size_t num_digits, mp_uint_t *result) {
     mp_uint_t num = 0;
     while (num_digits-- != 0) {
         next_char(lex);
@@ -283,14 +269,144 @@ STATIC bool get_hex(mp_lexer_t *lex, mp_uint_t num_digits, mp_uint_t *result) {
     return true;
 }
 
-STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
-    // start new token text
-    vstr_reset(&lex->vstr);
+STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw) {
+    // get first quoting character
+    char quote_char = '\'';
+    if (is_char(lex, '\"')) {
+        quote_char = '\"';
+    }
+    next_char(lex);
 
-    // skip white space and comments
+    // work out if it's a single or triple quoted literal
+    size_t num_quotes;
+    if (is_char_and(lex, quote_char, quote_char)) {
+        // triple quotes
+        next_char(lex);
+        next_char(lex);
+        num_quotes = 3;
+    } else {
+        // single quotes
+        num_quotes = 1;
+    }
+
+    size_t n_closing = 0;
+    while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
+        if (is_char(lex, quote_char)) {
+            n_closing += 1;
+            vstr_add_char(&lex->vstr, CUR_CHAR(lex));
+        } else {
+            n_closing = 0;
+            if (is_char(lex, '\\')) {
+                next_char(lex);
+                unichar c = CUR_CHAR(lex);
+                if (is_raw) {
+                    // raw strings allow escaping of quotes, but the backslash is also emitted
+                    vstr_add_char(&lex->vstr, '\\');
+                } else {
+                    switch (c) {
+                        // note: "c" can never be MP_LEXER_EOF because next_char
+                        // always inserts a newline at the end of the input stream
+                        case '\n': c = MP_LEXER_EOF; break; // backslash escape the newline, just ignore it
+                        case '\\': break;
+                        case '\'': break;
+                        case '"': break;
+                        case 'a': c = 0x07; break;
+                        case 'b': c = 0x08; break;
+                        case 't': c = 0x09; break;
+                        case 'n': c = 0x0a; break;
+                        case 'v': c = 0x0b; break;
+                        case 'f': c = 0x0c; break;
+                        case 'r': c = 0x0d; break;
+                        case 'u':
+                        case 'U':
+                            if (lex->tok_kind == MP_TOKEN_BYTES) {
+                                // b'\u1234' == b'\\u1234'
+                                vstr_add_char(&lex->vstr, '\\');
+                                break;
+                            }
+                            // Otherwise fall through.
+                        case 'x':
+                        {
+                            mp_uint_t num = 0;
+                            if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) {
+                                // not enough hex chars for escape sequence
+                                lex->tok_kind = MP_TOKEN_INVALID;
+                            }
+                            c = num;
+                            break;
+                        }
+                        case 'N':
+                            // Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the
+                            // entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly
+                            // 3MB of text; even gzip-compressed and with minimal structure, it'll take
+                            // roughly half a meg of storage. This form of Unicode escape may be added
+                            // later on, but it's definitely not a priority right now. -- CJA 20140607
+                            mp_not_implemented("unicode name escapes");
+                            break;
+                        default:
+                            if (c >= '0' && c <= '7') {
+                                // Octal sequence, 1-3 chars
+                                size_t digits = 3;
+                                mp_uint_t num = c - '0';
+                                while (is_following_odigit(lex) && --digits != 0) {
+                                    next_char(lex);
+                                    num = num * 8 + (CUR_CHAR(lex) - '0');
+                                }
+                                c = num;
+                            } else {
+                                // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
+                                vstr_add_char(&lex->vstr, '\\');
+                            }
+                            break;
+                    }
+                }
+                if (c != MP_LEXER_EOF) {
+                    if (MICROPY_PY_BUILTINS_STR_UNICODE_DYNAMIC) {
+                        if (c < 0x110000 && lex->tok_kind == MP_TOKEN_STRING) {
+                            vstr_add_char(&lex->vstr, c);
+                        } else if (c < 0x100 && lex->tok_kind == MP_TOKEN_BYTES) {
+                            vstr_add_byte(&lex->vstr, c);
+                        } else {
+                            // unicode character out of range
+                            // this raises a generic SyntaxError; could provide more info
+                            lex->tok_kind = MP_TOKEN_INVALID;
+                        }
+                    } else {
+                        // without unicode everything is just added as an 8-bit byte
+                        if (c < 0x100) {
+                            vstr_add_byte(&lex->vstr, c);
+                        } else {
+                            // 8-bit character out of range
+                            // this raises a generic SyntaxError; could provide more info
+                            lex->tok_kind = MP_TOKEN_INVALID;
+                        }
+                    }
+                }
+            } else {
+                // Add the "character" as a byte so that we remain 8-bit clean.
+                // This way, strings are parsed correctly whether or not they contain utf-8 chars.
+                vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
+            }
+        }
+        next_char(lex);
+    }
+
+    // check we got the required end quotes
+    if (n_closing < num_quotes) {
+        lex->tok_kind = MP_TOKEN_LONELY_STRING_OPEN;
+    }
+
+    // cut off the end quotes from the token text
+    vstr_cut_tail_bytes(&lex->vstr, n_closing);
+}
+
+STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) {
     bool had_physical_newline = false;
     while (!is_end(lex)) {
         if (is_physical_newline(lex)) {
+            if (stop_at_newline && lex->nested_bracket_level == 0) {
+                break;
+            }
             had_physical_newline = true;
             next_char(lex);
         } else if (is_whitespace(lex)) {
@@ -301,35 +417,29 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
                 next_char(lex);
             }
             // had_physical_newline will be set on next loop
-        } else if (is_char(lex, '\\')) {
-            // backslash (outside string literals) must appear just before a physical newline
+        } else if (is_char_and(lex, '\\', '\n')) {
+            // line-continuation, so don't set had_physical_newline
+            next_char(lex);
             next_char(lex);
-            if (!is_physical_newline(lex)) {
-                // SyntaxError: unexpected character after line continuation character
-                lex->tok_line = lex->line;
-                lex->tok_column = lex->column;
-                lex->tok_kind = MP_TOKEN_BAD_LINE_CONTINUATION;
-                return;
-            } else {
-                next_char(lex);
-            }
         } else {
             break;
         }
     }
+    return had_physical_newline;
+}
+
+void mp_lexer_to_next(mp_lexer_t *lex) {
+    // start new token text
+    vstr_reset(&lex->vstr);
+
+    // skip white space and comments
+    bool had_physical_newline = skip_whitespace(lex, false);
 
     // set token source information
     lex->tok_line = lex->line;
     lex->tok_column = lex->column;
 
-    if (first_token && lex->line == 1 && lex->column != 1) {
-        // check that the first token is in the first column
-        // if first token is not on first line, we get a physical newline and
-        // this check is done as part of normal indent/dedent checking below
-        // (done to get equivalence with CPython)
-        lex->tok_kind = MP_TOKEN_INDENT;
-
-    } else if (lex->emit_dent < 0) {
+    if (lex->emit_dent < 0) {
         lex->tok_kind = MP_TOKEN_DEDENT;
         lex->emit_dent += 1;
 
@@ -340,7 +450,7 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
     } else if (had_physical_newline && lex->nested_bracket_level == 0) {
         lex->tok_kind = MP_TOKEN_NEWLINE;
 
-        mp_uint_t num_spaces = lex->column - 1;
+        size_t num_spaces = lex->column - 1;
         if (num_spaces == indent_top(lex)) {
         } else if (num_spaces > indent_top(lex)) {
             indent_push(lex, num_spaces);
@@ -358,168 +468,65 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
     } else if (is_end(lex)) {
         lex->tok_kind = MP_TOKEN_END;
 
-    } else if (is_char_or(lex, '\'', '\"')
-               || (is_char_or3(lex, 'r', 'u', 'b') && is_char_following_or(lex, '\'', '\"'))
-               || ((is_char_and(lex, 'r', 'b') || is_char_and(lex, 'b', 'r')) && is_char_following_following_or(lex, '\'', '\"'))) {
+    } else if (is_string_or_bytes(lex)) {
         // a string or bytes literal
 
-        // parse type codes
-        bool is_raw = false;
-        bool is_bytes = false;
-        if (is_char(lex, 'u')) {
-            next_char(lex);
-        } else if (is_char(lex, 'b')) {
-            is_bytes = true;
-            next_char(lex);
-            if (is_char(lex, 'r')) {
-                is_raw = true;
-                next_char(lex);
-            }
-        } else if (is_char(lex, 'r')) {
-            is_raw = true;
-            next_char(lex);
-            if (is_char(lex, 'b')) {
-                is_bytes = true;
-                next_char(lex);
-            }
-        }
+        // Python requires adjacent string/bytes literals to be automatically
+        // concatenated.  We do it here in the tokeniser to make efficient use of RAM,
+        // because then the lexer's vstr can be used to accumulate the string literal,
+        // in contrast to creating a parse tree of strings and then joining them later
+        // in the compiler.  It's also more compact in code size to do it here.
 
-        // set token kind
-        if (is_bytes) {
-            lex->tok_kind = MP_TOKEN_BYTES;
-        } else {
-            lex->tok_kind = MP_TOKEN_STRING;
-        }
+        // MP_TOKEN_END is used to indicate that this is the first string token
+        lex->tok_kind = MP_TOKEN_END;
 
-        // get first quoting character
-        char quote_char = '\'';
-        if (is_char(lex, '\"')) {
-            quote_char = '\"';
-        }
-        next_char(lex);
+        // Loop to accumulate string/bytes literals
+        do {
+            // parse type codes
+            bool is_raw = false;
+            mp_token_kind_t kind = MP_TOKEN_STRING;
+            int n_char = 0;
+            if (is_char(lex, 'u')) {
+                n_char = 1;
+            } else if (is_char(lex, 'b')) {
+                kind = MP_TOKEN_BYTES;
+                n_char = 1;
+                if (is_char_following(lex, 'r')) {
+                    is_raw = true;
+                    n_char = 2;
+                }
+            } else if (is_char(lex, 'r')) {
+                is_raw = true;
+                n_char = 1;
+                if (is_char_following(lex, 'b')) {
+                    kind = MP_TOKEN_BYTES;
+                    n_char = 2;
+                }
+            }
 
-        // work out if it's a single or triple quoted literal
-        mp_uint_t num_quotes;
-        if (is_char_and(lex, quote_char, quote_char)) {
-            // triple quotes
-            next_char(lex);
-            next_char(lex);
-            num_quotes = 3;
-        } else {
-            // single quotes
-            num_quotes = 1;
-        }
+            // Set or check token kind
+            if (lex->tok_kind == MP_TOKEN_END) {
+                lex->tok_kind = kind;
+            } else if (lex->tok_kind != kind) {
+                // Can't concatenate string with bytes
+                break;
+            }
 
-        // parse the literal
-        mp_uint_t n_closing = 0;
-        while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
-            if (is_char(lex, quote_char)) {
-                n_closing += 1;
-                vstr_add_char(&lex->vstr, CUR_CHAR(lex));
-            } else {
-                n_closing = 0;
-                if (is_char(lex, '\\')) {
+            // Skip any type code characters
+            if (n_char != 0) {
+                next_char(lex);
+                if (n_char == 2) {
                     next_char(lex);
-                    unichar c = CUR_CHAR(lex);
-                    if (is_raw) {
-                        // raw strings allow escaping of quotes, but the backslash is also emitted
-                        vstr_add_char(&lex->vstr, '\\');
-                    } else {
-                        switch (c) {
-                            // note: "c" can never be MP_LEXER_EOF because next_char
-                            // always inserts a newline at the end of the input stream
-                            case '\n': c = MP_LEXER_EOF; break; // backslash escape the newline, just ignore it
-                            case '\\': break;
-                            case '\'': break;
-                            case '"': break;
-                            case 'a': c = 0x07; break;
-                            case 'b': c = 0x08; break;
-                            case 't': c = 0x09; break;
-                            case 'n': c = 0x0a; break;
-                            case 'v': c = 0x0b; break;
-                            case 'f': c = 0x0c; break;
-                            case 'r': c = 0x0d; break;
-                            case 'u':
-                            case 'U':
-                                if (is_bytes) {
-                                    // b'\u1234' == b'\\u1234'
-                                    vstr_add_char(&lex->vstr, '\\');
-                                    break;
-                                }
-                                // Otherwise fall through.
-                            case 'x':
-                            {
-                                mp_uint_t num = 0;
-                                if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) {
-                                    // not enough hex chars for escape sequence
-                                    lex->tok_kind = MP_TOKEN_INVALID;
-                                }
-                                c = num;
-                                break;
-                            }
-                            case 'N':
-                                // Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the
-                                // entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly
-                                // 3MB of text; even gzip-compressed and with minimal structure, it'll take
-                                // roughly half a meg of storage. This form of Unicode escape may be added
-                                // later on, but it's definitely not a priority right now. -- CJA 20140607
-                                mp_not_implemented("unicode name escapes");
-                                break;
-                            default:
-                                if (c >= '0' && c <= '7') {
-                                    // Octal sequence, 1-3 chars
-                                    mp_uint_t digits = 3;
-                                    mp_uint_t num = c - '0';
-                                    while (is_following_odigit(lex) && --digits != 0) {
-                                        next_char(lex);
-                                        num = num * 8 + (CUR_CHAR(lex) - '0');
-                                    }
-                                    c = num;
-                                } else {
-                                    // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
-                                    vstr_add_char(&lex->vstr, '\\');
-                                }
-                                break;
-                        }
-                    }
-                    if (c != MP_LEXER_EOF) {
-                        if (MICROPY_PY_BUILTINS_STR_UNICODE_DYNAMIC) {
-                            if (c < 0x110000 && !is_bytes) {
-                                vstr_add_char(&lex->vstr, c);
-                            } else if (c < 0x100 && is_bytes) {
-                                vstr_add_byte(&lex->vstr, c);
-                            } else {
-                                // unicode character out of range
-                                // this raises a generic SyntaxError; could provide more info
-                                lex->tok_kind = MP_TOKEN_INVALID;
-                            }
-                        } else {
-                            // without unicode everything is just added as an 8-bit byte
-                            if (c < 0x100) {
-                                vstr_add_byte(&lex->vstr, c);
-                            } else {
-                                // 8-bit character out of range
-                                // this raises a generic SyntaxError; could provide more info
-                                lex->tok_kind = MP_TOKEN_INVALID;
-                            }
-                        }
-                    }
-                } else {
-                    // Add the "character" as a byte so that we remain 8-bit clean.
-                    // This way, strings are parsed correctly whether or not they contain utf-8 chars.
-                    vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
                 }
             }
-            next_char(lex);
-        }
 
-        // check we got the required end quotes
-        if (n_closing < num_quotes) {
-            lex->tok_kind = MP_TOKEN_LONELY_STRING_OPEN;
-        }
+            // Parse the literal
+            parse_string_literal(lex, is_raw);
+
+            // Skip whitespace so we can check if there's another string following
+            skip_whitespace(lex, true);
 
-        // cut off the end quotes from the token text
-        vstr_cut_tail_bytes(&lex->vstr, n_closing);
+        } while (is_string_or_bytes(lex));
 
     } else if (is_head_of_identifier(lex)) {
         lex->tok_kind = MP_TOKEN_NAME;
@@ -534,6 +541,25 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
             next_char(lex);
         }
 
+        // Check if the name is a keyword.
+        // We also check for __debug__ here and convert it to its value.  This is
+        // so the parser gives a syntax error on, eg, x.__debug__.  Otherwise, we
+        // need to check for this special token in many places in the compiler.
+        const char *s = vstr_null_terminated_str(&lex->vstr);
+        for (size_t i = 0; i < MP_ARRAY_SIZE(tok_kw); i++) {
+            int cmp = strcmp(s, tok_kw[i]);
+            if (cmp == 0) {
+                lex->tok_kind = MP_TOKEN_KW_FALSE + i;
+                if (lex->tok_kind == MP_TOKEN_KW___DEBUG__) {
+                    lex->tok_kind = (MP_STATE_VM(mp_optimise_value) == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);
+                }
+                break;
+            } else if (cmp < 0) {
+                // Table is sorted and comparison was less-than, so stop searching
+                break;
+            }
+        }
+
     } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
         bool forced_integer = false;
         if (is_char(lex, '.')) {
@@ -570,34 +596,14 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
             }
         }
 
-    } else if (is_char(lex, '.')) {
-        // special handling for . and ... operators, because .. is not a valid operator
-
-        // get first char
-        vstr_add_char(&lex->vstr, '.');
-        next_char(lex);
-
-        if (is_char_and(lex, '.', '.')) {
-            vstr_add_char(&lex->vstr, '.');
-            vstr_add_char(&lex->vstr, '.');
-            next_char(lex);
-            next_char(lex);
-            lex->tok_kind = MP_TOKEN_ELLIPSIS;
-        } else {
-            lex->tok_kind = MP_TOKEN_DEL_PERIOD;
-        }
-
     } else {
         // search for encoded delimiter or operator
 
         const char *t = tok_enc;
-        mp_uint_t tok_enc_index = 0;
+        size_t tok_enc_index = 0;
         for (; *t != 0 && !is_char(lex, *t); t += 1) {
             if (*t == 'e' || *t == 'c') {
                 t += 1;
-            } else if (*t == 'E') {
-                tok_enc_index -= 1;
-                t += 1;
             }
             tok_enc_index += 1;
         }
@@ -608,55 +614,48 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
             // didn't match any delimiter or operator characters
             lex->tok_kind = MP_TOKEN_INVALID;
 
+        } else if (*t == '!') {
+            // "!=" is a special case because "!" is not a valid operator
+            if (is_char(lex, '=')) {
+                next_char(lex);
+                lex->tok_kind = MP_TOKEN_OP_NOT_EQUAL;
+            } else {
+                lex->tok_kind = MP_TOKEN_INVALID;
+            }
+
+        } else if (*t == '.') {
+            // "." and "..." are special cases because ".." is not a valid operator
+            if (is_char_and(lex, '.', '.')) {
+                next_char(lex);
+                next_char(lex);
+                lex->tok_kind = MP_TOKEN_ELLIPSIS;
+            } else {
+                lex->tok_kind = MP_TOKEN_DEL_PERIOD;
+            }
+
         } else {
             // matched a delimiter or operator character
 
             // get the maximum characters for a valid token
             t += 1;
-            mp_uint_t t_index = tok_enc_index;
-            for (;;) {
-                for (; *t == 'e'; t += 1) {
-                    t += 1;
-                    t_index += 1;
-                    if (is_char(lex, *t)) {
-                        next_char(lex);
-                        tok_enc_index = t_index;
-                        break;
-                    }
-                }
-
-                if (*t == 'E') {
-                    t += 1;
-                    if (is_char(lex, *t)) {
-                        next_char(lex);
-                        tok_enc_index = t_index;
-                    } else {
-                        lex->tok_kind = MP_TOKEN_INVALID;
-                        goto tok_enc_no_match;
-                    }
-                    break;
-                }
-
-                if (*t == 'c') {
-                    t += 1;
-                    t_index += 1;
-                    if (is_char(lex, *t)) {
-                        next_char(lex);
-                        tok_enc_index = t_index;
-                        t += 1;
-                    } else {
+            size_t t_index = tok_enc_index;
+            while (*t == 'c' || *t == 'e') {
+                t_index += 1;
+                if (is_char(lex, t[1])) {
+                    next_char(lex);
+                    tok_enc_index = t_index;
+                    if (*t == 'e') {
                         break;
                     }
-                } else {
+                } else if (*t == 'c') {
                     break;
                 }
+                t += 2;
             }
 
             // set token kind
             lex->tok_kind = tok_enc_kind[tok_enc_index];
 
-            tok_enc_no_match:
-
             // compute bracket level for implicit line joining
             if (lex->tok_kind == MP_TOKEN_DEL_PAREN_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACKET_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACE_OPEN) {
                 lex->nested_bracket_level += 1;
@@ -665,102 +664,55 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
             }
         }
     }
-
-    // check for keywords
-    if (lex->tok_kind == MP_TOKEN_NAME) {
-        // We check for __debug__ here and convert it to its value.  This is so
-        // the parser gives a syntax error on, eg, x.__debug__.  Otherwise, we
-        // need to check for this special token in many places in the compiler.
-        // TODO improve speed of these string comparisons
-        //for (mp_int_t i = 0; tok_kw[i] != NULL; i++) {
-        for (size_t i = 0; i < MP_ARRAY_SIZE(tok_kw); i++) {
-            if (str_strn_equal(tok_kw[i], lex->vstr.buf, lex->vstr.len)) {
-                if (i == MP_ARRAY_SIZE(tok_kw) - 1) {
-                    // tok_kw[MP_ARRAY_SIZE(tok_kw) - 1] == "__debug__"
-                    lex->tok_kind = (MP_STATE_VM(mp_optimise_value) == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);
-                } else {
-                    lex->tok_kind = MP_TOKEN_KW_FALSE + i;
-                }
-                break;
-            }
-        }
-    }
 }
 
 mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader) {
-    mp_lexer_t *lex = m_new_obj_maybe(mp_lexer_t);
-
-    // check for memory allocation error
-    if (lex == NULL) {
-        reader.close(reader.data);
-        return NULL;
-    }
+    mp_lexer_t *lex = m_new_obj(mp_lexer_t);
 
     lex->source_name = src_name;
     lex->reader = reader;
     lex->line = 1;
-    lex->column = 1;
+    lex->column = -2;   // account for 3 dummy bytes
     lex->emit_dent = 0;
     lex->nested_bracket_level = 0;
     lex->alloc_indent_level = MICROPY_ALLOC_LEXER_INDENT_INIT;
     lex->num_indent_level = 1;
-    lex->indent_level = m_new_maybe(uint16_t, lex->alloc_indent_level);
+    lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);
     vstr_init(&lex->vstr, 32);
 
-    // check for memory allocation error
-    // note: vstr_init above may fail on malloc, but so may mp_lexer_next_token_into below
-    if (lex->indent_level == NULL) {
-        mp_lexer_free(lex);
-        return NULL;
-    }
-
     // store sentinel for first indentation level
     lex->indent_level[0] = 0;
 
-    // preload characters
-    lex->chr0 = reader.readbyte(reader.data);
-    lex->chr1 = reader.readbyte(reader.data);
-    lex->chr2 = reader.readbyte(reader.data);
-
-    // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
-    if (lex->chr0 == MP_LEXER_EOF) {
-        lex->chr0 = '\n';
-    } else if (lex->chr1 == MP_LEXER_EOF) {
-        if (lex->chr0 == '\r') {
-            lex->chr0 = '\n';
-        } else if (lex->chr0 != '\n') {
-            lex->chr1 = '\n';
-        }
-    } else if (lex->chr2 == MP_LEXER_EOF) {
-        if (lex->chr1 == '\r') {
-            lex->chr1 = '\n';
-        } else if (lex->chr1 != '\n') {
-            lex->chr2 = '\n';
-        }
-    }
+    // load lexer with start of file, advancing lex->column to 1
+    // start with dummy bytes and use next_char() for proper EOL/EOF handling
+    lex->chr0 = lex->chr1 = lex->chr2 = 0;
+    next_char(lex);
+    next_char(lex);
+    next_char(lex);
 
     // preload first token
-    mp_lexer_next_token_into(lex, true);
+    mp_lexer_to_next(lex);
+
+    // Check that the first token is in the first column.  If it's not then we
+    // convert the token kind to INDENT so that the parser gives a syntax error.
+    if (lex->tok_column != 1) {
+        lex->tok_kind = MP_TOKEN_INDENT;
+    }
 
     return lex;
 }
 
-mp_lexer_t *mp_lexer_new_from_str_len(qstr src_name, const char *str, mp_uint_t len, mp_uint_t free_len) {
+mp_lexer_t *mp_lexer_new_from_str_len(qstr src_name, const char *str, size_t len, size_t free_len) {
     mp_reader_t reader;
-    if (!mp_reader_new_mem(&reader, (const byte*)str, len, free_len)) {
-        return NULL;
-    }
+    mp_reader_new_mem(&reader, (const byte*)str, len, free_len);
     return mp_lexer_new(src_name, reader);
 }
 
-#if MICROPY_READER_POSIX || MICROPY_READER_FATFS
+#if MICROPY_READER_POSIX || MICROPY_READER_VFS
 
 mp_lexer_t *mp_lexer_new_from_file(const char *filename) {
     mp_reader_t reader;
-    int ret = mp_reader_new_file(&reader, filename);
-    if (ret != 0) {
-        return NULL;
-    }
+    mp_reader_new_file(&reader, filename);
     return mp_lexer_new(qstr_from_str(filename), reader);
 }
 
@@ -768,10 +720,7 @@ mp_lexer_t *mp_lexer_new_from_file(const char *filename) {
 
 mp_lexer_t *mp_lexer_new_from_fd(qstr filename, int fd, bool close_fd) {
     mp_reader_t reader;
-    int ret = mp_reader_new_file_from_fd(&reader, fd, close_fd);
-    if (ret != 0) {
-        return NULL;
-    }
+    mp_reader_new_file_from_fd(&reader, fd, close_fd);
     return mp_lexer_new(filename, reader);
 }
 
@@ -788,10 +737,6 @@ void mp_lexer_free(mp_lexer_t *lex) {
     }
 }
 
-void mp_lexer_to_next(mp_lexer_t *lex) {
-    mp_lexer_next_token_into(lex, false);
-}
-
 #if 0
 // This function is used to print the current token and should only be
 // needed to debug the lexer, so it's not available via a config option.