14 files changed, 129 insertions, 68 deletions
diff --git a/py/builtineval.c b/py/builtineval.c
index 0e8f9e31d2..49d2bf16a2 100644
--- a/py/builtineval.c
+++ b/py/builtineval.c
@@ -24,13 +24,13 @@ static mp_obj_t mp_builtin_eval(mp_obj_t o_in) {
     const byte *str = mp_obj_str_get_data(o_in, &str_len);
 
     // create the lexer
-    mp_lexer_t *lex = mp_lexer_new_from_str_len("<string>", (const char*)str, str_len, 0);
+    mp_lexer_t *lex = mp_lexer_new_from_str_len(MP_QSTR__lt_string_gt_, (const char*)str, str_len, 0);
+    qstr source_name = mp_lexer_source_name(lex);
 
     // parse the string
     qstr parse_exc_id;
     const char *parse_exc_msg;
     mp_parse_node_t pn = mp_parse(lex, MP_PARSE_EVAL_INPUT, &parse_exc_id, &parse_exc_msg);
-    qstr source_name = mp_lexer_source_name(lex);
     mp_lexer_free(lex);
 
     if (pn == MP_PARSE_NODE_NULL) {
@@ -40,6 +40,7 @@ static mp_obj_t mp_builtin_eval(mp_obj_t o_in) {
 
     // compile the string
     mp_obj_t module_fun = mp_compile(pn, source_name, false);
+    mp_parse_node_free(pn);
 
     if (module_fun == mp_const_none) {
         // TODO handle compile error correctly
diff --git a/py/builtinimport.c b/py/builtinimport.c
index 3cfd64e887..35e7dcbb88 100644
--- a/py/builtinimport.c
+++ b/py/builtinimport.c
@@ -29,9 +29,7 @@ mp_obj_t mp_builtin___import__(int n_args, mp_obj_t *args) {
     }
     */
 
-    uint mod_name_l;
-    const byte *mod_name_s = mp_obj_str_get_data(args[0], &mod_name_l);
-    qstr mod_name = qstr_from_strn((const char*)mod_name_s, mod_name_l);
+    qstr mod_name = mp_obj_str_get_qstr(args[0]);
 
     mp_obj_t loaded = mp_obj_module_get(mod_name);
     if (loaded != MP_OBJ_NULL) {
@@ -44,6 +42,7 @@ mp_obj_t mp_builtin___import__(int n_args, mp_obj_t *args) {
         // TODO handle lexer error correctly
         return mp_const_none;
     }
+    qstr source_name = mp_lexer_source_name(lex);
 
     // create a new module object
     mp_obj_t module_obj = mp_obj_new_module(mod_name);
@@ -60,7 +59,6 @@ mp_obj_t mp_builtin___import__(int n_args, mp_obj_t *args) {
     qstr parse_exc_id;
     const char *parse_exc_msg;
     mp_parse_node_t pn = mp_parse(lex, MP_PARSE_FILE_INPUT, &parse_exc_id, &parse_exc_msg);
-    qstr source_name = mp_lexer_source_name(lex);
     mp_lexer_free(lex);
 
     if (pn == MP_PARSE_NODE_NULL) {
@@ -72,6 +70,7 @@ mp_obj_t mp_builtin___import__(int n_args, mp_obj_t *args) {
 
     // compile the imported script
     mp_obj_t module_fun = mp_compile(pn, source_name, false);
+    mp_parse_node_free(pn);
 
     if (module_fun == mp_const_none) {
         // TODO handle compile error correctly
diff --git a/py/emitbc.c b/py/emitbc.c
index 10a95fbcfa..9fa2880ecb 100644
--- a/py/emitbc.c
+++ b/py/emitbc.c
@@ -71,10 +71,14 @@ static void emit_write_code_info_qstr(emit_t* emit, qstr qstr) {
     c[3] = (qstr >> 24) & 0xff;
 }
 
-static void emit_write_code_info_byte_byte(emit_t* emit, byte b1, uint b2) {
-    byte* c = emit_get_cur_to_write_code_info(emit, 2);
-    c[0] = b1;
-    c[1] = b2;
+static void emit_write_code_info_bytes_lines(emit_t* emit, uint bytes_to_skip, uint lines_to_skip) {
+    for (; bytes_to_skip > 31; bytes_to_skip -= 31) {
+        *emit_get_cur_to_write_code_info(emit, 1) = 31;
+    }
+    for (; lines_to_skip > 7; lines_to_skip -= 7) {
+        *emit_get_cur_to_write_code_info(emit, 1) = 7 << 5;
+    }
+    *emit_get_cur_to_write_code_info(emit, 1) = bytes_to_skip | (lines_to_skip << 5);
 }
 
 // all functions must go through this one to emit byte code
@@ -218,7 +222,7 @@ static void emit_bc_end_pass(emit_t *emit) {
         printf("ERROR: stack size not back to zero; got %d\n", emit->stack_size);
     }
 
-    emit_write_code_info_byte_byte(emit, 0, 0); // end of line number info
+    emit_write_code_info_bytes_lines(emit, 0, 0); // end of line number info
 
     if (emit->pass == PASS_2) {
         // calculate size of code in bytes
@@ -246,15 +250,9 @@ static void emit_bc_set_stack_size(emit_t *emit, int size) {
 static void emit_bc_set_source_line(emit_t *emit, int source_line) {
     //printf("source: line %d -> %d  offset %d -> %d\n", emit->last_source_line, source_line, emit->last_source_line_offset, emit->byte_code_offset);
     if (source_line > emit->last_source_line) {
-        int bytes_to_skip = emit->byte_code_offset - emit->last_source_line_offset;
-        for (; bytes_to_skip > 255; bytes_to_skip -= 255) {
-            emit_write_code_info_byte_byte(emit, 255, 0);
-        }
-        int lines_to_skip = source_line - emit->last_source_line;
-        for (; lines_to_skip > 255; lines_to_skip -= 255) {
-            emit_write_code_info_byte_byte(emit, 0, 255);
-        }
-        emit_write_code_info_byte_byte(emit, bytes_to_skip, lines_to_skip);
+        uint bytes_to_skip = emit->byte_code_offset - emit->last_source_line_offset;
+        uint lines_to_skip = source_line - emit->last_source_line;
+        emit_write_code_info_bytes_lines(emit, bytes_to_skip, lines_to_skip);
         //printf("  %d %d\n", bytes_to_skip, lines_to_skip);
         emit->last_source_line_offset = emit->byte_code_offset;
         emit->last_source_line = source_line;
diff --git a/py/emitcpy.c b/py/emitcpy.c
index de2a5784db..71861c918d 100644
--- a/py/emitcpy.c
+++ b/py/emitcpy.c
@@ -192,29 +192,26 @@ static void print_quoted_str(qstr qstr, bool bytes) {
     if (bytes) {
         printf("b");
     }
-    bool quote_single = false;
+    int quote_char = '\'';
     if (has_single_quote && !has_double_quote) {
-        printf("\"");
-    } else {
-        quote_single = true;
-        printf("'");
+        quote_char = '"';
     }
-    for (int i = 0; i < len; i++) {
-        if (str[i] == '\n') {
-            printf("\\n");
-        } else if (str[i] == '\\') {
+    printf("%c", quote_char);
+    for (const char *s = str, *top = str + len; s < top; s++) {
+        if (*s == quote_char) {
+            printf("\\%c", quote_char);
+        } else if (*s == '\\') {
             printf("\\\\");
-        } else if (str[i] == '\'' && quote_single) {
-            printf("\\'");
+        } else if (32 <= *s && *s <= 126) {
+            printf("%c", *s);
+        } else if (*s == '\n') {
+            printf("\\n");
+        // TODO add more escape codes here
         } else {
-            printf("%c", str[i]);
+            printf("\\x%02x", (*s) & 0xff);
         }
     }
-    if (has_single_quote && !has_double_quote) {
-        printf("\"");
-    } else {
-        printf("'");
-    }
+    printf("%c", quote_char);
 }
 
 static void emit_cpy_load_const_str(emit_t *emit, qstr qstr, bool bytes) {
diff --git a/py/lexer.c b/py/lexer.c
index 9911da33d9..f71e355476 100644
--- a/py/lexer.c
+++ b/py/lexer.c
@@ -493,8 +493,8 @@ static void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
                                 }
                                 c = num;
                             } else {
-                                // TODO error message
-                                assert(0);
+                                // unrecognised escape character; CPython lets this through verbatim as '\' and then the character
+                                vstr_add_char(&lex->vstr, '\\');
                             }
                             break;
                     }
@@ -644,10 +644,10 @@ static void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
     }
 }
 
-mp_lexer_t *mp_lexer_new(const char *src_name, void *stream_data, mp_lexer_stream_next_char_t stream_next_char, mp_lexer_stream_close_t stream_close) {
+mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_char_t stream_next_char, mp_lexer_stream_close_t stream_close) {
     mp_lexer_t *lex = m_new(mp_lexer_t, 1);
 
-    lex->source_name = qstr_from_str(src_name);
+    lex->source_name = src_name;
     lex->stream_data = stream_data;
     lex->stream_next_char = stream_next_char;
     lex->stream_close = stream_close;
diff --git a/py/lexer.h b/py/lexer.h
index 69e97329b6..13fbfb5d33 100644
--- a/py/lexer.h
+++ b/py/lexer.h
@@ -124,8 +124,8 @@ typedef struct _mp_lexer_t mp_lexer_t;
 
 void mp_token_show(const mp_token_t *tok);
 
-mp_lexer_t *mp_lexer_new(const char *src_name, void *stream_data, mp_lexer_stream_next_char_t stream_next_char, mp_lexer_stream_close_t stream_close);
-mp_lexer_t *mp_lexer_new_from_str_len(const char *src_name, const char *str, uint len, uint free_len);
+mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_char_t stream_next_char, mp_lexer_stream_close_t stream_close);
+mp_lexer_t *mp_lexer_new_from_str_len(qstr src_name, const char *str, uint len, uint free_len);
 
 void mp_lexer_free(mp_lexer_t *lex);
 qstr mp_lexer_source_name(mp_lexer_t *lex);
diff --git a/py/lexerstr.c b/py/lexerstr.c
index 1e105d8645..d53a47d0c9 100644
--- a/py/lexerstr.c
+++ b/py/lexerstr.c
@@ -28,7 +28,7 @@ static void str_buf_free(mp_lexer_str_buf_t *sb) {
     m_del_obj(mp_lexer_str_buf_t, sb);
 }
 
-mp_lexer_t *mp_lexer_new_from_str_len(const char *src_name, const char *str, uint len, uint free_len) {
+mp_lexer_t *mp_lexer_new_from_str_len(qstr src_name, const char *str, uint len, uint free_len) {
     mp_lexer_str_buf_t *sb = m_new_obj(mp_lexer_str_buf_t);
     sb->free_len = free_len;
     sb->src_beg = str;
diff --git a/py/lexerunix.c b/py/lexerunix.c
index 7846120a4a..5d96c468f8 100644
--- a/py/lexerunix.c
+++ b/py/lexerunix.c
@@ -28,7 +28,7 @@ mp_lexer_t *mp_lexer_new_from_file(const char *filename) {
         return NULL;
     }
 
-    return mp_lexer_new_from_str_len(filename, data, size, size);
+    return mp_lexer_new_from_str_len(qstr_from_str(filename), data, size, size);
 }
 
 /******************************************************************************/
diff --git a/py/obj.h b/py/obj.h
index e122f5a2bf..b33e3c5981 100644
--- a/py/obj.h
+++ b/py/obj.h
@@ -287,6 +287,7 @@ mp_obj_t mp_obj_str_builder_end(mp_obj_t o_in);
 bool mp_obj_str_equal(mp_obj_t s1, mp_obj_t s2);
 uint mp_obj_str_get_hash(mp_obj_t self_in);
 uint mp_obj_str_get_len(mp_obj_t self_in);
+qstr mp_obj_str_get_qstr(mp_obj_t self_in); // use this if you will anyway convert the string to a qstr
 const char *mp_obj_str_get_str(mp_obj_t self_in); // use this only if you need the string to be null terminated
 const byte *mp_obj_str_get_data(mp_obj_t self_in, uint *len);
 
diff --git a/py/objstr.c b/py/objstr.c
index 3a4d69cfcc..84ac74bab9 100644
--- a/py/objstr.c
+++ b/py/objstr.c
@@ -40,11 +40,39 @@ void str_print(void (*print)(void *env, const char *fmt, ...), void *env, mp_obj
     if (kind == PRINT_STR && !is_bytes) {
         print(env, "%.*s", str_len, str_data);
     } else {
+        // this escapes characters, but it will be very slow to print (calling print many times)
+        bool has_single_quote = false;
+        bool has_double_quote = false;
+        for (const byte *s = str_data, *top = str_data + str_len; (!has_single_quote || !has_double_quote) && s < top; s++) {
+            if (*s == '\'') {
+                has_single_quote = true;
+            } else if (*s == '"') {
+                has_double_quote = true;
+            }
+        }
         if (is_bytes) {
             print(env, "b");
         }
-        // TODO need to escape chars etc
-        print(env, "'%.*s'", str_len, str_data);
+        int quote_char = '\'';
+        if (has_single_quote && !has_double_quote) {
+            quote_char = '"';
+        }
+        print(env, "%c", quote_char);
+        for (const byte *s = str_data, *top = str_data + str_len; s < top; s++) {
+            if (*s == quote_char) {
+                print(env, "\\%c", quote_char);
+            } else if (*s == '\\') {
+                print(env, "\\\\");
+            } else if (32 <= *s && *s <= 126) {
+                print(env, "%c", *s);
+            } else if (*s == '\n') {
+                print(env, "\\n");
+            // TODO add more escape codes here if we want to match CPython
+            } else {
+                print(env, "\\x%02x", *s);
+            }
+        }
+        print(env, "%c", quote_char);
     }
 }
 
@@ -474,13 +502,17 @@ bool mp_obj_str_equal(mp_obj_t s1, mp_obj_t s2) {
     }
 }
 
+void bad_implicit_conversion(mp_obj_t self_in) __attribute__((noreturn));
+void bad_implicit_conversion(mp_obj_t self_in) {
+    nlr_jump(mp_obj_new_exception_msg_varg(MP_QSTR_TypeError, "Can't convert '%s' object to str implicitly", mp_obj_get_type_str(self_in)));
+}
+
 uint mp_obj_str_get_hash(mp_obj_t self_in) {
     if (MP_OBJ_IS_STR(self_in)) {
         GET_STR_HASH(self_in, h);
         return h;
     } else {
-        nlr_jump(mp_obj_new_exception_msg_varg(MP_QSTR_TypeError, "Can't convert '%s' object to str implicitly",
-                 mp_obj_get_type_str(self_in)));
+        bad_implicit_conversion(self_in);
     }
 }
 
@@ -489,8 +521,20 @@ uint mp_obj_str_get_len(mp_obj_t self_in) {
         GET_STR_LEN(self_in, l);
         return l;
     } else {
-        nlr_jump(mp_obj_new_exception_msg_varg(MP_QSTR_TypeError, "Can't convert '%s' object to str implicitly",
-                 mp_obj_get_type_str(self_in)));
+        bad_implicit_conversion(self_in);
+    }
+}
+
+// use this if you will anyway convert the string to a qstr
+// will be more efficient for the case where it's already a qstr
+qstr mp_obj_str_get_qstr(mp_obj_t self_in) {
+    if (MP_OBJ_IS_QSTR(self_in)) {
+        return MP_OBJ_QSTR_VALUE(self_in);
+    } else if (MP_OBJ_IS_TYPE(self_in, &str_type)) {
+        mp_obj_str_t *self = self_in;
+        return qstr_from_strn((char*)self->data, self->len);
+    } else {
+        bad_implicit_conversion(self_in);
     }
 }
 
@@ -502,8 +546,7 @@ const char *mp_obj_str_get_str(mp_obj_t self_in) {
         (void)l; // len unused
         return (const char*)s;
     } else {
-        nlr_jump(mp_obj_new_exception_msg_varg(MP_QSTR_TypeError, "Can't convert '%s' object to str implicitly",
-                 mp_obj_get_type_str(self_in)));
+        bad_implicit_conversion(self_in);
     }
 }
 
@@ -513,8 +556,7 @@ const byte *mp_obj_str_get_data(mp_obj_t self_in, uint *len) {
         *len = l;
         return s;
     } else {
-        nlr_jump(mp_obj_new_exception_msg_varg(MP_QSTR_TypeError, "Can't convert '%s' object to str implicitly",
-                 mp_obj_get_type_str(self_in)));
+        bad_implicit_conversion(self_in);
     }
 }
 
diff --git a/py/parse.c b/py/parse.c
index 3cf909d752..d9969d6785 100644
--- a/py/parse.c
+++ b/py/parse.c
@@ -26,6 +26,8 @@
 #define RULE_ARG_OPT_TOK        (0x3000)
 #define RULE_ARG_OPT_RULE       (0x4000)
 
+#define ADD_BLANK_NODE(rule_id) ((rule_id) == RULE_funcdef || (rule_id) == RULE_classdef || (rule_id) == RULE_comp_for || (rule_id) == RULE_lambdef || (rule_id) == RULE_lambdef_nocond)
+
 // (un)comment to use rule names; for debugging
 //#define USE_RULE_NAME (1)
 
@@ -135,6 +137,28 @@ mp_parse_node_struct_t *parse_node_new_struct(int src_line, int rule_id, int num
     return pn;
 }
 
+uint mp_parse_node_free(mp_parse_node_t pn) {
+    uint cnt = 0;
+    if (MP_PARSE_NODE_IS_STRUCT(pn)) {
+        mp_parse_node_struct_t *pns = (mp_parse_node_struct_t *)pn;
+        uint n = MP_PARSE_NODE_STRUCT_NUM_NODES(pns);
+        uint rule_id = MP_PARSE_NODE_STRUCT_KIND(pns);
+        bool adjust = ADD_BLANK_NODE(rule_id);
+        if (adjust) {
+            n--;
+        }
+        for (uint i = 0; i < n; i++) {
+            cnt += mp_parse_node_free(pns->nodes[i]);
+        }
+        if (adjust) {
+            n++;
+        }
+        m_del_var(mp_parse_node_struct_t, mp_parse_node_t, n, pns);
+        cnt++;
+    }
+    return cnt;
+}
+
 #if MICROPY_DEBUG_PRINTERS
 void mp_parse_node_print(mp_parse_node_t pn, int indent) {
     if (MP_PARSE_NODE_IS_STRUCT(pn)) {
@@ -160,15 +184,15 @@ void mp_parse_node_print(mp_parse_node_t pn, int indent) {
             default: assert(0);
         }
     } else {
-        mp_parse_node_struct_t *pns2 = (mp_parse_node_struct_t*)pn;
-        int n = pns2->kind_num_nodes >> 8;
+        mp_parse_node_struct_t *pns = (mp_parse_node_struct_t*)pn;
+        uint n = MP_PARSE_NODE_STRUCT_NUM_NODES(pns);
 #ifdef USE_RULE_NAME
-        printf("%s(%d) (n=%d)\n", rules[MP_PARSE_NODE_STRUCT_KIND(pns2)]->rule_name, MP_PARSE_NODE_STRUCT_KIND(pns2), n);
+        printf("%s(%d) (n=%d)\n", rules[MP_PARSE_NODE_STRUCT_KIND(pns)]->rule_name, MP_PARSE_NODE_STRUCT_KIND(pns), n);
 #else
-        printf("rule(%u) (n=%d)\n", (uint)MP_PARSE_NODE_STRUCT_KIND(pns2), n);
+        printf("rule(%u) (n=%d)\n", (uint)MP_PARSE_NODE_STRUCT_KIND(pns), n);
 #endif
-        for (int i = 0; i < n; i++) {
-            mp_parse_node_print(pns2->nodes[i], indent + 2);
+        for (uint i = 0; i < n; i++) {
+            mp_parse_node_print(pns->nodes[i], indent + 2);
         }
     }
 }
@@ -458,7 +482,7 @@ mp_parse_node_t mp_parse(mp_lexer_t *lex, mp_parse_input_kind_t input_kind, qstr
                 }
 
                 // always emit these rules, and add an extra blank node at the end (to be used by the compiler to store data)
-                if (rule->rule_id == RULE_funcdef || rule->rule_id == RULE_classdef || rule->rule_id == RULE_comp_for || rule->rule_id == RULE_lambdef || rule->rule_id == RULE_lambdef_nocond) {
+                if (ADD_BLANK_NODE(rule->rule_id)) {
                     emit_rule = true;
                     push_result_node(parser, MP_PARSE_NODE_NULL);
                     i += 1;
diff --git a/py/parse.h b/py/parse.h
index 2801f414ee..9797873d1b 100644
--- a/py/parse.h
+++ b/py/parse.h
@@ -53,6 +53,7 @@ typedef struct _mp_parse_node_struct_t {
 #define MP_PARSE_NODE_STRUCT_NUM_NODES(pns) ((pns)->kind_num_nodes >> 8)
 
 mp_parse_node_t mp_parse_node_new_leaf(machine_int_t kind, machine_int_t arg);
+uint mp_parse_node_free(mp_parse_node_t pn);
 
 void mp_parse_node_print(mp_parse_node_t pn, int indent);
 
diff --git a/py/qstrdefs.h b/py/qstrdefs.h
index e76efaf0e0..81706841cd 100644
--- a/py/qstrdefs.h
+++ b/py/qstrdefs.h
@@ -91,4 +91,5 @@ Q(<listcomp>)
 Q(<dictcomp>)
 Q(<setcomp>)
 Q(<genexpr>)
+Q(<string>)
 Q(<stdin>)
diff --git a/py/vm.c b/py/vm.c
index affa5943bd..82a9f893f3 100644
--- a/py/vm.c
+++ b/py/vm.c
@@ -550,12 +550,9 @@ bool mp_execute_byte_code_2(const byte *code_info, const byte **ip_in_out, mp_ob
                 machine_uint_t source_line = 1;
                 machine_uint_t bc = save_ip - code_info - code_info_size;
                 //printf("find %lu %d %d\n", bc, code_info[12], code_info[13]);
-                for (const byte* ci = code_info + 12; bc >= ci[0]; ci += 2) {
-                    bc -= ci[0];
-                    source_line += ci[1];
-                    if (ci[0] == 0 && ci[1] == 0) {
-                        break;
-                    }
+                for (const byte* ci = code_info + 12; *ci && bc >= ((*ci) & 31); ci++) {
+                    bc -= *ci & 31;
+                    source_line += *ci >> 5;
                 }
                 mp_obj_exception_add_traceback(nlr.ret_val, source_file, source_line, block_name);
             }