7 files changed, 403 insertions, 20 deletions
diff --git a/py/makecompresseddata.py b/py/makecompresseddata.py
new file mode 100644
index 0000000000..26d6703a9e
--- /dev/null
+++ b/py/makecompresseddata.py
@@ -0,0 +1,200 @@
+from __future__ import print_function
+
+import collections
+import re
+import sys
+
+import gzip
+import zlib
+
+
+_COMPRESSED_MARKER = 0xFF
+
+
+def check_non_ascii(msg):
+    for c in msg:
+        if ord(c) >= 0x80:
+            print(
+                'Unable to generate compressed data: message "{}" contains a non-ascii character "{}".'.format(
+                    msg, c
+                ),
+                file=sys.stderr,
+            )
+            sys.exit(1)
+
+
+# Replace <char><space> with <char | 0x80>.
+# Trival scheme to demo/test.
+def space_compression(error_strings):
+    for line in error_strings:
+        check_non_ascii(line)
+        result = ""
+        for i in range(len(line)):
+            if i > 0 and line[i] == " ":
+                result = result[:-1]
+                result += "\\{:03o}".format(ord(line[i - 1]))
+            else:
+                result += line[i]
+        error_strings[line] = result
+    return None
+
+
+# Replace common words with <0x80 | index>.
+# Index is into a table of words stored as aaaaa<0x80|a>bbb<0x80|b>...
+# Replaced words are assumed to have spaces either side to avoid having to store the spaces in the compressed strings.
+def word_compression(error_strings):
+    topn = collections.Counter()
+
+    for line in error_strings.keys():
+        check_non_ascii(line)
+        for word in line.split(" "):
+            topn[word] += 1
+
+    # Order not just by frequency, but by expected saving. i.e. prefer a longer string that is used less frequently.
+    def bytes_saved(item):
+        w, n = item
+        return -((len(w) + 1) * (n - 1))
+
+    top128 = sorted(topn.items(), key=bytes_saved)[:128]
+
+    index = [w for w, _ in top128]
+    index_lookup = {w: i for i, w in enumerate(index)}
+
+    for line in error_strings.keys():
+        result = ""
+        need_space = False
+        for word in line.split(" "):
+            if word in index_lookup:
+                result += "\\{:03o}".format(0b10000000 | index_lookup[word])
+                need_space = False
+            else:
+                if need_space:
+                    result += " "
+                need_space = True
+                result += word
+        error_strings[line] = result.strip()
+
+    return "".join(w[:-1] + "\\{:03o}".format(0b10000000 | ord(w[-1])) for w in index)
+
+
+# Replace chars in text with variable length bit sequence.
+# For comparison only (the table is not emitted).
+def huffman_compression(error_strings):
+    # https://github.com/tannewt/huffman
+    import huffman
+
+    all_strings = "".join(error_strings)
+    cb = huffman.codebook(collections.Counter(all_strings).items())
+
+    for line in error_strings:
+        b = "1"
+        for c in line:
+            b += cb[c]
+        n = len(b)
+        if n % 8 != 0:
+            n += 8 - (n % 8)
+        result = ""
+        for i in range(0, n, 8):
+            result += "\\{:03o}".format(int(b[i : i + 8], 2))
+        if len(result) > len(line) * 4:
+            result = line
+        error_strings[line] = result
+
+    # TODO: This would be the prefix lengths and the table ordering.
+    return "_" * (10 + len(cb))
+
+
+# Replace common N-letter sequences with <0x80 | index>, where
+# the common sequences are stored in a separate table.
+# This isn't very useful, need a smarter way to find top-ngrams.
+def ngram_compression(error_strings):
+    topn = collections.Counter()
+    N = 2
+
+    for line in error_strings.keys():
+        check_non_ascii(line)
+        if len(line) < N:
+            continue
+        for i in range(0, len(line) - N, N):
+            topn[line[i : i + N]] += 1
+
+    def bytes_saved(item):
+        w, n = item
+        return -(len(w) * (n - 1))
+
+    top128 = sorted(topn.items(), key=bytes_saved)[:128]
+
+    index = [w for w, _ in top128]
+    index_lookup = {w: i for i, w in enumerate(index)}
+
+    for line in error_strings.keys():
+        result = ""
+        for i in range(0, len(line) - N + 1, N):
+            word = line[i : i + N]
+            if word in index_lookup:
+                result += "\\{:03o}".format(0b10000000 | index_lookup[word])
+            else:
+                result += word
+        if len(line) % N != 0:
+            result += line[len(line) - len(line) % N :]
+        error_strings[line] = result.strip()
+
+    return "".join(index)
+
+
+def main(collected_path, fn):
+    error_strings = {}
+    max_uncompressed_len = 0
+    num_uses = 0
+
+    # Read in all MP_ERROR_TEXT strings.
+    with open(collected_path, "r") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            num_uses += 1
+            error_strings[line] = None
+            max_uncompressed_len = max(max_uncompressed_len, len(line))
+
+    # So that objexcept.c can figure out how big the buffer needs to be.
+    print("#define MP_MAX_UNCOMPRESSED_TEXT_LEN ({})".format(max_uncompressed_len))
+
+    # Run the compression.
+    compressed_data = fn(error_strings)
+
+    # Print the data table.
+    print('MP_COMPRESSED_DATA("{}")'.format(compressed_data))
+
+    # Print the replacements.
+    for uncomp, comp in error_strings.items():
+        print('MP_MATCH_COMPRESSED("{}", "\\{:03o}{}")'.format(uncomp, _COMPRESSED_MARKER, comp))
+
+    # Used to calculate the "true" length of the (escaped) compressed strings.
+    def unescape(s):
+        return re.sub(r"\\\d\d\d", "!", s)
+
+    # Stats. Note this doesn't include the cost of the decompressor code.
+    uncomp_len = sum(len(s) + 1 for s in error_strings.keys())
+    comp_len = sum(1 + len(unescape(s)) + 1 for s in error_strings.values())
+    data_len = len(compressed_data) + 1 if compressed_data else 0
+    print("// Total input length:      {}".format(uncomp_len))
+    print("// Total compressed length: {}".format(comp_len))
+    print("// Total data length:       {}".format(data_len))
+    print("// Predicted saving:        {}".format(uncomp_len - comp_len - data_len))
+
+    # Somewhat meaningless comparison to zlib/gzip.
+    all_input_bytes = "\\0".join(error_strings.keys()).encode()
+    print()
+    if hasattr(gzip, "compress"):
+        gzip_len = len(gzip.compress(all_input_bytes)) + num_uses * 4
+        print("// gzip length:             {}".format(gzip_len))
+        print("// Percentage of gzip:      {:.1f}%".format(100 * (comp_len + data_len) / gzip_len))
+    if hasattr(zlib, "compress"):
+        zlib_len = len(zlib.compress(all_input_bytes)) + num_uses * 4
+        print("// zlib length:             {}".format(zlib_len))
+        print("// Percentage of zlib:      {:.1f}%".format(100 * (comp_len + data_len) / zlib_len))
+
+
+if __name__ == "__main__":
+    main(sys.argv[1], word_compression)
diff --git a/py/makeqstrdefs.py b/py/makeqstrdefs.py
index 03ea1afc79..9449a46ee9 100644
--- a/py/makeqstrdefs.py
+++ b/py/makeqstrdefs.py
@@ -13,17 +13,27 @@ import io
 import os
 
 
+# Extract MP_QSTR_FOO macros.
+_MODE_QSTR = "qstr"
+
+# Extract MP_COMPRESSED_ROM_TEXT("") macros.  (Which come from MP_ERROR_TEXT)
+_MODE_COMPRESS = "compress"
+
+
 def write_out(fname, output):
     if output:
         for m, r in [("/", "__"), ("\\", "__"), (":", "@"), ("..", "@@")]:
             fname = fname.replace(m, r)
-        with open(args.output_dir + "/" + fname + ".qstr", "w") as f:
+        with open(args.output_dir + "/" + fname + "." + args.mode, "w") as f:
             f.write("\n".join(output) + "\n")
 
 
 def process_file(f):
     re_line = re.compile(r"#[line]*\s\d+\s\"([^\"]+)\"")
-    re_qstr = re.compile(r"MP_QSTR_[_a-zA-Z0-9]+")
+    if args.mode == _MODE_QSTR:
+        re_match = re.compile(r"MP_QSTR_[_a-zA-Z0-9]+")
+    elif args.mode == _MODE_COMPRESS:
+        re_match = re.compile(r'MP_COMPRESSED_ROM_TEXT\("([^"]*)"\)')
     output = []
     last_fname = None
     for line in f:
@@ -41,9 +51,12 @@ def process_file(f):
                 output = []
                 last_fname = fname
             continue
-        for match in re_qstr.findall(line):
-            name = match.replace("MP_QSTR_", "")
-            output.append("Q(" + name + ")")
+        for match in re_match.findall(line):
+            if args.mode == _MODE_QSTR:
+                name = match.replace("MP_QSTR_", "")
+                output.append("Q(" + name + ")")
+            elif args.mode == _MODE_COMPRESS:
+                output.append(match)
 
     write_out(last_fname, output)
     return ""
@@ -56,7 +69,7 @@ def cat_together():
     hasher = hashlib.md5()
     all_lines = []
     outf = open(args.output_dir + "/out", "wb")
-    for fname in glob.glob(args.output_dir + "/*.qstr"):
+    for fname in glob.glob(args.output_dir + "/*." + args.mode):
         with open(fname, "rb") as f:
             lines = f.readlines()
             all_lines += lines
@@ -73,8 +86,11 @@ def cat_together():
             old_hash = f.read()
     except IOError:
         pass
+    mode_full = "QSTR"
+    if args.mode == _MODE_COMPRESS:
+        mode_full = "Compressed data"
     if old_hash != new_hash:
-        print("QSTR updated")
+        print(mode_full, "updated")
         try:
             # rename below might fail if file exists
             os.remove(args.output_file)
@@ -84,12 +100,12 @@ def cat_together():
         with open(args.output_file + ".hash", "w") as f:
             f.write(new_hash)
     else:
-        print("QSTR not updated")
+        print(mode_full, "not updated")
 
 
 if __name__ == "__main__":
-    if len(sys.argv) != 5:
-        print("usage: %s command input_filename output_dir output_file" % sys.argv[0])
+    if len(sys.argv) != 6:
+        print("usage: %s command mode input_filename output_dir output_file" % sys.argv[0])
         sys.exit(2)
 
     class Args:
@@ -97,9 +113,14 @@ if __name__ == "__main__":
 
     args = Args()
     args.command = sys.argv[1]
-    args.input_filename = sys.argv[2]
-    args.output_dir = sys.argv[3]
-    args.output_file = sys.argv[4]
+    args.mode = sys.argv[2]
+    args.input_filename = sys.argv[3]  # Unused for command=cat
+    args.output_dir = sys.argv[4]
+    args.output_file = None if len(sys.argv) == 5 else sys.argv[5]  # Unused for command=split
+
+    if args.mode not in (_MODE_QSTR, _MODE_COMPRESS):
+        print("error: mode %s unrecognised" % sys.argv[2])
+        sys.exit(2)
 
     try:
         os.makedirs(args.output_dir)
diff --git a/py/misc.h b/py/misc.h
index eafcef9667..d1f8a65177 100644
--- a/py/misc.h
+++ b/py/misc.h
@@ -257,4 +257,58 @@ typedef union _mp_float_union_t {
 
 #endif // MICROPY_PY_BUILTINS_FLOAT
 
+/** ROM string compression *************/
+
+#ifdef NO_QSTR
+
+// QSTR extraction sets NO_QSTR.
+// So leave MP_COMPRESSED_ROM_TEXT in place for makeqstrdefs.py / makecompresseddata.py to find them.
+
+// However, dynamic native modules also set NO_QSTR, so provide a dummy implementation.
+#if MICROPY_ENABLE_DYNRUNTIME
+typedef const char *mp_rom_error_text_t;
+#define MP_COMPRESSED_ROM_TEXT(x) x
+#endif
+
+#else
+
+#if MICROPY_ROM_TEXT_COMPRESSION
+
+// Force usage of the MP_ERROR_TEXT macro by requiring an opaque type.
+typedef struct {} *mp_rom_error_text_t;
+
+// Regular build -- map MP_COMPRESSED_ROM_TEXT to the compressed strings.
+
+#include <string.h>
+
+inline __attribute__((always_inline)) const char *MP_COMPRESSED_ROM_TEXT(const char *msg) {
+    // "genhdr/compressed.data.h" contains an invocation of the MP_MATCH_COMPRESSED macro for each compressed string.
+    // The giant if(strcmp) tree is optimized by the compiler, which turns this into a direct return of the compressed data.
+    #define MP_MATCH_COMPRESSED(a, b) if (strcmp(msg, a) == 0) { return b; } else
+
+    // It also contains a single invocation of the MP_COMPRESSED_DATA macro, we don't need that here.
+    #define MP_COMPRESSED_DATA(x)
+
+    #include "genhdr/compressed.data.h"
+
+#undef MP_COMPRESSED_DATA
+#undef MP_MATCH_COMPRESSED
+
+    return msg;
+}
+
+#else
+
+// Compression not enabled, just make it a no-op.
+typedef const char *mp_rom_error_text_t;
+#define MP_COMPRESSED_ROM_TEXT(x) x
+
+#endif // MICROPY_ROM_TEXT_COMPRESSION
+
+#endif // NO_QSTR
+
+// Might add more types of compressed text in the future.
+// For now, forward directly to MP_COMPRESSED_ROM_TEXT.
+#define MP_ERROR_TEXT(x) (mp_rom_error_text_t)MP_COMPRESSED_ROM_TEXT(x)
+
 #endif // MICROPY_INCLUDED_PY_MISC_H
diff --git a/py/mkrules.mk b/py/mkrules.mk
index 68da3e793c..6c7d3701f4 100644
--- a/py/mkrules.mk
+++ b/py/mkrules.mk
@@ -4,6 +4,23 @@ THIS_MAKEFILE = $(lastword $(MAKEFILE_LIST))
 include $(dir $(THIS_MAKEFILE))mkenv.mk
 endif
 
+# Extra deps that need to happen before object compilation.
+OBJ_EXTRA_ORDER_DEPS =
+
+# QSTR generation uses the same CFLAGS, with these modifications.
+# Note: := to force evalulation immediately.
+QSTR_GEN_CFLAGS := $(CFLAGS)
+QSTR_GEN_CFLAGS += -DNO_QSTR
+QSTR_GEN_CFLAGS += -I$(BUILD)/tmp
+
+ifeq ($(MICROPY_ROM_TEXT_COMPRESSION),1)
+# If compression is enabled, trigger the build of compressed.data.h...
+OBJ_EXTRA_ORDER_DEPS += $(HEADER_BUILD)/compressed.data.h
+# ...and enable the MP_COMPRESSED_ROM_TEXT macro (used by MP_ERROR_TEXT).
+# Note, this doesn't get added to the QSTR_GEN_CFLAGS.
+CFLAGS += -DMICROPY_ROM_TEXT_COMPRESSION=1
+endif
+
 # This file expects that OBJ contains a list of all of the object files.
 # The directory portion of each object file is used to locate the source
 # and should not contain any ..'s but rather be relative to the top of the
@@ -46,9 +63,6 @@ vpath %.c . $(TOP) $(USER_C_MODULES)
 $(BUILD)/%.o: %.c
 	$(call compile_c)
 
-QSTR_GEN_EXTRA_CFLAGS += -DNO_QSTR
-QSTR_GEN_EXTRA_CFLAGS += -I$(BUILD)/tmp
-
 vpath %.c . $(TOP) $(USER_C_MODULES)
 
 $(BUILD)/%.pp: %.c
@@ -64,7 +78,7 @@ $(BUILD)/%.pp: %.c
 # the right .o's to get recompiled if the generated.h file changes. Adding
 # an order-only dependency to all of the .o's will cause the generated .h
 # to get built before we try to compile any of them.
-$(OBJ): | $(HEADER_BUILD)/qstrdefs.generated.h $(HEADER_BUILD)/mpversion.h
+$(OBJ): | $(HEADER_BUILD)/qstrdefs.generated.h $(HEADER_BUILD)/mpversion.h $(OBJ_EXTRA_ORDER_DEPS)
 
 # The logic for qstr regeneration is:
 # - if anything in QSTR_GLOBAL_DEPENDENCIES is newer, then process all source files ($^)
@@ -73,16 +87,26 @@ $(OBJ): | $(HEADER_BUILD)/qstrdefs.generated.h $(HEADER_BUILD)/mpversion.h
 # See more information about this process in docs/develop/qstr.rst.
 $(HEADER_BUILD)/qstr.i.last: $(SRC_QSTR) $(QSTR_GLOBAL_DEPENDENCIES) | $(QSTR_GLOBAL_REQUIREMENTS)
 	$(ECHO) "GEN $@"
-	$(Q)$(CPP) $(QSTR_GEN_EXTRA_CFLAGS) $(CFLAGS) $(if $(filter $?,$(QSTR_GLOBAL_DEPENDENCIES)),$^,$(if $?,$?,$^)) >$(HEADER_BUILD)/qstr.i.last
+	$(Q)$(CPP) $(QSTR_GEN_CFLAGS) $(if $(filter $?,$(QSTR_GLOBAL_DEPENDENCIES)),$^,$(if $?,$?,$^)) >$(HEADER_BUILD)/qstr.i.last
 
 $(HEADER_BUILD)/qstr.split: $(HEADER_BUILD)/qstr.i.last
 	$(ECHO) "GEN $@"
-	$(Q)$(PYTHON) $(PY_SRC)/makeqstrdefs.py split $(HEADER_BUILD)/qstr.i.last $(HEADER_BUILD)/qstr $(QSTR_DEFS_COLLECTED)
+	$(Q)$(PYTHON) $(PY_SRC)/makeqstrdefs.py split qstr $< $(HEADER_BUILD)/qstr _
 	$(Q)$(TOUCH) $@
 
 $(QSTR_DEFS_COLLECTED): $(HEADER_BUILD)/qstr.split
 	$(ECHO) "GEN $@"
-	$(Q)$(PYTHON) $(PY_SRC)/makeqstrdefs.py cat $(HEADER_BUILD)/qstr.i.last $(HEADER_BUILD)/qstr $(QSTR_DEFS_COLLECTED)
+	$(Q)$(PYTHON) $(PY_SRC)/makeqstrdefs.py cat qstr _ $(HEADER_BUILD)/qstr $@
+
+# Compressed error strings.
+$(HEADER_BUILD)/compressed.split: $(HEADER_BUILD)/qstr.i.last
+	$(ECHO) "GEN $@"
+	$(Q)$(PYTHON) $(PY_SRC)/makeqstrdefs.py split compress $< $(HEADER_BUILD)/compress _
+	$(Q)$(TOUCH) $@
+
+$(HEADER_BUILD)/compressed.collected: $(HEADER_BUILD)/compressed.split
+	$(ECHO) "GEN $@"
+	$(Q)$(PYTHON) $(PY_SRC)/makeqstrdefs.py cat compress _ $(HEADER_BUILD)/compress $@
 
 # $(sort $(var)) removes duplicates
 #
diff --git a/py/py.mk b/py/py.mk
index 8c90beff71..0a98ccc00c 100644
--- a/py/py.mk
+++ b/py/py.mk
@@ -255,6 +255,10 @@ $(HEADER_BUILD)/qstrdefs.generated.h: $(PY_QSTR_DEFS) $(QSTR_DEFS) $(QSTR_DEFS_C
 	$(Q)$(CAT) $(PY_QSTR_DEFS) $(QSTR_DEFS) $(QSTR_DEFS_COLLECTED) | $(SED) 's/^Q(.*)/"&"/' | $(CPP) $(CFLAGS) - | $(SED) 's/^\"\(Q(.*)\)\"/\1/' > $(HEADER_BUILD)/qstrdefs.preprocessed.h
 	$(Q)$(PYTHON) $(PY_SRC)/makeqstrdata.py $(HEADER_BUILD)/qstrdefs.preprocessed.h > $@
 
+$(HEADER_BUILD)/compressed.data.h: $(HEADER_BUILD)/compressed.collected
+	$(ECHO) "GEN $@"
+	$(Q)$(PYTHON) $(PY_SRC)/makecompresseddata.py $< > $@
+
 # build a list of registered modules for py/objmodule.c.
 $(HEADER_BUILD)/moduledefs.h: $(SRC_QSTR) $(QSTR_GLOBAL_DEPENDENCIES) | $(HEADER_BUILD)/mpversion.h
 	@$(ECHO) "GEN $@"
diff --git a/py/qstr.c b/py/qstr.c
index c3d78bfdab..2b1cea405b 100644
--- a/py/qstr.c
+++ b/py/qstr.c
@@ -310,3 +310,78 @@ void qstr_dump_data(void) {
     QSTR_EXIT();
 }
 #endif
+
+#if MICROPY_ROM_TEXT_COMPRESSION
+
+#ifdef NO_QSTR
+
+// If NO_QSTR is set, it means we're doing QSTR extraction.
+// So we won't yet have "genhdr/compressed.data.h"
+
+#else
+
+// Emit the compressed_string_data string.
+#define MP_COMPRESSED_DATA(x) STATIC const char *compressed_string_data = x;
+#define MP_MATCH_COMPRESSED(a, b)
+#include "genhdr/compressed.data.h"
+#undef MP_COMPRESSED_DATA
+#undef MP_MATCH_COMPRESSED
+
+#endif // NO_QSTR
+
+// This implements the "common word" compression scheme (see makecompresseddata.py) where the most
+// common 128 words in error messages are replaced by their index into the list of common words.
+
+// The compressed string data is delimited by setting high bit in the final char of each word.
+// e.g. aaaa<0x80|a>bbbbbb<0x80|b>....
+// This method finds the n'th string.
+STATIC const byte *find_uncompressed_string(uint8_t n) {
+    const byte *c = (byte *)compressed_string_data;
+    while (n > 0) {
+        while ((*c & 0x80) == 0) {
+            ++c;
+        }
+        ++c;
+        --n;
+    }
+    return c;
+}
+
+// Given a compressed string in src, decompresses it into dst.
+// dst must be large enough (use MP_MAX_UNCOMPRESSED_TEXT_LEN+1).
+void mp_decompress_rom_string(byte *dst, const mp_rom_error_text_t src_chr) {
+    // Skip past the 0xff marker.
+    const byte *src = (byte *)src_chr + 1;
+    // Need to add spaces around compressed words, except for the first (i.e. transition from 1<->2).
+    // 0 = start, 1 = compressed, 2 = regular.
+    int state = 0;
+    while (*src) {
+        if ((byte) * src >= 128) {
+            if (state != 0) {
+                *dst++ = ' ';
+            }
+            state = 1;
+
+            // High bit set, replace with common word.
+            const byte *word = find_uncompressed_string(*src & 0x7f);
+            // The word is terminated by the final char having its high bit set.
+            while ((*word & 0x80) == 0) {
+                *dst++ = *word++;
+            }
+            *dst++ = (*word & 0x7f);
+        } else {
+            // Otherwise just copy one char.
+            if (state == 1) {
+                *dst++ = ' ';
+            }
+            state = 2;
+
+            *dst++ = *src;
+        }
+        ++src;
+    }
+    // Add null-terminator.
+    *dst = 0;
+}
+
+#endif // MICROPY_ROM_TEXT_COMPRESSION
diff --git a/py/qstr.h b/py/qstr.h
index 7dd3024c36..df87217ff3 100644
--- a/py/qstr.h
+++ b/py/qstr.h
@@ -74,4 +74,9 @@ const byte *qstr_data(qstr q, size_t *len);
 void qstr_pool_info(size_t *n_pool, size_t *n_qstr, size_t *n_str_data_bytes, size_t *n_total_bytes);
 void qstr_dump_data(void);
 
+#if MICROPY_ROM_TEXT_COMPRESSION
+void mp_decompress_rom_string(byte *dst, mp_rom_error_text_t src);
+#define MP_IS_COMPRESSED_ROM_STRING(s) (*(byte *)(s) == 0xff)
+#endif
+
 #endif // MICROPY_INCLUDED_PY_QSTR_H