summaryrefslogtreecommitdiffstatshomepage
diff options
context:
space:
mode:
-rw-r--r--py/makecompresseddata.py200
-rw-r--r--py/makeqstrdefs.py47
-rw-r--r--py/misc.h54
-rw-r--r--py/mkrules.mk38
-rw-r--r--py/py.mk4
-rw-r--r--py/qstr.c75
-rw-r--r--py/qstr.h5
7 files changed, 403 insertions, 20 deletions
diff --git a/py/makecompresseddata.py b/py/makecompresseddata.py
new file mode 100644
index 0000000000..26d6703a9e
--- /dev/null
+++ b/py/makecompresseddata.py
@@ -0,0 +1,200 @@
+from __future__ import print_function
+
+import collections
+import re
+import sys
+
+import gzip
+import zlib
+
+
+_COMPRESSED_MARKER = 0xFF
+
+
+def check_non_ascii(msg):
+ for c in msg:
+ if ord(c) >= 0x80:
+ print(
+ 'Unable to generate compressed data: message "{}" contains a non-ascii character "{}".'.format(
+ msg, c
+ ),
+ file=sys.stderr,
+ )
+ sys.exit(1)
+
+
+# Replace <char><space> with <char | 0x80>.
+# Trival scheme to demo/test.
+def space_compression(error_strings):
+ for line in error_strings:
+ check_non_ascii(line)
+ result = ""
+ for i in range(len(line)):
+ if i > 0 and line[i] == " ":
+ result = result[:-1]
+ result += "\\{:03o}".format(ord(line[i - 1]))
+ else:
+ result += line[i]
+ error_strings[line] = result
+ return None
+
+
+# Replace common words with <0x80 | index>.
+# Index is into a table of words stored as aaaaa<0x80|a>bbb<0x80|b>...
+# Replaced words are assumed to have spaces either side to avoid having to store the spaces in the compressed strings.
+def word_compression(error_strings):
+ topn = collections.Counter()
+
+ for line in error_strings.keys():
+ check_non_ascii(line)
+ for word in line.split(" "):
+ topn[word] += 1
+
+ # Order not just by frequency, but by expected saving. i.e. prefer a longer string that is used less frequently.
+ def bytes_saved(item):
+ w, n = item
+ return -((len(w) + 1) * (n - 1))
+
+ top128 = sorted(topn.items(), key=bytes_saved)[:128]
+
+ index = [w for w, _ in top128]
+ index_lookup = {w: i for i, w in enumerate(index)}
+
+ for line in error_strings.keys():
+ result = ""
+ need_space = False
+ for word in line.split(" "):
+ if word in index_lookup:
+ result += "\\{:03o}".format(0b10000000 | index_lookup[word])
+ need_space = False
+ else:
+ if need_space:
+ result += " "
+ need_space = True
+ result += word
+ error_strings[line] = result.strip()
+
+ return "".join(w[:-1] + "\\{:03o}".format(0b10000000 | ord(w[-1])) for w in index)
+
+
+# Replace chars in text with variable length bit sequence.
+# For comparison only (the table is not emitted).
+def huffman_compression(error_strings):
+ # https://github.com/tannewt/huffman
+ import huffman
+
+ all_strings = "".join(error_strings)
+ cb = huffman.codebook(collections.Counter(all_strings).items())
+
+ for line in error_strings:
+ b = "1"
+ for c in line:
+ b += cb[c]
+ n = len(b)
+ if n % 8 != 0:
+ n += 8 - (n % 8)
+ result = ""
+ for i in range(0, n, 8):
+ result += "\\{:03o}".format(int(b[i : i + 8], 2))
+ if len(result) > len(line) * 4:
+ result = line
+ error_strings[line] = result
+
+ # TODO: This would be the prefix lengths and the table ordering.
+ return "_" * (10 + len(cb))
+
+
+# Replace common N-letter sequences with <0x80 | index>, where
+# the common sequences are stored in a separate table.
+# This isn't very useful, need a smarter way to find top-ngrams.
+def ngram_compression(error_strings):
+ topn = collections.Counter()
+ N = 2
+
+ for line in error_strings.keys():
+ check_non_ascii(line)
+ if len(line) < N:
+ continue
+ for i in range(0, len(line) - N, N):
+ topn[line[i : i + N]] += 1
+
+ def bytes_saved(item):
+ w, n = item
+ return -(len(w) * (n - 1))
+
+ top128 = sorted(topn.items(), key=bytes_saved)[:128]
+
+ index = [w for w, _ in top128]
+ index_lookup = {w: i for i, w in enumerate(index)}
+
+ for line in error_strings.keys():
+ result = ""
+ for i in range(0, len(line) - N + 1, N):
+ word = line[i : i + N]
+ if word in index_lookup:
+ result += "\\{:03o}".format(0b10000000 | index_lookup[word])
+ else:
+ result += word
+ if len(line) % N != 0:
+ result += line[len(line) - len(line) % N :]
+ error_strings[line] = result.strip()
+
+ return "".join(index)
+
+
+def main(collected_path, fn):
+ error_strings = {}
+ max_uncompressed_len = 0
+ num_uses = 0
+
+ # Read in all MP_ERROR_TEXT strings.
+ with open(collected_path, "r") as f:
+ for line in f:
+ line = line.strip()
+ if not line:
+ continue
+ num_uses += 1
+ error_strings[line] = None
+ max_uncompressed_len = max(max_uncompressed_len, len(line))
+
+ # So that objexcept.c can figure out how big the buffer needs to be.
+ print("#define MP_MAX_UNCOMPRESSED_TEXT_LEN ({})".format(max_uncompressed_len))
+
+ # Run the compression.
+ compressed_data = fn(error_strings)
+
+ # Print the data table.
+ print('MP_COMPRESSED_DATA("{}")'.format(compressed_data))
+
+ # Print the replacements.
+ for uncomp, comp in error_strings.items():
+ print('MP_MATCH_COMPRESSED("{}", "\\{:03o}{}")'.format(uncomp, _COMPRESSED_MARKER, comp))
+
+ # Used to calculate the "true" length of the (escaped) compressed strings.
+ def unescape(s):
+ return re.sub(r"\\\d\d\d", "!", s)
+
+ # Stats. Note this doesn't include the cost of the decompressor code.
+ uncomp_len = sum(len(s) + 1 for s in error_strings.keys())
+ comp_len = sum(1 + len(unescape(s)) + 1 for s in error_strings.values())
+ data_len = len(compressed_data) + 1 if compressed_data else 0
+ print("// Total input length: {}".format(uncomp_len))
+ print("// Total compressed length: {}".format(comp_len))
+ print("// Total data length: {}".format(data_len))
+ print("// Predicted saving: {}".format(uncomp_len - comp_len - data_len))
+
+ # Somewhat meaningless comparison to zlib/gzip.
+ all_input_bytes = "\\0".join(error_strings.keys()).encode()
+ print()
+ if hasattr(gzip, "compress"):
+ gzip_len = len(gzip.compress(all_input_bytes)) + num_uses * 4
+ print("// gzip length: {}".format(gzip_len))
+ print("// Percentage of gzip: {:.1f}%".format(100 * (comp_len + data_len) / gzip_len))
+ if hasattr(zlib, "compress"):
+ zlib_len = len(zlib.compress(all_input_bytes)) + num_uses * 4
+ print("// zlib length: {}".format(zlib_len))
+ print("// Percentage of zlib: {:.1f}%".format(100 * (comp_len + data_len) / zlib_len))
+
+
+if __name__ == "__main__":
+ main(sys.argv[1], word_compression)
diff --git a/py/makeqstrdefs.py b/py/makeqstrdefs.py
index 03ea1afc79..9449a46ee9 100644
--- a/py/makeqstrdefs.py
+++ b/py/makeqstrdefs.py
@@ -13,17 +13,27 @@ import io
import os
+# Extract MP_QSTR_FOO macros.
+_MODE_QSTR = "qstr"
+
+# Extract MP_COMPRESSED_ROM_TEXT("") macros. (Which come from MP_ERROR_TEXT)
+_MODE_COMPRESS = "compress"
+
+
def write_out(fname, output):
if output:
for m, r in [("/", "__"), ("\\", "__"), (":", "@"), ("..", "@@")]:
fname = fname.replace(m, r)
- with open(args.output_dir + "/" + fname + ".qstr", "w") as f:
+ with open(args.output_dir + "/" + fname + "." + args.mode, "w") as f:
f.write("\n".join(output) + "\n")
def process_file(f):
re_line = re.compile(r"#[line]*\s\d+\s\"([^\"]+)\"")
- re_qstr = re.compile(r"MP_QSTR_[_a-zA-Z0-9]+")
+ if args.mode == _MODE_QSTR:
+ re_match = re.compile(r"MP_QSTR_[_a-zA-Z0-9]+")
+ elif args.mode == _MODE_COMPRESS:
+ re_match = re.compile(r'MP_COMPRESSED_ROM_TEXT\("([^"]*)"\)')
output = []
last_fname = None
for line in f:
@@ -41,9 +51,12 @@ def process_file(f):
output = []
last_fname = fname
continue
- for match in re_qstr.findall(line):
- name = match.replace("MP_QSTR_", "")
- output.append("Q(" + name + ")")
+ for match in re_match.findall(line):
+ if args.mode == _MODE_QSTR:
+ name = match.replace("MP_QSTR_", "")
+ output.append("Q(" + name + ")")
+ elif args.mode == _MODE_COMPRESS:
+ output.append(match)
write_out(last_fname, output)
return ""
@@ -56,7 +69,7 @@ def cat_together():
hasher = hashlib.md5()
all_lines = []
outf = open(args.output_dir + "/out", "wb")
- for fname in glob.glob(args.output_dir + "/*.qstr"):
+ for fname in glob.glob(args.output_dir + "/*." + args.mode):
with open(fname, "rb") as f:
lines = f.readlines()
all_lines += lines
@@ -73,8 +86,11 @@ def cat_together():
old_hash = f.read()
except IOError:
pass
+ mode_full = "QSTR"
+ if args.mode == _MODE_COMPRESS:
+ mode_full = "Compressed data"
if old_hash != new_hash:
- print("QSTR updated")
+ print(mode_full, "updated")
try:
# rename below might fail if file exists
os.remove(args.output_file)
@@ -84,12 +100,12 @@ def cat_together():
with open(args.output_file + ".hash", "w") as f:
f.write(new_hash)
else:
- print("QSTR not updated")
+ print(mode_full, "not updated")
if __name__ == "__main__":
- if len(sys.argv) != 5:
- print("usage: %s command input_filename output_dir output_file" % sys.argv[0])
+ if len(sys.argv) != 6:
+ print("usage: %s command mode input_filename output_dir output_file" % sys.argv[0])
sys.exit(2)
class Args:
@@ -97,9 +113,14 @@ if __name__ == "__main__":
args = Args()
args.command = sys.argv[1]
- args.input_filename = sys.argv[2]
- args.output_dir = sys.argv[3]
- args.output_file = sys.argv[4]
+ args.mode = sys.argv[2]
+ args.input_filename = sys.argv[3] # Unused for command=cat
+ args.output_dir = sys.argv[4]
+ args.output_file = None if len(sys.argv) == 5 else sys.argv[5] # Unused for command=split
+
+ if args.mode not in (_MODE_QSTR, _MODE_COMPRESS):
+ print("error: mode %s unrecognised" % sys.argv[2])
+ sys.exit(2)
try:
os.makedirs(args.output_dir)
diff --git a/py/misc.h b/py/misc.h
index eafcef9667..d1f8a65177 100644
--- a/py/misc.h
+++ b/py/misc.h
@@ -257,4 +257,58 @@ typedef union _mp_float_union_t {
#endif // MICROPY_PY_BUILTINS_FLOAT
+/** ROM string compression *************/
+
+#ifdef NO_QSTR
+
+// QSTR extraction sets NO_QSTR.
+// So leave MP_COMPRESSED_ROM_TEXT in place for makeqstrdefs.py / makecompresseddata.py to find them.
+
+// However, dynamic native modules also set NO_QSTR, so provide a dummy implementation.
+#if MICROPY_ENABLE_DYNRUNTIME
+typedef const char *mp_rom_error_text_t;
+#define MP_COMPRESSED_ROM_TEXT(x) x
+#endif
+
+#else
+
+#if MICROPY_ROM_TEXT_COMPRESSION
+
+// Force usage of the MP_ERROR_TEXT macro by requiring an opaque type.
+typedef struct {} *mp_rom_error_text_t;
+
+// Regular build -- map MP_COMPRESSED_ROM_TEXT to the compressed strings.
+
+#include <string.h>
+
+inline __attribute__((always_inline)) const char *MP_COMPRESSED_ROM_TEXT(const char *msg) {
+ // "genhdr/compressed.data.h" contains an invocation of the MP_MATCH_COMPRESSED macro for each compressed string.
+ // The giant if(strcmp) tree is optimized by the compiler, which turns this into a direct return of the compressed data.
+ #define MP_MATCH_COMPRESSED(a, b) if (strcmp(msg, a) == 0) { return b; } else
+
+ // It also contains a single invocation of the MP_COMPRESSED_DATA macro, we don't need that here.
+ #define MP_COMPRESSED_DATA(x)
+
+ #include "genhdr/compressed.data.h"
+
+#undef MP_COMPRESSED_DATA
+#undef MP_MATCH_COMPRESSED
+
+ return msg;
+}
+
+#else
+
+// Compression not enabled, just make it a no-op.
+typedef const char *mp_rom_error_text_t;
+#define MP_COMPRESSED_ROM_TEXT(x) x
+
+#endif // MICROPY_ROM_TEXT_COMPRESSION
+
+#endif // NO_QSTR
+
+// Might add more types of compressed text in the future.
+// For now, forward directly to MP_COMPRESSED_ROM_TEXT.
+#define MP_ERROR_TEXT(x) (mp_rom_error_text_t)MP_COMPRESSED_ROM_TEXT(x)
+
#endif // MICROPY_INCLUDED_PY_MISC_H
diff --git a/py/mkrules.mk b/py/mkrules.mk
index 68da3e793c..6c7d3701f4 100644
--- a/py/mkrules.mk
+++ b/py/mkrules.mk
@@ -4,6 +4,23 @@ THIS_MAKEFILE = $(lastword $(MAKEFILE_LIST))
include $(dir $(THIS_MAKEFILE))mkenv.mk
endif
+# Extra deps that need to happen before object compilation.
+OBJ_EXTRA_ORDER_DEPS =
+
+# QSTR generation uses the same CFLAGS, with these modifications.
+# Note: := to force evalulation immediately.
+QSTR_GEN_CFLAGS := $(CFLAGS)
+QSTR_GEN_CFLAGS += -DNO_QSTR
+QSTR_GEN_CFLAGS += -I$(BUILD)/tmp
+
+ifeq ($(MICROPY_ROM_TEXT_COMPRESSION),1)
+# If compression is enabled, trigger the build of compressed.data.h...
+OBJ_EXTRA_ORDER_DEPS += $(HEADER_BUILD)/compressed.data.h
+# ...and enable the MP_COMPRESSED_ROM_TEXT macro (used by MP_ERROR_TEXT).
+# Note, this doesn't get added to the QSTR_GEN_CFLAGS.
+CFLAGS += -DMICROPY_ROM_TEXT_COMPRESSION=1
+endif
+
# This file expects that OBJ contains a list of all of the object files.
# The directory portion of each object file is used to locate the source
# and should not contain any ..'s but rather be relative to the top of the
@@ -46,9 +63,6 @@ vpath %.c . $(TOP) $(USER_C_MODULES)
$(BUILD)/%.o: %.c
$(call compile_c)
-QSTR_GEN_EXTRA_CFLAGS += -DNO_QSTR
-QSTR_GEN_EXTRA_CFLAGS += -I$(BUILD)/tmp
-
vpath %.c . $(TOP) $(USER_C_MODULES)
$(BUILD)/%.pp: %.c
@@ -64,7 +78,7 @@ $(BUILD)/%.pp: %.c
# the right .o's to get recompiled if the generated.h file changes. Adding
# an order-only dependency to all of the .o's will cause the generated .h
# to get built before we try to compile any of them.
-$(OBJ): | $(HEADER_BUILD)/qstrdefs.generated.h $(HEADER_BUILD)/mpversion.h
+$(OBJ): | $(HEADER_BUILD)/qstrdefs.generated.h $(HEADER_BUILD)/mpversion.h $(OBJ_EXTRA_ORDER_DEPS)
# The logic for qstr regeneration is:
# - if anything in QSTR_GLOBAL_DEPENDENCIES is newer, then process all source files ($^)
@@ -73,16 +87,26 @@ $(OBJ): | $(HEADER_BUILD)/qstrdefs.generated.h $(HEADER_BUILD)/mpversion.h
# See more information about this process in docs/develop/qstr.rst.
$(HEADER_BUILD)/qstr.i.last: $(SRC_QSTR) $(QSTR_GLOBAL_DEPENDENCIES) | $(QSTR_GLOBAL_REQUIREMENTS)
$(ECHO) "GEN $@"
- $(Q)$(CPP) $(QSTR_GEN_EXTRA_CFLAGS) $(CFLAGS) $(if $(filter $?,$(QSTR_GLOBAL_DEPENDENCIES)),$^,$(if $?,$?,$^)) >$(HEADER_BUILD)/qstr.i.last
+ $(Q)$(CPP) $(QSTR_GEN_CFLAGS) $(if $(filter $?,$(QSTR_GLOBAL_DEPENDENCIES)),$^,$(if $?,$?,$^)) >$(HEADER_BUILD)/qstr.i.last
$(HEADER_BUILD)/qstr.split: $(HEADER_BUILD)/qstr.i.last
$(ECHO) "GEN $@"
- $(Q)$(PYTHON) $(PY_SRC)/makeqstrdefs.py split $(HEADER_BUILD)/qstr.i.last $(HEADER_BUILD)/qstr $(QSTR_DEFS_COLLECTED)
+ $(Q)$(PYTHON) $(PY_SRC)/makeqstrdefs.py split qstr $< $(HEADER_BUILD)/qstr _
$(Q)$(TOUCH) $@
$(QSTR_DEFS_COLLECTED): $(HEADER_BUILD)/qstr.split
$(ECHO) "GEN $@"
- $(Q)$(PYTHON) $(PY_SRC)/makeqstrdefs.py cat $(HEADER_BUILD)/qstr.i.last $(HEADER_BUILD)/qstr $(QSTR_DEFS_COLLECTED)
+ $(Q)$(PYTHON) $(PY_SRC)/makeqstrdefs.py cat qstr _ $(HEADER_BUILD)/qstr $@
+
+# Compressed error strings.
+$(HEADER_BUILD)/compressed.split: $(HEADER_BUILD)/qstr.i.last
+ $(ECHO) "GEN $@"
+ $(Q)$(PYTHON) $(PY_SRC)/makeqstrdefs.py split compress $< $(HEADER_BUILD)/compress _
+ $(Q)$(TOUCH) $@
+
+$(HEADER_BUILD)/compressed.collected: $(HEADER_BUILD)/compressed.split
+ $(ECHO) "GEN $@"
+ $(Q)$(PYTHON) $(PY_SRC)/makeqstrdefs.py cat compress _ $(HEADER_BUILD)/compress $@
# $(sort $(var)) removes duplicates
#
diff --git a/py/py.mk b/py/py.mk
index 8c90beff71..0a98ccc00c 100644
--- a/py/py.mk
+++ b/py/py.mk
@@ -255,6 +255,10 @@ $(HEADER_BUILD)/qstrdefs.generated.h: $(PY_QSTR_DEFS) $(QSTR_DEFS) $(QSTR_DEFS_C
$(Q)$(CAT) $(PY_QSTR_DEFS) $(QSTR_DEFS) $(QSTR_DEFS_COLLECTED) | $(SED) 's/^Q(.*)/"&"/' | $(CPP) $(CFLAGS) - | $(SED) 's/^\"\(Q(.*)\)\"/\1/' > $(HEADER_BUILD)/qstrdefs.preprocessed.h
$(Q)$(PYTHON) $(PY_SRC)/makeqstrdata.py $(HEADER_BUILD)/qstrdefs.preprocessed.h > $@
+$(HEADER_BUILD)/compressed.data.h: $(HEADER_BUILD)/compressed.collected
+ $(ECHO) "GEN $@"
+ $(Q)$(PYTHON) $(PY_SRC)/makecompresseddata.py $< > $@
+
# build a list of registered modules for py/objmodule.c.
$(HEADER_BUILD)/moduledefs.h: $(SRC_QSTR) $(QSTR_GLOBAL_DEPENDENCIES) | $(HEADER_BUILD)/mpversion.h
@$(ECHO) "GEN $@"
diff --git a/py/qstr.c b/py/qstr.c
index c3d78bfdab..2b1cea405b 100644
--- a/py/qstr.c
+++ b/py/qstr.c
@@ -310,3 +310,78 @@ void qstr_dump_data(void) {
QSTR_EXIT();
}
#endif
+
+#if MICROPY_ROM_TEXT_COMPRESSION
+
+#ifdef NO_QSTR
+
+// If NO_QSTR is set, it means we're doing QSTR extraction.
+// So we won't yet have "genhdr/compressed.data.h"
+
+#else
+
+// Emit the compressed_string_data string.
+#define MP_COMPRESSED_DATA(x) STATIC const char *compressed_string_data = x;
+#define MP_MATCH_COMPRESSED(a, b)
+#include "genhdr/compressed.data.h"
+#undef MP_COMPRESSED_DATA
+#undef MP_MATCH_COMPRESSED
+
+#endif // NO_QSTR
+
+// This implements the "common word" compression scheme (see makecompresseddata.py) where the most
+// common 128 words in error messages are replaced by their index into the list of common words.
+
+// The compressed string data is delimited by setting high bit in the final char of each word.
+// e.g. aaaa<0x80|a>bbbbbb<0x80|b>....
+// This method finds the n'th string.
+STATIC const byte *find_uncompressed_string(uint8_t n) {
+ const byte *c = (byte *)compressed_string_data;
+ while (n > 0) {
+ while ((*c & 0x80) == 0) {
+ ++c;
+ }
+ ++c;
+ --n;
+ }
+ return c;
+}
+
+// Given a compressed string in src, decompresses it into dst.
+// dst must be large enough (use MP_MAX_UNCOMPRESSED_TEXT_LEN+1).
+void mp_decompress_rom_string(byte *dst, const mp_rom_error_text_t src_chr) {
+ // Skip past the 0xff marker.
+ const byte *src = (byte *)src_chr + 1;
+ // Need to add spaces around compressed words, except for the first (i.e. transition from 1<->2).
+ // 0 = start, 1 = compressed, 2 = regular.
+ int state = 0;
+ while (*src) {
+ if ((byte) * src >= 128) {
+ if (state != 0) {
+ *dst++ = ' ';
+ }
+ state = 1;
+
+ // High bit set, replace with common word.
+ const byte *word = find_uncompressed_string(*src & 0x7f);
+ // The word is terminated by the final char having its high bit set.
+ while ((*word & 0x80) == 0) {
+ *dst++ = *word++;
+ }
+ *dst++ = (*word & 0x7f);
+ } else {
+ // Otherwise just copy one char.
+ if (state == 1) {
+ *dst++ = ' ';
+ }
+ state = 2;
+
+ *dst++ = *src;
+ }
+ ++src;
+ }
+ // Add null-terminator.
+ *dst = 0;
+}
+
+#endif // MICROPY_ROM_TEXT_COMPRESSION
diff --git a/py/qstr.h b/py/qstr.h
index 7dd3024c36..df87217ff3 100644
--- a/py/qstr.h
+++ b/py/qstr.h
@@ -74,4 +74,9 @@ const byte *qstr_data(qstr q, size_t *len);
void qstr_pool_info(size_t *n_pool, size_t *n_qstr, size_t *n_str_data_bytes, size_t *n_total_bytes);
void qstr_dump_data(void);
+#if MICROPY_ROM_TEXT_COMPRESSION
+void mp_decompress_rom_string(byte *dst, mp_rom_error_text_t src);
+#define MP_IS_COMPRESSED_ROM_STRING(s) (*(byte *)(s) == 0xff)
+#endif
+
#endif // MICROPY_INCLUDED_PY_QSTR_H