summaryrefslogtreecommitdiffstatshomepage
diff options
context:
space:
mode:
authorPaul Sokolovsky <pfalcon@users.sourceforge.net>2014-06-13 21:23:00 +0300
committerPaul Sokolovsky <pfalcon@users.sourceforge.net>2014-06-14 01:21:13 +0300
commit2ec38a17d4e357f8f12ee6a2643e2dd2ff7a426e (patch)
tree5bc282117e32cb70604b617d5692900529cc9ded
parente9036c295ca1240946c122044e86ba8b569184e1 (diff)
downloadmicropython-2ec38a17d4e357f8f12ee6a2643e2dd2ff7a426e.tar.gz
micropython-2ec38a17d4e357f8f12ee6a2643e2dd2ff7a426e.zip
objstr: Be 8-bit clean even for repr().
This will allow roughly the same behavior as Python3 for non-ASCII strings, for example, print("<phrase in non-Latin script>".split()) will print list of words, not weird hex dump (like Python2 behaves). (Of course, that it will print list of words, if there're "words" in that phrase at all, separated by ASCII-compatible whitespace; that surely won't apply to every human language in existence).
-rw-r--r--py/obj.h2
-rw-r--r--py/objarray.c2
-rw-r--r--py/objstr.c10
-rw-r--r--tests/basics/string-repr.py3
4 files changed, 11 insertions, 6 deletions
diff --git a/py/obj.h b/py/obj.h
index d62bc7b341..1f5a83f919 100644
--- a/py/obj.h
+++ b/py/obj.h
@@ -469,7 +469,7 @@ qstr mp_obj_str_get_qstr(mp_obj_t self_in); // use this if you will anyway conve
const char *mp_obj_str_get_str(mp_obj_t self_in); // use this only if you need the string to be null terminated
const char *mp_obj_str_get_data(mp_obj_t self_in, uint *len);
mp_obj_t mp_obj_str_intern(mp_obj_t str);
-void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *env, const byte *str_data, uint str_len);
+void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *env, const byte *str_data, uint str_len, bool is_bytes);
#if MICROPY_PY_BUILTINS_FLOAT
// float
diff --git a/py/objarray.c b/py/objarray.c
index edf3ee8121..05821e8de4 100644
--- a/py/objarray.c
+++ b/py/objarray.c
@@ -58,7 +58,7 @@ STATIC void array_print(void (*print)(void *env, const char *fmt, ...), void *en
mp_obj_array_t *o = o_in;
if (o->typecode == BYTEARRAY_TYPECODE) {
print(env, "bytearray(b", o->typecode);
- mp_str_print_quoted(print, env, o->items, o->len);
+ mp_str_print_quoted(print, env, o->items, o->len, true);
} else {
print(env, "array('%c'", o->typecode);
if (o->len > 0) {
diff --git a/py/objstr.c b/py/objstr.c
index 6656090c84..f9cc273447 100644
--- a/py/objstr.c
+++ b/py/objstr.c
@@ -64,7 +64,8 @@ STATIC bool is_str_or_bytes(mp_obj_t o) {
/******************************************************************************/
/* str */
-void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *env, const byte *str_data, uint str_len) {
+void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *env,
+ const byte *str_data, uint str_len, bool is_bytes) {
// this escapes characters, but it will be very slow to print (calling print many times)
bool has_single_quote = false;
bool has_double_quote = false;
@@ -85,7 +86,10 @@ void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *e
print(env, "\\%c", quote_char);
} else if (*s == '\\') {
print(env, "\\\\");
- } else if (32 <= *s && *s <= 126) {
+ } else if (*s >= 0x20 && *s != 0x7f && (!is_bytes || *s < 0x80)) {
+ // In strings, anything which is not ascii control character
+ // is printed as is, this includes characters in range 0x80-0xff
+ // (which can be non-Latin letters, etc.)
print(env, "%c", *s);
} else if (*s == '\n') {
print(env, "\\n");
@@ -109,7 +113,7 @@ STATIC void str_print(void (*print)(void *env, const char *fmt, ...), void *env,
if (is_bytes) {
print(env, "b");
}
- mp_str_print_quoted(print, env, str_data, str_len);
+ mp_str_print_quoted(print, env, str_data, str_len, is_bytes);
}
}
diff --git a/tests/basics/string-repr.py b/tests/basics/string-repr.py
index 34da483a57..2a3ef2527c 100644
--- a/tests/basics/string-repr.py
+++ b/tests/basics/string-repr.py
@@ -1,3 +1,4 @@
# anything above 0xa0 is printed as Unicode by CPython
-for c in range(0xa1):
+# the abobe is CPython implementation detail, stick to ASCII
+for c in range(0x80):
print("0x%02x: %s" % (c, repr(chr(c))))