9 files changed, 57 insertions, 40 deletions
diff --git a/py/gc.c b/py/gc.c
index 7fab0409a9..30bae5054a 100644
--- a/py/gc.c
+++ b/py/gc.c
@@ -120,7 +120,7 @@ void gc_init(void *start, void *end) {
     //     F = A * BLOCKS_PER_ATB / BLOCKS_PER_FTB
     //     P = A * BLOCKS_PER_ATB * BYTES_PER_BLOCK
     // => T = A * (1 + BLOCKS_PER_ATB / BLOCKS_PER_FTB + BLOCKS_PER_ATB * BYTES_PER_BLOCK)
-    machine_uint_t total_byte_len = end - start;
+    machine_uint_t total_byte_len = (byte*)end - (byte*)start;
 #if MICROPY_ENABLE_FINALISER
     gc_alloc_table_byte_len = total_byte_len * BITS_PER_BYTE / (BITS_PER_BYTE + BITS_PER_BYTE * BLOCKS_PER_ATB / BLOCKS_PER_FTB + BITS_PER_BYTE * BLOCKS_PER_ATB * BYTES_PER_BLOCK);
 #else
@@ -136,8 +136,8 @@ void gc_init(void *start, void *end) {
 #endif
 
     machine_uint_t gc_pool_block_len = gc_alloc_table_byte_len * BLOCKS_PER_ATB;
-    gc_pool_start = end - gc_pool_block_len * BYTES_PER_BLOCK;
-    gc_pool_end = end;
+    gc_pool_start = (machine_uint_t*)((byte*)end - gc_pool_block_len * BYTES_PER_BLOCK);
+    gc_pool_end = (machine_uint_t*)end;
 
     // clear ATBs
     memset(gc_alloc_table_start, 0, gc_alloc_table_byte_len);
@@ -407,7 +407,7 @@ found:
     // to the heap and will not be set to something else if the caller
     // doesn't actually use the entire block.  As such they will continue
     // to point to the heap and may prevent other blocks from being reclaimed.
-    memset(ret_ptr + n_bytes, 0, (end_block - start_block + 1) * BYTES_PER_BLOCK - n_bytes);
+    memset((byte*)ret_ptr + n_bytes, 0, (end_block - start_block + 1) * BYTES_PER_BLOCK - n_bytes);
 
 #if MICROPY_ENABLE_FINALISER
     if (has_finaliser) {
@@ -571,7 +571,7 @@ void *gc_realloc(void *ptr_in, machine_uint_t n_bytes) {
         }
 
         // zero out the additional bytes of the newly allocated blocks (see comment above in gc_alloc)
-        memset(ptr_in + n_bytes, 0, new_blocks * BYTES_PER_BLOCK - n_bytes);
+        memset((byte*)ptr_in + n_bytes, 0, new_blocks * BYTES_PER_BLOCK - n_bytes);
 
         return ptr_in;
     }
diff --git a/py/lexer.c b/py/lexer.c
index 26993922eb..a65df54ba6 100644
--- a/py/lexer.c
+++ b/py/lexer.c
@@ -83,8 +83,8 @@ bool str_strn_equal(const char *str, const char *strn, int len) {
 void mp_token_show(const mp_token_t *tok) {
     printf("(%d:%d) kind:%d str:%p len:%d", tok->src_line, tok->src_column, tok->kind, tok->str, tok->len);
     if (tok->str != NULL && tok->len > 0) {
-        const char *i = tok->str;
-        const char *j = i + tok->len;
+        const byte *i = (const byte *)tok->str;
+        const byte *j = (const byte *)i + tok->len;
         printf(" ");
         while (i < j) {
             unichar c = utf8_get_char(i);
diff --git a/py/misc.h b/py/misc.h
index fd54147efd..044fef6236 100644
--- a/py/misc.h
+++ b/py/misc.h
@@ -88,8 +88,8 @@ int m_get_peak_bytes_allocated(void);
 
 typedef int unichar; // TODO
 
-unichar utf8_get_char(const char *s);
-char *utf8_next_char(const char *s);
+unichar utf8_get_char(const byte *s);
+const byte *utf8_next_char(const byte *s);
 
 bool unichar_isspace(unichar c);
 bool unichar_isalpha(unichar c);
@@ -100,6 +100,7 @@ bool unichar_isupper(unichar c);
 bool unichar_islower(unichar c);
 unichar unichar_tolower(unichar c);
 unichar unichar_toupper(unichar c);
+#define unichar_charlen(s, bytelen) (bytelen)
 
 /** variable string *********************************************/
 
diff --git a/py/obj.h b/py/obj.h
index d62bc7b341..1f5a83f919 100644
--- a/py/obj.h
+++ b/py/obj.h
@@ -469,7 +469,7 @@ qstr mp_obj_str_get_qstr(mp_obj_t self_in); // use this if you will anyway conve
 const char *mp_obj_str_get_str(mp_obj_t self_in); // use this only if you need the string to be null terminated
 const char *mp_obj_str_get_data(mp_obj_t self_in, uint *len);
 mp_obj_t mp_obj_str_intern(mp_obj_t str);
-void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *env, const byte *str_data, uint str_len);
+void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *env, const byte *str_data, uint str_len, bool is_bytes);
 
 #if MICROPY_PY_BUILTINS_FLOAT
 // float
diff --git a/py/objarray.c b/py/objarray.c
index edf3ee8121..05821e8de4 100644
--- a/py/objarray.c
+++ b/py/objarray.c
@@ -58,7 +58,7 @@ STATIC void array_print(void (*print)(void *env, const char *fmt, ...), void *en
     mp_obj_array_t *o = o_in;
     if (o->typecode == BYTEARRAY_TYPECODE) {
         print(env, "bytearray(b", o->typecode);
-        mp_str_print_quoted(print, env, o->items, o->len);
+        mp_str_print_quoted(print, env, o->items, o->len, true);
     } else {
         print(env, "array('%c'", o->typecode);
         if (o->len > 0) {
diff --git a/py/objstr.c b/py/objstr.c
index 6656090c84..c84d7c900d 100644
--- a/py/objstr.c
+++ b/py/objstr.c
@@ -64,7 +64,8 @@ STATIC bool is_str_or_bytes(mp_obj_t o) {
 /******************************************************************************/
 /* str                                                                        */
 
-void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *env, const byte *str_data, uint str_len) {
+void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *env,
+                         const byte *str_data, uint str_len, bool is_bytes) {
     // this escapes characters, but it will be very slow to print (calling print many times)
     bool has_single_quote = false;
     bool has_double_quote = false;
@@ -85,7 +86,10 @@ void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *e
             print(env, "\\%c", quote_char);
         } else if (*s == '\\') {
             print(env, "\\\\");
-        } else if (32 <= *s && *s <= 126) {
+        } else if (*s >= 0x20 && *s != 0x7f && (!is_bytes || *s < 0x80)) {
+            // In strings, anything which is not ascii control character
+            // is printed as is, this includes characters in range 0x80-0xff
+            // (which can be non-Latin letters, etc.)
             print(env, "%c", *s);
         } else if (*s == '\n') {
             print(env, "\\n");
@@ -109,7 +113,7 @@ STATIC void str_print(void (*print)(void *env, const char *fmt, ...), void *env,
         if (is_bytes) {
             print(env, "b");
         }
-        mp_str_print_quoted(print, env, str_data, str_len);
+        mp_str_print_quoted(print, env, str_data, str_len, is_bytes);
     }
 }
 
@@ -348,6 +352,12 @@ uncomparable:
     return MP_OBJ_NULL; // op not supported
 }
 
+const byte *str_index_to_ptr(const mp_obj_type_t *type, const byte *self_data, uint self_len,
+                             mp_obj_t index, bool is_slice) {
+    machine_uint_t index_val = mp_get_index(type, self_len, index, is_slice);
+    return self_data + index_val;
+}
+
 STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
     mp_obj_type_t *type = mp_obj_get_type(self_in);
     GET_STR_DATA_LEN(self_in, self_data, self_len);
@@ -363,11 +373,11 @@ STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
             return mp_obj_new_str_of_type(type, self_data + slice.start, slice.stop - slice.start);
         }
 #endif
-        uint index_val = mp_get_index(type, self_len, index, false);
+        const byte *p = str_index_to_ptr(type, self_data, self_len, index, false);
         if (type == &mp_type_bytes) {
-            return MP_OBJ_NEW_SMALL_INT((mp_small_int_t)self_data[index_val]);
+            return MP_OBJ_NEW_SMALL_INT((mp_small_int_t)*p);
         } else {
-            return mp_obj_new_str((char*)self_data + index_val, 1, true);
+            return mp_obj_new_str((char*)p, 1, true);
         }
     } else {
         return MP_OBJ_NULL; // op not supported
@@ -563,6 +573,7 @@ STATIC mp_obj_t str_rsplit(uint n_args, const mp_obj_t *args) {
 
 
 STATIC mp_obj_t str_finder(uint n_args, const mp_obj_t *args, machine_int_t direction, bool is_index) {
+    const mp_obj_type_t *self_type = mp_obj_get_type(args[0]);
     assert(2 <= n_args && n_args <= 4);
     assert(MP_OBJ_IS_STR(args[0]));
     assert(MP_OBJ_IS_STR(args[1]));
@@ -570,16 +581,16 @@ STATIC mp_obj_t str_finder(uint n_args, const mp_obj_t *args, machine_int_t dire
     GET_STR_DATA_LEN(args[0], haystack, haystack_len);
     GET_STR_DATA_LEN(args[1], needle, needle_len);
 
-    machine_uint_t start = 0;
-    machine_uint_t end = haystack_len;
+    const byte *start = haystack;
+    const byte *end = haystack + haystack_len;
     if (n_args >= 3 && args[2] != mp_const_none) {
-        start = mp_get_index(&mp_type_str, haystack_len, args[2], true);
+        start = str_index_to_ptr(self_type, haystack, haystack_len, args[2], true);
     }
     if (n_args >= 4 && args[3] != mp_const_none) {
-        end = mp_get_index(&mp_type_str, haystack_len, args[3], true);
+        end = str_index_to_ptr(self_type, haystack, haystack_len, args[3], true);
     }
 
-    const byte *p = find_subbytes(haystack + start, end - start, needle, needle_len, direction);
+    const byte *p = find_subbytes(start, end - start, needle, needle_len, direction);
     if (p == NULL) {
         // not found
         if (is_index) {
@@ -611,16 +622,17 @@ STATIC mp_obj_t str_rindex(uint n_args, const mp_obj_t *args) {
 
 // TODO: (Much) more variety in args
 STATIC mp_obj_t str_startswith(uint n_args, const mp_obj_t *args) {
+    const mp_obj_type_t *self_type = mp_obj_get_type(args[0]);
     GET_STR_DATA_LEN(args[0], str, str_len);
     GET_STR_DATA_LEN(args[1], prefix, prefix_len);
-    uint index_val = 0;
+    const byte *start = str;
     if (n_args > 2) {
-        index_val = mp_get_index(&mp_type_str, str_len, args[2], true);
+        start = str_index_to_ptr(self_type, str, str_len, args[2], true);
     }
-    if (prefix_len + index_val > str_len) {
+    if (prefix_len + (start - str) > str_len) {
         return mp_const_false;
     }
-    return MP_BOOL(memcmp(str + index_val, prefix, prefix_len) == 0);
+    return MP_BOOL(memcmp(start, prefix, prefix_len) == 0);
 }
 
 STATIC mp_obj_t str_endswith(uint n_args, const mp_obj_t *args) {
@@ -1418,6 +1430,7 @@ STATIC mp_obj_t str_replace(uint n_args, const mp_obj_t *args) {
 }
 
 STATIC mp_obj_t str_count(uint n_args, const mp_obj_t *args) {
+    const mp_obj_type_t *self_type = mp_obj_get_type(args[0]);
     assert(2 <= n_args && n_args <= 4);
     assert(MP_OBJ_IS_STR(args[0]));
     assert(MP_OBJ_IS_STR(args[1]));
@@ -1425,26 +1438,28 @@ STATIC mp_obj_t str_count(uint n_args, const mp_obj_t *args) {
     GET_STR_DATA_LEN(args[0], haystack, haystack_len);
     GET_STR_DATA_LEN(args[1], needle, needle_len);
 
-    machine_uint_t start = 0;
-    machine_uint_t end = haystack_len;
+    const byte *start = haystack;
+    const byte *end = haystack + haystack_len;
     if (n_args >= 3 && args[2] != mp_const_none) {
-        start = mp_get_index(&mp_type_str, haystack_len, args[2], true);
+        start = str_index_to_ptr(self_type, haystack, haystack_len, args[2], true);
     }
     if (n_args >= 4 && args[3] != mp_const_none) {
-        end = mp_get_index(&mp_type_str, haystack_len, args[3], true);
+        end = str_index_to_ptr(self_type, haystack, haystack_len, args[3], true);
     }
 
     // if needle_len is zero then we count each gap between characters as an occurrence
     if (needle_len == 0) {
-        return MP_OBJ_NEW_SMALL_INT(end - start + 1);
+        return MP_OBJ_NEW_SMALL_INT(unichar_charlen((const char*)start, end - start) + 1);
     }
 
     // count the occurrences
     machine_int_t num_occurrences = 0;
-    for (machine_uint_t haystack_index = start; haystack_index + needle_len <= end; haystack_index++) {
-        if (memcmp(&haystack[haystack_index], needle, needle_len) == 0) {
+    for (const byte *haystack_ptr = start; haystack_ptr + needle_len <= end;) {
+        if (memcmp(haystack_ptr, needle, needle_len) == 0) {
             num_occurrences++;
-            haystack_index += needle_len - 1;
+            haystack_ptr += needle_len;
+        } else {
+            haystack_ptr = utf8_next_char(haystack_ptr);
         }
     }
 
diff --git a/py/stream.c b/py/stream.c
index 71aaa4e99a..07a79248ab 100644
--- a/py/stream.c
+++ b/py/stream.c
@@ -217,7 +217,7 @@ STATIC mp_obj_t stream_unbuffered_readlines(mp_obj_t self) {
     mp_obj_t lines = mp_obj_new_list(0, NULL);
     for (;;) {
         mp_obj_t line = stream_unbuffered_readline(1, &self);
-        if (mp_obj_str_get_len(line) == 0) {
+        if (!mp_obj_is_true(line)) {
             break;
         }
         mp_obj_list_append(lines, line);
@@ -228,7 +228,7 @@ MP_DEFINE_CONST_FUN_OBJ_1(mp_stream_unbuffered_readlines_obj, stream_unbuffered_
 
 mp_obj_t mp_stream_unbuffered_iter(mp_obj_t self) {
     mp_obj_t l_in = stream_unbuffered_readline(1, &self);
-    if (mp_obj_str_get_len(l_in) != 0) {
+    if (mp_obj_is_true(l_in)) {
         return l_in;
     }
     return MP_OBJ_STOP_ITERATION;
diff --git a/py/unicode.c b/py/unicode.c
index 131ddc8108..c8faa57009 100644
--- a/py/unicode.c
+++ b/py/unicode.c
@@ -65,12 +65,12 @@ STATIC const uint8_t attr[] = {
     AT_LO, AT_LO, AT_LO, AT_PR, AT_PR, AT_PR, AT_PR, 0
 };
 
-unichar utf8_get_char(const char *s) {
+unichar utf8_get_char(const byte *s) {
     return *s;
 }
 
-char *utf8_next_char(const char *s) {
-    return (char*)(s + 1);
+const byte *utf8_next_char(const byte *s) {
+    return s + 1;
 }
 
 bool unichar_isspace(unichar c) {
diff --git a/tests/basics/string-repr.py b/tests/basics/string-repr.py
index 34da483a57..2a3ef2527c 100644
--- a/tests/basics/string-repr.py
+++ b/tests/basics/string-repr.py
@@ -1,3 +1,4 @@
 # anything above 0xa0 is printed as Unicode by CPython
-for c in range(0xa1):
+# the abobe is CPython implementation detail, stick to ASCII
+for c in range(0x80):
     print("0x%02x: %s" % (c, repr(chr(c))))