diff options
-rw-r--r-- | py/gc.c | 10 | ||||
-rw-r--r-- | py/lexer.c | 4 | ||||
-rw-r--r-- | py/misc.h | 5 | ||||
-rw-r--r-- | py/obj.h | 2 | ||||
-rw-r--r-- | py/objarray.c | 2 | ||||
-rw-r--r-- | py/objstr.c | 61 | ||||
-rw-r--r-- | py/stream.c | 4 | ||||
-rw-r--r-- | py/unicode.c | 6 | ||||
-rw-r--r-- | tests/basics/string-repr.py | 3 |
9 files changed, 57 insertions, 40 deletions
@@ -120,7 +120,7 @@ void gc_init(void *start, void *end) { // F = A * BLOCKS_PER_ATB / BLOCKS_PER_FTB // P = A * BLOCKS_PER_ATB * BYTES_PER_BLOCK // => T = A * (1 + BLOCKS_PER_ATB / BLOCKS_PER_FTB + BLOCKS_PER_ATB * BYTES_PER_BLOCK) - machine_uint_t total_byte_len = end - start; + machine_uint_t total_byte_len = (byte*)end - (byte*)start; #if MICROPY_ENABLE_FINALISER gc_alloc_table_byte_len = total_byte_len * BITS_PER_BYTE / (BITS_PER_BYTE + BITS_PER_BYTE * BLOCKS_PER_ATB / BLOCKS_PER_FTB + BITS_PER_BYTE * BLOCKS_PER_ATB * BYTES_PER_BLOCK); #else @@ -136,8 +136,8 @@ void gc_init(void *start, void *end) { #endif machine_uint_t gc_pool_block_len = gc_alloc_table_byte_len * BLOCKS_PER_ATB; - gc_pool_start = end - gc_pool_block_len * BYTES_PER_BLOCK; - gc_pool_end = end; + gc_pool_start = (machine_uint_t*)((byte*)end - gc_pool_block_len * BYTES_PER_BLOCK); + gc_pool_end = (machine_uint_t*)end; // clear ATBs memset(gc_alloc_table_start, 0, gc_alloc_table_byte_len); @@ -407,7 +407,7 @@ found: // to the heap and will not be set to something else if the caller // doesn't actually use the entire block. As such they will continue // to point to the heap and may prevent other blocks from being reclaimed. - memset(ret_ptr + n_bytes, 0, (end_block - start_block + 1) * BYTES_PER_BLOCK - n_bytes); + memset((byte*)ret_ptr + n_bytes, 0, (end_block - start_block + 1) * BYTES_PER_BLOCK - n_bytes); #if MICROPY_ENABLE_FINALISER if (has_finaliser) { @@ -571,7 +571,7 @@ void *gc_realloc(void *ptr_in, machine_uint_t n_bytes) { } // zero out the additional bytes of the newly allocated blocks (see comment above in gc_alloc) - memset(ptr_in + n_bytes, 0, new_blocks * BYTES_PER_BLOCK - n_bytes); + memset((byte*)ptr_in + n_bytes, 0, new_blocks * BYTES_PER_BLOCK - n_bytes); return ptr_in; } diff --git a/py/lexer.c b/py/lexer.c index 26993922eb..a65df54ba6 100644 --- a/py/lexer.c +++ b/py/lexer.c @@ -83,8 +83,8 @@ bool str_strn_equal(const char *str, const char *strn, int len) { void mp_token_show(const mp_token_t *tok) { printf("(%d:%d) kind:%d str:%p len:%d", tok->src_line, tok->src_column, tok->kind, tok->str, tok->len); if (tok->str != NULL && tok->len > 0) { - const char *i = tok->str; - const char *j = i + tok->len; + const byte *i = (const byte *)tok->str; + const byte *j = (const byte *)i + tok->len; printf(" "); while (i < j) { unichar c = utf8_get_char(i); @@ -88,8 +88,8 @@ int m_get_peak_bytes_allocated(void); typedef int unichar; // TODO -unichar utf8_get_char(const char *s); -char *utf8_next_char(const char *s); +unichar utf8_get_char(const byte *s); +const byte *utf8_next_char(const byte *s); bool unichar_isspace(unichar c); bool unichar_isalpha(unichar c); @@ -100,6 +100,7 @@ bool unichar_isupper(unichar c); bool unichar_islower(unichar c); unichar unichar_tolower(unichar c); unichar unichar_toupper(unichar c); +#define unichar_charlen(s, bytelen) (bytelen) /** variable string *********************************************/ @@ -469,7 +469,7 @@ qstr mp_obj_str_get_qstr(mp_obj_t self_in); // use this if you will anyway conve const char *mp_obj_str_get_str(mp_obj_t self_in); // use this only if you need the string to be null terminated const char *mp_obj_str_get_data(mp_obj_t self_in, uint *len); mp_obj_t mp_obj_str_intern(mp_obj_t str); -void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *env, const byte *str_data, uint str_len); +void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *env, const byte *str_data, uint str_len, bool is_bytes); #if MICROPY_PY_BUILTINS_FLOAT // float diff --git a/py/objarray.c b/py/objarray.c index edf3ee8121..05821e8de4 100644 --- a/py/objarray.c +++ b/py/objarray.c @@ -58,7 +58,7 @@ STATIC void array_print(void (*print)(void *env, const char *fmt, ...), void *en mp_obj_array_t *o = o_in; if (o->typecode == BYTEARRAY_TYPECODE) { print(env, "bytearray(b", o->typecode); - mp_str_print_quoted(print, env, o->items, o->len); + mp_str_print_quoted(print, env, o->items, o->len, true); } else { print(env, "array('%c'", o->typecode); if (o->len > 0) { diff --git a/py/objstr.c b/py/objstr.c index 6656090c84..c84d7c900d 100644 --- a/py/objstr.c +++ b/py/objstr.c @@ -64,7 +64,8 @@ STATIC bool is_str_or_bytes(mp_obj_t o) { /******************************************************************************/ /* str */ -void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *env, const byte *str_data, uint str_len) { +void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *env, + const byte *str_data, uint str_len, bool is_bytes) { // this escapes characters, but it will be very slow to print (calling print many times) bool has_single_quote = false; bool has_double_quote = false; @@ -85,7 +86,10 @@ void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *e print(env, "\\%c", quote_char); } else if (*s == '\\') { print(env, "\\\\"); - } else if (32 <= *s && *s <= 126) { + } else if (*s >= 0x20 && *s != 0x7f && (!is_bytes || *s < 0x80)) { + // In strings, anything which is not ascii control character + // is printed as is, this includes characters in range 0x80-0xff + // (which can be non-Latin letters, etc.) print(env, "%c", *s); } else if (*s == '\n') { print(env, "\\n"); @@ -109,7 +113,7 @@ STATIC void str_print(void (*print)(void *env, const char *fmt, ...), void *env, if (is_bytes) { print(env, "b"); } - mp_str_print_quoted(print, env, str_data, str_len); + mp_str_print_quoted(print, env, str_data, str_len, is_bytes); } } @@ -348,6 +352,12 @@ uncomparable: return MP_OBJ_NULL; // op not supported } +const byte *str_index_to_ptr(const mp_obj_type_t *type, const byte *self_data, uint self_len, + mp_obj_t index, bool is_slice) { + machine_uint_t index_val = mp_get_index(type, self_len, index, is_slice); + return self_data + index_val; +} + STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) { mp_obj_type_t *type = mp_obj_get_type(self_in); GET_STR_DATA_LEN(self_in, self_data, self_len); @@ -363,11 +373,11 @@ STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) { return mp_obj_new_str_of_type(type, self_data + slice.start, slice.stop - slice.start); } #endif - uint index_val = mp_get_index(type, self_len, index, false); + const byte *p = str_index_to_ptr(type, self_data, self_len, index, false); if (type == &mp_type_bytes) { - return MP_OBJ_NEW_SMALL_INT((mp_small_int_t)self_data[index_val]); + return MP_OBJ_NEW_SMALL_INT((mp_small_int_t)*p); } else { - return mp_obj_new_str((char*)self_data + index_val, 1, true); + return mp_obj_new_str((char*)p, 1, true); } } else { return MP_OBJ_NULL; // op not supported @@ -563,6 +573,7 @@ STATIC mp_obj_t str_rsplit(uint n_args, const mp_obj_t *args) { STATIC mp_obj_t str_finder(uint n_args, const mp_obj_t *args, machine_int_t direction, bool is_index) { + const mp_obj_type_t *self_type = mp_obj_get_type(args[0]); assert(2 <= n_args && n_args <= 4); assert(MP_OBJ_IS_STR(args[0])); assert(MP_OBJ_IS_STR(args[1])); @@ -570,16 +581,16 @@ STATIC mp_obj_t str_finder(uint n_args, const mp_obj_t *args, machine_int_t dire GET_STR_DATA_LEN(args[0], haystack, haystack_len); GET_STR_DATA_LEN(args[1], needle, needle_len); - machine_uint_t start = 0; - machine_uint_t end = haystack_len; + const byte *start = haystack; + const byte *end = haystack + haystack_len; if (n_args >= 3 && args[2] != mp_const_none) { - start = mp_get_index(&mp_type_str, haystack_len, args[2], true); + start = str_index_to_ptr(self_type, haystack, haystack_len, args[2], true); } if (n_args >= 4 && args[3] != mp_const_none) { - end = mp_get_index(&mp_type_str, haystack_len, args[3], true); + end = str_index_to_ptr(self_type, haystack, haystack_len, args[3], true); } - const byte *p = find_subbytes(haystack + start, end - start, needle, needle_len, direction); + const byte *p = find_subbytes(start, end - start, needle, needle_len, direction); if (p == NULL) { // not found if (is_index) { @@ -611,16 +622,17 @@ STATIC mp_obj_t str_rindex(uint n_args, const mp_obj_t *args) { // TODO: (Much) more variety in args STATIC mp_obj_t str_startswith(uint n_args, const mp_obj_t *args) { + const mp_obj_type_t *self_type = mp_obj_get_type(args[0]); GET_STR_DATA_LEN(args[0], str, str_len); GET_STR_DATA_LEN(args[1], prefix, prefix_len); - uint index_val = 0; + const byte *start = str; if (n_args > 2) { - index_val = mp_get_index(&mp_type_str, str_len, args[2], true); + start = str_index_to_ptr(self_type, str, str_len, args[2], true); } - if (prefix_len + index_val > str_len) { + if (prefix_len + (start - str) > str_len) { return mp_const_false; } - return MP_BOOL(memcmp(str + index_val, prefix, prefix_len) == 0); + return MP_BOOL(memcmp(start, prefix, prefix_len) == 0); } STATIC mp_obj_t str_endswith(uint n_args, const mp_obj_t *args) { @@ -1418,6 +1430,7 @@ STATIC mp_obj_t str_replace(uint n_args, const mp_obj_t *args) { } STATIC mp_obj_t str_count(uint n_args, const mp_obj_t *args) { + const mp_obj_type_t *self_type = mp_obj_get_type(args[0]); assert(2 <= n_args && n_args <= 4); assert(MP_OBJ_IS_STR(args[0])); assert(MP_OBJ_IS_STR(args[1])); @@ -1425,26 +1438,28 @@ STATIC mp_obj_t str_count(uint n_args, const mp_obj_t *args) { GET_STR_DATA_LEN(args[0], haystack, haystack_len); GET_STR_DATA_LEN(args[1], needle, needle_len); - machine_uint_t start = 0; - machine_uint_t end = haystack_len; + const byte *start = haystack; + const byte *end = haystack + haystack_len; if (n_args >= 3 && args[2] != mp_const_none) { - start = mp_get_index(&mp_type_str, haystack_len, args[2], true); + start = str_index_to_ptr(self_type, haystack, haystack_len, args[2], true); } if (n_args >= 4 && args[3] != mp_const_none) { - end = mp_get_index(&mp_type_str, haystack_len, args[3], true); + end = str_index_to_ptr(self_type, haystack, haystack_len, args[3], true); } // if needle_len is zero then we count each gap between characters as an occurrence if (needle_len == 0) { - return MP_OBJ_NEW_SMALL_INT(end - start + 1); + return MP_OBJ_NEW_SMALL_INT(unichar_charlen((const char*)start, end - start) + 1); } // count the occurrences machine_int_t num_occurrences = 0; - for (machine_uint_t haystack_index = start; haystack_index + needle_len <= end; haystack_index++) { - if (memcmp(&haystack[haystack_index], needle, needle_len) == 0) { + for (const byte *haystack_ptr = start; haystack_ptr + needle_len <= end;) { + if (memcmp(haystack_ptr, needle, needle_len) == 0) { num_occurrences++; - haystack_index += needle_len - 1; + haystack_ptr += needle_len; + } else { + haystack_ptr = utf8_next_char(haystack_ptr); } } diff --git a/py/stream.c b/py/stream.c index 71aaa4e99a..07a79248ab 100644 --- a/py/stream.c +++ b/py/stream.c @@ -217,7 +217,7 @@ STATIC mp_obj_t stream_unbuffered_readlines(mp_obj_t self) { mp_obj_t lines = mp_obj_new_list(0, NULL); for (;;) { mp_obj_t line = stream_unbuffered_readline(1, &self); - if (mp_obj_str_get_len(line) == 0) { + if (!mp_obj_is_true(line)) { break; } mp_obj_list_append(lines, line); @@ -228,7 +228,7 @@ MP_DEFINE_CONST_FUN_OBJ_1(mp_stream_unbuffered_readlines_obj, stream_unbuffered_ mp_obj_t mp_stream_unbuffered_iter(mp_obj_t self) { mp_obj_t l_in = stream_unbuffered_readline(1, &self); - if (mp_obj_str_get_len(l_in) != 0) { + if (mp_obj_is_true(l_in)) { return l_in; } return MP_OBJ_STOP_ITERATION; diff --git a/py/unicode.c b/py/unicode.c index 131ddc8108..c8faa57009 100644 --- a/py/unicode.c +++ b/py/unicode.c @@ -65,12 +65,12 @@ STATIC const uint8_t attr[] = { AT_LO, AT_LO, AT_LO, AT_PR, AT_PR, AT_PR, AT_PR, 0 }; -unichar utf8_get_char(const char *s) { +unichar utf8_get_char(const byte *s) { return *s; } -char *utf8_next_char(const char *s) { - return (char*)(s + 1); +const byte *utf8_next_char(const byte *s) { + return s + 1; } bool unichar_isspace(unichar c) { diff --git a/tests/basics/string-repr.py b/tests/basics/string-repr.py index 34da483a57..2a3ef2527c 100644 --- a/tests/basics/string-repr.py +++ b/tests/basics/string-repr.py @@ -1,3 +1,4 @@ # anything above 0xa0 is printed as Unicode by CPython -for c in range(0xa1): +# the abobe is CPython implementation detail, stick to ASCII +for c in range(0x80): print("0x%02x: %s" % (c, repr(chr(c)))) |