summaryrefslogtreecommitdiffstatshomepage
diff options
context:
space:
mode:
-rw-r--r--py/gc.c10
-rw-r--r--py/lexer.c4
-rw-r--r--py/misc.h5
-rw-r--r--py/obj.h2
-rw-r--r--py/objarray.c2
-rw-r--r--py/objstr.c61
-rw-r--r--py/stream.c4
-rw-r--r--py/unicode.c6
-rw-r--r--tests/basics/string-repr.py3
9 files changed, 57 insertions, 40 deletions
diff --git a/py/gc.c b/py/gc.c
index 7fab0409a9..30bae5054a 100644
--- a/py/gc.c
+++ b/py/gc.c
@@ -120,7 +120,7 @@ void gc_init(void *start, void *end) {
// F = A * BLOCKS_PER_ATB / BLOCKS_PER_FTB
// P = A * BLOCKS_PER_ATB * BYTES_PER_BLOCK
// => T = A * (1 + BLOCKS_PER_ATB / BLOCKS_PER_FTB + BLOCKS_PER_ATB * BYTES_PER_BLOCK)
- machine_uint_t total_byte_len = end - start;
+ machine_uint_t total_byte_len = (byte*)end - (byte*)start;
#if MICROPY_ENABLE_FINALISER
gc_alloc_table_byte_len = total_byte_len * BITS_PER_BYTE / (BITS_PER_BYTE + BITS_PER_BYTE * BLOCKS_PER_ATB / BLOCKS_PER_FTB + BITS_PER_BYTE * BLOCKS_PER_ATB * BYTES_PER_BLOCK);
#else
@@ -136,8 +136,8 @@ void gc_init(void *start, void *end) {
#endif
machine_uint_t gc_pool_block_len = gc_alloc_table_byte_len * BLOCKS_PER_ATB;
- gc_pool_start = end - gc_pool_block_len * BYTES_PER_BLOCK;
- gc_pool_end = end;
+ gc_pool_start = (machine_uint_t*)((byte*)end - gc_pool_block_len * BYTES_PER_BLOCK);
+ gc_pool_end = (machine_uint_t*)end;
// clear ATBs
memset(gc_alloc_table_start, 0, gc_alloc_table_byte_len);
@@ -407,7 +407,7 @@ found:
// to the heap and will not be set to something else if the caller
// doesn't actually use the entire block. As such they will continue
// to point to the heap and may prevent other blocks from being reclaimed.
- memset(ret_ptr + n_bytes, 0, (end_block - start_block + 1) * BYTES_PER_BLOCK - n_bytes);
+ memset((byte*)ret_ptr + n_bytes, 0, (end_block - start_block + 1) * BYTES_PER_BLOCK - n_bytes);
#if MICROPY_ENABLE_FINALISER
if (has_finaliser) {
@@ -571,7 +571,7 @@ void *gc_realloc(void *ptr_in, machine_uint_t n_bytes) {
}
// zero out the additional bytes of the newly allocated blocks (see comment above in gc_alloc)
- memset(ptr_in + n_bytes, 0, new_blocks * BYTES_PER_BLOCK - n_bytes);
+ memset((byte*)ptr_in + n_bytes, 0, new_blocks * BYTES_PER_BLOCK - n_bytes);
return ptr_in;
}
diff --git a/py/lexer.c b/py/lexer.c
index 26993922eb..a65df54ba6 100644
--- a/py/lexer.c
+++ b/py/lexer.c
@@ -83,8 +83,8 @@ bool str_strn_equal(const char *str, const char *strn, int len) {
void mp_token_show(const mp_token_t *tok) {
printf("(%d:%d) kind:%d str:%p len:%d", tok->src_line, tok->src_column, tok->kind, tok->str, tok->len);
if (tok->str != NULL && tok->len > 0) {
- const char *i = tok->str;
- const char *j = i + tok->len;
+ const byte *i = (const byte *)tok->str;
+ const byte *j = (const byte *)i + tok->len;
printf(" ");
while (i < j) {
unichar c = utf8_get_char(i);
diff --git a/py/misc.h b/py/misc.h
index fd54147efd..044fef6236 100644
--- a/py/misc.h
+++ b/py/misc.h
@@ -88,8 +88,8 @@ int m_get_peak_bytes_allocated(void);
typedef int unichar; // TODO
-unichar utf8_get_char(const char *s);
-char *utf8_next_char(const char *s);
+unichar utf8_get_char(const byte *s);
+const byte *utf8_next_char(const byte *s);
bool unichar_isspace(unichar c);
bool unichar_isalpha(unichar c);
@@ -100,6 +100,7 @@ bool unichar_isupper(unichar c);
bool unichar_islower(unichar c);
unichar unichar_tolower(unichar c);
unichar unichar_toupper(unichar c);
+#define unichar_charlen(s, bytelen) (bytelen)
/** variable string *********************************************/
diff --git a/py/obj.h b/py/obj.h
index d62bc7b341..1f5a83f919 100644
--- a/py/obj.h
+++ b/py/obj.h
@@ -469,7 +469,7 @@ qstr mp_obj_str_get_qstr(mp_obj_t self_in); // use this if you will anyway conve
const char *mp_obj_str_get_str(mp_obj_t self_in); // use this only if you need the string to be null terminated
const char *mp_obj_str_get_data(mp_obj_t self_in, uint *len);
mp_obj_t mp_obj_str_intern(mp_obj_t str);
-void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *env, const byte *str_data, uint str_len);
+void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *env, const byte *str_data, uint str_len, bool is_bytes);
#if MICROPY_PY_BUILTINS_FLOAT
// float
diff --git a/py/objarray.c b/py/objarray.c
index edf3ee8121..05821e8de4 100644
--- a/py/objarray.c
+++ b/py/objarray.c
@@ -58,7 +58,7 @@ STATIC void array_print(void (*print)(void *env, const char *fmt, ...), void *en
mp_obj_array_t *o = o_in;
if (o->typecode == BYTEARRAY_TYPECODE) {
print(env, "bytearray(b", o->typecode);
- mp_str_print_quoted(print, env, o->items, o->len);
+ mp_str_print_quoted(print, env, o->items, o->len, true);
} else {
print(env, "array('%c'", o->typecode);
if (o->len > 0) {
diff --git a/py/objstr.c b/py/objstr.c
index 6656090c84..c84d7c900d 100644
--- a/py/objstr.c
+++ b/py/objstr.c
@@ -64,7 +64,8 @@ STATIC bool is_str_or_bytes(mp_obj_t o) {
/******************************************************************************/
/* str */
-void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *env, const byte *str_data, uint str_len) {
+void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *env,
+ const byte *str_data, uint str_len, bool is_bytes) {
// this escapes characters, but it will be very slow to print (calling print many times)
bool has_single_quote = false;
bool has_double_quote = false;
@@ -85,7 +86,10 @@ void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *e
print(env, "\\%c", quote_char);
} else if (*s == '\\') {
print(env, "\\\\");
- } else if (32 <= *s && *s <= 126) {
+ } else if (*s >= 0x20 && *s != 0x7f && (!is_bytes || *s < 0x80)) {
+ // In strings, anything which is not ascii control character
+ // is printed as is, this includes characters in range 0x80-0xff
+ // (which can be non-Latin letters, etc.)
print(env, "%c", *s);
} else if (*s == '\n') {
print(env, "\\n");
@@ -109,7 +113,7 @@ STATIC void str_print(void (*print)(void *env, const char *fmt, ...), void *env,
if (is_bytes) {
print(env, "b");
}
- mp_str_print_quoted(print, env, str_data, str_len);
+ mp_str_print_quoted(print, env, str_data, str_len, is_bytes);
}
}
@@ -348,6 +352,12 @@ uncomparable:
return MP_OBJ_NULL; // op not supported
}
+const byte *str_index_to_ptr(const mp_obj_type_t *type, const byte *self_data, uint self_len,
+ mp_obj_t index, bool is_slice) {
+ machine_uint_t index_val = mp_get_index(type, self_len, index, is_slice);
+ return self_data + index_val;
+}
+
STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
mp_obj_type_t *type = mp_obj_get_type(self_in);
GET_STR_DATA_LEN(self_in, self_data, self_len);
@@ -363,11 +373,11 @@ STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
return mp_obj_new_str_of_type(type, self_data + slice.start, slice.stop - slice.start);
}
#endif
- uint index_val = mp_get_index(type, self_len, index, false);
+ const byte *p = str_index_to_ptr(type, self_data, self_len, index, false);
if (type == &mp_type_bytes) {
- return MP_OBJ_NEW_SMALL_INT((mp_small_int_t)self_data[index_val]);
+ return MP_OBJ_NEW_SMALL_INT((mp_small_int_t)*p);
} else {
- return mp_obj_new_str((char*)self_data + index_val, 1, true);
+ return mp_obj_new_str((char*)p, 1, true);
}
} else {
return MP_OBJ_NULL; // op not supported
@@ -563,6 +573,7 @@ STATIC mp_obj_t str_rsplit(uint n_args, const mp_obj_t *args) {
STATIC mp_obj_t str_finder(uint n_args, const mp_obj_t *args, machine_int_t direction, bool is_index) {
+ const mp_obj_type_t *self_type = mp_obj_get_type(args[0]);
assert(2 <= n_args && n_args <= 4);
assert(MP_OBJ_IS_STR(args[0]));
assert(MP_OBJ_IS_STR(args[1]));
@@ -570,16 +581,16 @@ STATIC mp_obj_t str_finder(uint n_args, const mp_obj_t *args, machine_int_t dire
GET_STR_DATA_LEN(args[0], haystack, haystack_len);
GET_STR_DATA_LEN(args[1], needle, needle_len);
- machine_uint_t start = 0;
- machine_uint_t end = haystack_len;
+ const byte *start = haystack;
+ const byte *end = haystack + haystack_len;
if (n_args >= 3 && args[2] != mp_const_none) {
- start = mp_get_index(&mp_type_str, haystack_len, args[2], true);
+ start = str_index_to_ptr(self_type, haystack, haystack_len, args[2], true);
}
if (n_args >= 4 && args[3] != mp_const_none) {
- end = mp_get_index(&mp_type_str, haystack_len, args[3], true);
+ end = str_index_to_ptr(self_type, haystack, haystack_len, args[3], true);
}
- const byte *p = find_subbytes(haystack + start, end - start, needle, needle_len, direction);
+ const byte *p = find_subbytes(start, end - start, needle, needle_len, direction);
if (p == NULL) {
// not found
if (is_index) {
@@ -611,16 +622,17 @@ STATIC mp_obj_t str_rindex(uint n_args, const mp_obj_t *args) {
// TODO: (Much) more variety in args
STATIC mp_obj_t str_startswith(uint n_args, const mp_obj_t *args) {
+ const mp_obj_type_t *self_type = mp_obj_get_type(args[0]);
GET_STR_DATA_LEN(args[0], str, str_len);
GET_STR_DATA_LEN(args[1], prefix, prefix_len);
- uint index_val = 0;
+ const byte *start = str;
if (n_args > 2) {
- index_val = mp_get_index(&mp_type_str, str_len, args[2], true);
+ start = str_index_to_ptr(self_type, str, str_len, args[2], true);
}
- if (prefix_len + index_val > str_len) {
+ if (prefix_len + (start - str) > str_len) {
return mp_const_false;
}
- return MP_BOOL(memcmp(str + index_val, prefix, prefix_len) == 0);
+ return MP_BOOL(memcmp(start, prefix, prefix_len) == 0);
}
STATIC mp_obj_t str_endswith(uint n_args, const mp_obj_t *args) {
@@ -1418,6 +1430,7 @@ STATIC mp_obj_t str_replace(uint n_args, const mp_obj_t *args) {
}
STATIC mp_obj_t str_count(uint n_args, const mp_obj_t *args) {
+ const mp_obj_type_t *self_type = mp_obj_get_type(args[0]);
assert(2 <= n_args && n_args <= 4);
assert(MP_OBJ_IS_STR(args[0]));
assert(MP_OBJ_IS_STR(args[1]));
@@ -1425,26 +1438,28 @@ STATIC mp_obj_t str_count(uint n_args, const mp_obj_t *args) {
GET_STR_DATA_LEN(args[0], haystack, haystack_len);
GET_STR_DATA_LEN(args[1], needle, needle_len);
- machine_uint_t start = 0;
- machine_uint_t end = haystack_len;
+ const byte *start = haystack;
+ const byte *end = haystack + haystack_len;
if (n_args >= 3 && args[2] != mp_const_none) {
- start = mp_get_index(&mp_type_str, haystack_len, args[2], true);
+ start = str_index_to_ptr(self_type, haystack, haystack_len, args[2], true);
}
if (n_args >= 4 && args[3] != mp_const_none) {
- end = mp_get_index(&mp_type_str, haystack_len, args[3], true);
+ end = str_index_to_ptr(self_type, haystack, haystack_len, args[3], true);
}
// if needle_len is zero then we count each gap between characters as an occurrence
if (needle_len == 0) {
- return MP_OBJ_NEW_SMALL_INT(end - start + 1);
+ return MP_OBJ_NEW_SMALL_INT(unichar_charlen((const char*)start, end - start) + 1);
}
// count the occurrences
machine_int_t num_occurrences = 0;
- for (machine_uint_t haystack_index = start; haystack_index + needle_len <= end; haystack_index++) {
- if (memcmp(&haystack[haystack_index], needle, needle_len) == 0) {
+ for (const byte *haystack_ptr = start; haystack_ptr + needle_len <= end;) {
+ if (memcmp(haystack_ptr, needle, needle_len) == 0) {
num_occurrences++;
- haystack_index += needle_len - 1;
+ haystack_ptr += needle_len;
+ } else {
+ haystack_ptr = utf8_next_char(haystack_ptr);
}
}
diff --git a/py/stream.c b/py/stream.c
index 71aaa4e99a..07a79248ab 100644
--- a/py/stream.c
+++ b/py/stream.c
@@ -217,7 +217,7 @@ STATIC mp_obj_t stream_unbuffered_readlines(mp_obj_t self) {
mp_obj_t lines = mp_obj_new_list(0, NULL);
for (;;) {
mp_obj_t line = stream_unbuffered_readline(1, &self);
- if (mp_obj_str_get_len(line) == 0) {
+ if (!mp_obj_is_true(line)) {
break;
}
mp_obj_list_append(lines, line);
@@ -228,7 +228,7 @@ MP_DEFINE_CONST_FUN_OBJ_1(mp_stream_unbuffered_readlines_obj, stream_unbuffered_
mp_obj_t mp_stream_unbuffered_iter(mp_obj_t self) {
mp_obj_t l_in = stream_unbuffered_readline(1, &self);
- if (mp_obj_str_get_len(l_in) != 0) {
+ if (mp_obj_is_true(l_in)) {
return l_in;
}
return MP_OBJ_STOP_ITERATION;
diff --git a/py/unicode.c b/py/unicode.c
index 131ddc8108..c8faa57009 100644
--- a/py/unicode.c
+++ b/py/unicode.c
@@ -65,12 +65,12 @@ STATIC const uint8_t attr[] = {
AT_LO, AT_LO, AT_LO, AT_PR, AT_PR, AT_PR, AT_PR, 0
};
-unichar utf8_get_char(const char *s) {
+unichar utf8_get_char(const byte *s) {
return *s;
}
-char *utf8_next_char(const char *s) {
- return (char*)(s + 1);
+const byte *utf8_next_char(const byte *s) {
+ return s + 1;
}
bool unichar_isspace(unichar c) {
diff --git a/tests/basics/string-repr.py b/tests/basics/string-repr.py
index 34da483a57..2a3ef2527c 100644
--- a/tests/basics/string-repr.py
+++ b/tests/basics/string-repr.py
@@ -1,3 +1,4 @@
# anything above 0xa0 is printed as Unicode by CPython
-for c in range(0xa1):
+# the abobe is CPython implementation detail, stick to ASCII
+for c in range(0x80):
print("0x%02x: %s" % (c, repr(chr(c))))