diff options
Diffstat (limited to 'py')
-rw-r--r-- | py/builtin.c | 50 | ||||
-rw-r--r-- | py/compile.c | 4 | ||||
-rw-r--r-- | py/emit.h | 5 | ||||
-rw-r--r-- | py/emitbc.c | 11 | ||||
-rw-r--r-- | py/emitcpy.c | 11 | ||||
-rw-r--r-- | py/emitnative.c | 110 | ||||
-rw-r--r-- | py/emitpass1.c | 4 | ||||
-rw-r--r-- | py/lexer.c | 29 | ||||
-rw-r--r-- | py/misc.h | 18 | ||||
-rw-r--r-- | py/mpconfig.h | 10 | ||||
-rw-r--r-- | py/obj.c | 9 | ||||
-rw-r--r-- | py/objfun.c | 2 | ||||
-rw-r--r-- | py/objstr.c | 112 | ||||
-rw-r--r-- | py/objstr.h | 48 | ||||
-rw-r--r-- | py/objstrunicode.c | 359 | ||||
-rw-r--r-- | py/py.mk | 1 | ||||
-rw-r--r-- | py/runtime.c | 2 | ||||
-rw-r--r-- | py/stackctrl.c | 12 | ||||
-rw-r--r-- | py/stackctrl.h | 14 | ||||
-rw-r--r-- | py/stream.c | 11 | ||||
-rw-r--r-- | py/unicode.c | 51 | ||||
-rw-r--r-- | py/unicode.h | 1 | ||||
-rw-r--r-- | py/vstr.c | 38 |
23 files changed, 811 insertions, 101 deletions
diff --git a/py/builtin.c b/py/builtin.c index d4b77d37a8..f4bbe0e237 100644 --- a/py/builtin.c +++ b/py/builtin.c @@ -172,13 +172,40 @@ STATIC mp_obj_t mp_builtin_callable(mp_obj_t o_in) { MP_DEFINE_CONST_FUN_OBJ_1(mp_builtin_callable_obj, mp_builtin_callable); STATIC mp_obj_t mp_builtin_chr(mp_obj_t o_in) { - int ord = mp_obj_get_int(o_in); + #if MICROPY_PY_BUILTINS_STR_UNICODE + machine_int_t c = mp_obj_get_int(o_in); + char str[4]; + int len = 0; + if (c < 0x80) { + *str = c; len = 1; + } else if (c < 0x800) { + str[0] = (c >> 6) | 0xC0; + str[1] = (c & 0x3F) | 0x80; + len = 2; + } else if (c < 0x10000) { + str[0] = (c >> 12) | 0xE0; + str[1] = ((c >> 6) & 0x3F) | 0x80; + str[2] = (c & 0x3F) | 0x80; + len = 3; + } else if (c < 0x110000) { + str[0] = (c >> 18) | 0xF0; + str[1] = ((c >> 12) & 0x3F) | 0x80; + str[2] = ((c >> 6) & 0x3F) | 0x80; + str[3] = (c & 0x3F) | 0x80; + len = 4; + } else { + nlr_raise(mp_obj_new_exception_msg(&mp_type_ValueError, "chr() arg not in range(0x110000)")); + } + return mp_obj_new_str(str, len, true); + #else + machine_int_t ord = mp_obj_get_int(o_in); if (0 <= ord && ord <= 0x10ffff) { char str[1] = {ord}; return mp_obj_new_str(str, 1, true); } else { nlr_raise(mp_obj_new_exception_msg(&mp_type_ValueError, "chr() arg not in range(0x110000)")); } + #endif } MP_DEFINE_CONST_FUN_OBJ_1(mp_builtin_chr_obj, mp_builtin_chr); @@ -344,13 +371,32 @@ MP_DEFINE_CONST_FUN_OBJ_1(mp_builtin_oct_obj, mp_builtin_oct); STATIC mp_obj_t mp_builtin_ord(mp_obj_t o_in) { uint len; const char *str = mp_obj_str_get_data(o_in, &len); + #if MICROPY_PY_BUILTINS_STR_UNICODE + machine_uint_t charlen = unichar_charlen(str, len); + if (charlen == 1) { + if (MP_OBJ_IS_STR(o_in) && UTF8_IS_NONASCII(*str)) { + machine_int_t ord = *str++ & 0x7F; + for (machine_int_t mask = 0x40; ord & mask; mask >>= 1) { + ord &= ~mask; + } + while (UTF8_IS_CONT(*str)) { + ord = (ord << 6) | (*str++ & 0x3F); + } + return mp_obj_new_int(ord); + } else { + return mp_obj_new_int(((const byte*)str)[0]); + } + } else { + nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_TypeError, "ord() expected a character, but string of length %d found", charlen)); + } + #else if (len == 1) { // don't sign extend when converting to ord - // TODO unicode return mp_obj_new_int(((const byte*)str)[0]); } else { nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_TypeError, "ord() expected a character, but string of length %d found", len)); } + #endif } MP_DEFINE_CONST_FUN_OBJ_1(mp_builtin_ord_obj, mp_builtin_ord); diff --git a/py/compile.c b/py/compile.c index d49edd6ef3..f2a108074f 100644 --- a/py/compile.c +++ b/py/compile.c @@ -1894,7 +1894,7 @@ void compile_try_except(compiler_t *comp, mp_parse_node_t pn_body, int n_except, EMIT_ARG(jump, success_label); // jump over exception handler EMIT_ARG(label_assign, l1); // start of exception handler - EMIT_ARG(adjust_stack_size, 6); // stack adjust for the 3 exception items, +3 for possible UNWIND_JUMP state + EMIT(start_except_handler); uint l2 = comp_next_label(comp); @@ -1966,7 +1966,7 @@ void compile_try_except(compiler_t *comp, mp_parse_node_t pn_body, int n_except, compile_decrease_except_level(comp); EMIT(end_finally); - EMIT_ARG(adjust_stack_size, -5); // stack adjust + EMIT(end_except_handler); EMIT_ARG(label_assign, success_label); compile_node(comp, pn_else); // else block, can be null @@ -134,6 +134,11 @@ typedef struct _emit_method_table_t { void (*yield_value)(emit_t *emit); void (*yield_from)(emit_t *emit); + // these methods are used to control entry to/exit from an exception handler + // they may or may not emit code + void (*start_except_handler)(emit_t *emit); + void (*end_except_handler)(emit_t *emit); + #if MICROPY_EMIT_CPYTHON // these methods are only needed for emitcpy void (*load_const_verbatim_str)(emit_t *emit, const char *str); diff --git a/py/emitbc.c b/py/emitbc.c index 3eb0d575b9..f9fbed4aaf 100644 --- a/py/emitbc.c +++ b/py/emitbc.c @@ -849,6 +849,14 @@ STATIC void emit_bc_yield_from(emit_t *emit) { emit_write_bytecode_byte(emit, MP_BC_YIELD_FROM); } +STATIC void emit_bc_start_except_handler(emit_t *emit) { + emit_bc_adjust_stack_size(emit, 6); // stack adjust for the 3 exception items, +3 for possible UNWIND_JUMP state +} + +STATIC void emit_bc_end_except_handler(emit_t *emit) { + emit_bc_adjust_stack_size(emit, -5); // stack adjust +} + const emit_method_table_t emit_bc_method_table = { emit_bc_set_native_types, emit_bc_start_pass, @@ -934,6 +942,9 @@ const emit_method_table_t emit_bc_method_table = { emit_bc_raise_varargs, emit_bc_yield_value, emit_bc_yield_from, + + emit_bc_start_except_handler, + emit_bc_end_except_handler, }; #endif // !MICROPY_EMIT_CPYTHON diff --git a/py/emitcpy.c b/py/emitcpy.c index fbca3805b3..4ff99866a0 100644 --- a/py/emitcpy.c +++ b/py/emitcpy.c @@ -792,6 +792,14 @@ STATIC void emit_cpy_yield_from(emit_t *emit) { } } +STATIC void emit_cpy_start_except_handler(emit_t *emit) { + emit_cpy_adjust_stack_size(emit, 3); // stack adjust for the 3 exception items +} + +STATIC void emit_cpy_end_except_handler(emit_t *emit) { + emit_cpy_adjust_stack_size(emit, -5); // stack adjust +} + STATIC void emit_cpy_load_const_verbatim_str(emit_t *emit, const char *str) { emit_pre(emit, 1, 3); if (emit->pass == MP_PASS_EMIT) { @@ -899,6 +907,9 @@ const emit_method_table_t emit_cpython_method_table = { emit_cpy_yield_value, emit_cpy_yield_from, + emit_cpy_start_except_handler, + emit_cpy_end_except_handler, + // emitcpy specific functions emit_cpy_load_const_verbatim_str, emit_cpy_load_closure, diff --git a/py/emitnative.c b/py/emitnative.c index f291efe481..4cab3f4697 100644 --- a/py/emitnative.c +++ b/py/emitnative.c @@ -49,6 +49,7 @@ #include <assert.h> #include "mpconfig.h" +#include "nlr.h" #include "misc.h" #include "qstr.h" #include "lexer.h" @@ -723,7 +724,11 @@ STATIC void emit_native_load_const_str(emit_t *emit, qstr qstr, bool bytes) { assert(0); emit_post_push_imm(emit, VTYPE_PTR, (machine_uint_t)qstr_str(qstr)); } else { - emit_call_with_imm_arg(emit, MP_F_LOAD_CONST_STR, mp_load_const_str, qstr, REG_ARG_1); + if (bytes) { + emit_call_with_imm_arg(emit, 0, mp_load_const_bytes, qstr, REG_ARG_1); // TODO need to add function to runtime table + } else { + emit_call_with_imm_arg(emit, MP_F_LOAD_CONST_STR, mp_load_const_str, qstr, REG_ARG_1); + } emit_post_push_reg(emit, VTYPE_PYOBJ, REG_RET); } } @@ -1057,17 +1062,33 @@ STATIC void emit_native_setup_with(emit_t *emit, uint label) { // not supported, or could be with runtime call assert(0); } + STATIC void emit_native_with_cleanup(emit_t *emit) { assert(0); } + STATIC void emit_native_setup_except(emit_t *emit, uint label) { - assert(0); + emit_native_pre(emit); + // need to commit stack because we may jump elsewhere + need_stack_settled(emit); + emit_get_stack_pointer_to_reg_for_push(emit, REG_ARG_1, sizeof(nlr_buf_t) / sizeof(machine_uint_t)); // arg1 = pointer to nlr buf + emit_call(emit, 0, nlr_push); // TODO need to add function to runtime table +#if N_X64 + asm_x64_test_r8_with_r8(emit->as, REG_RET, REG_RET); + asm_x64_jcc_label(emit->as, JCC_JNZ, label); +#elif N_THUMB + asm_thumb_cmp_rlo_i8(emit->as, REG_RET, 0); + asm_thumb_bcc_label(emit->as, THUMB_CC_NE, label); +#endif + emit_post(emit); } + STATIC void emit_native_setup_finally(emit_t *emit, uint label) { assert(0); } + STATIC void emit_native_end_finally(emit_t *emit) { - assert(0); + //assert(0); } STATIC void emit_native_get_iter(emit_t *emit) { @@ -1107,19 +1128,31 @@ STATIC void emit_native_for_iter_end(emit_t *emit) { STATIC void emit_native_pop_block(emit_t *emit) { emit_native_pre(emit); + emit_call(emit, 0, nlr_pop); // TODO need to add function to runtime table + adjust_stack(emit, -(machine_int_t)(sizeof(nlr_buf_t) / sizeof(machine_uint_t))); emit_post(emit); } STATIC void emit_native_pop_except(emit_t *emit) { - assert(0); + /* + emit_native_pre(emit); + emit_call(emit, 0, nlr_pop); // TODO need to add function to runtime table + adjust_stack(emit, -(machine_int_t)(sizeof(nlr_buf_t) / sizeof(machine_uint_t))); + emit_post(emit); + */ } STATIC void emit_native_unary_op(emit_t *emit, mp_unary_op_t op) { - vtype_kind_t vtype; - emit_pre_pop_reg(emit, &vtype, REG_ARG_2); - assert(vtype == VTYPE_PYOBJ); - emit_call_with_imm_arg(emit, MP_F_UNARY_OP, mp_unary_op, op, REG_ARG_1); - emit_post_push_reg(emit, VTYPE_PYOBJ, REG_RET); + if (op == MP_UNARY_OP_NOT) { + // we need to synthesise this operation + assert(0); + } else { + vtype_kind_t vtype; + emit_pre_pop_reg(emit, &vtype, REG_ARG_2); + assert(vtype == VTYPE_PYOBJ); + emit_call_with_imm_arg(emit, MP_F_UNARY_OP, mp_unary_op, op, REG_ARG_1); + emit_post_push_reg(emit, VTYPE_PYOBJ, REG_RET); + } } STATIC void emit_native_binary_op(emit_t *emit, mp_binary_op_t op) { @@ -1233,17 +1266,26 @@ STATIC void emit_native_set_add(emit_t *emit, int set_index) { STATIC void emit_native_build_slice(emit_t *emit, int n_args) { DEBUG_printf("build_slice %d\n", n_args); - assert(n_args == 2); - vtype_kind_t vtype_start, vtype_stop; - emit_pre_pop_reg_reg(emit, &vtype_stop, REG_ARG_2, &vtype_start, REG_ARG_1); // arg1 = start, arg2 = stop - assert(vtype_start == VTYPE_PYOBJ); - assert(vtype_stop == VTYPE_PYOBJ); - emit_call_with_imm_arg(emit, MP_F_NEW_SLICE, mp_obj_new_slice, (machine_uint_t)MP_OBJ_NULL, REG_ARG_3); // arg3 = step - emit_post_push_reg(emit, VTYPE_PYOBJ, REG_RET); + if (n_args == 2) { + vtype_kind_t vtype_start, vtype_stop; + emit_pre_pop_reg_reg(emit, &vtype_stop, REG_ARG_2, &vtype_start, REG_ARG_1); // arg1 = start, arg2 = stop + assert(vtype_start == VTYPE_PYOBJ); + assert(vtype_stop == VTYPE_PYOBJ); + emit_call_with_imm_arg(emit, MP_F_NEW_SLICE, mp_obj_new_slice, (machine_uint_t)mp_const_none, REG_ARG_3); // arg3 = step + emit_post_push_reg(emit, VTYPE_PYOBJ, REG_RET); + } else { + assert(n_args == 3); + vtype_kind_t vtype_start, vtype_stop, vtype_step; + emit_pre_pop_reg_reg_reg(emit, &vtype_step, REG_ARG_3, &vtype_stop, REG_ARG_2, &vtype_start, REG_ARG_1); // arg1 = start, arg2 = stop, arg3 = step + assert(vtype_start == VTYPE_PYOBJ); + assert(vtype_stop == VTYPE_PYOBJ); + assert(vtype_step == VTYPE_PYOBJ); + emit_call(emit, MP_F_NEW_SLICE, mp_obj_new_slice); + emit_post_push_reg(emit, VTYPE_PYOBJ, REG_RET); + } } STATIC void emit_native_unpack_sequence(emit_t *emit, int n_args) { - // TODO this is untested DEBUG_printf("unpack_sequence %d\n", n_args); vtype_kind_t vtype_base; emit_pre_pop_reg(emit, &vtype_base, REG_ARG_1); // arg1 = seq @@ -1253,13 +1295,12 @@ STATIC void emit_native_unpack_sequence(emit_t *emit, int n_args) { } STATIC void emit_native_unpack_ex(emit_t *emit, int n_left, int n_right) { - // TODO this is untested DEBUG_printf("unpack_ex %d %d\n", n_left, n_right); vtype_kind_t vtype_base; emit_pre_pop_reg(emit, &vtype_base, REG_ARG_1); // arg1 = seq assert(vtype_base == VTYPE_PYOBJ); - emit_get_stack_pointer_to_reg_for_push(emit, REG_ARG_3, n_left + n_right); // arg3 = dest ptr - emit_call_with_imm_arg(emit, MP_F_UNPACK_EX, mp_unpack_ex, n_left + n_right, REG_ARG_2); // arg2 = n_left + n_right + emit_get_stack_pointer_to_reg_for_push(emit, REG_ARG_3, n_left + n_right + 1); // arg3 = dest ptr + emit_call_with_imm_arg(emit, MP_F_UNPACK_EX, mp_unpack_ex, n_left | (n_right << 8), REG_ARG_2); // arg2 = n_left + n_right } STATIC void emit_native_make_function(emit_t *emit, scope_t *scope, uint n_pos_defaults, uint n_kw_defaults) { @@ -1368,9 +1409,16 @@ STATIC void emit_native_return_value(emit_t *emit) { } STATIC void emit_native_raise_varargs(emit_t *emit, int n_args) { - // call runtime - assert(0); + assert(n_args == 1); + vtype_kind_t vtype_err; + emit_pre_pop_reg(emit, &vtype_err, REG_ARG_1); // arg1 = object to raise + assert(vtype_err == VTYPE_PYOBJ); + emit_call(emit, 0, mp_make_raise_obj); // TODO need to add function to runtime table + emit_post_push_reg(emit, VTYPE_PYOBJ, REG_RET); + emit_pre_pop_reg(emit, &vtype_err, REG_ARG_1); + emit_call(emit, 0, nlr_jump); // TODO need to add function to runtime table } + STATIC void emit_native_yield_value(emit_t *emit) { // not supported (for now) assert(0); @@ -1380,6 +1428,21 @@ STATIC void emit_native_yield_from(emit_t *emit) { assert(0); } +STATIC void emit_native_start_except_handler(emit_t *emit) { + // This instruction follows an nlr_pop, so the stack counter is back to zero, when really + // it should be up by a whole nlr_buf_t. We then want to pop the nlr_buf_t here, but save + // the first 2 elements, so we can get the thrown value. + adjust_stack(emit, 2); + vtype_kind_t vtype_nlr; + emit_pre_pop_reg(emit, &vtype_nlr, REG_ARG_1); // get the thrown value + emit_pre_pop_discard(emit, &vtype_nlr); // discard the linked-list pointer in the nlr_buf + emit_post_push_reg_reg_reg(emit, VTYPE_PYOBJ, REG_ARG_1, VTYPE_PYOBJ, REG_ARG_1, VTYPE_PYOBJ, REG_ARG_1); // push the 3 exception items +} + +STATIC void emit_native_end_except_handler(emit_t *emit) { + adjust_stack(emit, -3); // stack adjust (not sure why it's this much...) +} + const emit_method_table_t EXPORT_FUN(method_table) = { emit_native_set_viper_types, emit_native_start_pass, @@ -1465,6 +1528,9 @@ const emit_method_table_t EXPORT_FUN(method_table) = { emit_native_raise_varargs, emit_native_yield_value, emit_native_yield_from, + + emit_native_start_except_handler, + emit_native_end_except_handler, }; #endif // (MICROPY_EMIT_X64 && N_X64) || (MICROPY_EMIT_THUMB && N_THUMB) diff --git a/py/emitpass1.c b/py/emitpass1.c index bea1af4dbf..b39597318a 100644 --- a/py/emitpass1.c +++ b/py/emitpass1.c @@ -214,6 +214,10 @@ const emit_method_table_t emit_pass1_method_table = { (void*)emit_pass1_dummy, (void*)emit_pass1_dummy, (void*)emit_pass1_dummy, + + (void*)emit_pass1_dummy, + (void*)emit_pass1_dummy, + #if MICROPY_EMIT_CPYTHON (void*)emit_pass1_dummy, (void*)emit_pass1_dummy, diff --git a/py/lexer.c b/py/lexer.c index 5d1113fb30..8732d64362 100644 --- a/py/lexer.c +++ b/py/lexer.c @@ -502,19 +502,32 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs case 'v': c = 0x0b; break; case 'f': c = 0x0c; break; case 'r': c = 0x0d; break; + case 'u': + case 'U': + if (is_bytes) { + // b'\u1234' == b'\\u1234' + vstr_add_char(&lex->vstr, '\\'); + break; + } + // Otherwise fall through. case 'x': { uint num = 0; - if (!get_hex(lex, 2, &num)) { + if (!get_hex(lex, (c == 'x' ? 2 : c == 'u' ? 4 : 8), &num)) { // TODO error message assert(0); } c = num; break; } - case 'N': break; // TODO \N{name} only in strings - case 'u': break; // TODO \uxxxx only in strings - case 'U': break; // TODO \Uxxxxxxxx only in strings + case 'N': + // Supporting '\N{LATIN SMALL LETTER A}' == 'a' would require keeping the + // entire Unicode name table in the core. As of Unicode 6.3.0, that's nearly + // 3MB of text; even gzip-compressed and with minimal structure, it'll take + // roughly half a meg of storage. This form of Unicode escape may be added + // later on, but it's definitely not a priority right now. -- CJA 20140607 + assert(!"Unicode name escapes not supported"); + break; default: if (c >= '0' && c <= '7') { // Octal sequence, 1-3 chars @@ -533,7 +546,13 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs } } if (c != MP_LEXER_CHAR_EOF) { - vstr_add_char(&lex->vstr, c); + if (c < 0x110000 && !is_bytes) { + vstr_add_char(&lex->vstr, c); + } else if (c < 0x100 && is_bytes) { + vstr_add_byte(&lex->vstr, c); + } else { + assert(!"TODO: Throw an error, invalid escape code probably"); + } } } else { vstr_add_char(&lex->vstr, CUR_CHAR(lex)); @@ -100,7 +100,9 @@ bool unichar_isupper(unichar c); bool unichar_islower(unichar c); unichar unichar_tolower(unichar c); unichar unichar_toupper(unichar c); -#define unichar_charlen(s, bytelen) (bytelen) +machine_uint_t unichar_charlen(const char *str, machine_uint_t len); +#define UTF8_IS_NONASCII(ch) ((ch) & 0x80) +#define UTF8_IS_CONT(ch) (((ch) & 0xC0) == 0x80) /** variable string *********************************************/ @@ -164,4 +166,18 @@ int DEBUG_printf(const char *fmt, ...); extern uint mp_verbose_flag; +// This is useful for unicode handling. Some CPU archs has +// special instructions for efficient implentation of this +// function (e.g. CLZ on ARM). +// NOTE: this function is unused at the moment +#ifndef count_lead_ones +static inline uint count_lead_ones(byte val) { + uint c = 0; + for (byte mask = 0x80; val & mask; mask >>= 1) { + c++; + } + return c; +} +#endif + #endif // _INCLUDED_MINILIB_H diff --git a/py/mpconfig.h b/py/mpconfig.h index 5b27b910e7..3a9d342ea3 100644 --- a/py/mpconfig.h +++ b/py/mpconfig.h @@ -249,6 +249,11 @@ typedef double mp_float_t; /*****************************************************************************/ /* Fine control over Python builtins, classes, modules, etc */ +// Whether str object is proper unicode +#ifndef MICROPY_PY_BUILTINS_STR_UNICODE +#define MICROPY_PY_BUILTINS_STR_UNICODE (0) +#endif + // Whether to support bytearray object #ifndef MICROPY_PY_BUILTINS_BYTEARRAY #define MICROPY_PY_BUILTINS_BYTEARRAY (1) @@ -399,3 +404,8 @@ typedef double mp_float_t; #ifndef NORETURN #define NORETURN __attribute__((noreturn)) #endif + +// Modifier for weak functions +#ifndef MP_WEAK +#define MP_WEAK __attribute__((weak)) +#endif @@ -61,7 +61,7 @@ void printf_wrapper(void *env, const char *fmt, ...) { void mp_obj_print_helper(void (*print)(void *env, const char *fmt, ...), void *env, mp_obj_t o_in, mp_print_kind_t kind) { // There can be data structures nested too deep, or just recursive - STACK_CHECK(); + MP_STACK_CHECK(); #if !NDEBUG if (o_in == NULL) { print(env, "(nil)"); @@ -357,7 +357,12 @@ uint mp_get_index(const mp_obj_type_t *type, machine_uint_t len, mp_obj_t index, // may return MP_OBJ_NULL mp_obj_t mp_obj_len_maybe(mp_obj_t o_in) { - if (MP_OBJ_IS_STR(o_in) || MP_OBJ_IS_TYPE(o_in, &mp_type_bytes)) { + if ( +#if !MICROPY_PY_BUILTINS_STR_UNICODE + // It's simple - unicode is slow, non-unicode is fast + MP_OBJ_IS_STR(o_in) || +#endif + MP_OBJ_IS_TYPE(o_in, &mp_type_bytes)) { return MP_OBJ_NEW_SMALL_INT((machine_int_t)mp_obj_str_get_len(o_in)); } else { mp_obj_type_t *type = mp_obj_get_type(o_in); diff --git a/py/objfun.c b/py/objfun.c index f75e9142a2..74e959f9d3 100644 --- a/py/objfun.c +++ b/py/objfun.c @@ -356,7 +356,7 @@ continue2:; STATIC mp_obj_t fun_bc_call(mp_obj_t self_in, uint n_args, uint n_kw, const mp_obj_t *args) { - STACK_CHECK(); + MP_STACK_CHECK(); DEBUG_printf("Input n_args: %d, n_kw: %d\n", n_args, n_kw); DEBUG_printf("Input pos args: "); diff --git a/py/objstr.c b/py/objstr.c index c84d7c900d..b13517b63d 100644 --- a/py/objstr.c +++ b/py/objstr.c @@ -32,6 +32,7 @@ #include "mpconfig.h" #include "nlr.h" #include "misc.h" +#include "unicode.h" #include "qstr.h" #include "obj.h" #include "runtime0.h" @@ -43,16 +44,7 @@ STATIC mp_obj_t str_modulo_format(mp_obj_t pattern, uint n_args, const mp_obj_t *args, mp_obj_t dict); const mp_obj_t mp_const_empty_bytes; -// use this macro to extract the string hash -#define GET_STR_HASH(str_obj_in, str_hash) uint str_hash; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_hash = qstr_hash(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_hash = ((mp_obj_str_t*)str_obj_in)->hash; } - -// use this macro to extract the string length -#define GET_STR_LEN(str_obj_in, str_len) uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_len = qstr_len(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; } - -// use this macro to extract the string data and length -#define GET_STR_DATA_LEN(str_obj_in, str_data, str_len) const byte *str_data; uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_data = qstr_data(MP_OBJ_QSTR_VALUE(str_obj_in), &str_len); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; str_data = ((mp_obj_str_t*)str_obj_in)->data; } - -STATIC mp_obj_t mp_obj_new_str_iterator(mp_obj_t str); +mp_obj_t mp_obj_new_str_iterator(mp_obj_t str); STATIC mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str); STATIC NORETURN void bad_implicit_conversion(mp_obj_t self_in); STATIC NORETURN void arg_type_mixup(); @@ -259,7 +251,7 @@ STATIC const byte *find_subbytes(const byte *haystack, machine_uint_t hlen, cons return NULL; } -STATIC mp_obj_t str_binary_op(int op, mp_obj_t lhs_in, mp_obj_t rhs_in) { +mp_obj_t mp_obj_str_binary_op(int op, mp_obj_t lhs_in, mp_obj_t rhs_in) { GET_STR_DATA_LEN(lhs_in, lhs_data, lhs_len); mp_obj_type_t *lhs_type = mp_obj_get_type(lhs_in); mp_obj_type_t *rhs_type = mp_obj_get_type(rhs_in); @@ -352,11 +344,14 @@ uncomparable: return MP_OBJ_NULL; // op not supported } +#if !MICROPY_PY_BUILTINS_STR_UNICODE +// objstrunicode defines own version const byte *str_index_to_ptr(const mp_obj_type_t *type, const byte *self_data, uint self_len, mp_obj_t index, bool is_slice) { machine_uint_t index_val = mp_get_index(type, self_len, index, is_slice); return self_data + index_val; } +#endif STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) { mp_obj_type_t *type = mp_obj_get_type(self_in); @@ -571,7 +566,6 @@ STATIC mp_obj_t str_rsplit(uint n_args, const mp_obj_t *args) { return res; } - STATIC mp_obj_t str_finder(uint n_args, const mp_obj_t *args, machine_int_t direction, bool is_index) { const mp_obj_type_t *self_type = mp_obj_get_type(args[0]); assert(2 <= n_args && n_args <= 4); @@ -600,6 +594,11 @@ STATIC mp_obj_t str_finder(uint n_args, const mp_obj_t *args, machine_int_t dire } } else { // found + #if MICROPY_PY_BUILTINS_STR_UNICODE + if (self_type == &mp_type_str) { + return MP_OBJ_NEW_SMALL_INT(utf8_ptr_to_index(haystack, p)); + } + #endif return MP_OBJ_NEW_SMALL_INT(p - haystack); } } @@ -1610,7 +1609,7 @@ STATIC mp_obj_t str_encode(uint n_args, const mp_obj_t *args) { } #endif -STATIC machine_int_t str_get_buffer(mp_obj_t self_in, mp_buffer_info_t *bufinfo, int flags) { +machine_int_t mp_obj_str_get_buffer(mp_obj_t self_in, mp_buffer_info_t *bufinfo, int flags) { if (flags == MP_BUFFER_READ) { GET_STR_DATA_LEN(self_in, str_data, str_len); bufinfo->buf = (void*)str_data; @@ -1627,38 +1626,45 @@ STATIC machine_int_t str_get_buffer(mp_obj_t self_in, mp_buffer_info_t *bufinfo, } #if MICROPY_CPYTHON_COMPAT -STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(bytes_decode_obj, 1, 3, bytes_decode); -STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_encode_obj, 1, 3, str_encode); +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(bytes_decode_obj, 1, 3, bytes_decode); +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_encode_obj, 1, 3, str_encode); #endif -STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_find_obj, 2, 4, str_find); -STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rfind_obj, 2, 4, str_rfind); -STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_index_obj, 2, 4, str_index); -STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rindex_obj, 2, 4, str_rindex); -STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_join_obj, str_join); -STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_split_obj, 1, 3, str_split); -STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rsplit_obj, 1, 3, str_rsplit); -STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_startswith_obj, 2, 3, str_startswith); -STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_endswith_obj, 2, 3, str_endswith); -STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_strip_obj, 1, 2, str_strip); -STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_lstrip_obj, 1, 2, str_lstrip); -STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rstrip_obj, 1, 2, str_rstrip); -STATIC MP_DEFINE_CONST_FUN_OBJ_VAR(str_format_obj, 1, mp_obj_str_format); -STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_replace_obj, 3, 4, str_replace); -STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_count_obj, 2, 4, str_count); -STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_partition_obj, str_partition); -STATIC MP_DEFINE_CONST_FUN_OBJ_2(str_rpartition_obj, str_rpartition); -STATIC MP_DEFINE_CONST_FUN_OBJ_1(str_lower_obj, str_lower); -STATIC MP_DEFINE_CONST_FUN_OBJ_1(str_upper_obj, str_upper); -STATIC MP_DEFINE_CONST_FUN_OBJ_1(str_isspace_obj, str_isspace); -STATIC MP_DEFINE_CONST_FUN_OBJ_1(str_isalpha_obj, str_isalpha); -STATIC MP_DEFINE_CONST_FUN_OBJ_1(str_isdigit_obj, str_isdigit); -STATIC MP_DEFINE_CONST_FUN_OBJ_1(str_isupper_obj, str_isupper); -STATIC MP_DEFINE_CONST_FUN_OBJ_1(str_islower_obj, str_islower); +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_find_obj, 2, 4, str_find); +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rfind_obj, 2, 4, str_rfind); +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_index_obj, 2, 4, str_index); +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rindex_obj, 2, 4, str_rindex); +MP_DEFINE_CONST_FUN_OBJ_2(str_join_obj, str_join); +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_split_obj, 1, 3, str_split); +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rsplit_obj, 1, 3, str_rsplit); +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_startswith_obj, 2, 3, str_startswith); +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_endswith_obj, 2, 3, str_endswith); +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_strip_obj, 1, 2, str_strip); +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_lstrip_obj, 1, 2, str_lstrip); +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rstrip_obj, 1, 2, str_rstrip); +MP_DEFINE_CONST_FUN_OBJ_VAR(str_format_obj, 1, mp_obj_str_format); +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_replace_obj, 3, 4, str_replace); +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_count_obj, 2, 4, str_count); +MP_DEFINE_CONST_FUN_OBJ_2(str_partition_obj, str_partition); +MP_DEFINE_CONST_FUN_OBJ_2(str_rpartition_obj, str_rpartition); +MP_DEFINE_CONST_FUN_OBJ_1(str_lower_obj, str_lower); +MP_DEFINE_CONST_FUN_OBJ_1(str_upper_obj, str_upper); +MP_DEFINE_CONST_FUN_OBJ_1(str_isspace_obj, str_isspace); +MP_DEFINE_CONST_FUN_OBJ_1(str_isalpha_obj, str_isalpha); +MP_DEFINE_CONST_FUN_OBJ_1(str_isdigit_obj, str_isdigit); +MP_DEFINE_CONST_FUN_OBJ_1(str_isupper_obj, str_isupper); +MP_DEFINE_CONST_FUN_OBJ_1(str_islower_obj, str_islower); STATIC const mp_map_elem_t str_locals_dict_table[] = { #if MICROPY_CPYTHON_COMPAT { MP_OBJ_NEW_QSTR(MP_QSTR_decode), (mp_obj_t)&bytes_decode_obj }, + #if !MICROPY_PY_BUILTINS_STR_UNICODE + // If we have separate unicode type, then here we have methods only + // for bytes type, and it should not have encode() methods. Otherwise, + // we have non-compliant-but-practical bytestring type, which shares + // method table with bytes, so they both have encode() and decode() + // methods (which should do type checking at runtime). { MP_OBJ_NEW_QSTR(MP_QSTR_encode), (mp_obj_t)&str_encode_obj }, + #endif #endif { MP_OBJ_NEW_QSTR(MP_QSTR_find), (mp_obj_t)&str_find_obj }, { MP_OBJ_NEW_QSTR(MP_QSTR_rfind), (mp_obj_t)&str_rfind_obj }, @@ -1688,17 +1694,19 @@ STATIC const mp_map_elem_t str_locals_dict_table[] = { STATIC MP_DEFINE_CONST_DICT(str_locals_dict, str_locals_dict_table); +#if !MICROPY_PY_BUILTINS_STR_UNICODE const mp_obj_type_t mp_type_str = { { &mp_type_type }, .name = MP_QSTR_str, .print = str_print, .make_new = str_make_new, - .binary_op = str_binary_op, + .binary_op = mp_obj_str_binary_op, .subscr = str_subscr, .getiter = mp_obj_new_str_iterator, - .buffer_p = { .get_buffer = str_get_buffer }, + .buffer_p = { .get_buffer = mp_obj_str_get_buffer }, .locals_dict = (mp_obj_t)&str_locals_dict, }; +#endif // Reuses most of methods from str const mp_obj_type_t mp_type_bytes = { @@ -1706,10 +1714,10 @@ const mp_obj_type_t mp_type_bytes = { .name = MP_QSTR_bytes, .print = str_print, .make_new = bytes_make_new, - .binary_op = str_binary_op, + .binary_op = mp_obj_str_binary_op, .subscr = str_subscr, .getiter = mp_obj_new_bytes_iterator, - .buffer_p = { .get_buffer = str_get_buffer }, + .buffer_p = { .get_buffer = mp_obj_str_get_buffer }, .locals_dict = (mp_obj_t)&str_locals_dict, }; @@ -1866,6 +1874,7 @@ typedef struct _mp_obj_str_it_t { machine_uint_t cur; } mp_obj_str_it_t; +#if !MICROPY_PY_BUILTINS_STR_UNICODE STATIC mp_obj_t str_it_iternext(mp_obj_t self_in) { mp_obj_str_it_t *self = self_in; GET_STR_DATA_LEN(self->str, str, len); @@ -1885,6 +1894,15 @@ STATIC const mp_obj_type_t mp_type_str_it = { .iternext = str_it_iternext, }; +mp_obj_t mp_obj_new_str_iterator(mp_obj_t str) { + mp_obj_str_it_t *o = m_new_obj(mp_obj_str_it_t); + o->base.type = &mp_type_str_it; + o->str = str; + o->cur = 0; + return o; +} +#endif + STATIC mp_obj_t bytes_it_iternext(mp_obj_t self_in) { mp_obj_str_it_t *self = self_in; GET_STR_DATA_LEN(self->str, str, len); @@ -1904,14 +1922,6 @@ STATIC const mp_obj_type_t mp_type_bytes_it = { .iternext = bytes_it_iternext, }; -mp_obj_t mp_obj_new_str_iterator(mp_obj_t str) { - mp_obj_str_it_t *o = m_new_obj(mp_obj_str_it_t); - o->base.type = &mp_type_str_it; - o->str = str; - o->cur = 0; - return o; -} - mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str) { mp_obj_str_it_t *o = m_new_obj(mp_obj_str_it_t); o->base.type = &mp_type_bytes_it; diff --git a/py/objstr.h b/py/objstr.h index 5be137d36d..515890c6e1 100644 --- a/py/objstr.h +++ b/py/objstr.h @@ -35,5 +35,53 @@ typedef struct _mp_obj_str_t { #define MP_DEFINE_STR_OBJ(obj_name, str) mp_obj_str_t obj_name = {{&mp_type_str}, 0, sizeof(str) - 1, (const byte*)str}; +// use this macro to extract the string hash +#define GET_STR_HASH(str_obj_in, str_hash) \ + uint str_hash; if (MP_OBJ_IS_QSTR(str_obj_in)) \ + { str_hash = qstr_hash(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_hash = ((mp_obj_str_t*)str_obj_in)->hash; } + +// use this macro to extract the string length +#define GET_STR_LEN(str_obj_in, str_len) \ + uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) \ + { str_len = qstr_len(MP_OBJ_QSTR_VALUE(str_obj_in)); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; } + +// use this macro to extract the string data and length +#define GET_STR_DATA_LEN(str_obj_in, str_data, str_len) \ + const byte *str_data; uint str_len; if (MP_OBJ_IS_QSTR(str_obj_in)) \ + { str_data = qstr_data(MP_OBJ_QSTR_VALUE(str_obj_in), &str_len); } \ + else { str_len = ((mp_obj_str_t*)str_obj_in)->len; str_data = ((mp_obj_str_t*)str_obj_in)->data; } + mp_obj_t mp_obj_str_format(uint n_args, const mp_obj_t *args); mp_obj_t mp_obj_new_str_of_type(const mp_obj_type_t *type, const byte* data, uint len); + +mp_obj_t mp_obj_str_binary_op(int op, mp_obj_t lhs_in, mp_obj_t rhs_in); +machine_int_t mp_obj_str_get_buffer(mp_obj_t self_in, mp_buffer_info_t *bufinfo, int flags); + +const byte *str_index_to_ptr(const mp_obj_type_t *type, const byte *self_data, uint self_len, + mp_obj_t index, bool is_slice); + +MP_DECLARE_CONST_FUN_OBJ(str_encode_obj); +MP_DECLARE_CONST_FUN_OBJ(str_find_obj); +MP_DECLARE_CONST_FUN_OBJ(str_rfind_obj); +MP_DECLARE_CONST_FUN_OBJ(str_index_obj); +MP_DECLARE_CONST_FUN_OBJ(str_rindex_obj); +MP_DECLARE_CONST_FUN_OBJ(str_join_obj); +MP_DECLARE_CONST_FUN_OBJ(str_split_obj); +MP_DECLARE_CONST_FUN_OBJ(str_rsplit_obj); +MP_DECLARE_CONST_FUN_OBJ(str_startswith_obj); +MP_DECLARE_CONST_FUN_OBJ(str_endswith_obj); +MP_DECLARE_CONST_FUN_OBJ(str_strip_obj); +MP_DECLARE_CONST_FUN_OBJ(str_lstrip_obj); +MP_DECLARE_CONST_FUN_OBJ(str_rstrip_obj); +MP_DECLARE_CONST_FUN_OBJ(str_format_obj); +MP_DECLARE_CONST_FUN_OBJ(str_replace_obj); +MP_DECLARE_CONST_FUN_OBJ(str_count_obj); +MP_DECLARE_CONST_FUN_OBJ(str_partition_obj); +MP_DECLARE_CONST_FUN_OBJ(str_rpartition_obj); +MP_DECLARE_CONST_FUN_OBJ(str_lower_obj); +MP_DECLARE_CONST_FUN_OBJ(str_upper_obj); +MP_DECLARE_CONST_FUN_OBJ(str_isspace_obj); +MP_DECLARE_CONST_FUN_OBJ(str_isalpha_obj); +MP_DECLARE_CONST_FUN_OBJ(str_isdigit_obj); +MP_DECLARE_CONST_FUN_OBJ(str_isupper_obj); +MP_DECLARE_CONST_FUN_OBJ(str_islower_obj); diff --git a/py/objstrunicode.c b/py/objstrunicode.c new file mode 100644 index 0000000000..d96ce0a552 --- /dev/null +++ b/py/objstrunicode.c @@ -0,0 +1,359 @@ +/* + * This file is part of the Micro Python project, http://micropython.org/ + * + * The MIT License (MIT) + * + * Copyright (c) 2013, 2014 Damien P. George + * Copyright (c) 2014 Paul Sokolovsky + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <stdbool.h> +#include <string.h> +#include <assert.h> + +#include "mpconfig.h" +#include "nlr.h" +#include "misc.h" +#include "qstr.h" +#include "obj.h" +#include "runtime0.h" +#include "runtime.h" +#include "pfenv.h" +#include "objstr.h" +#include "objlist.h" + +#if MICROPY_PY_BUILTINS_STR_UNICODE + +STATIC mp_obj_t mp_obj_new_str_iterator(mp_obj_t str); + +/******************************************************************************/ +/* str */ + +STATIC void uni_print_quoted(void (*print)(void *env, const char *fmt, ...), void *env, const byte *str_data, uint str_len) { + // this escapes characters, but it will be very slow to print (calling print many times) + bool has_single_quote = false; + bool has_double_quote = false; + for (const byte *s = str_data, *top = str_data + str_len; !has_double_quote && s < top; s++) { + if (*s == '\'') { + has_single_quote = true; + } else if (*s == '"') { + has_double_quote = true; + } + } + int quote_char = '\''; + if (has_single_quote && !has_double_quote) { + quote_char = '"'; + } + print(env, "%c", quote_char); + const byte *s = str_data, *top = str_data + str_len; + while (s < top) { + unichar ch; + ch = utf8_get_char(s); + s = utf8_next_char(s); + if (ch == quote_char) { + print(env, "\\%c", quote_char); + } else if (ch == '\\') { + print(env, "\\\\"); + } else if (32 <= ch && ch <= 126) { + print(env, "%c", ch); + } else if (ch == '\n') { + print(env, "\\n"); + } else if (ch == '\r') { + print(env, "\\r"); + } else if (ch == '\t') { + print(env, "\\t"); + } else if (ch < 0x100) { + print(env, "\\x%02x", ch); + } else if (ch < 0x10000) { + print(env, "\\u%04x", ch); + } else { + print(env, "\\U%08x", ch); + } + } + print(env, "%c", quote_char); +} + +STATIC void uni_print(void (*print)(void *env, const char *fmt, ...), void *env, mp_obj_t self_in, mp_print_kind_t kind) { + GET_STR_DATA_LEN(self_in, str_data, str_len); + if (kind == PRINT_STR) { + print(env, "%.*s", str_len, str_data); + } else { + uni_print_quoted(print, env, str_data, str_len); + } +} + +STATIC mp_obj_t uni_unary_op(int op, mp_obj_t self_in) { + GET_STR_DATA_LEN(self_in, str_data, str_len); + switch (op) { + case MP_UNARY_OP_BOOL: + return MP_BOOL(str_len != 0); + case MP_UNARY_OP_LEN: + return MP_OBJ_NEW_SMALL_INT(unichar_charlen((const char *)str_data, str_len)); + default: + return MP_OBJ_NULL; // op not supported + } +} + +STATIC mp_obj_t str_make_new(mp_obj_t type_in, uint n_args, uint n_kw, const mp_obj_t *args) { +#if MICROPY_CPYTHON_COMPAT + if (n_kw != 0) { + mp_arg_error_unimpl_kw(); + } +#endif + + switch (n_args) { + case 0: + return MP_OBJ_NEW_QSTR(MP_QSTR_); + + case 1: + { + vstr_t *vstr = vstr_new(); + mp_obj_print_helper((void (*)(void*, const char*, ...))vstr_printf, vstr, args[0], PRINT_STR); + mp_obj_t s = mp_obj_new_str(vstr->buf, vstr->len, false); + vstr_free(vstr); + return s; + } + + case 2: + case 3: + { + // TODO: validate 2nd/3rd args + if (!MP_OBJ_IS_TYPE(args[0], &mp_type_bytes)) { + nlr_raise(mp_obj_new_exception_msg(&mp_type_TypeError, "bytes expected")); + } + GET_STR_DATA_LEN(args[0], str_data, str_len); + GET_STR_HASH(args[0], str_hash); + mp_obj_str_t *o = mp_obj_new_str_of_type(&mp_type_str, NULL, str_len); + o->data = str_data; + o->hash = str_hash; + return o; + } + + default: + nlr_raise(mp_obj_new_exception_msg(&mp_type_TypeError, "str takes at most 3 arguments")); + } +} + +// Convert an index into a pointer to its lead byte. Out of bounds indexing will raise IndexError or +// be capped to the first/last character of the string, depending on is_slice. +const byte *str_index_to_ptr(const mp_obj_type_t *type, const byte *self_data, uint self_len, + mp_obj_t index, bool is_slice) { + machine_int_t i; + // Copied from mp_get_index; I don't want bounds checking, just give me + // the integer as-is. (I can't bounds-check without scanning the whole + // string; an out-of-bounds index will be caught in the loops below.) + if (MP_OBJ_IS_SMALL_INT(index)) { + i = MP_OBJ_SMALL_INT_VALUE(index); + } else if (!mp_obj_get_int_maybe(index, &i)) { + nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_TypeError, "string indices must be integers, not %s", mp_obj_get_type_str(index))); + } + const byte *s, *top = self_data + self_len; + if (i < 0) + { + // Negative indexing is performed by counting from the end of the string. + for (s = top - 1; i; --s) { + if (s < self_data) { + if (is_slice) { + return self_data; + } + nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_IndexError, "string index out of range")); + } + if (!UTF8_IS_CONT(*s)) { + ++i; + } + } + ++s; + } else if (!i) { + return self_data; // Shortcut - str[0] is its base pointer + } else { + // Positive indexing, correspondingly, counts from the start of the string. + // It's assumed that negative indexing will generally be used with small + // absolute values (eg str[-1], not str[-1000000]), which means it'll be + // more efficient this way. + for (s = self_data; true; ++s) { + if (s >= top) { + if (is_slice) { + return top; + } + nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_IndexError, "string index out of range")); + } + while (UTF8_IS_CONT(*s)) { + ++s; + } + if (!i--) { + return s; + } + } + } + return s; +} + +STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) { + mp_obj_type_t *type = mp_obj_get_type(self_in); + GET_STR_DATA_LEN(self_in, self_data, self_len); + if (value == MP_OBJ_SENTINEL) { + // load +#if MICROPY_PY_BUILTINS_SLICE + if (MP_OBJ_IS_TYPE(index, &mp_type_slice)) { + mp_obj_t ostart, ostop, ostep; + mp_obj_slice_get(index, &ostart, &ostop, &ostep); + if (ostep != mp_const_none && ostep != MP_OBJ_NEW_SMALL_INT(1)) { + nlr_raise(mp_obj_new_exception_msg(&mp_type_NotImplementedError, + "only slices with step=1 (aka None) are supported")); + } + + if (type == &mp_type_bytes) { + machine_int_t start = 0, stop = self_len; + if (ostart != mp_const_none) { + start = MP_OBJ_SMALL_INT_VALUE(ostart); + if (start < 0) { + start = self_len + start; + } + } + if (ostop != mp_const_none) { + stop = MP_OBJ_SMALL_INT_VALUE(ostop); + if (stop < 0) { + stop = self_len + stop; + } + } + return mp_obj_new_str_of_type(type, self_data + start, stop - start); + } + const byte *pstart, *pstop; + if (ostart != mp_const_none) { + pstart = str_index_to_ptr(type, self_data, self_len, ostart, true); + } else { + pstart = self_data; + } + if (ostop != mp_const_none) { + // pstop will point just after the stop character. This depends on + // the \0 at the end of the string. + pstop = str_index_to_ptr(type, self_data, self_len, ostop, true); + } else { + pstop = self_data + self_len; + } + if (pstop < pstart) { + return MP_OBJ_NEW_QSTR(MP_QSTR_); + } + return mp_obj_new_str_of_type(type, (const byte *)pstart, pstop - pstart); + } +#endif + if (type == &mp_type_bytes) { + uint index_val = mp_get_index(type, self_len, index, false); + return MP_OBJ_NEW_SMALL_INT((mp_small_int_t)self_data[index_val]); + } + const byte *s = str_index_to_ptr(type, self_data, self_len, index, false); + int len = 1; + if (UTF8_IS_NONASCII(*s)) { + // Count the number of 1 bits (after the first) + for (char mask = 0x40; *s & mask; mask >>= 1) { + ++len; + } + } + return mp_obj_new_str((const char*)s, len, true); // This will create a one-character string + } else { + return MP_OBJ_NULL; // op not supported + } +} + +STATIC const mp_map_elem_t str_locals_dict_table[] = { +#if MICROPY_CPYTHON_COMPAT + { MP_OBJ_NEW_QSTR(MP_QSTR_encode), (mp_obj_t)&str_encode_obj }, +#endif + { MP_OBJ_NEW_QSTR(MP_QSTR_find), (mp_obj_t)&str_find_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_rfind), (mp_obj_t)&str_rfind_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_index), (mp_obj_t)&str_index_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_rindex), (mp_obj_t)&str_rindex_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_join), (mp_obj_t)&str_join_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_split), (mp_obj_t)&str_split_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_rsplit), (mp_obj_t)&str_rsplit_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_startswith), (mp_obj_t)&str_startswith_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_endswith), (mp_obj_t)&str_endswith_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_strip), (mp_obj_t)&str_strip_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_lstrip), (mp_obj_t)&str_lstrip_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_rstrip), (mp_obj_t)&str_rstrip_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_format), (mp_obj_t)&str_format_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_replace), (mp_obj_t)&str_replace_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_count), (mp_obj_t)&str_count_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_partition), (mp_obj_t)&str_partition_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_rpartition), (mp_obj_t)&str_rpartition_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_lower), (mp_obj_t)&str_lower_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_upper), (mp_obj_t)&str_upper_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_isspace), (mp_obj_t)&str_isspace_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_isalpha), (mp_obj_t)&str_isalpha_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_isdigit), (mp_obj_t)&str_isdigit_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_isupper), (mp_obj_t)&str_isupper_obj }, + { MP_OBJ_NEW_QSTR(MP_QSTR_islower), (mp_obj_t)&str_islower_obj }, +}; + +STATIC MP_DEFINE_CONST_DICT(str_locals_dict, str_locals_dict_table); + +const mp_obj_type_t mp_type_str = { + { &mp_type_type }, + .name = MP_QSTR_str, + .print = uni_print, + .make_new = str_make_new, + .unary_op = uni_unary_op, + .binary_op = mp_obj_str_binary_op, + .subscr = str_subscr, + .getiter = mp_obj_new_str_iterator, + .buffer_p = { .get_buffer = mp_obj_str_get_buffer }, + .locals_dict = (mp_obj_t)&str_locals_dict, +}; + +/******************************************************************************/ +/* str iterator */ + +typedef struct _mp_obj_str_it_t { + mp_obj_base_t base; + mp_obj_t str; + machine_uint_t cur; +} mp_obj_str_it_t; + +STATIC mp_obj_t str_it_iternext(mp_obj_t self_in) { + mp_obj_str_it_t *self = self_in; + GET_STR_DATA_LEN(self->str, str, len); + if (self->cur < len) { + const byte *cur = str + self->cur; + const byte *end = utf8_next_char(str + self->cur); + mp_obj_t o_out = mp_obj_new_str((const char*)cur, end - cur, true); + self->cur += end - cur; + return o_out; + } else { + return MP_OBJ_STOP_ITERATION; + } +} + +STATIC const mp_obj_type_t mp_type_str_it = { + { &mp_type_type }, + .name = MP_QSTR_iterator, + .getiter = mp_identity, + .iternext = str_it_iternext, +}; + +mp_obj_t mp_obj_new_str_iterator(mp_obj_t str) { + mp_obj_str_it_t *o = m_new_obj(mp_obj_str_it_t); + o->base.type = &mp_type_str_it; + o->str = str; + o->cur = 0; + return o; +} + +#endif // MICROPY_PY_BUILTINS_STR_UNICODE @@ -75,6 +75,7 @@ PY_O_BASENAME = \ objset.o \ objslice.o \ objstr.o \ + objstrunicode.o \ objstringio.o \ objtuple.o \ objtype.o \ diff --git a/py/runtime.c b/py/runtime.c index f08ff9ff40..5490bcbac5 100644 --- a/py/runtime.c +++ b/py/runtime.c @@ -70,7 +70,7 @@ const mp_obj_module_t mp_module___main__ = { }; void mp_init(void) { - stack_ctrl_init(); + mp_stack_ctrl_init(); // call port specific initialization if any #ifdef MICROPY_PORT_INIT_FUNC diff --git a/py/stackctrl.c b/py/stackctrl.c index aef9bad9e9..724d54a1be 100644 --- a/py/stackctrl.c +++ b/py/stackctrl.c @@ -35,12 +35,12 @@ // Stack top at the start of program char *stack_top; -void stack_ctrl_init() { +void mp_stack_ctrl_init() { volatile int stack_dummy; stack_top = (char*)&stack_dummy; } -uint stack_usage() { +uint mp_stack_usage() { // Assumes descending stack volatile int stack_dummy; return stack_top - (char*)&stack_dummy; @@ -48,14 +48,14 @@ uint stack_usage() { #if MICROPY_STACK_CHECK -uint stack_limit = 10240; +static uint stack_limit = 10240; -void stack_set_limit(uint limit) { +void mp_stack_set_limit(uint limit) { stack_limit = limit; } -void stack_check() { - if (stack_usage() >= stack_limit) { +void mp_stack_check() { + if (mp_stack_usage() >= stack_limit) { nlr_raise(mp_obj_new_exception_msg(&mp_type_RuntimeError, "maximum recursion depth exceeded")); } } diff --git a/py/stackctrl.h b/py/stackctrl.h index a9a8d2e2d8..92de882bfa 100644 --- a/py/stackctrl.h +++ b/py/stackctrl.h @@ -24,18 +24,18 @@ * THE SOFTWARE. */ -void stack_ctrl_init(); -uint stack_usage(); +void mp_stack_ctrl_init(); +uint mp_stack_usage(); #if MICROPY_STACK_CHECK -void stack_set_limit(uint limit); -void stack_check(); -#define STACK_CHECK() stack_check() +void mp_stack_set_limit(uint limit); +void mp_stack_check(); +#define MP_STACK_CHECK() mp_stack_check() #else -#define stack_set_limit(limit) -#define STACK_CHECK() +#define mp_stack_set_limit(limit) +#define MP_STACK_CHECK() #endif diff --git a/py/stream.c b/py/stream.c index 07a79248ab..cfdea15cca 100644 --- a/py/stream.c +++ b/py/stream.c @@ -33,9 +33,13 @@ #include "qstr.h" #include "obj.h" #include "objstr.h" +#include "runtime.h" #include "stream.h" #if MICROPY_STREAMS_NON_BLOCK #include <errno.h> +#if defined(__MINGW32__) && !defined(__MINGW64_VERSION_MAJOR) +#define EWOULDBLOCK 140 +#endif #endif // This file defines generic Python stream read/write methods which @@ -67,6 +71,13 @@ STATIC mp_obj_t stream_read(uint n_args, const mp_obj_t *args) { if (n_args == 1 || ((sz = mp_obj_get_int(args[1])) == -1)) { return stream_readall(args[0]); } + + #if MICROPY_PY_BUILTINS_STR_UNICODE + if (!o->type->stream_p->is_bytes) { + mp_not_implemented("Reading from unicode text streams by character count"); + } + #endif + byte *buf = m_new(byte, sz); int error; machine_int_t out_sz = o->type->stream_p->read(o, buf, sz, &error); diff --git a/py/unicode.c b/py/unicode.c index 88f835131d..d69e81c8e0 100644 --- a/py/unicode.c +++ b/py/unicode.c @@ -65,14 +65,65 @@ STATIC const uint8_t attr[] = { AT_LO, AT_LO, AT_LO, AT_PR, AT_PR, AT_PR, AT_PR, 0 }; +// TODO: Rename to str_get_char unichar utf8_get_char(const byte *s) { +#if MICROPY_PY_BUILTINS_STR_UNICODE + unichar ord = *s++; + if (!UTF8_IS_NONASCII(ord)) return ord; + ord &= 0x7F; + for (unichar mask = 0x40; ord & mask; mask >>= 1) { + ord &= ~mask; + } + while (UTF8_IS_CONT(*s)) { + ord = (ord << 6) | (*s++ & 0x3F); + } + return ord; +#else return *s; +#endif } +// TODO: Rename to str_next_char const byte *utf8_next_char(const byte *s) { +#if MICROPY_PY_BUILTINS_STR_UNICODE + ++s; + while (UTF8_IS_CONT(*s)) { + ++s; + } + return s; +#else return s + 1; +#endif +} + +machine_uint_t utf8_ptr_to_index(const char *s, const char *ptr) { + machine_uint_t i = 0; + while (ptr > s) { + if (!UTF8_IS_CONT(*--ptr)) { + i++; + } + } + + return i; +} + +// TODO: Rename to str_charlen +machine_uint_t unichar_charlen(const char *str, machine_uint_t len) +{ +#if MICROPY_PY_BUILTINS_STR_UNICODE + machine_uint_t charlen = 0; + for (const char *top = str + len; str < top; ++str) { + if (!UTF8_IS_CONT(*str)) { + ++charlen; + } + } + return charlen; +#else + return len; +#endif } +// Be aware: These unichar_is* functions are actually ASCII-only! bool unichar_isspace(unichar c) { return c < 128 && (attr[c] & FL_SPACE) != 0; } diff --git a/py/unicode.h b/py/unicode.h new file mode 100644 index 0000000000..2468b2fecf --- /dev/null +++ b/py/unicode.h @@ -0,0 +1 @@ +machine_uint_t utf8_ptr_to_index(const byte *s, const byte *ptr); @@ -199,12 +199,48 @@ void vstr_add_byte(vstr_t *vstr, byte b) { } void vstr_add_char(vstr_t *vstr, unichar c) { - // TODO UNICODE +#if MICROPY_PY_BUILTINS_STR_UNICODE + // TODO: Can this be simplified and deduplicated? + // Is it worth just calling vstr_add_len(vstr, 4)? + if (c < 0x80) { + byte *buf = (byte*)vstr_add_len(vstr, 1); + if (buf == NULL) { + return; + } + *buf = (byte)c; + } else if (c < 0x800) { + byte *buf = (byte*)vstr_add_len(vstr, 2); + if (buf == NULL) { + return; + } + buf[0] = (c >> 6) | 0xC0; + buf[1] = (c & 0x3F) | 0x80; + } else if (c < 0x10000) { + byte *buf = (byte*)vstr_add_len(vstr, 3); + if (buf == NULL) { + return; + } + buf[0] = (c >> 12) | 0xE0; + buf[1] = ((c >> 6) & 0x3F) | 0x80; + buf[2] = (c & 0x3F) | 0x80; + } else { + assert(c < 0x110000); + byte *buf = (byte*)vstr_add_len(vstr, 4); + if (buf == NULL) { + return; + } + buf[0] = (c >> 18) | 0xF0; + buf[1] = ((c >> 12) & 0x3F) | 0x80; + buf[2] = ((c >> 6) & 0x3F) | 0x80; + buf[3] = (c & 0x3F) | 0x80; + } +#else byte *buf = (byte*)vstr_add_len(vstr, 1); if (buf == NULL) { return; } buf[0] = c; +#endif } void vstr_add_str(vstr_t *vstr, const char *str) { |