summaryrefslogtreecommitdiffstatshomepage
path: root/py/unicode.c
diff options
context:
space:
mode:
Diffstat (limited to 'py/unicode.c')
-rw-r--r--py/unicode.c51
1 files changed, 51 insertions, 0 deletions
diff --git a/py/unicode.c b/py/unicode.c
index 88f835131d..a91e08078e 100644
--- a/py/unicode.c
+++ b/py/unicode.c
@@ -65,14 +65,65 @@ STATIC const uint8_t attr[] = {
AT_LO, AT_LO, AT_LO, AT_PR, AT_PR, AT_PR, AT_PR, 0
};
+// TODO: Rename to str_get_char
unichar utf8_get_char(const byte *s) {
+#if MICROPY_PY_BUILTINS_STR_UNICODE
+ unichar ord = *s++;
+ if (!UTF8_IS_NONASCII(ord)) return ord;
+ ord &= 0x7F;
+ for (unichar mask = 0x40; ord & mask; mask >>= 1) {
+ ord &= ~mask;
+ }
+ while (UTF8_IS_CONT(*s)) {
+ ord = (ord << 6) | (*s++ & 0x3F);
+ }
+ return ord;
+#else
return *s;
+#endif
}
+// TODO: Rename to str_next_char
const byte *utf8_next_char(const byte *s) {
+#if MICROPY_PY_BUILTINS_STR_UNICODE
+ ++s;
+ while (UTF8_IS_CONT(*s)) {
+ ++s;
+ }
+ return s;
+#else
return s + 1;
+#endif
+}
+
+machine_uint_t utf8_ptr_to_index(const char *s, const char *ptr) {
+ machine_uint_t i = 0;
+ while (ptr > s) {
+ if (!UTF8_IS_CONT(*--ptr)) {
+ i++;
+ }
+ }
+
+ return i;
+}
+
+// TODO: Rename to str_charlen; return machine_uint_t
+uint unichar_charlen(const char *str, uint len)
+{
+#if MICROPY_PY_BUILTINS_STR_UNICODE
+ uint charlen = 0;
+ for (const char *top = str + len; str < top; ++str) {
+ if (!UTF8_IS_CONT(*str)) {
+ ++charlen;
+ }
+ }
+ return charlen;
+#else
+ return len;
+#endif
}
+// Be aware: These unichar_is* functions are actually ASCII-only!
bool unichar_isspace(unichar c) {
return c < 128 && (attr[c] & FL_SPACE) != 0;
}