summaryrefslogtreecommitdiffstatshomepage
diff options
context:
space:
mode:
-rw-r--r--py/lexer.c11
-rw-r--r--tests/unicode/unicode_id.py27
2 files changed, 32 insertions, 6 deletions
diff --git a/py/lexer.c b/py/lexer.c
index 12cb5ae5b2..97c84cf118 100644
--- a/py/lexer.c
+++ b/py/lexer.c
@@ -112,12 +112,11 @@ STATIC bool is_following_odigit(mp_lexer_t *lex) {
return lex->chr1 >= '0' && lex->chr1 <= '7';
}
-// TODO UNICODE include unicode characters in definition of identifiers
+// to easily parse utf-8 identifiers we allow any raw byte with high bit set
STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
- return is_letter(lex) || lex->chr0 == '_';
+ return is_letter(lex) || lex->chr0 == '_' || lex->chr0 >= 0x80;
}
-// TODO UNICODE include unicode characters in definition of identifiers
STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
return is_head_of_identifier(lex) || is_digit(lex);
}
@@ -523,13 +522,13 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
} else if (is_head_of_identifier(lex)) {
lex->tok_kind = MP_TOKEN_NAME;
- // get first char
- vstr_add_char(&lex->vstr, CUR_CHAR(lex));
+ // get first char (add as byte to remain 8-bit clean and support utf-8)
+ vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
next_char(lex);
// get tail chars
while (!is_end(lex) && is_tail_of_identifier(lex)) {
- vstr_add_char(&lex->vstr, CUR_CHAR(lex));
+ vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
next_char(lex);
}
diff --git a/tests/unicode/unicode_id.py b/tests/unicode/unicode_id.py
new file mode 100644
index 0000000000..10f540c503
--- /dev/null
+++ b/tests/unicode/unicode_id.py
@@ -0,0 +1,27 @@
+# test unicode in identifiers
+
+# comment
+# αβγδϵφζ
+
+# global identifiers
+α = 1
+αβγ = 2
+bβ = 3
+βb = 4
+print(α, αβγ, bβ, βb)
+
+# function, argument, local identifiers
+def α(β, γ):
+ δ = β + γ
+ print(β, γ, δ)
+α(1, 2)
+
+# class, method identifiers
+class φ:
+ def __init__(self):
+ pass
+ def δ(self, ϵ):
+ print(ϵ)
+zζzζz = φ()
+if hasattr(zζzζz, "δ"):
+ zζzζz.δ(ϵ=123)