summaryrefslogtreecommitdiffstatshomepage
diff options
context:
space:
mode:
-rw-r--r--py/lexer.c64
-rw-r--r--py/misc.h1
-rw-r--r--py/unicode.c4
-rw-r--r--tests/basics/string-escape.py11
4 files changed, 74 insertions, 6 deletions
diff --git a/py/lexer.c b/py/lexer.c
index daaeebf511..af413021b1 100644
--- a/py/lexer.c
+++ b/py/lexer.c
@@ -126,6 +126,10 @@ static bool is_following_digit(mp_lexer_t *lex) {
return unichar_isdigit(lex->chr1);
}
+static bool is_following_odigit(mp_lexer_t *lex) {
+ return lex->chr1 >= '0' && lex->chr1 <= '7';
+}
+
// TODO UNICODE include unicode characters in definition of identifiers
static bool is_head_of_identifier(mp_lexer_t *lex) {
return is_letter(lex) || lex->chr0 == '_';
@@ -275,6 +279,32 @@ static const char *tok_kw[] = {
NULL,
};
+static int hex_digit(unichar c) {
+ // c is assumed to be hex digit
+ int n = c - '0';
+ if (n > 9) {
+ n &= ~('a' - 'A');
+ n -= ('A' - ('9' + 1));
+ }
+ return n;
+}
+
+// This is called with CUR_CHAR() before first hex digit, and should return with
+// it pointing to last hex digit
+static bool get_hex(mp_lexer_t *lex, int num_digits, uint *result) {
+ uint num = 0;
+ while (num_digits-- != 0) {
+ next_char(lex);
+ unichar c = CUR_CHAR(lex);
+ if (!unichar_isxdigit(c)) {
+ return false;
+ }
+ num = (num << 4) + hex_digit(c);
+ }
+ *result = num;
+ return true;
+}
+
static void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool first_token) {
// skip white space and comments
bool had_physical_newline = false;
@@ -439,12 +469,34 @@ static void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
case 'v': c = 0x0b; break;
case 'f': c = 0x0c; break;
case 'r': c = 0x0d; break;
- // TODO \ooo octal
- case 'x': // TODO \xhh
- case 'N': // TODO \N{name} only in strings
- case 'u': // TODO \uxxxx only in strings
- case 'U': // TODO \Uxxxxxxxx only in strings
- default: break; // TODO error message
+ case 'x':
+ {
+ uint num;
+ if (!get_hex(lex, 2, &num)) {
+ // TODO error message
+ assert(0);
+ }
+ c = num;
+ break;
+ }
+ case 'N': break; // TODO \N{name} only in strings
+ case 'u': break; // TODO \uxxxx only in strings
+ case 'U': break; // TODO \Uxxxxxxxx only in strings
+ default:
+ if (c >= '0' && c <= '7') {
+ // Octal sequence, 1-3 chars
+ int digits = 3;
+ int num = c - '0';
+ while (is_following_odigit(lex) && --digits != 0) {
+ next_char(lex);
+ num = num * 8 + (CUR_CHAR(lex) - '0');
+ }
+ c = num;
+ } else {
+ // TODO error message
+ assert(0);
+ }
+ break;
}
if (c != MP_LEXER_CHAR_EOF) {
vstr_add_char(&lex->vstr, c);
diff --git a/py/misc.h b/py/misc.h
index 8756c25a07..52498c70bd 100644
--- a/py/misc.h
+++ b/py/misc.h
@@ -43,6 +43,7 @@ bool unichar_isspace(unichar c);
bool unichar_isalpha(unichar c);
bool unichar_isprint(unichar c);
bool unichar_isdigit(unichar c);
+bool unichar_isxdigit(unichar c);
/** string ******************************************************/
diff --git a/py/unicode.c b/py/unicode.c
index 58c860a0e4..52bc9b9f60 100644
--- a/py/unicode.c
+++ b/py/unicode.c
@@ -62,6 +62,10 @@ bool unichar_isdigit(unichar c) {
return c < 128 && (attr[c] & FL_DIGIT) != 0;
}
+bool unichar_isxdigit(unichar c) {
+ return unichar_isdigit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
+}
+
/*
bool char_is_alpha_or_digit(unichar c) {
return c < 128 && (attr[c] & (FL_ALPHA | FL_DIGIT)) != 0;
diff --git a/tests/basics/string-escape.py b/tests/basics/string-escape.py
new file mode 100644
index 0000000000..000a8713e6
--- /dev/null
+++ b/tests/basics/string-escape.py
@@ -0,0 +1,11 @@
+a = "a\1b"
+print(len(a))
+print(ord(a[1]))
+print(len("a\123b"))
+a = "a\12345b"
+print(len(a))
+print(ord(a[1]))
+
+a = "a\xffb"
+print(len(a))
+print(ord(a[1]))