summaryrefslogtreecommitdiffstatshomepage
path: root/py
diff options
context:
space:
mode:
Diffstat (limited to 'py')
-rw-r--r--py/emitcpy.c10
-rw-r--r--py/lexer.c212
-rw-r--r--py/lexer.h21
-rw-r--r--py/lexerfile.c23
4 files changed, 118 insertions, 148 deletions
diff --git a/py/emitcpy.c b/py/emitcpy.c
index 6e3543da31..089352c0fe 100644
--- a/py/emitcpy.c
+++ b/py/emitcpy.c
@@ -211,7 +211,6 @@ static void emit_cpy_load_const_verbatim_strn(emit_t *emit, const char *str, int
}
static void emit_cpy_load_const_verbatim_quoted_str(emit_t *emit, qstr qstr, bool bytes) {
- // TODO strings should be escaped before we get here
if (emit->pass == PASS_3) {
const char *str = qstr_str(qstr);
int len = strlen(str);
@@ -237,13 +236,8 @@ static void emit_cpy_load_const_verbatim_quoted_str(emit_t *emit, qstr qstr, boo
for (int i = 0; i < len; i++) {
if (str[i] == '\n') {
printf("\\n");
- } else if (str[i] == '\\' && str[i + 1] == '\'') {
- i += 1;
- if (quote_single) {
- printf("\\'");
- } else {
- printf("'");
- }
+ } else if (str[i] == '\\') {
+ printf("\\\\");
} else if (str[i] == '\'' && quote_single) {
printf("\\'");
} else {
diff --git a/py/lexer.c b/py/lexer.c
index 7167b93276..56f1ed0df4 100644
--- a/py/lexer.c
+++ b/py/lexer.c
@@ -9,48 +9,43 @@
#include "lexer.h"
#define TAB_SIZE (8)
-#define CHR_EOF (-1)
struct _py_lexer_t {
- const char *name; // (file) name of source
- bool free; // free source when done with it
+ const char *name; // name of source
+ void *stream_data; // data for stream
+ py_lexer_stream_next_char_t stream_next_char; // stream callback to get next char
+ py_lexer_stream_free_t stream_free; // stream callback to free
- const char *src_beg; // beginning of source
- const char *src_cur; // current location in source; points to chr0
- const char *src_end; // end (exclusive) of source
- unichar chr0, chr1, chr2; // current characters from source
+ unichar chr0, chr1, chr2; // current cached characters from source
uint line; // source line
uint column; // source column
- uint cont_line; // continued line
-
- int emit_dent;
- int nested_bracket_level;
+ int emit_dent; // non-zero when there are INDENT/DEDENT tokens to emit
+ int nested_bracket_level; // >0 when there are nested brackets over multiple lines
uint alloc_indent_level;
uint num_indent_level;
uint16_t *indent_level;
+ vstr_t vstr;
py_token_t tok_cur;
- py_token_t tok_next;
};
-static bool py_token_is_str(const py_token_t *tok, const char *str) {
+bool str_strn_equal(const char *str, const char *strn, int len) {
uint i = 0;
- const char *tstr = tok->str;
- while (i < tok->len && *tstr == *str) {
+ while (i < len && *str == *strn) {
++i;
- ++tstr;
++str;
+ ++strn;
}
- return i == tok->len && *str == 0;
+ return i == len && *str == 0;
}
void py_token_show(const py_token_t *tok) {
- printf("(%s:%d:%d) kind:%d cont_line:%d str:%p len:%d", tok->src_name, tok->src_line, tok->src_column, tok->kind, tok->cont_line, tok->str, tok->len);
+ printf("(%s:%d:%d) kind:%d str:%p len:%d", tok->src_name, tok->src_line, tok->src_column, tok->kind, tok->str, tok->len);
if (tok->str != NULL && tok->len > 0) {
const char *i = tok->str;
const char *j = i + tok->len;
@@ -77,8 +72,10 @@ bool py_token_show_error(const py_token_t *tok, const char *msg) {
return false;
}
+#define CUR_CHAR(lex) ((lex)->chr0)
+
static bool is_end(py_lexer_t *lex) {
- return lex->chr0 == CHR_EOF;
+ return lex->chr0 == PY_LEXER_CHAR_EOF;
}
static bool is_physical_newline(py_lexer_t *lex) {
@@ -142,7 +139,7 @@ static bool is_tail_of_identifier(py_lexer_t *lex) {
}
static void next_char(py_lexer_t *lex) {
- if (lex->chr0 == CHR_EOF) {
+ if (lex->chr0 == PY_LEXER_CHAR_EOF) {
return;
}
@@ -152,12 +149,10 @@ static void next_char(py_lexer_t *lex) {
// LF is a new line
++lex->line;
lex->column = 1;
- lex->cont_line = lex->line;
} else if (lex->chr0 == '\r') {
// CR is a new line
++lex->line;
lex->column = 1;
- lex->cont_line = lex->line;
if (lex->chr1 == '\n') {
// CR LF is a single new line
advance = 2;
@@ -173,15 +168,11 @@ static void next_char(py_lexer_t *lex) {
for (; advance > 0; advance--) {
lex->chr0 = lex->chr1;
lex->chr1 = lex->chr2;
- lex->src_cur++;
- if (lex->src_cur + 2 < lex->src_end) {
- lex->chr2 = lex->src_cur[2];
- } else {
+ lex->chr2 = lex->stream_next_char(lex->stream_data);
+ if (lex->chr2 == PY_LEXER_CHAR_EOF) {
// EOF
- if (lex->chr1 != CHR_EOF && lex->chr1 != '\n' && lex->chr1 != '\r') {
+ if (lex->chr1 != PY_LEXER_CHAR_EOF && lex->chr1 != '\n' && lex->chr1 != '\r') {
lex->chr2 = '\n'; // insert newline at end of file
- } else {
- lex->chr2 = CHR_EOF;
}
}
}
@@ -286,9 +277,9 @@ static const char *tok_kw[] = {
NULL,
};
-static void py_lexer_next_token_into(py_lexer_t *lex, py_token_t *tok) {
+static void py_lexer_next_token_into(py_lexer_t *lex, py_token_t *tok, bool first_token) {
+ // skip white space and comments
bool had_physical_newline = false;
-
while (!is_end(lex)) {
if (is_physical_newline(lex)) {
had_physical_newline = true;
@@ -315,15 +306,22 @@ static void py_lexer_next_token_into(py_lexer_t *lex, py_token_t *tok) {
}
}
+ // set token source information
tok->src_name = lex->name;
tok->src_line = lex->line;
tok->src_column = lex->column;
- tok->kind = PY_TOKEN_INVALID;
- tok->cont_line = lex->cont_line;
- tok->str = lex->src_cur;
- tok->len = 0;
- if (lex->emit_dent < 0) {
+ // start new token text
+ vstr_reset(&lex->vstr);
+
+ if (first_token && lex->line == 1 && lex->column != 1) {
+ // check that the first token is in the first column
+ // if first token is not on first line, we get a physical newline and
+ // this check is done as part of normal indent/dedent checking below
+ // (done to get equivalence with CPython)
+ tok->kind = PY_TOKEN_INDENT;
+
+ } else if (lex->emit_dent < 0) {
tok->kind = PY_TOKEN_DEDENT;
lex->emit_dent += 1;
@@ -414,19 +412,42 @@ static void py_lexer_next_token_into(py_lexer_t *lex, py_token_t *tok) {
num_quotes = 1;
}
- // set start of token
- tok->str = lex->src_cur;
-
// parse the literal
- // TODO proper escaping
int n_closing = 0;
while (!is_end(lex) && (num_quotes > 1 || !is_char(lex, '\n')) && n_closing < num_quotes) {
if (is_char(lex, quote_char)) {
n_closing += 1;
+ vstr_add_char(&lex->vstr, CUR_CHAR(lex));
} else {
n_closing = 0;
if (!is_raw && is_char(lex, '\\')) {
next_char(lex);
+ unichar c = CUR_CHAR(lex);
+ switch (c) {
+ case PY_LEXER_CHAR_EOF: break; // TODO a proper error message?
+ case '\n': c = PY_LEXER_CHAR_EOF; break; // TODO check this works correctly (we are supposed to ignore it
+ case '\\': break;
+ case '\'': break;
+ case '"': break;
+ case 'a': c = 0x07; break;
+ case 'b': c = 0x08; break;
+ case 't': c = 0x09; break;
+ case 'n': c = 0x0a; break;
+ case 'v': c = 0x0b; break;
+ case 'f': c = 0x0c; break;
+ case 'r': c = 0x0d; break;
+ // TODO \ooo octal
+ case 'x': // TODO \xhh
+ case 'N': // TODO \N{name} only in strings
+ case 'u': // TODO \uxxxx only in strings
+ case 'U': // TODO \Uxxxxxxxx only in strings
+ default: break; // TODO error message
+ }
+ if (c != PY_LEXER_CHAR_EOF) {
+ vstr_add_char(&lex->vstr, c);
+ }
+ } else {
+ vstr_add_char(&lex->vstr, CUR_CHAR(lex));
}
}
next_char(lex);
@@ -437,33 +458,40 @@ static void py_lexer_next_token_into(py_lexer_t *lex, py_token_t *tok) {
tok->kind = PY_TOKEN_LONELY_STRING_OPEN;
}
- // set token string (byte) length
- tok->len = lex->src_cur - tok->str - n_closing;
-
- // we set the length, return now so it's not set incorrectly below
- return;
+ // cut off the end quotes from the token text
+ vstr_cut_tail(&lex->vstr, n_closing);
} else if (is_head_of_identifier(lex)) {
tok->kind = PY_TOKEN_NAME;
+ // get first char
+ vstr_add_char(&lex->vstr, CUR_CHAR(lex));
next_char(lex);
+ // get tail chars
while (!is_end(lex) && is_tail_of_identifier(lex)) {
+ vstr_add_char(&lex->vstr, CUR_CHAR(lex));
next_char(lex);
}
} else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
tok->kind = PY_TOKEN_NUMBER;
+ // get first char
+ vstr_add_char(&lex->vstr, CUR_CHAR(lex));
next_char(lex);
+ // get tail chars
while (!is_end(lex)) {
if (is_char_or(lex, 'e', 'E')) {
+ vstr_add_char(&lex->vstr, 'e');
next_char(lex);
if (is_char(lex, '+') || is_char(lex, '-')) {
+ vstr_add_char(&lex->vstr, CUR_CHAR(lex));
next_char(lex);
}
} else if (is_letter(lex) || is_digit(lex) || is_char_or(lex, '_', '.')) {
+ vstr_add_char(&lex->vstr, CUR_CHAR(lex));
next_char(lex);
} else {
break;
@@ -546,13 +574,14 @@ static void py_lexer_next_token_into(py_lexer_t *lex, py_token_t *tok) {
}
}
- // set token string (byte) length
- tok->len = lex->src_cur - tok->str;
+ // point token text to vstr buffer
+ tok->str = vstr_str(&lex->vstr);
+ tok->len = vstr_len(&lex->vstr);
- // check for keywords (must be done after setting token string length)
+ // check for keywords
if (tok->kind == PY_TOKEN_NAME) {
for (int i = 0; tok_kw[i] != NULL; i++) {
- if (py_token_is_str(tok, tok_kw[i])) {
+ if (str_strn_equal(tok_kw[i], tok->str, tok->len)) {
tok->kind = PY_TOKEN_KW_FALSE + i;
break;
}
@@ -560,83 +589,58 @@ static void py_lexer_next_token_into(py_lexer_t *lex, py_token_t *tok) {
}
}
-py_lexer_t *py_lexer_from_str_len(const char *src_name, const char *str, uint len, bool free_str) {
- py_lexer_t *lex;
+py_lexer_t *py_lexer_new(const char *src_name, void *stream_data, py_lexer_stream_next_char_t stream_next_char, py_lexer_stream_free_t stream_free) {
+ py_lexer_t *lex = m_new(py_lexer_t, 1);
- lex = m_new(py_lexer_t, 1);
-
- //lex->name = g_strdup(src_name); // TODO
- lex->name = src_name;
- lex->free = free_str;
- lex->src_beg = str;
- lex->src_cur = str;
- lex->src_end = str + len;
+ lex->name = src_name; // TODO do we need to strdup this?
+ lex->stream_data = stream_data;
+ lex->stream_next_char = stream_next_char;
+ lex->stream_free = stream_free;
lex->line = 1;
lex->column = 1;
- lex->cont_line = lex->line;
lex->emit_dent = 0;
lex->nested_bracket_level = 0;
lex->alloc_indent_level = 16;
lex->num_indent_level = 1;
lex->indent_level = m_new(uint16_t, lex->alloc_indent_level);
lex->indent_level[0] = 0;
+ vstr_init(&lex->vstr);
// preload characters
- // TODO unicode
- if (len == 0) {
- lex->chr0 = '\n'; // insert newline at end of file
- lex->chr1 = CHR_EOF;
- lex->chr2 = CHR_EOF;
- } else if (len == 1) {
- lex->chr0 = str[0];
+ lex->chr0 = stream_next_char(stream_data);
+ lex->chr1 = stream_next_char(stream_data);
+ lex->chr2 = stream_next_char(stream_data);
+
+ // if input stream is 0, 1 or 2 characters long and doesn't end in a newline, then insert a newline at the end
+ if (lex->chr0 == PY_LEXER_CHAR_EOF) {
+ lex->chr0 = '\n';
+ } else if (lex->chr1 == PY_LEXER_CHAR_EOF) {
if (lex->chr0 != '\n' && lex->chr0 != '\r') {
- lex->chr1 = '\n'; // insert newline at end of file
- } else {
- lex->chr1 = CHR_EOF;
+ lex->chr1 = '\n';
}
- lex->chr2 = CHR_EOF;
- } else if (len == 2) {
- lex->chr0 = str[0];
- lex->chr1 = str[1];
+ } else if (lex->chr2 == PY_LEXER_CHAR_EOF) {
if (lex->chr1 != '\n' && lex->chr1 != '\r') {
- lex->chr2 = '\n'; // insert newline at end of file
- } else {
- lex->chr2 = CHR_EOF;
+ lex->chr2 = '\n';
}
- } else {
- lex->chr0 = str[0];
- lex->chr1 = str[1];
- lex->chr2 = str[2];
}
- py_lexer_next_token_into(lex, &lex->tok_cur);
-
- // check that the first token is in the first column
- // (done to get equivalence with CPython)
- if (lex->tok_cur.src_line == 1 && lex->tok_cur.src_column != 1) {
- lex->tok_next = lex->tok_cur;
- lex->tok_cur.kind = PY_TOKEN_INDENT;
- } else {
- py_lexer_next_token_into(lex, &lex->tok_next);
- }
+ // preload first token
+ py_lexer_next_token_into(lex, &lex->tok_cur, true);
return lex;
}
void py_lexer_free(py_lexer_t *lex) {
- if (lex == NULL) {
- return;
- }
- //m_free(lex->name);
- if (lex->free) {
- m_free((char*)lex->src_beg);
+ if (lex) {
+ if (lex->stream_free) {
+ lex->stream_free(lex->stream_data);
+ }
+ m_free(lex);
}
- m_free(lex);
}
void py_lexer_to_next(py_lexer_t *lex) {
- lex->tok_cur = lex->tok_next;
- py_lexer_next_token_into(lex, &lex->tok_next);
+ py_lexer_next_token_into(lex, &lex->tok_cur, false);
}
const py_token_t *py_lexer_cur(const py_lexer_t *lex) {
@@ -652,14 +656,6 @@ bool py_lexer_is_str(py_lexer_t *lex, const char *str) {
return py_token_is_str(&lex->tok_cur, str);
}
-bool py_lexer_is_next_kind(py_lexer_t *lex, py_token_kind_t kind) {
- return lex->tok_next.kind == kind;
-}
-
-bool py_lexer_is_next_str(py_lexer_t *lex, const char *str) {
- return py_token_is_str(&lex->tok_next, str);
-}
-
bool py_lexer_opt_kind(py_lexer_t *lex, py_token_kind_t kind) {
if (py_lexer_is_kind(lex, kind)) {
py_lexer_to_next(lex);
diff --git a/py/lexer.h b/py/lexer.h
index 948901259d..889a55e2bd 100644
--- a/py/lexer.h
+++ b/py/lexer.h
@@ -108,32 +108,35 @@ typedef enum _py_token_kind_t {
} py_token_kind_t;
typedef struct _py_token_t {
- const char *src_name; // (file) name of source
- uint src_line; // actual source line
- uint src_column; // actual source column
+ const char *src_name; // name of source
+ uint src_line; // source line
+ uint src_column; // source column
py_token_kind_t kind; // kind of token
- uint cont_line; // token belongs to this line in a continued line
- const char *str; // string of token
+ const char *str; // string of token (valid only while this token is current token)
uint len; // (byte) length of string of token
} py_token_t;
+// the next-char function must return the next character in the stream
+// it must return PY_LEXER_CHAR_EOF if end of stream
+// it can be called again after returning PY_LEXER_CHAR_EOF, and in that case must return PY_LEXER_CHAR_EOF
+#define PY_LEXER_CHAR_EOF (-1)
+typedef unichar (*py_lexer_stream_next_char_t)(void*);
+typedef void (*py_lexer_stream_free_t)(void*);
+
typedef struct _py_lexer_t py_lexer_t;
void py_token_show(const py_token_t *tok);
void py_token_show_error_prefix(const py_token_t *tok);
bool py_token_show_error(const py_token_t *tok, const char *msg);
-py_lexer_t *py_lexer_from_file(const char *filename);
-py_lexer_t *py_lexer_from_str_len(const char *src_name, const char *str, uint len, bool free_str);
+py_lexer_t *py_lexer_new(const char *src_name, void *stream_data, py_lexer_stream_next_char_t stream_next_char, py_lexer_stream_free_t stream_free);
void py_lexer_free(py_lexer_t *lex);
void py_lexer_to_next(py_lexer_t *lex);
const py_token_t *py_lexer_cur(const py_lexer_t *lex);
bool py_lexer_is_kind(py_lexer_t *lex, py_token_kind_t kind);
/* unused
bool py_lexer_is_str(py_lexer_t *lex, const char *str);
-bool py_lexer_is_next_kind(py_lexer_t *lex, py_token_kind_t kind);
-bool py_lexer_is_next_str(py_lexer_t *lex, const char *str);
bool py_lexer_opt_kind(py_lexer_t *lex, py_token_kind_t kind);
bool py_lexer_opt_str(py_lexer_t *lex, const char *str);
*/
diff --git a/py/lexerfile.c b/py/lexerfile.c
deleted file mode 100644
index 74bb5a061a..0000000000
--- a/py/lexerfile.c
+++ /dev/null
@@ -1,23 +0,0 @@
-#include <stdint.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <fcntl.h>
-
-#include "misc.h"
-#include "lexer.h"
-
-py_lexer_t *py_lexer_from_file(const char *filename) {
- // TODO abstract away file functionality
- int fd = open(filename, O_RDONLY);
- if (fd < 0) {
- printf("cannot open file %s\n", filename);
- return NULL;
- }
- uint size = lseek(fd, 0, SEEK_END);
- lseek(fd, 0, SEEK_SET);
- char *data = m_new(char, size);
- read(fd, data, size);
- close(fd);
-
- return py_lexer_from_str_len(filename, data, size, true);
-}