aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/Parser/tokenizer/utf8_tokenizer.c
diff options
context:
space:
mode:
Diffstat (limited to 'Parser/tokenizer/utf8_tokenizer.c')
-rw-r--r--Parser/tokenizer/utf8_tokenizer.c55
1 files changed, 55 insertions, 0 deletions
diff --git a/Parser/tokenizer/utf8_tokenizer.c b/Parser/tokenizer/utf8_tokenizer.c
new file mode 100644
index 00000000000..1a925f44540
--- /dev/null
+++ b/Parser/tokenizer/utf8_tokenizer.c
@@ -0,0 +1,55 @@
+#include "Python.h"
+#include "errcode.h"
+
+#include "helpers.h"
+#include "../lexer/state.h"
+
+static int
+tok_underflow_string(struct tok_state *tok) {
+ char *end = strchr(tok->inp, '\n');
+ if (end != NULL) {
+ end++;
+ }
+ else {
+ end = strchr(tok->inp, '\0');
+ if (end == tok->inp) {
+ tok->done = E_EOF;
+ return 0;
+ }
+ }
+ if (tok->start == NULL) {
+ tok->buf = tok->cur;
+ }
+ tok->line_start = tok->cur;
+ ADVANCE_LINENO();
+ tok->inp = end;
+ return 1;
+}
+
+/* Set up tokenizer for UTF-8 string */
+struct tok_state *
+_PyTokenizer_FromUTF8(const char *str, int exec_input, int preserve_crlf)
+{
+ struct tok_state *tok = _PyTokenizer_tok_new();
+ char *translated;
+ if (tok == NULL)
+ return NULL;
+ tok->input = translated = _PyTokenizer_translate_newlines(str, exec_input, preserve_crlf, tok);
+ if (translated == NULL) {
+ _PyTokenizer_Free(tok);
+ return NULL;
+ }
+ tok->decoding_state = STATE_NORMAL;
+ tok->enc = NULL;
+ tok->str = translated;
+ tok->encoding = _PyTokenizer_new_string("utf-8", 5, tok);
+ if (!tok->encoding) {
+ _PyTokenizer_Free(tok);
+ return NULL;
+ }
+
+ tok->buf = tok->cur = tok->inp = translated;
+ tok->end = translated;
+ tok->underflow = &tok_underflow_string;
+ return tok;
+}