summaryrefslogtreecommitdiffstatshomepage
diff options
context:
space:
mode:
authorPaul Sokolovsky <pfalcon@users.sourceforge.net>2014-10-11 14:26:29 +0300
committerPaul Sokolovsky <pfalcon@users.sourceforge.net>2014-10-11 14:36:32 +0300
commit5edbadefc1acbcae90d373bd49cd1e13f0f4332b (patch)
tree64eb26f5b74252c694fab3046aaa271b5cf6ab1f
parentc71e045165bffbbe085e407008feba3cdfda3298 (diff)
downloadmicropython-5edbadefc1acbcae90d373bd49cd1e13f0f4332b.tar.gz
micropython-5edbadefc1acbcae90d373bd49cd1e13f0f4332b.zip
modure: Import needed files from re1.5 v0.5.
https://github.com/pfalcon/re1.5
-rw-r--r--extmod/re1.5/compilecode.c213
-rw-r--r--extmod/re1.5/dumpcode.c50
-rw-r--r--extmod/re1.5/recursiveloop.c71
-rw-r--r--extmod/re1.5/regexp.h138
4 files changed, 472 insertions, 0 deletions
diff --git a/extmod/re1.5/compilecode.c b/extmod/re1.5/compilecode.c
new file mode 100644
index 0000000000..5b5d28c2a0
--- /dev/null
+++ b/extmod/re1.5/compilecode.c
@@ -0,0 +1,213 @@
+// Copyright 2014 Paul Sokolovsky.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "regexp.h"
+
+static void insert_code(char *code, int at, int num, int *pc)
+{
+ memmove(code + at + num, code + at, *pc - at);
+ *pc += num;
+}
+
+#define REL(at, to) (to - at - 2)
+
+int re1_5_sizecode(const char *re)
+{
+ int pc = 5 + NON_ANCHORED_PREFIX; // Save 0, Save 1, Match; more bytes for "search" (vs "match") prefix code
+
+ for (; *re; re++) {
+ switch (*re) {
+ case '\\':
+ re++;
+ default:
+ pc += 2;
+ break;
+ case '+':
+ // Skip entire "+?"
+ if (re[1] == '?')
+ re++;
+ case '?':
+ pc += 2;
+ break;
+ case '.':
+ case '^':
+ case '$':
+ pc++;
+ break;
+ case '*':
+ // Skip entire "*?"
+ if (re[1] == '?')
+ re++;
+ case '|':
+ case '(':
+ pc += 4;
+ break;
+ case ')':
+ break;
+ }
+ }
+
+ return pc;
+}
+
+#define EMIT(at, byte) code[at] = byte
+
+const char *_compilecode(const char *re, ByteProg *prog)
+{
+ char *code = prog->insts;
+ int pc = prog->bytelen;
+ int start = pc;
+ int term = pc;
+ int alt_label = 0;
+
+ for (; *re && *re != ')'; re++) {
+ switch (*re) {
+ case '\\':
+ re++;
+ default:
+ term = pc;
+ EMIT(pc++, Char);
+ EMIT(pc++, *re);
+ prog->len++;
+ break;
+ case '.':
+ term = pc;
+ EMIT(pc++, Any);
+ prog->len++;
+ break;
+ case '(':
+ term = pc;
+
+ EMIT(pc++, Save);
+ EMIT(pc++, 2 * ++prog->sub);
+ prog->len++;
+
+ prog->bytelen = pc;
+ re = _compilecode(re + 1, prog);
+ pc = prog->bytelen;
+
+ EMIT(pc++, Save);
+ EMIT(pc++, 2 * prog->sub + 1);
+ prog->len++;
+
+ break;
+ case '?':
+ insert_code(code, term, 2, &pc);
+ EMIT(term, Split);
+ EMIT(term + 1, REL(term, pc));
+ prog->len++;
+ break;
+ case '*':
+ insert_code(code, term, 2, &pc);
+ EMIT(pc, Jmp);
+ EMIT(pc + 1, REL(pc, term));
+ pc += 2;
+ if (re[1] == '?') {
+ EMIT(term, RSplit);
+ re++;
+ } else {
+ EMIT(term, Split);
+ }
+ EMIT(term + 1, REL(term, pc));
+ prog->len += 2;
+ break;
+ case '+':
+ if (re[1] == '?') {
+ EMIT(pc, Split);
+ re++;
+ } else {
+ EMIT(pc, RSplit);
+ }
+ EMIT(pc + 1, REL(pc, term));
+ pc += 2;
+ prog->len++;
+ break;
+ case '|':
+ if (alt_label) {
+ EMIT(alt_label, REL(alt_label, pc) + 1);
+ }
+ insert_code(code, start, 2, &pc);
+ EMIT(pc++, Jmp);
+ alt_label = pc++;
+ EMIT(start, Split);
+ EMIT(start + 1, REL(start, pc));
+ prog->len += 2;
+ break;
+ case '^':
+ EMIT(pc++, Bol);
+ prog->len++;
+ break;
+ case '$':
+ EMIT(pc++, Eol);
+ prog->len++;
+ break;
+ }
+ }
+
+ if (alt_label) {
+ EMIT(alt_label, REL(alt_label, pc) + 1);
+ }
+ prog->bytelen = pc;
+ return re;
+}
+
+int re1_5_compilecode(ByteProg *prog, const char *re)
+{
+ prog->len = 0;
+ prog->bytelen = 0;
+ prog->sub = 0;
+
+ // Add code to implement non-anchored operation ("search"),
+ // for anchored operation ("match"), this code will be just skipped.
+ // TODO: Implement search in much more efficient manner
+ prog->insts[prog->bytelen++] = RSplit;
+ prog->insts[prog->bytelen++] = 3;
+ prog->insts[prog->bytelen++] = Any;
+ prog->insts[prog->bytelen++] = Jmp;
+ prog->insts[prog->bytelen++] = -5;
+ prog->len += 3;
+
+ prog->insts[prog->bytelen++] = Save;
+ prog->insts[prog->bytelen++] = 0;
+ prog->len++;
+
+ _compilecode(re, prog);
+
+ prog->insts[prog->bytelen++] = Save;
+ prog->insts[prog->bytelen++] = 1;
+ prog->len++;
+
+ prog->insts[prog->bytelen++] = Match;
+ prog->len++;
+
+ return 0;
+}
+
+void
+cleanmarks(ByteProg *prog)
+{
+ char *pc = prog->insts;
+ char *end = pc + prog->bytelen;
+ while (pc < end) {
+ *pc &= 0x7f;
+ switch (*pc) {
+ case Jmp:
+ case Split:
+ case RSplit:
+ case Save:
+ case Char:
+ pc++;
+ }
+ pc++;
+ }
+}
+
+#if 0
+int main(int argc, char *argv[])
+{
+ int pc = 0;
+ ByteProg *code = re1_5_compilecode(argv[1]);
+ re1_5_dumpcode(code);
+}
+#endif
diff --git a/extmod/re1.5/dumpcode.c b/extmod/re1.5/dumpcode.c
new file mode 100644
index 0000000000..b91ded03a6
--- /dev/null
+++ b/extmod/re1.5/dumpcode.c
@@ -0,0 +1,50 @@
+// Copyright 2014 Paul Sokolovsky.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "regexp.h"
+
+void re1_5_dumpcode(ByteProg *prog)
+{
+ int pc = 0;
+ char *code = prog->insts;
+ while (pc < prog->bytelen) {
+ printf("%2d: ", pc);
+ switch(code[pc++]) {
+ default:
+ assert(0);
+// re1_5_fatal("printprog");
+ case Split:
+ printf("split %d (%d)\n", pc + (signed char)code[pc] + 1, (signed char)code[pc]);
+ pc++;
+ break;
+ case RSplit:
+ printf("rsplit %d (%d)\n", pc + (signed char)code[pc] + 1, (signed char)code[pc]);
+ pc++;
+ break;
+ case Jmp:
+ printf("jmp %d (%d)\n", pc + (signed char)code[pc] + 1, (signed char)code[pc]);
+ pc++;
+ break;
+ case Char:
+ printf("char %c\n", code[pc++]);
+ break;
+ case Any:
+ printf("any\n");
+ break;
+ case Match:
+ printf("match\n");
+ break;
+ case Save:
+ printf("save %d\n", (unsigned char)code[pc++]);
+ break;
+ case Bol:
+ printf("assert bol\n");
+ break;
+ case Eol:
+ printf("assert eol\n");
+ break;
+ }
+ }
+ printf("Bytes: %d, insts: %d\n", prog->bytelen, prog->len);
+}
diff --git a/extmod/re1.5/recursiveloop.c b/extmod/re1.5/recursiveloop.c
new file mode 100644
index 0000000000..7b95eb4c95
--- /dev/null
+++ b/extmod/re1.5/recursiveloop.c
@@ -0,0 +1,71 @@
+// Copyright 2007-2009 Russ Cox. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "regexp.h"
+
+static int
+recursiveloop(char *pc, const char *sp, Subject *input, const char **subp, int nsubp)
+{
+ const char *old;
+ int off;
+
+ for(;;) {
+ if(inst_is_consumer(*pc)) {
+ // If we need to match a character, but there's none left, it's fail
+ if(sp >= input->end)
+ return 0;
+ }
+ switch(*pc++) {
+ case Char:
+ if(*sp != *pc++)
+ return 0;
+ case Any:
+ sp++;
+ continue;
+ case Match:
+ return 1;
+ case Jmp:
+ off = (signed char)*pc++;
+ pc = pc + off;
+ continue;
+ case Split:
+ off = (signed char)*pc++;
+ if(recursiveloop(pc, sp, input, subp, nsubp))
+ return 1;
+ pc = pc + off;
+ continue;
+ case RSplit:
+ off = (signed char)*pc++;
+ if(recursiveloop(pc + off, sp, input, subp, nsubp))
+ return 1;
+ continue;
+ case Save:
+ off = (unsigned char)*pc++;
+ if(off >= nsubp) {
+ continue;
+ }
+ old = subp[off];
+ subp[off] = sp;
+ if(recursiveloop(pc, sp, input, subp, nsubp))
+ return 1;
+ subp[off] = old;
+ return 0;
+ case Bol:
+ if(sp != input->begin)
+ return 0;
+ continue;
+ case Eol:
+ if(sp != input->end)
+ return 0;
+ continue;
+ }
+ re1_5_fatal("recursiveloop");
+ }
+}
+
+int
+re1_5_recursiveloopprog(ByteProg *prog, Subject *input, const char **subp, int nsubp, int is_anchored)
+{
+ return recursiveloop(HANDLE_ANCHORED(prog->insts, is_anchored), input->begin, input, subp, nsubp);
+}
diff --git a/extmod/re1.5/regexp.h b/extmod/re1.5/regexp.h
new file mode 100644
index 0000000000..78492eb0f1
--- /dev/null
+++ b/extmod/re1.5/regexp.h
@@ -0,0 +1,138 @@
+// Copyright 2007-2009 Russ Cox. All Rights Reserved.
+// Copyright 2014 Paul Sokolovsky.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <assert.h>
+
+#define nil ((void*)0)
+#define nelem(x) (sizeof(x)/sizeof((x)[0]))
+
+typedef struct Regexp Regexp;
+typedef struct Prog Prog;
+typedef struct ByteProg ByteProg;
+typedef struct Inst Inst;
+typedef struct Subject Subject;
+
+struct Regexp
+{
+ int type;
+ int n;
+ int ch;
+ Regexp *left;
+ Regexp *right;
+};
+
+enum /* Regexp.type */
+{
+ Alt = 1,
+ Cat,
+ Lit,
+ Dot,
+ Paren,
+ Quest,
+ Star,
+ Plus,
+};
+
+Regexp *parse(char*);
+Regexp *reg(int type, Regexp *left, Regexp *right);
+void printre(Regexp*);
+#ifndef re1_5_fatal
+void re1_5_fatal(char*);
+#endif
+void *mal(int);
+
+struct Prog
+{
+ Inst *start;
+ int len;
+};
+
+struct ByteProg
+{
+ int bytelen;
+ int len;
+ int sub;
+ char insts[0];
+};
+
+struct Inst
+{
+ int opcode;
+ int c;
+ int n;
+ Inst *x;
+ Inst *y;
+ int gen; // global state, oooh!
+};
+
+enum /* Inst.opcode */
+{
+ // Instructions which consume input bytes (and thus fail if none left)
+ CONSUMERS = 1,
+ Char = CONSUMERS,
+ Any,
+ ASSERTS = 0x50,
+ Bol = ASSERTS,
+ Eol,
+ // Instructions which take relative offset as arg
+ JUMPS = 0x60,
+ Jmp = JUMPS,
+ Split,
+ RSplit,
+ // Other (special) instructions
+ Save = 0x7e,
+ Match = 0x7f,
+};
+
+#define inst_is_consumer(inst) ((inst) < ASSERTS)
+#define inst_is_jump(inst) ((inst) & 0x70 == JUMPS)
+
+Prog *compile(Regexp*);
+void printprog(Prog*);
+
+extern int gen;
+
+enum {
+ MAXSUB = 20
+};
+
+typedef struct Sub Sub;
+
+struct Sub
+{
+ int ref;
+ int nsub;
+ const char *sub[MAXSUB];
+};
+
+Sub *newsub(int n);
+Sub *incref(Sub*);
+Sub *copy(Sub*);
+Sub *update(Sub*, int, const char*);
+void decref(Sub*);
+
+struct Subject {
+ const char *begin;
+ const char *end;
+};
+
+
+#define NON_ANCHORED_PREFIX 5
+#define HANDLE_ANCHORED(bytecode, is_anchored) ((is_anchored) ? (bytecode) + NON_ANCHORED_PREFIX : (bytecode))
+
+int re1_5_backtrack(ByteProg*, Subject*, const char**, int, int);
+int re1_5_pikevm(ByteProg*, Subject*, const char**, int, int);
+int re1_5_recursiveloopprog(ByteProg*, Subject*, const char**, int, int);
+int re1_5_recursiveprog(ByteProg*, Subject*, const char**, int, int);
+int re1_5_thompsonvm(ByteProg*, Subject*, const char**, int, int);
+
+int re1_5_sizecode(const char *re);
+int re1_5_compilecode(ByteProg *prog, const char *re);
+void re1_5_dumpcode(ByteProg *prog);
+void cleanmarks(ByteProg *prog);