diff options
Diffstat (limited to 'extmod/re1.5/compilecode.c')
-rw-r--r-- | extmod/re1.5/compilecode.c | 213 |
1 files changed, 213 insertions, 0 deletions
diff --git a/extmod/re1.5/compilecode.c b/extmod/re1.5/compilecode.c new file mode 100644 index 0000000000..5b5d28c2a0 --- /dev/null +++ b/extmod/re1.5/compilecode.c @@ -0,0 +1,213 @@ +// Copyright 2014 Paul Sokolovsky. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "regexp.h" + +static void insert_code(char *code, int at, int num, int *pc) +{ + memmove(code + at + num, code + at, *pc - at); + *pc += num; +} + +#define REL(at, to) (to - at - 2) + +int re1_5_sizecode(const char *re) +{ + int pc = 5 + NON_ANCHORED_PREFIX; // Save 0, Save 1, Match; more bytes for "search" (vs "match") prefix code + + for (; *re; re++) { + switch (*re) { + case '\\': + re++; + default: + pc += 2; + break; + case '+': + // Skip entire "+?" + if (re[1] == '?') + re++; + case '?': + pc += 2; + break; + case '.': + case '^': + case '$': + pc++; + break; + case '*': + // Skip entire "*?" + if (re[1] == '?') + re++; + case '|': + case '(': + pc += 4; + break; + case ')': + break; + } + } + + return pc; +} + +#define EMIT(at, byte) code[at] = byte + +const char *_compilecode(const char *re, ByteProg *prog) +{ + char *code = prog->insts; + int pc = prog->bytelen; + int start = pc; + int term = pc; + int alt_label = 0; + + for (; *re && *re != ')'; re++) { + switch (*re) { + case '\\': + re++; + default: + term = pc; + EMIT(pc++, Char); + EMIT(pc++, *re); + prog->len++; + break; + case '.': + term = pc; + EMIT(pc++, Any); + prog->len++; + break; + case '(': + term = pc; + + EMIT(pc++, Save); + EMIT(pc++, 2 * ++prog->sub); + prog->len++; + + prog->bytelen = pc; + re = _compilecode(re + 1, prog); + pc = prog->bytelen; + + EMIT(pc++, Save); + EMIT(pc++, 2 * prog->sub + 1); + prog->len++; + + break; + case '?': + insert_code(code, term, 2, &pc); + EMIT(term, Split); + EMIT(term + 1, REL(term, pc)); + prog->len++; + break; + case '*': + insert_code(code, term, 2, &pc); + EMIT(pc, Jmp); + EMIT(pc + 1, REL(pc, term)); + pc += 2; + if (re[1] == '?') { + EMIT(term, RSplit); + re++; + } else { + EMIT(term, Split); + } + EMIT(term + 1, REL(term, pc)); + prog->len += 2; + break; + case '+': + if (re[1] == '?') { + EMIT(pc, Split); + re++; + } else { + EMIT(pc, RSplit); + } + EMIT(pc + 1, REL(pc, term)); + pc += 2; + prog->len++; + break; + case '|': + if (alt_label) { + EMIT(alt_label, REL(alt_label, pc) + 1); + } + insert_code(code, start, 2, &pc); + EMIT(pc++, Jmp); + alt_label = pc++; + EMIT(start, Split); + EMIT(start + 1, REL(start, pc)); + prog->len += 2; + break; + case '^': + EMIT(pc++, Bol); + prog->len++; + break; + case '$': + EMIT(pc++, Eol); + prog->len++; + break; + } + } + + if (alt_label) { + EMIT(alt_label, REL(alt_label, pc) + 1); + } + prog->bytelen = pc; + return re; +} + +int re1_5_compilecode(ByteProg *prog, const char *re) +{ + prog->len = 0; + prog->bytelen = 0; + prog->sub = 0; + + // Add code to implement non-anchored operation ("search"), + // for anchored operation ("match"), this code will be just skipped. + // TODO: Implement search in much more efficient manner + prog->insts[prog->bytelen++] = RSplit; + prog->insts[prog->bytelen++] = 3; + prog->insts[prog->bytelen++] = Any; + prog->insts[prog->bytelen++] = Jmp; + prog->insts[prog->bytelen++] = -5; + prog->len += 3; + + prog->insts[prog->bytelen++] = Save; + prog->insts[prog->bytelen++] = 0; + prog->len++; + + _compilecode(re, prog); + + prog->insts[prog->bytelen++] = Save; + prog->insts[prog->bytelen++] = 1; + prog->len++; + + prog->insts[prog->bytelen++] = Match; + prog->len++; + + return 0; +} + +void +cleanmarks(ByteProg *prog) +{ + char *pc = prog->insts; + char *end = pc + prog->bytelen; + while (pc < end) { + *pc &= 0x7f; + switch (*pc) { + case Jmp: + case Split: + case RSplit: + case Save: + case Char: + pc++; + } + pc++; + } +} + +#if 0 +int main(int argc, char *argv[]) +{ + int pc = 0; + ByteProg *code = re1_5_compilecode(argv[1]); + re1_5_dumpcode(code); +} +#endif |