summaryrefslogtreecommitdiffstatshomepage
path: root/extmod/re1.5/compilecode.c
diff options
context:
space:
mode:
authorPaul Sokolovsky <pfalcon@users.sourceforge.net>2015-11-01 00:37:44 +0300
committerPaul Sokolovsky <pfalcon@users.sourceforge.net>2015-11-01 00:38:00 +0300
commit7cce2f664c9a93b0ed42e1bf51a601bddf2b7059 (patch)
tree6b3200e8e723bc7b361d0a42b052fd5f0a40c6bf /extmod/re1.5/compilecode.c
parent000a12783c7364527b9f8a59c6924fc88a66f290 (diff)
downloadmicropython-7cce2f664c9a93b0ed42e1bf51a601bddf2b7059.tar.gz
micropython-7cce2f664c9a93b0ed42e1bf51a601bddf2b7059.zip
extmod/re1.5: Update to 0.8.
Contains implementation of ?: (non-capturing groups), ?? (non-greedy ?), as well as much improved robustness, and edge cases and error handling by Amir Plivatsky (@ampli).
Diffstat (limited to 'extmod/re1.5/compilecode.c')
-rw-r--r--extmod/re1.5/compilecode.c223
1 files changed, 88 insertions, 135 deletions
diff --git a/extmod/re1.5/compilecode.c b/extmod/re1.5/compilecode.c
index 85a165c7a0..7a1b98128b 100644
--- a/extmod/re1.5/compilecode.c
+++ b/extmod/re1.5/compilecode.c
@@ -4,203 +4,175 @@
#include "re1.5.h"
-static void insert_code(char *code, int at, int num, int *pc)
-{
- memmove(code + at + num, code + at, *pc - at);
- *pc += num;
-}
-
+#define INSERT_CODE(at, num, pc) \
+ ((code ? memmove(code + at + num, code + at, pc - at) : (void)0), pc += num)
#define REL(at, to) (to - at - 2)
+#define EMIT(at, byte) (code ? (code[at] = byte) : (void)(at))
+#define PC (prog->bytelen)
-int re1_5_sizecode(const char *re)
-{
- int pc = 5 + NON_ANCHORED_PREFIX; // Save 0, Save 1, Match; more bytes for "search" (vs "match") prefix code
-
- for (; *re; re++) {
- switch (*re) {
- case '\\':
- re++;
- default:
- pc += 2;
- break;
- case '+':
- // Skip entire "+?"
- if (re[1] == '?')
- re++;
- case '?':
- pc += 2;
- break;
- case '.':
- case '^':
- case '$':
- pc++;
- break;
- case '*':
- // Skip entire "*?"
- if (re[1] == '?')
- re++;
- case '|':
- case '(':
- pc += 4;
- break;
- case ')':
- break;
- case '[': {
- pc += 2;
- re++;
- if (*re == '^') re++;
- while (*re != ']') {
- if (!*re) return -1;
- if (re[1] == '-') {
- re += 2;
- }
- pc += 2;
- re++;
- }
- }
- }
- }
-
- return pc;
-}
-
-#define EMIT(at, byte) code[at] = byte
-
-static const char *_compilecode(const char *re, ByteProg *prog)
+static const char *_compilecode(const char *re, ByteProg *prog, int sizecode)
{
- char *code = prog->insts;
- int pc = prog->bytelen;
- int start = pc;
- int term = pc;
+ char *code = sizecode ? NULL : prog->insts;
+ int start = PC;
+ int term = PC;
int alt_label = 0;
for (; *re && *re != ')'; re++) {
switch (*re) {
case '\\':
re++;
+ if (!*re) return NULL; // Trailing backslash
if ((*re | 0x20) == 'd' || (*re | 0x20) == 's' || (*re | 0x20) == 'w') {
- term = pc;
- EMIT(pc++, NamedClass);
- EMIT(pc++, *re);
+ term = PC;
+ EMIT(PC++, NamedClass);
+ EMIT(PC++, *re);
prog->len++;
break;
}
default:
- term = pc;
- EMIT(pc++, Char);
- EMIT(pc++, *re);
+ term = PC;
+ EMIT(PC++, Char);
+ EMIT(PC++, *re);
prog->len++;
break;
case '.':
- term = pc;
- EMIT(pc++, Any);
+ term = PC;
+ EMIT(PC++, Any);
prog->len++;
break;
case '[': {
int cnt;
- term = pc;
+ term = PC;
re++;
if (*re == '^') {
- EMIT(pc++, ClassNot);
+ EMIT(PC++, ClassNot);
re++;
} else {
- EMIT(pc++, Class);
+ EMIT(PC++, Class);
}
- pc++; // Skip # of pair byte
+ PC++; // Skip # of pair byte
prog->len++;
for (cnt = 0; *re != ']'; re++, cnt++) {
if (!*re) return NULL;
- EMIT(pc++, *re);
+ EMIT(PC++, *re);
if (re[1] == '-') {
re += 2;
}
- EMIT(pc++, *re);
+ EMIT(PC++, *re);
}
EMIT(term + 1, cnt);
break;
}
case '(': {
- term = pc;
- int sub = ++prog->sub;
-
- EMIT(pc++, Save);
- EMIT(pc++, 2 * sub);
- prog->len++;
+ term = PC;
+ int sub;
+ int capture = re[1] != '?' || re[2] != ':';
+
+ if (capture) {
+ sub = ++prog->sub;
+ EMIT(PC++, Save);
+ EMIT(PC++, 2 * sub);
+ prog->len++;
+ } else {
+ re += 2;
+ }
- prog->bytelen = pc;
- re = _compilecode(re + 1, prog);
+ re = _compilecode(re + 1, prog, sizecode);
if (re == NULL || *re != ')') return NULL; // error, or no matching paren
- pc = prog->bytelen;
- EMIT(pc++, Save);
- EMIT(pc++, 2 * sub + 1);
- prog->len++;
+ if (capture) {
+ EMIT(PC++, Save);
+ EMIT(PC++, 2 * sub + 1);
+ prog->len++;
+ }
break;
}
case '?':
- if (pc == term) return NULL; // nothing to repeat
- insert_code(code, term, 2, &pc);
- EMIT(term, Split);
- EMIT(term + 1, REL(term, pc));
+ if (PC == term) return NULL; // nothing to repeat
+ INSERT_CODE(term, 2, PC);
+ if (re[1] == '?') {
+ EMIT(term, RSplit);
+ re++;
+ } else {
+ EMIT(term, Split);
+ }
+ EMIT(term + 1, REL(term, PC));
prog->len++;
+ term = PC;
break;
case '*':
- if (pc == term) return NULL; // nothing to repeat
- insert_code(code, term, 2, &pc);
- EMIT(pc, Jmp);
- EMIT(pc + 1, REL(pc, term));
- pc += 2;
+ if (PC == term) return NULL; // nothing to repeat
+ INSERT_CODE(term, 2, PC);
+ EMIT(PC, Jmp);
+ EMIT(PC + 1, REL(PC, term));
+ PC += 2;
if (re[1] == '?') {
EMIT(term, RSplit);
re++;
} else {
EMIT(term, Split);
}
- EMIT(term + 1, REL(term, pc));
+ EMIT(term + 1, REL(term, PC));
prog->len += 2;
+ term = PC;
break;
case '+':
- if (pc == term) return NULL; // nothing to repeat
+ if (PC == term) return NULL; // nothing to repeat
if (re[1] == '?') {
- EMIT(pc, Split);
+ EMIT(PC, Split);
re++;
} else {
- EMIT(pc, RSplit);
+ EMIT(PC, RSplit);
}
- EMIT(pc + 1, REL(pc, term));
- pc += 2;
+ EMIT(PC + 1, REL(PC, term));
+ PC += 2;
prog->len++;
+ term = PC;
break;
case '|':
if (alt_label) {
- EMIT(alt_label, REL(alt_label, pc) + 1);
+ EMIT(alt_label, REL(alt_label, PC) + 1);
}
- insert_code(code, start, 2, &pc);
- EMIT(pc++, Jmp);
- alt_label = pc++;
+ INSERT_CODE(start, 2, PC);
+ EMIT(PC++, Jmp);
+ alt_label = PC++;
EMIT(start, Split);
- EMIT(start + 1, REL(start, pc));
+ EMIT(start + 1, REL(start, PC));
prog->len += 2;
+ term = PC;
break;
case '^':
- EMIT(pc++, Bol);
+ EMIT(PC++, Bol);
prog->len++;
+ term = PC;
break;
case '$':
- EMIT(pc++, Eol);
+ EMIT(PC++, Eol);
prog->len++;
+ term = PC;
break;
}
}
if (alt_label) {
- EMIT(alt_label, REL(alt_label, pc) + 1);
+ EMIT(alt_label, REL(alt_label, PC) + 1);
}
- prog->bytelen = pc;
return re;
}
+int re1_5_sizecode(const char *re)
+{
+ ByteProg dummyprog = {
+ // Save 0, Save 1, Match; more bytes for "search" (vs "match") prefix code
+ .bytelen = 5 + NON_ANCHORED_PREFIX
+ };
+
+ if (_compilecode(re, &dummyprog, /*sizecode*/1) == NULL) return -1;
+
+ return dummyprog.bytelen;
+}
+
int re1_5_compilecode(ByteProg *prog, const char *re)
{
prog->len = 0;
@@ -221,7 +193,7 @@ int re1_5_compilecode(ByteProg *prog, const char *re)
prog->insts[prog->bytelen++] = 0;
prog->len++;
- re = _compilecode(re, prog);
+ re = _compilecode(re, prog, /*sizecode*/0);
if (re == NULL || *re) return 1;
prog->insts[prog->bytelen++] = Save;
@@ -234,25 +206,6 @@ int re1_5_compilecode(ByteProg *prog, const char *re)
return 0;
}
-void
-cleanmarks(ByteProg *prog)
-{
- char *pc = prog->insts;
- char *end = pc + prog->bytelen;
- while (pc < end) {
- *pc &= 0x7f;
- switch (*pc) {
- case Jmp:
- case Split:
- case RSplit:
- case Save:
- case Char:
- pc++;
- }
- pc++;
- }
-}
-
#if 0
int main(int argc, char *argv[])
{