From de38f7bce81ab15cedbab4a77e491f6714adf3cd Mon Sep 17 00:00:00 2001 From: Dan Hirsch <thequux@upstandinghackers.com> Date: Fri, 24 May 2013 02:50:05 +0200 Subject: [PATCH] Got more regex tests passing --- src/backends/regex.c | 15 ++++++++------- src/backends/regex_debug.c | 19 ++++++++++++++++++- src/parsers/charset.c | 26 +++++++++++++++++++------- src/parsers/ignoreseq.c | 2 +- src/parsers/whitespace.c | 10 ++++++++-- 5 files changed, 54 insertions(+), 18 deletions(-) diff --git a/src/backends/regex.c b/src/backends/regex.c index e0f3f7ee..b0cfc2b8 100644 --- a/src/backends/regex.c +++ b/src/backends/regex.c @@ -98,7 +98,7 @@ void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const uint8_t* input, size_ if (!heads_p[ip_s]) continue; THREAD.ip = ip_s; - + THREAD.trace = heads_p[ip_s]; uint8_t hi, lo; uint16_t arg; while(ipq_top > 0) { @@ -112,8 +112,6 @@ void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const uint8_t* input, size_ ret_trace = THREAD.trace; goto run_trace; case RVM_MATCH: - // Doesn't actually validate the "must be followed by MATCH - // or STEP. It should. Preproc perhaps? hi = (arg >> 8) & 0xff; lo = arg & 0xff; THREAD.ip++; @@ -171,7 +169,6 @@ void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const uint8_t* input, size_ run_trace: // Invert the direction of the trace linked list. - ret_trace = invert_trace(ret_trace); HParseResult *ret = run_trace(mm__, prog, ret_trace, input, len); // ret is in its own arena @@ -234,9 +231,13 @@ HParseResult *run_trace(HAllocator *mm__, HRVMProg *orig_prog, HRVMTrace *trace, tmp_res->bytes.len = cur->input_pos - tmp_res->index; break; case SVM_ACCEPT: - assert(ctx.stack_count == 1); - HParseResult *res = a_new(HParseResult, 1); - res->ast = ctx.stack[0]; + assert(ctx.stack_count <= 1); + HParseResult *res = a_new(HParseResult, 1); + if (ctx.stack_count == 1) { + res->ast = ctx.stack[0]; + } else { + res->ast = NULL; + } res->bit_length = cur->input_pos * 8; res->arena = arena; return res; diff --git a/src/backends/regex_debug.c b/src/backends/regex_debug.c index 520c5a9b..70ed37bb 100644 --- a/src/backends/regex_debug.c +++ b/src/backends/regex_debug.c @@ -61,7 +61,7 @@ void dump_rvm_prog(HRVMProg *prog) { uint8_t low, high; low = insn->arg & 0xff; high = (insn->arg >> 8) & 0xff; - if (high > low) + if (high < low) printf("NONE\n"); else { if (low >= 0x32 && low <= 0x7e) @@ -81,3 +81,20 @@ void dump_rvm_prog(HRVMProg *prog) { } } } + +void dump_svm_prog(HRVMProg *prog, HRVMTrace *trace) { + char* symref; + for (; trace != NULL; trace = trace->next) { + printf("@%04zd %-10s", trace->input_pos, svm_op_names[trace->opcode]); + switch (trace->opcode) { + case SVM_ACTION: + symref = getsym(prog->actions[trace->arg].action); + // TODO: somehow format the argument to action + printf("%s\n", symref); + free(symref); + break; + default: + printf("\n"); + } + } +} diff --git a/src/parsers/charset.c b/src/parsers/charset.c index 8984b576..a1fe668c 100644 --- a/src/parsers/charset.c +++ b/src/parsers/charset.c @@ -41,13 +41,25 @@ static bool cs_ctrvm(HRVMProg *prog, void *env) { h_rvm_insert_insn(prog, RVM_PUSH, 0); uint16_t start = h_rvm_get_ip(prog); - for (size_t i=0; i<256; ++i) { - // TODO: merge ranges. - if (charset_isset(cs, i)) { - uint16_t insn = h_rvm_insert_insn(prog, RVM_FORK, 0); - h_rvm_insert_insn(prog, RVM_MATCH, i | i << 8); - h_rvm_insert_insn(prog, RVM_GOTO, 0); - h_rvm_patch_arg(prog, insn, h_rvm_get_ip(prog)); + + uint8_t range_start = 0; + bool collecting = false; + for (size_t i=0; i<257; ++i) { + // Position 256 is only there so that every included character has + // a non-included character after it. + if (i < 256 && charset_isset(cs, i)) { + if (!collecting) { + collecting = true; + range_start = i; + } + } else { + if (collecting) { + collecting = false; + uint16_t insn = h_rvm_insert_insn(prog, RVM_FORK, 0); + h_rvm_insert_insn(prog, RVM_MATCH, range_start | i << 8); + h_rvm_insert_insn(prog, RVM_GOTO, 0); + h_rvm_patch_arg(prog, insn, h_rvm_get_ip(prog)); + } } } h_rvm_insert_insn(prog, RVM_MATCH, 0x00FF); diff --git a/src/parsers/ignoreseq.c b/src/parsers/ignoreseq.c index 6a61f3ea..3b920f18 100644 --- a/src/parsers/ignoreseq.c +++ b/src/parsers/ignoreseq.c @@ -97,7 +97,7 @@ static bool is_ctrvm(HRVMProg *prog, void* env) { HIgnoreSeq *seq = (HIgnoreSeq*)env; for (size_t i=0; i<seq->len; ++i) { h_rvm_insert_insn(prog, RVM_PUSH, 0); - if (!h_compile_regex(prog, seq->parsers[i]->env)) + if (!h_compile_regex(prog, seq->parsers[i])) return false; } h_rvm_insert_insn(prog, RVM_ACTION, h_rvm_create_action(prog, h_svm_action_ignoreseq, env)); diff --git a/src/parsers/whitespace.c b/src/parsers/whitespace.c index eb89446f..454e04ed 100644 --- a/src/parsers/whitespace.c +++ b/src/parsers/whitespace.c @@ -67,9 +67,15 @@ static bool ws_ctrvm(HRVMProg *prog, void *env) { uint16_t start = h_rvm_get_ip(prog); uint16_t next; - for (int i = 0; i < 6; i++) { + uint16_t ranges[2] = { + 0x0d09, + 0x2020, + }; + + for (int i = 0; i < 2; i++) { next = h_rvm_insert_insn(prog, RVM_FORK, 0); - h_rvm_insert_insn(prog, RVM_MATCH, (SPACE_CHRS[i] << 8) | (SPACE_CHRS[i])); + h_rvm_insert_insn(prog, RVM_MATCH, ranges[i]); + h_rvm_insert_insn(prog, RVM_STEP, 0); h_rvm_insert_insn(prog, RVM_GOTO, start); h_rvm_patch_arg(prog, next, h_rvm_get_ip(prog)); } -- GitLab