From de38f7bce81ab15cedbab4a77e491f6714adf3cd Mon Sep 17 00:00:00 2001
From: Dan Hirsch <thequux@upstandinghackers.com>
Date: Fri, 24 May 2013 02:50:05 +0200
Subject: [PATCH] Got more regex tests passing

---
 src/backends/regex.c       | 15 ++++++++-------
 src/backends/regex_debug.c | 19 ++++++++++++++++++-
 src/parsers/charset.c      | 26 +++++++++++++++++++-------
 src/parsers/ignoreseq.c    |  2 +-
 src/parsers/whitespace.c   | 10 ++++++++--
 5 files changed, 54 insertions(+), 18 deletions(-)

diff --git a/src/backends/regex.c b/src/backends/regex.c
index e0f3f7ee..b0cfc2b8 100644
--- a/src/backends/regex.c
+++ b/src/backends/regex.c
@@ -98,7 +98,7 @@ void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const uint8_t* input, size_
       if (!heads_p[ip_s])
 	continue;
       THREAD.ip = ip_s;
-
+      THREAD.trace = heads_p[ip_s];
       uint8_t hi, lo;
       uint16_t arg;
       while(ipq_top > 0) {
@@ -112,8 +112,6 @@ void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const uint8_t* input, size_
 	  ret_trace = THREAD.trace;
 	  goto run_trace;
 	case RVM_MATCH:
-	  // Doesn't actually validate the "must be followed by MATCH
-	  // or STEP. It should. Preproc perhaps?
 	  hi = (arg >> 8) & 0xff;
 	  lo = arg & 0xff;
 	  THREAD.ip++;
@@ -171,7 +169,6 @@ void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const uint8_t* input, size_
  run_trace:
   // Invert the direction of the trace linked list.
 
-  
   ret_trace = invert_trace(ret_trace);
   HParseResult *ret = run_trace(mm__, prog, ret_trace, input, len);
   // ret is in its own arena
@@ -234,9 +231,13 @@ HParseResult *run_trace(HAllocator *mm__, HRVMProg *orig_prog, HRVMTrace *trace,
       tmp_res->bytes.len = cur->input_pos - tmp_res->index;
       break;
     case SVM_ACCEPT:
-      assert(ctx.stack_count == 1);
-      HParseResult *res = a_new(HParseResult, 1);
-      res->ast = ctx.stack[0];
+      assert(ctx.stack_count <= 1);
+	HParseResult *res = a_new(HParseResult, 1);
+      if (ctx.stack_count == 1) {
+	res->ast = ctx.stack[0];
+      } else {
+	res->ast = NULL;
+      }
       res->bit_length = cur->input_pos * 8;
       res->arena = arena;
       return res;
diff --git a/src/backends/regex_debug.c b/src/backends/regex_debug.c
index 520c5a9b..70ed37bb 100644
--- a/src/backends/regex_debug.c
+++ b/src/backends/regex_debug.c
@@ -61,7 +61,7 @@ void dump_rvm_prog(HRVMProg *prog) {
       uint8_t low, high;
       low = insn->arg & 0xff;
       high = (insn->arg >> 8) & 0xff;
-      if (high > low)
+      if (high < low)
 	printf("NONE\n");
       else {
 	if (low >= 0x32 && low <= 0x7e)
@@ -81,3 +81,20 @@ void dump_rvm_prog(HRVMProg *prog) {
     }
   }
 }
+
+void dump_svm_prog(HRVMProg *prog, HRVMTrace *trace) {
+  char* symref;
+  for (; trace != NULL; trace = trace->next) {
+    printf("@%04zd %-10s", trace->input_pos, svm_op_names[trace->opcode]);
+    switch (trace->opcode) {
+    case SVM_ACTION:
+      symref = getsym(prog->actions[trace->arg].action);
+      // TODO: somehow format the argument to action
+      printf("%s\n", symref);
+      free(symref);
+      break;
+    default:
+      printf("\n");
+    }
+  }
+}
diff --git a/src/parsers/charset.c b/src/parsers/charset.c
index 8984b576..a1fe668c 100644
--- a/src/parsers/charset.c
+++ b/src/parsers/charset.c
@@ -41,13 +41,25 @@ static bool cs_ctrvm(HRVMProg *prog, void *env) {
   h_rvm_insert_insn(prog, RVM_PUSH, 0);
 
   uint16_t start = h_rvm_get_ip(prog);
-  for (size_t i=0; i<256; ++i) {
-    // TODO: merge ranges.
-    if (charset_isset(cs, i)) {
-      uint16_t insn = h_rvm_insert_insn(prog, RVM_FORK, 0);
-      h_rvm_insert_insn(prog, RVM_MATCH, i | i << 8);
-      h_rvm_insert_insn(prog, RVM_GOTO, 0);
-      h_rvm_patch_arg(prog, insn, h_rvm_get_ip(prog));
+
+  uint8_t range_start = 0;
+  bool collecting = false;
+  for (size_t i=0; i<257; ++i) {
+    // Position 256 is only there so that every included character has
+    // a non-included character after it.
+    if (i < 256 && charset_isset(cs, i)) {
+      if (!collecting) {
+	collecting = true;
+	range_start = i;
+      }
+    } else {
+      if (collecting) {
+	collecting = false;
+	uint16_t insn = h_rvm_insert_insn(prog, RVM_FORK, 0);
+	h_rvm_insert_insn(prog, RVM_MATCH, range_start | i << 8);
+	h_rvm_insert_insn(prog, RVM_GOTO, 0);
+	h_rvm_patch_arg(prog, insn, h_rvm_get_ip(prog));
+      }
     }
   }
   h_rvm_insert_insn(prog, RVM_MATCH, 0x00FF);
diff --git a/src/parsers/ignoreseq.c b/src/parsers/ignoreseq.c
index 6a61f3ea..3b920f18 100644
--- a/src/parsers/ignoreseq.c
+++ b/src/parsers/ignoreseq.c
@@ -97,7 +97,7 @@ static bool is_ctrvm(HRVMProg *prog, void* env) {
   HIgnoreSeq *seq = (HIgnoreSeq*)env;
   for (size_t i=0; i<seq->len; ++i) {
     h_rvm_insert_insn(prog, RVM_PUSH, 0);
-    if (!h_compile_regex(prog, seq->parsers[i]->env))
+    if (!h_compile_regex(prog, seq->parsers[i]))
       return false;
   }
   h_rvm_insert_insn(prog, RVM_ACTION, h_rvm_create_action(prog, h_svm_action_ignoreseq, env));
diff --git a/src/parsers/whitespace.c b/src/parsers/whitespace.c
index eb89446f..454e04ed 100644
--- a/src/parsers/whitespace.c
+++ b/src/parsers/whitespace.c
@@ -67,9 +67,15 @@ static bool ws_ctrvm(HRVMProg *prog, void *env) {
   uint16_t start = h_rvm_get_ip(prog);
   uint16_t next;
 
-  for (int i = 0; i < 6; i++) {
+  uint16_t ranges[2] = {
+    0x0d09,
+    0x2020,
+  };
+  
+  for (int i = 0; i < 2; i++) {
     next = h_rvm_insert_insn(prog, RVM_FORK, 0);
-    h_rvm_insert_insn(prog, RVM_MATCH, (SPACE_CHRS[i] << 8) | (SPACE_CHRS[i]));
+    h_rvm_insert_insn(prog, RVM_MATCH, ranges[i]);
+    h_rvm_insert_insn(prog, RVM_STEP, 0);
     h_rvm_insert_insn(prog, RVM_GOTO, start);
     h_rvm_patch_arg(prog, next, h_rvm_get_ip(prog));
   }
-- 
GitLab