From c19d7bb66e05e581ec6edd8cfe7f013532278e05 Mon Sep 17 00:00:00 2001
From: Dan Hirsch <>
Date: Sat, 9 Mar 2013 21:42:49 -0800
Subject: [PATCH] Regex VM finished but untested.

 docs/         |  32 +++++++++
 src/Makefile                |   3 +-
 src/backends/regex.c        | 131 ++++++++++++++++++++++++++++--------
 src/backends/regex.h        |  21 +++++-
 src/backends/ | 112 ++++++++++++++++++++++++++++++
 src/hammer.c                |   8 +++
 src/hammer.h                |   5 +-
 src/internal.h              |   3 +
 8 files changed, 283 insertions(+), 32 deletions(-)
 create mode 100644 docs/
 create mode 100644 src/backends/

diff --git a/docs/ b/docs/
new file mode 100644
index 0000000..9ae28b0
--- /dev/null
+++ b/docs/
@@ -0,0 +1,32 @@
+digraph {
+	graph [rankdir=LR];
+subgraph complete {
+	 node [color="gray",fontcolor="gray"];
+	 regex_gen;
+	 glue;
+/* The end result of the milestone, along with the subtasks listed */
+milestone2 [color="green",style="filled"];
+llk -> milestone2;
+lr -> milestone2;
+lalr8_gen -> lr;
+glr_gen -> lr;
+lr_driver -> lr;
+regex -> milestone2;
+glue -> milestone2; // Meredith knows what glue referred to here.
+tests -> milestone2;
+regex_gen -> regex;
+regex_driver -> regex;
+llk_driver -> llk;
+llk_gen -> llk;
+ * 
+ */
+ desugaring -> llk_gen;
+ desugaring -> lalr8_gen;
+ desugaring -> glr_gen;
diff --git a/src/Makefile b/src/Makefile
index 128de05..bb83e83 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -26,7 +26,8 @@ PARSERS := \
-	packrat
+	packrat \
+	regex
 	bitreader.o \
diff --git a/src/backends/regex.c b/src/backends/regex.c
index 659b2d9..0b1991d 100644
--- a/src/backends/regex.c
+++ b/src/backends/regex.c
@@ -1,8 +1,11 @@
+#include <string.h>
+#include <assert.h>
 #include "../internal.h"
 #include "../parsers/parser_internal.h"
+#include "regex.h"
 #undef a_new
-#define a_new(typ, count) a_new_(arena, typ, count);
+#define a_new(typ, count) a_new_(arena, typ, count)
 // Stack VM
 typedef enum HSVMOp_ {
   SVM_PUSH, // Push a mark. There is no VM insn to push an object.
@@ -16,6 +19,7 @@ typedef struct HRVMTrace_ {
   struct HRVMTrace_ *next; // When parsing, these are
 			   // reverse-threaded. There is a postproc
 			   // step that inverts all the pointers.
+  size_t input_pos;
   uint16_t arg;
   uint8_t opcode;
 } HRVMTrace;
@@ -25,13 +29,27 @@ typedef struct HRVMThread_ {
   uint16_t ip;
 } HRVMThread;
-// TODO(thequux): This function could really use a refactoring, at the
-// very least, to split the two VMs.
-void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const char* input, size_t len) {
+HParseResult *run_trace(HAllocator *mm__, HRVMProg *orig_prog, HRVMTrace *trace, const uint8_t *input, int len);
+HRVMTrace *invert_trace(HRVMTrace *trace) {
+  HRVMTrace *last = NULL;
+  if (!trace)
+    return NULL;
+  if (!trace->next)
+    return trace;
+  do {
+    HRVMTrace *next = trace->next;
+    trace->next = last;
+    last = trace;
+    trace = next;
+  } while (trace->next);
+  return trace;
+void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const uint8_t* input, size_t len) {
   HArena *arena = h_new_arena(mm__, 0);
   HRVMTrace **heads_p = a_new(HRVMTrace*, prog->length),
-    **heads_n = a_new(HRVMTrace*, prog->length),
-    **heads_t;
+    **heads_n = a_new(HRVMTrace*, prog->length);
   HRVMTrace *ret_trace;
@@ -39,12 +57,16 @@ void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const char* input, size_t l
   HRVMThread *ip_queue = a_new(HRVMThread, prog->length);
   size_t ipq_top;
 #define THREAD ip_queue[ipq_top-1]
 #define PUSH_SVM(op_, arg_) do { \
 	  HRVMTrace *nt = a_new(HRVMTrace, 1); \
 	  nt->arg = (arg_);		       \
 	  nt->opcode = (op_);		       \
 	  nt->next = THREAD.trace;	       \
+	  nt->input_pos = off;		       \
 	  THREAD.trace = nt;		       \
   } while(0)
@@ -55,7 +77,8 @@ void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const char* input, size_t l
   int live_threads = 1;
   for (off = 0; off <= len; off++) {
     uint8_t ch = ((off == len) ? 0 : input[off]);
-    size_t ip_s, ip;
+    size_t ip_s; // BUG: there was an unused variable ip. Not sure if
+		 // I intended to use it somewhere.
     /* scope */ {
       HRVMTrace **heads_t;
       heads_t = heads_n;
@@ -77,9 +100,9 @@ void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const char* input, size_t l
       uint8_t hi, lo;
       uint16_t arg;
       while(ipq_top > 0) {
-	if (insns_seen[THREAD.ip] == 1)
+	if (insn_seen[THREAD.ip] == 1)
-	insns_seen[THREAD.ip] = 1;
+	insn_seen[THREAD.ip] = 1;
 	arg = prog->insns[THREAD.ip].arg;
 	switch(prog->insns[THREAD.ip].op) {
 	case RVM_ACCEPT:
@@ -100,8 +123,8 @@ void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const char* input, size_t l
 	  goto next_insn;
 	case RVM_FORK:
-	  if (!insns_seen[arg]) {
-	    insns_seen[THREAD.ip] = 2;
+	  if (!insn_seen[arg]) {
+	    insn_seen[THREAD.ip] = 2;
 	    HRVMTrace* tr = THREAD.trace;
 	    THREAD.ip = arg;
@@ -109,7 +132,7 @@ void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const char* input, size_t l
 	  goto next_insn;
 	case RVM_PUSH:
-	  PUSH_SVM(SVM_PUSH, off);
 	  goto next_insn;
 	case RVM_ACTION:
@@ -133,6 +156,7 @@ void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const char* input, size_t l
 	  goto next_insn;
+	;
@@ -147,27 +171,78 @@ void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const char* input, size_t l
   ret_trace = invert_trace(ret_trace);
-  HParseResult *ret = run_trace(mm__, ret_trace, input, length);
+  HParseResult *ret = run_trace(mm__, prog, ret_trace, input, len);
   // ret is in its own arena
   return ret;
+#undef PUSH_SVM
+#undef THREAD
-HRVMTrace *invert_trace(HRVMTrace *trace) {
-  HRVMTrace *next, *last = NULL;
-  if (!trace)
-    return NULL;
-  if (!trace->next)
-    return trace;
-  do {
-    HRVMTrace *next = trace->next;
-    trace->next = last;
-    last = trace;
-    trace = next;
-  } while (trace->next);
-  return trace;
+void svm_stack_ensure_cap(HAllocator *mm__, HSVMContext *ctx, size_t addl) {
+  if (ctx->stack_count + addl >= ctx->stack_capacity) {
+    ctx->stack = mm__->realloc(mm__, ctx->stack, sizeof(*ctx->stack) * (ctx->stack_capacity *= 2));
+    // TODO: check for realloc failure
+  }
-HParseResult *run_trace(HAllocator mm__, HRVMTrace *trace, uint8_t *input, int len) {
+HParseResult *run_trace(HAllocator *mm__, HRVMProg *orig_prog, HRVMTrace *trace, const uint8_t *input, int len) {
+  // orig_prog is only used for the action table
+  HSVMContext ctx;
+  HArena *arena = h_new_arena(mm__, 0);
+  ctx.stack_count = 0;
+  ctx.stack_capacity = 16;
+  ctx.stack = h_new(HParsedToken*, ctx.stack_capacity);
+  HParsedToken *tmp_res;
+  HRVMTrace *cur;
+  for (cur = trace; cur; cur = cur->next) {
+    switch (cur->opcode) {
+    case SVM_PUSH:
+      svm_stack_ensure_cap(mm__, &ctx, 1);
+      tmp_res = a_new(HParsedToken, 1);
+      tmp_res->token_type = TT_MARK;
+      tmp_res->index = cur->input_pos;
+      tmp_res->bit_offset = 0;
+      ctx.stack[ctx.stack_count++] = tmp_res;
+      break;
+    case SVM_NOP:
+      break;
+    case SVM_ACTION:
+      // Action should modify stack appropriately
+      if (!orig_prog->actions[cur->arg].fn(arena, &ctx, orig_prog->actions[cur->arg].env)) {
+	// action failed... abort somehow
+	// TODO: Actually abort
+      }
+      break;
+    case SVM_CAPTURE: 
+      // Top of stack must be a mark
+      // This replaces said mark in-place with a TT_BYTES.
+      assert(ctx.stack[ctx.stack_count]->token_type == TT_MARK);
+      tmp_res = ctx.stack[ctx.stack_count];
+      tmp_res->token_type = TT_BYTES;
+      // TODO: Will need to copy if bit_offset is nonzero
+      assert(tmp_res->bit_offset == 0);
+      tmp_res->bytes.token = input + tmp_res->index;
+      tmp_res->bytes.len = cur->input_pos - tmp_res->index + 1; // inclusive
+      break;
+    case SVM_ACCEPT:
+      assert(ctx.stack_count == 1);
+      HParseResult *res = a_new(HParseResult, 1);
+      res->ast = ctx.stack[0];
+      res->bit_length = cur->input_pos * 8;
+      res->arena = arena;
+      return res;
+    }
+  }
+  h_delete_arena(arena);
+  return NULL;
+    // TODO: Implement the primitive actions
diff --git a/src/backends/regex.h b/src/backends/regex.h
index c406c84..a0bc5b8 100644
--- a/src/backends/regex.h
+++ b/src/backends/regex.h
@@ -27,12 +27,29 @@ typedef struct HRVMInsn_{
   uint16_t arg;
 } HRVMInsn;
+const HTokenType TT_MARK = TT_RESERVED_1;
+typedef struct HSVMContext_ {
+  HParsedToken **stack;
+  size_t stack_count;
+  size_t stack_capacity;
+} HSVMContext;
+// These actions all assume that the items on the stack are not
+// aliased anywhere.
+typedef struct HSVMAction_ {
+  bool (*fn)(HArena *arena, HSVMContext *ctx, void* env);
+  void* env;
+} HSVMAction;
 typedef struct HRVMProg_ {
   size_t length;
   size_t action_count;
-  HAction *actions;
   HRVMInsn *insns;
+  HSVMAction *actions;
+} HRVMProg;
diff --git a/src/backends/ b/src/backends/
new file mode 100644
index 0000000..998b840
--- /dev/null
+++ b/src/backends/
@@ -0,0 +1,112 @@
+#!/usr/bin/perl -w
+use strict;
+# The input file consists of a sequence of blocks, which can be parsed
+# as SVM test cases, RVM test cases, or C functions. Each block starts
+# with a header line, then a sequence of options, and finally text in
+# a format defined by the block type.
+# Header lines start with "+TYPE", optionally followed by a name. This
+# name is semantically meaningful for SVM and RVM blocks; it
+# determines the name of the test case.
+# A C block's name is not used, and it takes no options. The body
+# (which continues until the first line that looks like a header), is
+# just passed straight through into the C source.
+# SVM blocks' names are the GLib test case name. The underlying
+# function's name is derived by substituting invalid characters with
+# '_'. Note that this can result in collisions (eg, /foo_bar/baz
+# collides with /foo/bar_baz). If this happens, it's your own damn
+# fault; rename the blocks. SVM blocks take three different options:
+# @input, @output, and @pre. The @input pragma's argument is a
+# C-quoted string that gets passed into the VM as the input string,
+# and @output is a C-quoted string that is compared against
+# h_write_result_unamb.  @pre lines are prepended verbatim to the
+# function body (with the @pre stripped, of course); they can be used
+# to initialize environment values.
+# SVM instructions consist of either two or four fields:
+#     input_pos opcode [arg env]
+# input_pos and opcode correspond to the fields in HRVMTrace.  arg and
+# env are used to populate an HSVMAction; arg is the function, and env
+# is the object whose address should be used as the env.
+# RVM blocks are very similar to SVM blocks; the name and options are
+# handled exactly the same way. The assembly text is handled slightly
+# differently; the format is:
+#     [label:] opcode [arg ...]
+# For FORK and GOTO, the arg should be a label that is defined
+# elsewhere.
+# For ACTION, the arguments are handled the same way as with SVM.
+# MATCH takes two arguments, each of which can be any C integer
+# constant (not including character constants), which form the lower
+# and upper bounds of the matched character, respectively.
+# No other RVM instructions take an argument.
+# At the beginning of any line, comments preceeded by '#' are allowed;
+# they are replaced by C++ comments and inserted in the nearest valid
+# location in the output.
+my $mode == "TOP";
+# common regexes:
+my $re_ident = qr/[A-Za-z_][A-Za-z0-9_]*/;
+my $re_cstr = qr/"(?:[^\\"]|\\["'abefnrtv0\\]|\\x[0-9a-fA-F]{2}|\\[0-7]{3})*"/;
+my %svm = (
+    name => sub {
+	my ($env, $name) = @_;
+	$env->{name} = $name;
+    },
+    pragma => sub {
+	my ($env, $name, $val) = @_;
+	if ($name eq "input") {
+	    chomp($env->{input} = $val);
+	} elsif ($name eq "output") {
+	    chomp($env->{output} = $val);
+	} elsif ($name eq "pre") {
+	    # Do I have the ref precedence right here?
+	    push(@$env->{pre}, $val);
+	} else {
+	    warn "Invalid SVM pragma";
+	}
+    },
+    body => sub {
+	my ($env, $line) = @_;
+	my ($ipos, $op, $arg, $argenv);
+	if ($line =~ /^\s*(\d+)\s+(PUSH|NOP|ACTION|CAPTURE|ACCEPT)(?:\s+($re_ident)\s+($re_ident))?/) {
+	    if ($2 eq "PUSH") {
+		# TODO: implement all the opcodes
+	    }
+	}
+    }
+    );
+while (<>) {
+    if (/^+(C|RVM|SVM)/) {
+	$mode = $1;
+    }
+    if ($mode eq "TOP") {
+	if (/^#(.*)/) {
+	    print "// $1";
+	    next;
+	}
+    } elsif ($mode eq "SVM") {
+    } elsif ($mode eq "RVM") {
+    } elsif ($mode eq "C") {
+    }
diff --git a/src/hammer.c b/src/hammer.c
index c33f6c8..c369f64 100644
--- a/src/hammer.c
+++ b/src/hammer.c
@@ -84,4 +84,12 @@ void h_parse_result_free(HParseResult *result) {
+bool h_false(void* env) {
+  (void)env;
+  return false;
+bool h_true(void* env) {
+  (void)env;
+  return true;
diff --git a/src/hammer.h b/src/hammer.h
index 4512685..6678db9 100644
--- a/src/hammer.h
+++ b/src/hammer.h
@@ -47,6 +47,7 @@ typedef enum HTokenType_ {
+  TT_RESERVED_1, // reserved for internal use
   TT_USER = 64,
@@ -78,7 +79,9 @@ typedef struct HParsedToken_ {
 } HParsedToken;
- * The result of a successful parse.
+ * The result of a successful parse. Note that this may reference the
+ * input string.
+ *
  * If a parse fails, the parse result will be NULL.
  * If a parse is successful but there's nothing there (i.e., if end_p 
  * succeeds) then there's a parse result but its ast is NULL.
diff --git a/src/internal.h b/src/internal.h
index d35ebaa..116af89 100644
--- a/src/internal.h
+++ b/src/internal.h
@@ -223,6 +223,9 @@ int   h_hashtable_present(HHashTable* ht, void* key);
 void  h_hashtable_del(HHashTable* ht, void* key);
 void  h_hashtable_free(HHashTable* ht);
+bool h_false(void*);
+bool h_true(void*);
 #if 0
 #include <stdlib.h>
 #define h_arena_malloc(a, s) malloc(s)