From c19d7bb66e05e581ec6edd8cfe7f013532278e05 Mon Sep 17 00:00:00 2001 From: Dan Hirsch <thequux@upstandinghackers.com> Date: Sat, 9 Mar 2013 21:42:49 -0800 Subject: [PATCH] Regex VM finished but untested. --- docs/milestone2.dot | 32 +++++++++ src/Makefile | 3 +- src/backends/regex.c | 131 ++++++++++++++++++++++++++++-------- src/backends/regex.h | 21 +++++- src/backends/regexvm_asm.pl | 112 ++++++++++++++++++++++++++++++ src/hammer.c | 8 +++ src/hammer.h | 5 +- src/internal.h | 3 + 8 files changed, 283 insertions(+), 32 deletions(-) create mode 100644 docs/milestone2.dot create mode 100644 src/backends/regexvm_asm.pl diff --git a/docs/milestone2.dot b/docs/milestone2.dot new file mode 100644 index 00000000..9ae28b05 --- /dev/null +++ b/docs/milestone2.dot @@ -0,0 +1,32 @@ +digraph { + graph [rankdir=LR]; +subgraph complete { + node [color="gray",fontcolor="gray"]; + regex_gen; + glue; +} +/* The end result of the milestone, along with the subtasks listed */ +milestone2 [color="green",style="filled"]; +llk -> milestone2; +lr -> milestone2; +lalr8_gen -> lr; +glr_gen -> lr; +lr_driver -> lr; +regex -> milestone2; +glue -> milestone2; // Meredith knows what glue referred to here. +tests -> milestone2; + +regex_gen -> regex; +regex_driver -> regex; +llk_driver -> llk; +llk_gen -> llk; + + +/* + * + */ + desugaring -> llk_gen; + desugaring -> lalr8_gen; + desugaring -> glr_gen; + +} diff --git a/src/Makefile b/src/Makefile index 128de050..bb83e839 100644 --- a/src/Makefile +++ b/src/Makefile @@ -26,7 +26,8 @@ PARSERS := \ indirect BACKENDS := \ - packrat + packrat \ + regex HAMMER_PARTS := \ bitreader.o \ diff --git a/src/backends/regex.c b/src/backends/regex.c index 659b2d93..0b1991da 100644 --- a/src/backends/regex.c +++ b/src/backends/regex.c @@ -1,8 +1,11 @@ +#include <string.h> +#include <assert.h> #include "../internal.h" #include "../parsers/parser_internal.h" +#include "regex.h" #undef a_new -#define a_new(typ, count) a_new_(arena, typ, count); +#define a_new(typ, count) a_new_(arena, typ, count) // Stack VM typedef enum HSVMOp_ { SVM_PUSH, // Push a mark. There is no VM insn to push an object. @@ -16,6 +19,7 @@ typedef struct HRVMTrace_ { struct HRVMTrace_ *next; // When parsing, these are // reverse-threaded. There is a postproc // step that inverts all the pointers. + size_t input_pos; uint16_t arg; uint8_t opcode; } HRVMTrace; @@ -25,13 +29,27 @@ typedef struct HRVMThread_ { uint16_t ip; } HRVMThread; -// TODO(thequux): This function could really use a refactoring, at the -// very least, to split the two VMs. -void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const char* input, size_t len) { +HParseResult *run_trace(HAllocator *mm__, HRVMProg *orig_prog, HRVMTrace *trace, const uint8_t *input, int len); + +HRVMTrace *invert_trace(HRVMTrace *trace) { + HRVMTrace *last = NULL; + if (!trace) + return NULL; + if (!trace->next) + return trace; + do { + HRVMTrace *next = trace->next; + trace->next = last; + last = trace; + trace = next; + } while (trace->next); + return trace; +} + +void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const uint8_t* input, size_t len) { HArena *arena = h_new_arena(mm__, 0); HRVMTrace **heads_p = a_new(HRVMTrace*, prog->length), - **heads_n = a_new(HRVMTrace*, prog->length), - **heads_t; + **heads_n = a_new(HRVMTrace*, prog->length); HRVMTrace *ret_trace; @@ -39,12 +57,16 @@ void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const char* input, size_t l HRVMThread *ip_queue = a_new(HRVMThread, prog->length); size_t ipq_top; + + + #define THREAD ip_queue[ipq_top-1] #define PUSH_SVM(op_, arg_) do { \ HRVMTrace *nt = a_new(HRVMTrace, 1); \ nt->arg = (arg_); \ nt->opcode = (op_); \ nt->next = THREAD.trace; \ + nt->input_pos = off; \ THREAD.trace = nt; \ } while(0) @@ -55,7 +77,8 @@ void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const char* input, size_t l int live_threads = 1; for (off = 0; off <= len; off++) { uint8_t ch = ((off == len) ? 0 : input[off]); - size_t ip_s, ip; + size_t ip_s; // BUG: there was an unused variable ip. Not sure if + // I intended to use it somewhere. /* scope */ { HRVMTrace **heads_t; heads_t = heads_n; @@ -77,9 +100,9 @@ void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const char* input, size_t l uint8_t hi, lo; uint16_t arg; while(ipq_top > 0) { - if (insns_seen[THREAD.ip] == 1) + if (insn_seen[THREAD.ip] == 1) continue; - insns_seen[THREAD.ip] = 1; + insn_seen[THREAD.ip] = 1; arg = prog->insns[THREAD.ip].arg; switch(prog->insns[THREAD.ip].op) { case RVM_ACCEPT: @@ -100,8 +123,8 @@ void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const char* input, size_t l goto next_insn; case RVM_FORK: THREAD.ip++; - if (!insns_seen[arg]) { - insns_seen[THREAD.ip] = 2; + if (!insn_seen[arg]) { + insn_seen[THREAD.ip] = 2; HRVMTrace* tr = THREAD.trace; ipq_top++; THREAD.ip = arg; @@ -109,7 +132,7 @@ void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const char* input, size_t l } goto next_insn; case RVM_PUSH: - PUSH_SVM(SVM_PUSH, off); + PUSH_SVM(SVM_PUSH, 0); THREAD.ip++; goto next_insn; case RVM_ACTION: @@ -133,6 +156,7 @@ void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const char* input, size_t l goto next_insn; } next_insn: + ; } } @@ -147,27 +171,78 @@ void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const char* input, size_t l ret_trace = invert_trace(ret_trace); - HParseResult *ret = run_trace(mm__, ret_trace, input, length); + HParseResult *ret = run_trace(mm__, prog, ret_trace, input, len); // ret is in its own arena h_delete_arena(arena); return ret; } +#undef PUSH_SVM +#undef THREAD -HRVMTrace *invert_trace(HRVMTrace *trace) { - HRVMTrace *next, *last = NULL; - if (!trace) - return NULL; - if (!trace->next) - return trace; - do { - HRVMTrace *next = trace->next; - trace->next = last; - last = trace; - trace = next; - } while (trace->next); - return trace; + + + +void svm_stack_ensure_cap(HAllocator *mm__, HSVMContext *ctx, size_t addl) { + if (ctx->stack_count + addl >= ctx->stack_capacity) { + ctx->stack = mm__->realloc(mm__, ctx->stack, sizeof(*ctx->stack) * (ctx->stack_capacity *= 2)); + // TODO: check for realloc failure + } } -HParseResult *run_trace(HAllocator mm__, HRVMTrace *trace, uint8_t *input, int len) { - +HParseResult *run_trace(HAllocator *mm__, HRVMProg *orig_prog, HRVMTrace *trace, const uint8_t *input, int len) { + // orig_prog is only used for the action table + HSVMContext ctx; + HArena *arena = h_new_arena(mm__, 0); + ctx.stack_count = 0; + ctx.stack_capacity = 16; + ctx.stack = h_new(HParsedToken*, ctx.stack_capacity); + + HParsedToken *tmp_res; + HRVMTrace *cur; + for (cur = trace; cur; cur = cur->next) { + switch (cur->opcode) { + case SVM_PUSH: + svm_stack_ensure_cap(mm__, &ctx, 1); + tmp_res = a_new(HParsedToken, 1); + tmp_res->token_type = TT_MARK; + tmp_res->index = cur->input_pos; + tmp_res->bit_offset = 0; + ctx.stack[ctx.stack_count++] = tmp_res; + break; + case SVM_NOP: + break; + case SVM_ACTION: + // Action should modify stack appropriately + if (!orig_prog->actions[cur->arg].fn(arena, &ctx, orig_prog->actions[cur->arg].env)) { + // action failed... abort somehow + // TODO: Actually abort + } + break; + case SVM_CAPTURE: + // Top of stack must be a mark + // This replaces said mark in-place with a TT_BYTES. + assert(ctx.stack[ctx.stack_count]->token_type == TT_MARK); + + tmp_res = ctx.stack[ctx.stack_count]; + tmp_res->token_type = TT_BYTES; + // TODO: Will need to copy if bit_offset is nonzero + assert(tmp_res->bit_offset == 0); + + tmp_res->bytes.token = input + tmp_res->index; + tmp_res->bytes.len = cur->input_pos - tmp_res->index + 1; // inclusive + break; + case SVM_ACCEPT: + assert(ctx.stack_count == 1); + HParseResult *res = a_new(HParseResult, 1); + res->ast = ctx.stack[0]; + res->bit_length = cur->input_pos * 8; + res->arena = arena; + return res; + } + } + + h_delete_arena(arena); + return NULL; } + + // TODO: Implement the primitive actions diff --git a/src/backends/regex.h b/src/backends/regex.h index c406c842..a0bc5b85 100644 --- a/src/backends/regex.h +++ b/src/backends/regex.h @@ -27,12 +27,29 @@ typedef struct HRVMInsn_{ uint16_t arg; } HRVMInsn; +const HTokenType TT_MARK = TT_RESERVED_1; + +typedef struct HSVMContext_ { + HParsedToken **stack; + size_t stack_count; + size_t stack_capacity; +} HSVMContext; + +// These actions all assume that the items on the stack are not +// aliased anywhere. +typedef struct HSVMAction_ { + bool (*fn)(HArena *arena, HSVMContext *ctx, void* env); + void* env; +} HSVMAction; typedef struct HRVMProg_ { size_t length; size_t action_count; - HAction *actions; HRVMInsn *insns; -}; + HSVMAction *actions; +} HRVMProg; + + + #endif diff --git a/src/backends/regexvm_asm.pl b/src/backends/regexvm_asm.pl new file mode 100644 index 00000000..998b8408 --- /dev/null +++ b/src/backends/regexvm_asm.pl @@ -0,0 +1,112 @@ +#!/usr/bin/perl -w + +use strict; +# The input file consists of a sequence of blocks, which can be parsed +# as SVM test cases, RVM test cases, or C functions. Each block starts +# with a header line, then a sequence of options, and finally text in +# a format defined by the block type. +# +# Header lines start with "+TYPE", optionally followed by a name. This +# name is semantically meaningful for SVM and RVM blocks; it +# determines the name of the test case. + +# A C block's name is not used, and it takes no options. The body +# (which continues until the first line that looks like a header), is +# just passed straight through into the C source. + +# SVM blocks' names are the GLib test case name. The underlying +# function's name is derived by substituting invalid characters with +# '_'. Note that this can result in collisions (eg, /foo_bar/baz +# collides with /foo/bar_baz). If this happens, it's your own damn +# fault; rename the blocks. SVM blocks take three different options: +# @input, @output, and @pre. The @input pragma's argument is a +# C-quoted string that gets passed into the VM as the input string, +# and @output is a C-quoted string that is compared against +# h_write_result_unamb. @pre lines are prepended verbatim to the +# function body (with the @pre stripped, of course); they can be used +# to initialize environment values. +# +# SVM instructions consist of either two or four fields: +# +# input_pos opcode [arg env] +# +# input_pos and opcode correspond to the fields in HRVMTrace. arg and +# env are used to populate an HSVMAction; arg is the function, and env +# is the object whose address should be used as the env. + +# RVM blocks are very similar to SVM blocks; the name and options are +# handled exactly the same way. The assembly text is handled slightly +# differently; the format is: +# +# [label:] opcode [arg ...] +# +# For FORK and GOTO, the arg should be a label that is defined +# elsewhere. +# +# For ACTION, the arguments are handled the same way as with SVM. +# +# MATCH takes two arguments, each of which can be any C integer +# constant (not including character constants), which form the lower +# and upper bounds of the matched character, respectively. +# +# No other RVM instructions take an argument. + +# At the beginning of any line, comments preceeded by '#' are allowed; +# they are replaced by C++ comments and inserted in the nearest valid +# location in the output. + +my $mode == "TOP"; + +# common regexes: +my $re_ident = qr/[A-Za-z_][A-Za-z0-9_]*/; +my $re_cstr = qr/"(?:[^\\"]|\\["'abefnrtv0\\]|\\x[0-9a-fA-F]{2}|\\[0-7]{3})*"/; + + +my %svm = ( + name => sub { + my ($env, $name) = @_; + $env->{name} = $name; + }, + pragma => sub { + my ($env, $name, $val) = @_; + if ($name eq "input") { + chomp($env->{input} = $val); + } elsif ($name eq "output") { + chomp($env->{output} = $val); + } elsif ($name eq "pre") { + # Do I have the ref precedence right here? + push(@$env->{pre}, $val); + } else { + warn "Invalid SVM pragma"; + } + }, + body => sub { + my ($env, $line) = @_; + my ($ipos, $op, $arg, $argenv); + if ($line =~ /^\s*(\d+)\s+(PUSH|NOP|ACTION|CAPTURE|ACCEPT)(?:\s+($re_ident)\s+($re_ident))?/) { + if ($2 eq "PUSH") { + # TODO: implement all the opcodes + } + } + } + ); + + +while (<>) { + if (/^+(C|RVM|SVM)/) { + $mode = $1; + } + + if ($mode eq "TOP") { + if (/^#(.*)/) { + print "// $1"; + next; + } + } elsif ($mode eq "SVM") { + } elsif ($mode eq "RVM") { + } elsif ($mode eq "C") { + } + +} + + diff --git a/src/hammer.c b/src/hammer.c index c33f6c8b..c369f64b 100644 --- a/src/hammer.c +++ b/src/hammer.c @@ -84,4 +84,12 @@ void h_parse_result_free(HParseResult *result) { h_delete_arena(result->arena); } +bool h_false(void* env) { + (void)env; + return false; +} +bool h_true(void* env) { + (void)env; + return true; +} diff --git a/src/hammer.h b/src/hammer.h index 45126856..6678db95 100644 --- a/src/hammer.h +++ b/src/hammer.h @@ -47,6 +47,7 @@ typedef enum HTokenType_ { TT_SINT, TT_UINT, TT_SEQUENCE, + TT_RESERVED_1, // reserved for internal use TT_USER = 64, TT_ERR, TT_MAX @@ -78,7 +79,9 @@ typedef struct HParsedToken_ { } HParsedToken; /** - * The result of a successful parse. + * The result of a successful parse. Note that this may reference the + * input string. + * * If a parse fails, the parse result will be NULL. * If a parse is successful but there's nothing there (i.e., if end_p * succeeds) then there's a parse result but its ast is NULL. diff --git a/src/internal.h b/src/internal.h index d35ebaab..116af899 100644 --- a/src/internal.h +++ b/src/internal.h @@ -223,6 +223,9 @@ int h_hashtable_present(HHashTable* ht, void* key); void h_hashtable_del(HHashTable* ht, void* key); void h_hashtable_free(HHashTable* ht); +bool h_false(void*); +bool h_true(void*); + #if 0 #include <stdlib.h> #define h_arena_malloc(a, s) malloc(s) -- GitLab