Forked from
Hammer / hammer
357 commits behind, 7 commits ahead of the upstream repository.
-
Meredith L. Patterson authored
regex.c 10.66 KiB
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <string.h>
#include <assert.h>
#include "../internal.h"
#include "../parsers/parser_internal.h"
#include "regex.h"
#undef a_new
#define a_new(typ, count) a_new_(arena, typ, count)
// Stack VM
typedef enum HSVMOp_ {
SVM_PUSH, // Push a mark. There is no VM insn to push an object.
SVM_NOP, // Used to start the chain, and possibly elsewhere. Does nothing.
SVM_ACTION, // Same meaning as RVM_ACTION
SVM_CAPTURE, // Same meaning as RVM_CAPTURE
SVM_ACCEPT,
SVM_OPCOUNT
} HSVMOp;
typedef struct HRVMTrace_ {
struct HRVMTrace_ *next; // When parsing, these are
// reverse-threaded. There is a postproc
// step that inverts all the pointers.
size_t input_pos;
uint16_t arg;
uint8_t opcode;
} HRVMTrace;
typedef struct HRVMThread_ {
HRVMTrace *trace;
uint16_t ip;
} HRVMThread;
HParseResult *run_trace(HAllocator *mm__, HRVMProg *orig_prog, HRVMTrace *trace, const uint8_t *input, int len);
HRVMTrace *invert_trace(HRVMTrace *trace) {
HRVMTrace *last = NULL;
if (!trace) {
return NULL;
}
if (!trace->next) {
return trace;
}
do {
HRVMTrace *next = trace->next;
trace->next = last;
last = trace;
trace = next;
} while (trace);
return last;
}
void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const uint8_t* input, size_t len) {
HArena *arena = h_new_arena(mm__, 0);
HSArray *heads_n = h_sarray_new(mm__, prog->length), // Both of these contain HRVMTrace*'s
*heads_p = h_sarray_new(mm__, prog->length);
HRVMTrace *ret_trace = NULL;
uint8_t *insn_seen = a_new(uint8_t, prog->length); // 0 -> not seen, 1->processed, 2->queued
HRVMThread *ip_queue = a_new(HRVMThread, prog->length);
size_t ipq_top;
#define THREAD ip_queue[ipq_top-1]
#define PUSH_SVM(op_, arg_) do { \
HRVMTrace *nt = a_new(HRVMTrace, 1); \
nt->arg = (arg_); \
nt->opcode = (op_); \
nt->next = THREAD.trace; \
nt->input_pos = off; \
THREAD.trace = nt; \
} while(0)
((HRVMTrace*)h_sarray_set(heads_n, 0, a_new(HRVMTrace, 1)))->opcode = SVM_NOP; // Initial thread
size_t off = 0;
int live_threads = 1; // May be redundant
for (off = 0; off <= len; off++) {
uint8_t ch = ((off == len) ? 0 : input[off]);
/* scope */ {
HSArray *heads_t;
heads_t = heads_n;
heads_n = heads_p;
heads_p = heads_t;
h_sarray_clear(heads_n);
}
memset(insn_seen, 0, prog->length); // no insns seen yet
if (!live_threads) {
goto match_fail;
}
live_threads = 0;
HRVMTrace *tr_head;
H_SARRAY_FOREACH_KV(tr_head,ip_s,heads_p) {
ipq_top = 1;
// TODO: Write this as a threaded VM
THREAD.ip = ip_s;
THREAD.trace = tr_head;
uint8_t hi, lo;
uint16_t arg;
while(ipq_top > 0) {
if (insn_seen[THREAD.ip] == 1) {
ipq_top--; // Kill thread.
continue;
}
insn_seen[THREAD.ip] = 1;
arg = prog->insns[THREAD.ip].arg;
switch(prog->insns[THREAD.ip].op) {
case RVM_ACCEPT:
PUSH_SVM(SVM_ACCEPT, 0);
ret_trace = THREAD.trace;
ipq_top--;
goto next_insn;
case RVM_MATCH:
hi = (arg >> 8) & 0xff;
lo = arg & 0xff;
THREAD.ip++;
if (ch < lo || ch > hi) {
ipq_top--; // terminate thread
}
goto next_insn;
case RVM_GOTO:
THREAD.ip = arg;
goto next_insn;
case RVM_FORK:
THREAD.ip++;
if (!insn_seen[arg]) {
insn_seen[THREAD.ip] = 2;
HRVMTrace* tr = THREAD.trace;
ipq_top++;
THREAD.ip = arg;
THREAD.trace = tr;
}
goto next_insn;
case RVM_PUSH:
PUSH_SVM(SVM_PUSH, 0);
THREAD.ip++;
goto next_insn;
case RVM_ACTION:
PUSH_SVM(SVM_ACTION, arg);
THREAD.ip++;
goto next_insn;
case RVM_CAPTURE:
PUSH_SVM(SVM_CAPTURE, 0);
THREAD.ip++;
goto next_insn;
case RVM_EOF:
THREAD.ip++;
if (off != len) {
ipq_top--; // Terminate thread
}
goto next_insn;
case RVM_STEP:
// save thread
live_threads++;
h_sarray_set(heads_n, ++THREAD.ip, THREAD.trace);
ipq_top--;
goto next_insn;
}
next_insn:
;
}
}
}
// No accept was reached.
match_fail:
if (ret_trace == NULL) {
// No match found; definite failure.
h_delete_arena(arena);
return NULL;
}
// Invert the direction of the trace linked list.
ret_trace = invert_trace(ret_trace);
HParseResult *ret = run_trace(mm__, prog, ret_trace, input, len);
// ret is in its own arena
h_delete_arena(arena);
return ret;
}
#undef PUSH_SVM
#undef THREAD
bool svm_stack_ensure_cap(HAllocator *mm__, HSVMContext *ctx, size_t addl) {
if (ctx->stack_count + addl >= ctx->stack_capacity) {
ctx->stack = mm__->realloc(mm__, ctx->stack, sizeof(*ctx->stack) * (ctx->stack_capacity *= 2));
if (!ctx->stack) {
return false;
}
return true;
}
return true;
}
HParseResult *run_trace(HAllocator *mm__, HRVMProg *orig_prog, HRVMTrace *trace, const uint8_t *input, int len) {
// orig_prog is only used for the action table
HSVMContext ctx;
HArena *arena = h_new_arena(mm__, 0);
ctx.stack_count = 0;
ctx.stack_capacity = 16;
ctx.stack = h_new(HParsedToken*, ctx.stack_capacity);
HParsedToken *tmp_res;
HRVMTrace *cur;
for (cur = trace; cur; cur = cur->next) {
switch (cur->opcode) {
case SVM_PUSH:
if (!svm_stack_ensure_cap(mm__, &ctx, 1)) {
goto fail;
}
tmp_res = a_new(HParsedToken, 1);
tmp_res->token_type = TT_MARK;
tmp_res->index = cur->input_pos;
tmp_res->bit_offset = 0;
ctx.stack[ctx.stack_count++] = tmp_res;
break;
case SVM_NOP:
break;
case SVM_ACTION:
// Action should modify stack appropriately
if (!orig_prog->actions[cur->arg].action(arena, &ctx, orig_prog->actions[cur->arg].env)) {
// action failed... abort somehow
goto fail;
}
break;
case SVM_CAPTURE:
// Top of stack must be a mark
// This replaces said mark in-place with a TT_BYTES.
assert(ctx.stack[ctx.stack_count-1]->token_type == TT_MARK);
tmp_res = ctx.stack[ctx.stack_count-1];
tmp_res->token_type = TT_BYTES;
// TODO: Will need to copy if bit_offset is nonzero
assert(tmp_res->bit_offset == 0);
tmp_res->bytes.token = input + tmp_res->index;
tmp_res->bytes.len = cur->input_pos - tmp_res->index;
break;
case SVM_ACCEPT:
assert(ctx.stack_count <= 1);
HParseResult *res = a_new(HParseResult, 1);
if (ctx.stack_count == 1) {
res->ast = ctx.stack[0];
} else {
res->ast = NULL;
}
res->bit_length = cur->input_pos * 8;
res->arena = arena;
return res;
}
}
fail:
h_delete_arena(arena);
return NULL;
}
uint16_t h_rvm_create_action(HRVMProg *prog, HSVMActionFunc action_func, void* env) {
for (uint16_t i = 0; i < prog->action_count; i++) {
if (prog->actions[i].action == action_func && prog->actions[i].env == env) {
return i;
}
}
// Ensure that there's room in the action array...
if (!(prog->action_count & (prog->action_count + 1))) {
// needs to be scaled up.
size_t array_size = (prog->action_count + 1) * 2; // action_count+1 is a
// power of two
prog->actions = prog->allocator->realloc(prog->allocator, prog->actions, array_size * sizeof(*prog->actions));
if (!prog->actions) {
longjmp(prog->except, 1);
}
}
HSVMAction *action = &prog->actions[prog->action_count];
action->action = action_func;
action->env = env;
return prog->action_count++;
}
uint16_t h_rvm_insert_insn(HRVMProg *prog, HRVMOp op, uint16_t arg) {
// Ensure that there's room in the insn array...
if (!(prog->length & (prog->length + 1))) {
// needs to be scaled up.
size_t array_size = (prog->length + 1) * 2; // action_count+1 is a
// power of two
prog->insns = prog->allocator->realloc(prog->allocator, prog->insns, array_size * sizeof(*prog->insns));
if (!prog->insns) {
longjmp(prog->except, 1);
}
}
prog->insns[prog->length].op = op;
prog->insns[prog->length].arg = arg;
return prog->length++;
}
uint16_t h_rvm_get_ip(HRVMProg *prog) {
return prog->length;
}
void h_rvm_patch_arg(HRVMProg *prog, uint16_t ip, uint16_t new_val) {
assert(prog->length > ip);
prog->insns[ip].arg = new_val;
}
size_t h_svm_count_to_mark(HSVMContext *ctx) {
size_t ctm;
for (ctm = 0; ctm < ctx->stack_count; ctm++) {
if (ctx->stack[ctx->stack_count - 1 - ctm]->token_type == TT_MARK) {
return ctm;
}
}
return ctx->stack_count;
}
// TODO: Implement the primitive actions
bool h_svm_action_make_sequence(HArena *arena, HSVMContext *ctx, void* env) {
size_t n_items = h_svm_count_to_mark(ctx);
assert (n_items < ctx->stack_count);
HParsedToken *res = ctx->stack[ctx->stack_count - 1 - n_items];
assert (res->token_type == TT_MARK);
res->token_type = TT_SEQUENCE;
HCountedArray *ret_carray = h_carray_new_sized(arena, n_items);
res->seq = ret_carray;
// res index and bit offset are the same as the mark.
for (size_t i = 0; i < n_items; i++) {
ret_carray->elements[i] = ctx->stack[ctx->stack_count - n_items + i];
}
ret_carray->used = n_items;
ctx->stack_count -= n_items;
return true;
}
bool h_svm_action_clear_to_mark(HArena *arena, HSVMContext *ctx, void* env) {
while (ctx->stack_count > 0) {
if (ctx->stack[--ctx->stack_count]->token_type == TT_MARK) {
return true;
}
}
return false; // no mark found.
}
// Glue regex backend to rest of system
bool h_compile_regex(HRVMProg *prog, const HParser *parser) {
return parser->vtable->compile_to_rvm(prog, parser->env);
}
static void h_regex_free(HParser *parser) {
HRVMProg *prog = (HRVMProg*)parser->backend_data;
HAllocator *mm__ = prog->allocator;
h_free(prog->insns);
h_free(prog->actions);
h_free(prog);
parser->backend_data = NULL;
parser->backend = PB_PACKRAT;
}
static int h_regex_compile(HAllocator *mm__, HParser* parser, const void* params) {
if (!parser->vtable->isValidRegular(parser->env)) {
return -1;
}
HRVMProg *prog = h_new(HRVMProg, 1);
prog->length = prog->action_count = 0;
prog->insns = NULL;
prog->actions = NULL;
prog->allocator = mm__;
if (setjmp(prog->except)) {
return false;
}
if (!h_compile_regex(prog, parser)) {
h_free(prog->insns);
h_free(prog->actions);
h_free(prog);
return 2;
}
memset(prog->except, 0, sizeof(prog->except));
h_rvm_insert_insn(prog, RVM_ACCEPT, 0);
parser->backend_data = prog;
return 0;
}
static HParseResult *h_regex_parse(HAllocator* mm__, const HParser* parser, HInputStream *input_stream) {
return h_rvm_run__m(mm__, (HRVMProg*)parser->backend_data, input_stream->input, input_stream->length);
}
HParserBackendVTable h__regex_backend_vtable = {
.compile = h_regex_compile,
.parse = h_regex_parse,
.free = h_regex_free
};
#ifndef NDEBUG
#include "regex_debug.c"
#endif