diff --git a/.gitignore b/.gitignore index db2ee3a2f3fa54431b73adf65233086e9cde2441..40bd0e3389623b146b0c1f97d177825fd3afef5e 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,7 @@ examples/base64 TAGS *.swp *.swo +\#* +.* +docs/milestone2.dot.pdf +*.dot.pdf diff --git a/Makefile b/Makefile index bd383a22ff5a0768bd2fd7cb3c2a36baada49630..fbd96c4fdf9866b28d806cb2bbcdfbb00b026e93 100644 --- a/Makefile +++ b/Makefile @@ -17,6 +17,9 @@ CONFIG_VARS= INCLUDE_TESTS test: src/test_suite $< +examples/all: src/all +examples/compile: src/compile + define SUBDIR_TEMPLATE $(1)/%: $$(MAKE) -C $(1) $$* diff --git a/common.mk b/common.mk index a57429db48e76179ff7d2537d78ef73d8b80dec2..143a0f40959a7474b578214f2815d40bd59dff33 100644 --- a/common.mk +++ b/common.mk @@ -6,7 +6,7 @@ endif include $(TOPLEVEL)/config.mk TEST_CFLAGS = $(shell pkg-config --cflags glib-2.0) -DINCLUDE_TESTS -TEST_LDFLAGS = $(shell pkg-config --libs glib-2.0) +TEST_LDFLAGS = $(shell pkg-config --libs glib-2.0) -lrt CFLAGS := -std=gnu99 -Wall -Wextra -Werror -Wno-unused-parameter -Wno-attributes LDFLAGS := diff --git a/docs/milestone2.dot b/docs/milestone2.dot new file mode 100644 index 0000000000000000000000000000000000000000..0bbd9b23afafe98c18b7d11f0dbc2fad4e417e74 --- /dev/null +++ b/docs/milestone2.dot @@ -0,0 +1,36 @@ +digraph { + graph [rankdir=LR]; + subgraph complete { + node [color="gray",fontcolor="gray"]; + glue; + regex_svm; + regex_rvm; + desugaring; // Needs merged. + } + /* The end result of the milestone, along with the subtasks listed */ + milestone2 [color="green",style="filled"]; + llk -> milestone2; + lr -> milestone2; + lalr8_gen -> lr; // Generate parse tables for LALR(8) + glr_gen -> lr; // Generate parse tables for GLR + llk_gen -> llk; // Generate parse tables for LL(k) + lr_driver -> lr; // Write driver for all LR-type algs; analagous to SVM and RVM implementations + llk_driver -> llk; // Write driver for LL(k) + regex -> milestone2; + glue -> milestone2; + tests -> milestone2; + + regex_gen -> regex; // should be mostly done; the rest is concurrent with regex_svm_actions + regex_driver -> regex; + regex_svm -> regex_driver; + regex_rvm -> regex_driver; + regex_svm_actions -> regex_driver; // 1 for each way that an HParsedToken can be extracted from the stack. + + + /* + * + */ + desugaring -> llk_gen; + desugaring -> lalr8_gen; + desugaring -> glr_gen; +} diff --git a/docs/milestone3.dot b/docs/milestone3.dot new file mode 100644 index 0000000000000000000000000000000000000000..66a5fb357f7c6dd82a0e71cd81aeb3b6c2b7fae5 --- /dev/null +++ b/docs/milestone3.dot @@ -0,0 +1,65 @@ +digraph { + graph [rankdir=LR]; + + subgraph complete { + node [color="gray",fontcolor="gray"]; + } + + subgraph groups { + node [color="blue",fontcolor="blue"]; + cpp; + python; + ruby; + go; + php; + dotnet; + } + + milestone3 [color="green",style="filled"]; + + + function_desc_fmt -> function_descs; + function_desc_fmt -> binding_generator; + + binding_generator -> cpp_gen; + binding_generator -> python_gen; + binding_generator -> ruby_gen; + binding_generator -> go_gen; + binding_generator -> php_gen; + binding_generator -> dotnet_gen; + + function_descs -> cpp_gen; + function_descs -> python_gen; + function_descs -> ruby_gen; + function_descs -> go_gen; + function_descs -> php_gen; + function_descs -> dotnet_gen; + + + // Plugins to generate a type of code + cpp_gen -> cpp; + python_gen -> python; + ruby_gen -> ruby; + go_gen -> go; + php_gen -> php; + dotnet_gen -> dotnet; + + // base code... developed concurrently with _gen's + cpp_base -> cpp; + python_base -> python; + ruby_base -> ruby; + go_base -> go; + php_base -> php; + dotnet_base -> dotnet; + + // Bindings for various languages. These are just groupings. + cpp -> milestone3; + python -> milestone3; + ruby -> milestone3; + go -> milestone3; + php -> milestone3; + dotnet -> milestone3; + + + +} \ No newline at end of file diff --git a/docs/rvm_sample_input.rvm b/docs/rvm_sample_input.rvm new file mode 100644 index 0000000000000000000000000000000000000000..07fdf60f82befdaa01789d244e60c076ac11ba1f --- /dev/null +++ b/docs/rvm_sample_input.rvm @@ -0,0 +1,17 @@ ++C +int foo() { + return 42; +} + ++SVM /svm/simple +@input "" +@output "()" +0 ACCEPT + ++SVM /svm/string +@input "quux" +@outut "(<5555>)" +1 MARK +2 CAPTURE +2 ACCEPT + diff --git a/src/Makefile b/src/Makefile index 06da6c142512fb7edc2b2fa7637e328ccaa584f7..870aad5c37e037c39d8d829336ff5b16d5d23eb3 100644 --- a/src/Makefile +++ b/src/Makefile @@ -27,7 +27,8 @@ PARSERS := \ BACKENDS := \ packrat \ - ll + llk \ + regex HAMMER_PARTS := \ bitreader.o \ @@ -39,7 +40,6 @@ HAMMER_PARTS := \ datastructures.o \ system_allocator.o \ benchmark.o \ - compile.o \ cfgrammar.o \ $(PARSERS:%=parsers/%.o) \ $(BACKENDS:%=backends/%.o) diff --git a/src/allocator.h b/src/allocator.h index e83cae7cbfecebc58dd810671385b1d63f72d9fb..2dfc14e689f825efabc0d7c46b515217ccd90abb 100644 --- a/src/allocator.h +++ b/src/allocator.h @@ -19,6 +19,7 @@ #define HAMMER_ALLOCATOR__H__ #include <sys/types.h> +// TODO(thequux): Turn this into an "HAllocatorVtable", and add a wrapper that also takes an environment pointer. typedef struct HAllocator_ { void* (*alloc)(struct HAllocator_* allocator, size_t size); void* (*realloc)(struct HAllocator_* allocator, void* ptr, size_t size); diff --git a/src/backends/ll.c b/src/backends/llk.c similarity index 77% rename from src/backends/ll.c rename to src/backends/llk.c index 338e4c6de0a3a073e37659a6b91a9c0ddb7b9dbc..d0a5f08e5214827fc07a7b9160ee177cd414f29d 100644 --- a/src/backends/ll.c +++ b/src/backends/llk.c @@ -3,22 +3,23 @@ #include "../cfgrammar.h" #include "../parsers/parser_internal.h" +// XXX despite the names, this is all LL(1) right now. TODO -/* Generating the LL parse table */ +/* Generating the LL(k) parse table */ /* Maps each nonterminal (HCFChoice) of the grammar to another hash table that * maps lookahead tokens (HCFToken) to productions (HCFSequence). */ -typedef struct HLLTable_ { +typedef struct HLLkTable_ { HHashTable *rows; HCFChoice *start; // start symbol HArena *arena; HAllocator *mm__; -} HLLTable; +} HLLkTable; /* Interface to look up an entry in the parse table. */ -const HCFSequence *h_ll_lookup(const HLLTable *table, const HCFChoice *x, HCFToken tok) +const HCFSequence *h_llk_lookup(const HLLkTable *table, const HCFChoice *x, HCFToken tok) { const HHashTable *row = h_hashtable_get(table->rows, x); assert(row != NULL); // the table should have one row for each nonterminal @@ -28,7 +29,7 @@ const HCFSequence *h_ll_lookup(const HLLTable *table, const HCFChoice *x, HCFTok } /* Allocate a new parse table. */ -HLLTable *h_lltable_new(HAllocator *mm__) +HLLkTable *h_llktable_new(HAllocator *mm__) { // NB the parse table gets an arena separate from the grammar so we can free // the latter after table generation. @@ -37,7 +38,7 @@ HLLTable *h_lltable_new(HAllocator *mm__) HHashTable *rows = h_hashtable_new(arena, h_eq_ptr, h_hash_ptr); assert(rows != NULL); - HLLTable *table = h_new(HLLTable, 1); + HLLkTable *table = h_new(HLLkTable, 1); assert(table != NULL); table->mm__ = mm__; table->arena = arena; @@ -46,7 +47,7 @@ HLLTable *h_lltable_new(HAllocator *mm__) return table; } -void h_lltable_free(HLLTable *table) +void h_llktable_free(HLLkTable *table) { HAllocator *mm__ = table->mm__; h_delete_arena(table->arena); @@ -95,10 +96,10 @@ int fill_table_row(HCFGrammar *g, HHashTable *row, return 0; } -/* Generate the LL parse table from the given grammar. +/* Generate the LL(k) parse table from the given grammar. * Returns -1 on error, 0 on success. */ -static int fill_table(HCFGrammar *g, HLLTable *table) +static int fill_table(HCFGrammar *g, HLLkTable *table) { table->start = g->start; @@ -120,7 +121,7 @@ static int fill_table(HCFGrammar *g, HLLTable *table) for(s = a->seq; *s; s++) { // record this production in row as appropriate // this can signal an ambiguity conflict. - // NB we don't worry about deallocating anything, h_ll_compile will + // NB we don't worry about deallocating anything, h_llk_compile will // delete the whole arena for us. if(fill_table_row(g, row, a, *s) < 0) return -1; @@ -131,7 +132,7 @@ static int fill_table(HCFGrammar *g, HLLTable *table) return 0; } -int h_ll_compile(HAllocator* mm__, HParser* parser, const void* params) +int h_llk_compile(HAllocator* mm__, HParser* parser, const void* params) { // Convert parser to a CFG. This can fail as indicated by a NULL return. HCFGrammar *grammar = h_cfgrammar(mm__, parser); @@ -143,11 +144,11 @@ int h_ll_compile(HAllocator* mm__, HParser* parser, const void* params) // TODO: avoid conflicts by splitting occurances? // generate table and store in parser->data. - HLLTable *table = h_lltable_new(mm__); + HLLkTable *table = h_llktable_new(mm__); if(fill_table(grammar, table) < 0) { // the table was ambiguous h_cfgrammar_free(grammar); - h_lltable_free(table); + h_llktable_free(table); return -1; } parser->data = table; @@ -161,13 +162,14 @@ int h_ll_compile(HAllocator* mm__, HParser* parser, const void* params) -/* LL driver */ +/* LL(k) driver */ -HParseResult *h_ll_parse(HAllocator* mm__, const HParser* parser, HParseState* state) +HParseResult *h_llk_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream) { - const HLLTable *table = parser->data; - HArena *arena = state->arena; - HSlist *stack = h_slist_new(arena); + const HLLkTable *table = parser->data; + HArena *arena = h_new_arena(mm__, 0); // will hold the results + HArena *tarena = h_new_arena(mm__, 0); // tmp, deleted after parse + HSlist *stack = h_slist_new(tarena); HCountedArray *seq = h_carray_new(arena); // accumulates current parse result // in order to construct the parse tree, we delimit the symbol stack into @@ -177,7 +179,7 @@ HParseResult *h_ll_parse(HAllocator* mm__, const HParser* parser, HParseState* s // frame delimiter. // also on the stack below the mark, we store the previously accumulated // value for the surrounding production. - void *mark = h_arena_malloc(arena, 1); + void *mark = h_arena_malloc(tarena, 1); // initialize with the start symbol on the stack. h_slist_push(stack, table->start); @@ -188,8 +190,8 @@ HParseResult *h_ll_parse(HAllocator* mm__, const HParser* parser, HParseState* s while(!h_slist_empty(stack)) { // fill up lookahead buffer as required if(lookahead == 0) { - uint8_t c = h_read_bits(&state->input_stream, 8, false); - if(state->input_stream.overrun) + uint8_t c = h_read_bits(stream, 8, false); + if(stream->overrun) lookahead = end_token; else lookahead = char_token(c); @@ -203,16 +205,16 @@ HParseResult *h_ll_parse(HAllocator* mm__, const HParser* parser, HParseState* s // hit stack frame boundary // wrap the accumulated parse result, this sequence is finished - HParsedToken *tok = a_new(HParsedToken, 1); + HParsedToken *tok = h_arena_malloc(arena, sizeof(HParsedToken)); tok->token_type = TT_SEQUENCE; tok->seq = seq; // XXX tok->index and tok->bit_offset (don't take directly from stream, cuz peek!) // call validation and semantic action, if present - if(x->pred && !x->pred(make_result(state, tok))) - return NULL; // validation failed -> no parse + if(x->pred && !x->pred(make_result(tarena, tok))) + goto no_parse; // validation failed -> no parse if(x->action) - tok = (HParsedToken *)x->action(make_result(state, tok)); + tok = (HParsedToken *)x->action(make_result(arena, tok)); // result becomes next left-most element of higher-level sequence seq = h_slist_pop(stack); @@ -230,7 +232,7 @@ HParseResult *h_ll_parse(HAllocator* mm__, const HParser* parser, HParseState* s seq = h_carray_new(arena); // look up applicable production in parse table - const HCFSequence *p = h_ll_lookup(table, x, lookahead); + const HCFSequence *p = h_llk_lookup(table, x, lookahead); // push production's rhs onto the stack (in reverse order) HCFChoice **s; @@ -250,40 +252,40 @@ HParseResult *h_ll_parse(HAllocator* mm__, const HParser* parser, HParseState* s switch(x->type) { case HCF_END: if(input != end_token) - return NULL; + goto no_parse; tok = NULL; break; case HCF_CHAR: if(input != char_token(x->chr)) - return NULL; - tok = a_new(HParsedToken, 1); + goto no_parse; + tok = h_arena_malloc(arena, sizeof(HParsedToken)); tok->token_type = TT_UINT; tok->uint = x->chr; break; case HCF_CHARSET: if(input == end_token) - return NULL; + goto no_parse; if(!charset_isset(x->charset, token_char(input))) - return NULL; - tok = a_new(HParsedToken, 1); + goto no_parse; + tok = h_arena_malloc(arena, sizeof(HParsedToken)); tok->token_type = TT_UINT; tok->uint = token_char(input); break; default: // should not be reached assert_message(0, "unknown HCFChoice type"); - return NULL; + goto no_parse; } // XXX tok->index and tok->bit_offset (don't take directly from stream, cuz peek!) // call validation and semantic action, if present - if(x->pred && !x->pred(make_result(state, tok))) - return NULL; // validation failed -> no parse + if(x->pred && !x->pred(make_result(tarena, tok))) + goto no_parse; // validation failed -> no parse if(x->action) - tok = (HParsedToken *)x->action(make_result(state, tok)); + tok = (HParsedToken *)x->action(make_result(arena, tok)); // append to result sequence h_carray_append(seq, tok); @@ -293,25 +295,31 @@ HParseResult *h_ll_parse(HAllocator* mm__, const HParser* parser, HParseState* s // since we started with a single nonterminal on the stack, seq should // contain exactly the parse result. assert(seq->used == 1); - return make_result(state, seq->elements[0]); + h_delete_arena(tarena); + return make_result(arena, seq->elements[0]); + + no_parse: + h_delete_arena(tarena); + h_delete_arena(arena); + return NULL; } -HParserBackendVTable h__ll_backend_vtable = { - .compile = h_ll_compile, - .parse = h_ll_parse +HParserBackendVTable h__llk_backend_vtable = { + .compile = h_llk_compile, + .parse = h_llk_parse }; // dummy! -int test_ll(void) +int test_llk(void) { - const HParser *c = h_many(h_ch('x')); - const HParser *q = h_sequence(c, h_ch('y'), NULL); - const HParser *p = h_choice(q, h_end_p(), NULL); + HParser *c = h_many(h_ch('x')); + HParser *q = h_sequence(c, h_ch('y'), NULL); + HParser *p = h_choice(q, h_end_p(), NULL); HCFGrammar *g = h_cfgrammar(&system_allocator, p); diff --git a/src/backends/packrat.c b/src/backends/packrat.c index 1f8d113531d27aa27748be87fdbb19a136247b69..4c53ef100f519b8249b9b85ad87facd45287f71f 100644 --- a/src/backends/packrat.c +++ b/src/backends/packrat.c @@ -1,7 +1,16 @@ #include <assert.h> +#include <string.h> #include "../internal.h" #include "../parsers/parser_internal.h" +static uint32_t djbhash(const uint8_t *buf, size_t len) { + uint32_t hash = 5381; + while (len--) { + hash = hash * 33 + *buf++; + } + return hash; +} + // short-hand for constructing HCachedResult's static HCachedResult *cached_result(const HParseState *state, HParseResult *result) { HCachedResult *ret = a_new(HCachedResult, 1); @@ -191,12 +200,37 @@ HParseResult* h_do_parse(const HParser* parser, HParseState *state) { } int h_packrat_compile(HAllocator* mm__, HParser* parser, const void* params) { + parser->backend = PB_PACKRAT; return 0; // No compilation necessary, and everything should work // out of the box. } -HParseResult *h_packrat_parse(HAllocator* mm__, const HParser* parser, HParseState* parse_state) { - return h_do_parse(parser, parse_state); +static uint32_t cache_key_hash(const void* key) { + return djbhash(key, sizeof(HParserCacheKey)); +} +static bool cache_key_equal(const void* key1, const void* key2) { + return memcmp(key1, key2, sizeof(HParserCacheKey)) == 0; +} + +HParseResult *h_packrat_parse(HAllocator* mm__, const HParser* parser, HInputStream *input_stream) { + HArena * arena = h_new_arena(mm__, 0); + HParseState *parse_state = a_new_(arena, HParseState, 1); + parse_state->cache = h_hashtable_new(arena, cache_key_equal, // key_equal_func + cache_key_hash); // hash_func + parse_state->input_stream = *input_stream; + parse_state->lr_stack = h_slist_new(arena); + parse_state->recursion_heads = h_hashtable_new(arena, cache_key_equal, + cache_key_hash); + parse_state->arena = arena; + HParseResult *res = h_do_parse(parser, parse_state); + h_slist_free(parse_state->lr_stack); + h_hashtable_free(parse_state->recursion_heads); + // tear down the parse state + h_hashtable_free(parse_state->cache); + if (!res) + h_delete_arena(parse_state->arena); + + return res; } HParserBackendVTable h__packrat_backend_vtable = { diff --git a/src/backends/regex.c b/src/backends/regex.c new file mode 100644 index 0000000000000000000000000000000000000000..3cbbb2d4721afafca68ba56aa2bb98bcebbffc9f --- /dev/null +++ b/src/backends/regex.c @@ -0,0 +1,366 @@ +#include <string.h> +#include <assert.h> +#include "../internal.h" +#include "../parsers/parser_internal.h" +#include "regex.h" + +#undef a_new +#define a_new(typ, count) a_new_(arena, typ, count) +// Stack VM +typedef enum HSVMOp_ { + SVM_PUSH, // Push a mark. There is no VM insn to push an object. + SVM_NOP, // Used to start the chain, and possibly elsewhere. Does nothing. + SVM_ACTION, // Same meaning as RVM_ACTION + SVM_CAPTURE, // Same meaning as RVM_CAPTURE + SVM_ACCEPT, +} HSVMOp; + +typedef struct HRVMTrace_ { + struct HRVMTrace_ *next; // When parsing, these are + // reverse-threaded. There is a postproc + // step that inverts all the pointers. + size_t input_pos; + uint16_t arg; + uint8_t opcode; +} HRVMTrace; + +typedef struct HRVMThread_ { + HRVMTrace *trace; + uint16_t ip; +} HRVMThread; + +HParseResult *run_trace(HAllocator *mm__, HRVMProg *orig_prog, HRVMTrace *trace, const uint8_t *input, int len); + +HRVMTrace *invert_trace(HRVMTrace *trace) { + HRVMTrace *last = NULL; + if (!trace) + return NULL; + if (!trace->next) + return trace; + do { + HRVMTrace *next = trace->next; + trace->next = last; + last = trace; + trace = next; + } while (trace->next); + return trace; +} + +void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const uint8_t* input, size_t len) { + HArena *arena = h_new_arena(mm__, 0); + HRVMTrace **heads_p = a_new(HRVMTrace*, prog->length), + **heads_n = a_new(HRVMTrace*, prog->length); + + HRVMTrace *ret_trace; + + uint8_t *insn_seen = a_new(uint8_t, prog->length); // 0 -> not seen, 1->processed, 2->queued + HRVMThread *ip_queue = a_new(HRVMThread, prog->length); + size_t ipq_top; + + + + +#define THREAD ip_queue[ipq_top-1] +#define PUSH_SVM(op_, arg_) do { \ + HRVMTrace *nt = a_new(HRVMTrace, 1); \ + nt->arg = (arg_); \ + nt->opcode = (op_); \ + nt->next = THREAD.trace; \ + nt->input_pos = off; \ + THREAD.trace = nt; \ + } while(0) + + heads_n[0] = a_new(HRVMTrace, 1); // zeroing + heads_n[0]->opcode = SVM_NOP; + + size_t off = 0; + int live_threads = 1; + for (off = 0; off <= len; off++) { + uint8_t ch = ((off == len) ? 0 : input[off]); + size_t ip_s; // BUG: there was an unused variable ip. Not sure if + // I intended to use it somewhere. + /* scope */ { + HRVMTrace **heads_t; + heads_t = heads_n; + heads_n = heads_p; + heads_p = heads_t; + memset(heads_n, 0, prog->length * sizeof(*heads_n)); + } + memset(insn_seen, 0, prog->length); // no insns seen yet + if (!live_threads) + goto match_fail; + live_threads = 0; + for (ip_s = 0; ip_s < prog->length; ip_s++) { + ipq_top = 1; + // TODO: Write this as a threaded VM + if (!heads_p[ip_s]) + continue; + THREAD.ip = ip_s; + + uint8_t hi, lo; + uint16_t arg; + while(ipq_top > 0) { + if (insn_seen[THREAD.ip] == 1) + continue; + insn_seen[THREAD.ip] = 1; + arg = prog->insns[THREAD.ip].arg; + switch(prog->insns[THREAD.ip].op) { + case RVM_ACCEPT: + PUSH_SVM(SVM_ACCEPT, 0); + ret_trace = THREAD.trace; + goto run_trace; + case RVM_MATCH: + // Doesn't actually validate the "must be followed by MATCH + // or STEP. It should. Preproc perhaps? + hi = (arg >> 8) & 0xff; + lo = arg & 0xff; + THREAD.ip++; + if (ch < lo || ch > hi) + ipq_top--; // terminate thread + goto next_insn; + case RVM_GOTO: + THREAD.ip = arg; + goto next_insn; + case RVM_FORK: + THREAD.ip++; + if (!insn_seen[arg]) { + insn_seen[THREAD.ip] = 2; + HRVMTrace* tr = THREAD.trace; + ipq_top++; + THREAD.ip = arg; + THREAD.trace = tr; + } + goto next_insn; + case RVM_PUSH: + PUSH_SVM(SVM_PUSH, 0); + THREAD.ip++; + goto next_insn; + case RVM_ACTION: + PUSH_SVM(SVM_ACTION, arg); + THREAD.ip++; + goto next_insn; + case RVM_CAPTURE: + PUSH_SVM(SVM_CAPTURE, 0); + THREAD.ip++; + goto next_insn; + case RVM_EOF: + THREAD.ip++; + if (off != len) + ipq_top--; // Terminate thread + goto next_insn; + case RVM_STEP: + // save thread + live_threads++; + heads_n[THREAD.ip++] = THREAD.trace; + ipq_top--; + goto next_insn; + } + next_insn: + ; + + } + } + } + // No accept was reached. + match_fail: + h_delete_arena(arena); + return NULL; + + run_trace: + // Invert the direction of the trace linked list. + + + ret_trace = invert_trace(ret_trace); + HParseResult *ret = run_trace(mm__, prog, ret_trace, input, len); + // ret is in its own arena + h_delete_arena(arena); + return ret; +} +#undef PUSH_SVM +#undef THREAD + + + + +void svm_stack_ensure_cap(HAllocator *mm__, HSVMContext *ctx, size_t addl) { + if (ctx->stack_count + addl >= ctx->stack_capacity) { + ctx->stack = mm__->realloc(mm__, ctx->stack, sizeof(*ctx->stack) * (ctx->stack_capacity *= 2)); + // TODO: check for realloc failure + } +} + +HParseResult *run_trace(HAllocator *mm__, HRVMProg *orig_prog, HRVMTrace *trace, const uint8_t *input, int len) { + // orig_prog is only used for the action table + HSVMContext ctx; + HArena *arena = h_new_arena(mm__, 0); + ctx.stack_count = 0; + ctx.stack_capacity = 16; + ctx.stack = h_new(HParsedToken*, ctx.stack_capacity); + + HParsedToken *tmp_res; + HRVMTrace *cur; + for (cur = trace; cur; cur = cur->next) { + switch (cur->opcode) { + case SVM_PUSH: + svm_stack_ensure_cap(mm__, &ctx, 1); + tmp_res = a_new(HParsedToken, 1); + tmp_res->token_type = TT_MARK; + tmp_res->index = cur->input_pos; + tmp_res->bit_offset = 0; + ctx.stack[ctx.stack_count++] = tmp_res; + break; + case SVM_NOP: + break; + case SVM_ACTION: + // Action should modify stack appropriately + if (!orig_prog->actions[cur->arg].action(arena, &ctx, orig_prog->actions[cur->arg].env)) { + // action failed... abort somehow + // TODO: Actually abort + } + break; + case SVM_CAPTURE: + // Top of stack must be a mark + // This replaces said mark in-place with a TT_BYTES. + assert(ctx.stack[ctx.stack_count]->token_type == TT_MARK); + + tmp_res = ctx.stack[ctx.stack_count]; + tmp_res->token_type = TT_BYTES; + // TODO: Will need to copy if bit_offset is nonzero + assert(tmp_res->bit_offset == 0); + + tmp_res->bytes.token = input + tmp_res->index; + tmp_res->bytes.len = cur->input_pos - tmp_res->index + 1; // inclusive + break; + case SVM_ACCEPT: + assert(ctx.stack_count == 1); + HParseResult *res = a_new(HParseResult, 1); + res->ast = ctx.stack[0]; + res->bit_length = cur->input_pos * 8; + res->arena = arena; + return res; + } + } + + h_delete_arena(arena); + return NULL; +} + +uint16_t h_rvm_create_action(HRVMProg *prog, HSVMActionFunc action_func, void* env) { + for (uint16_t i = 0; i < prog->action_count; i++) { + if (prog->actions[i].action == action_func && prog->actions[i].env == env) + return i; + } + // Ensure that there's room in the action array... + if (!(prog->action_count & (prog->action_count + 1))) { + // needs to be scaled up. + size_t array_size = (prog->action_count + 1) * 2; // action_count+1 is a + // power of two + prog->actions = prog->allocator->realloc(prog->allocator, prog->actions, array_size * sizeof(*prog->actions)); + // TODO: Handle the allocation failed case nicely. + } + + HSVMAction *action = &prog->actions[prog->action_count]; + action->action = action_func; + action->env = env; + return prog->action_count++; +} + +uint16_t h_rvm_insert_insn(HRVMProg *prog, HRVMOp op, uint16_t arg) { + // Ensure that there's room in the insn array... + if (!(prog->length & (prog->length + 1))) { + // needs to be scaled up. + size_t array_size = (prog->length + 1) * 2; // action_count+1 is a + // power of two + prog->insns = prog->allocator->realloc(prog->allocator, prog->insns, array_size * sizeof(*prog->insns)); + // TODO: Handle the allocation failed case nicely. + } + + prog->insns[prog->length].op = op; + prog->insns[prog->length].arg = arg; + return prog->length++; +} + +uint16_t h_rvm_get_ip(HRVMProg *prog) { + return prog->length; +} + +void h_rvm_patch_arg(HRVMProg *prog, uint16_t ip, uint16_t new_val) { + assert(prog->length > ip); + prog->insns[ip].arg = new_val; +} + +size_t h_svm_count_to_mark(HSVMContext *ctx) { + size_t ctm; + for (ctm = 0; ctm < ctx->stack_count-1; ctm++) { + if (ctx->stack[ctx->stack_count - 1 - ctm]->token_type == TT_MARK) + return ctm; + } + return ctx->stack_count; +} + +// TODO: Implement the primitive actions +bool h_svm_action_make_sequence(HArena *arena, HSVMContext *ctx, void* env) { + size_t n_items = h_svm_count_to_mark(ctx); + assert (n_items < ctx->stack_count); + HParsedToken *res = ctx->stack[ctx->stack_count - 1 - n_items]; + assert (res->token_type == TT_MARK); + res->token_type = TT_SEQUENCE; + + HCountedArray *ret_carray = h_carray_new_sized(arena, n_items); + res->seq = ret_carray; + // res index and bit offset are the same as the mark. + for (size_t i = 0; i < n_items; i++) { + ret_carray->elements[i] = ctx->stack[ctx->stack_count - n_items + i]; + } + ctx->stack_count -= n_items; + return true; +} + +bool h_svm_action_clear_to_mark(HArena *arena, HSVMContext *ctx, void* env) { + while (ctx->stack_count > 0) { + if (ctx->stack[--ctx->stack_count]->token_type == TT_MARK) + return true; + } + return false; // no mark found. +} + +// Glue regex backend to rest of system + +bool h_compile_regex(HRVMProg *prog, const HParser *parser) { + return parser->vtable->compile_to_rvm(prog, parser->env); +} + +static void h_regex_free(HParser *parser) { + HRVMProg *prog = (HRVMProg*)parser->backend_data; + HAllocator *mm__ = prog->allocator; + h_free(prog->insns); + h_free(prog->actions); + h_free(prog); + parser->backend_data = NULL; + parser->backend = PB_PACKRAT; +} + +static int h_regex_compile(HAllocator *mm__, HParser* parser, const void* params) { + if (!parser->vtable->isValidRegular(parser->env)) + return 1; + HRVMProg *prog = h_new(HRVMProg, 1); + prog->allocator = mm__; + if (!h_compile_regex(prog, parser)) { + h_free(prog->insns); + h_free(prog->actions); + h_free(prog); + return 2; + } + parser->backend_data = prog; + return 0; +} + +static HParseResult *h_regex_parse(HAllocator* mm__, const HParser* parser, HInputStream *input_stream) { + return h_rvm_run__m(mm__, (HRVMProg*)parser->backend_data, input_stream->input, input_stream->length); +} + +HParserBackendVTable h__regex_backend_vtable = { + .compile = h_regex_compile, + .parse = h_regex_parse, + .free = h_regex_free +}; diff --git a/src/backends/regex.h b/src/backends/regex.h new file mode 100644 index 0000000000000000000000000000000000000000..a84904d0b6bcd4ea18b8b0281b625c4c303e9a8d --- /dev/null +++ b/src/backends/regex.h @@ -0,0 +1,80 @@ +// Internal defs +#ifndef HAMMER_BACKEND_REGEX__H +#define HAMMER_BACKEND_REGEX__H + +// each insn is an 8-bit opcode and a 16-bit parameter +// [a] are actions; they add an instruction to the stackvm that is being output. +// [m] are match ops; they can either succeed or fail, depending on the current character +// [c] are control ops. They affect the pc non-linearly. +typedef enum HRVMOp_ { + RVM_ACCEPT, // [a] + RVM_GOTO, // [c] parameter is an offset into the instruction table + RVM_FORK, // [c] parameter is an offset into the instruction table + RVM_PUSH, // [a] No arguments, just pushes a mark (pointer to some + // character in the input string) onto the stack + RVM_ACTION, // [a] argument is an action ID + RVM_CAPTURE, // [a] Capture the last string (up to the current + // position, non-inclusive), and push it on the + // stack. No arg. + RVM_EOF, // [m] Succeeds only if at EOF. + RVM_MATCH, // [m] The high byte of the parameter is an upper bound + // and the low byte is a lower bound, both + // inclusive. An inverted match should be handled + // as two ranges. + RVM_STEP, // [a] Step to the next byte of input + RVM_OPCOUNT +} HRVMOp; + +typedef struct HRVMInsn_{ + uint8_t op; + uint16_t arg; +} HRVMInsn; + +#define TT_MARK TT_RESERVED_1 + +typedef struct HSVMContext_ { + HParsedToken **stack; + size_t stack_count; // number of items on the stack. Thus stack[stack_count] is the first unused item on the stack. + size_t stack_capacity; +} HSVMContext; + +// These actions all assume that the items on the stack are not +// aliased anywhere. +typedef bool (*HSVMActionFunc)(HArena *arena, HSVMContext *ctx, void* env); +typedef struct HSVMAction_ { + HSVMActionFunc action; + void* env; +} HSVMAction; + +struct HRVMProg_ { + HAllocator *allocator; + size_t length; + size_t action_count; + HRVMInsn *insns; + HSVMAction *actions; +}; + +// Returns true IFF the provided parser could be compiled. +bool h_compile_regex(HRVMProg *prog, const HParser* parser); + +// These functions are used by the compile_to_rvm method of HParser +uint16_t h_rvm_create_action(HRVMProg *prog, HSVMActionFunc action_func, void* env); + +// returns the address of the instruction just created +uint16_t h_rvm_insert_insn(HRVMProg *prog, HRVMOp op, uint16_t arg); + +// returns the address of the next insn to be created. +uint16_t h_rvm_get_ip(HRVMProg *prog); + +// Used to insert forward references; the idea is to generate a JUMP +// or FORK instruction with a target of 0, then update it once the +// correct target is known. +void h_rvm_patch_arg(HRVMProg *prog, uint16_t ip, uint16_t new_val); + +// Common SVM action funcs... +bool h_svm_action_make_sequence(HArena *arena, HSVMContext *ctx, void* env); +bool h_svm_action_clear_to_mark(HArena *arena, HSVMContext *ctx, void* env); + +extern HParserBackendVTable h__regex_backend_vtable; + +#endif diff --git a/src/backends/regexvm_asm.pl b/src/backends/regexvm_asm.pl new file mode 100644 index 0000000000000000000000000000000000000000..998b8408db4ace734493bb83b1030acc879596e8 --- /dev/null +++ b/src/backends/regexvm_asm.pl @@ -0,0 +1,112 @@ +#!/usr/bin/perl -w + +use strict; +# The input file consists of a sequence of blocks, which can be parsed +# as SVM test cases, RVM test cases, or C functions. Each block starts +# with a header line, then a sequence of options, and finally text in +# a format defined by the block type. +# +# Header lines start with "+TYPE", optionally followed by a name. This +# name is semantically meaningful for SVM and RVM blocks; it +# determines the name of the test case. + +# A C block's name is not used, and it takes no options. The body +# (which continues until the first line that looks like a header), is +# just passed straight through into the C source. + +# SVM blocks' names are the GLib test case name. The underlying +# function's name is derived by substituting invalid characters with +# '_'. Note that this can result in collisions (eg, /foo_bar/baz +# collides with /foo/bar_baz). If this happens, it's your own damn +# fault; rename the blocks. SVM blocks take three different options: +# @input, @output, and @pre. The @input pragma's argument is a +# C-quoted string that gets passed into the VM as the input string, +# and @output is a C-quoted string that is compared against +# h_write_result_unamb. @pre lines are prepended verbatim to the +# function body (with the @pre stripped, of course); they can be used +# to initialize environment values. +# +# SVM instructions consist of either two or four fields: +# +# input_pos opcode [arg env] +# +# input_pos and opcode correspond to the fields in HRVMTrace. arg and +# env are used to populate an HSVMAction; arg is the function, and env +# is the object whose address should be used as the env. + +# RVM blocks are very similar to SVM blocks; the name and options are +# handled exactly the same way. The assembly text is handled slightly +# differently; the format is: +# +# [label:] opcode [arg ...] +# +# For FORK and GOTO, the arg should be a label that is defined +# elsewhere. +# +# For ACTION, the arguments are handled the same way as with SVM. +# +# MATCH takes two arguments, each of which can be any C integer +# constant (not including character constants), which form the lower +# and upper bounds of the matched character, respectively. +# +# No other RVM instructions take an argument. + +# At the beginning of any line, comments preceeded by '#' are allowed; +# they are replaced by C++ comments and inserted in the nearest valid +# location in the output. + +my $mode == "TOP"; + +# common regexes: +my $re_ident = qr/[A-Za-z_][A-Za-z0-9_]*/; +my $re_cstr = qr/"(?:[^\\"]|\\["'abefnrtv0\\]|\\x[0-9a-fA-F]{2}|\\[0-7]{3})*"/; + + +my %svm = ( + name => sub { + my ($env, $name) = @_; + $env->{name} = $name; + }, + pragma => sub { + my ($env, $name, $val) = @_; + if ($name eq "input") { + chomp($env->{input} = $val); + } elsif ($name eq "output") { + chomp($env->{output} = $val); + } elsif ($name eq "pre") { + # Do I have the ref precedence right here? + push(@$env->{pre}, $val); + } else { + warn "Invalid SVM pragma"; + } + }, + body => sub { + my ($env, $line) = @_; + my ($ipos, $op, $arg, $argenv); + if ($line =~ /^\s*(\d+)\s+(PUSH|NOP|ACTION|CAPTURE|ACCEPT)(?:\s+($re_ident)\s+($re_ident))?/) { + if ($2 eq "PUSH") { + # TODO: implement all the opcodes + } + } + } + ); + + +while (<>) { + if (/^+(C|RVM|SVM)/) { + $mode = $1; + } + + if ($mode eq "TOP") { + if (/^#(.*)/) { + print "// $1"; + next; + } + } elsif ($mode eq "SVM") { + } elsif ($mode eq "RVM") { + } elsif ($mode eq "C") { + } + +} + + diff --git a/src/benchmark.c b/src/benchmark.c index 41d9164fce1457cb1ff116598fcc54226cb7942a..918f87c1be4d370fbba8cf0d40426e5a1da8b37c 100644 --- a/src/benchmark.c +++ b/src/benchmark.c @@ -21,11 +21,11 @@ */ -HBenchmarkResults *h_benchmark(const HParser* parser, HParserTestcase* testcases) { +HBenchmarkResults *h_benchmark(HParser* parser, HParserTestcase* testcases) { return h_benchmark__m(&system_allocator, parser, testcases); } -HBenchmarkResults *h_benchmark__m(HAllocator* mm__, const HParser* parser, HParserTestcase* testcases) { +HBenchmarkResults *h_benchmark__m(HAllocator* mm__, HParser* parser, HParserTestcase* testcases) { // For now, just output the results to stderr HParserTestcase* tc = testcases; HParserBackend backend = PB_MIN; @@ -33,7 +33,7 @@ HBenchmarkResults *h_benchmark__m(HAllocator* mm__, const HParser* parser, HPars ret->len = PB_MAX-PB_MIN; ret->results = h_new(HBackendResults, ret->len); - for (backend = PB_MIN; backend < PB_MAX; backend++) { + for (backend = PB_MIN; backend <= PB_MAX; backend++) { ret->results[backend].backend = backend; // Step 1: Compile grammar for given parser... if (h_compile(parser, backend, NULL) == -1) { diff --git a/src/compile.c b/src/compile.c deleted file mode 100644 index 305db3e9f9a0d8c7bec66b5df3ff4caf358b8eed..0000000000000000000000000000000000000000 --- a/src/compile.c +++ /dev/null @@ -1,17 +0,0 @@ -// This file contains functions related to managing multiple parse backends -#include "hammer.h" -#include "internal.h" - -static HParserBackendVTable *backends[PB_MAX] = { - &h__packrat_backend_vtable, - &h__ll_backend_vtable, -}; - -int h_compile(const HParser* parser, HParserBackend backend, const void* params) { - return h_compile__m(&system_allocator, parser, backend, params); -} - -int h_compile__m(HAllocator* mm__, const HParser* parser, HParserBackend backend, const void* params) { - // be naughty and cast off the const - return backends[backend]->compile(mm__, (HParser *)parser, params); -} diff --git a/src/hammer.c b/src/hammer.c index c33f6c8bc0b7bd388b6cc6ee6427ef6f8bf81526..72da96d601b384c554194d0e0fdb3a5b6ab7207c 100644 --- a/src/hammer.c +++ b/src/hammer.c @@ -26,13 +26,12 @@ #include "allocator.h" #include "parsers/parser_internal.h" -static uint32_t djbhash(const uint8_t *buf, size_t len) { - uint32_t hash = 5381; - while (len--) { - hash = hash * 33 + *buf++; - } - return hash; -} +static HParserBackendVTable *backends[PB_MAX + 1] = { + &h__packrat_backend_vtable, + &h__regex_backend_vtable, + &h__llk_backend_vtable, +}; + /* Helper function, since these lines appear in every parser */ @@ -42,46 +41,52 @@ typedef struct { } HTwoParsers; -static uint32_t cache_key_hash(const void* key) { - return djbhash(key, sizeof(HParserCacheKey)); -} -static bool cache_key_equal(const void* key1, const void* key2) { - return memcmp(key1, key2, sizeof(HParserCacheKey)) == 0; -} HParseResult* h_parse(const HParser* parser, const uint8_t* input, size_t length) { return h_parse__m(&system_allocator, parser, input, length); } -HParseResult* h_parse__m(HAllocator* mm__, const HParser* parser, const uint8_t* input, size_t length) { +HParseResult* h_parse__m(HAllocator* mm__, const HParser* parser, const uint8_t* input, size_t length) { + // TODO: split the creation of the parse state into h_packrat_parse // Set up a parse state... - HArena * arena = h_new_arena(mm__, 0); - HParseState *parse_state = a_new_(arena, HParseState, 1); - parse_state->cache = h_hashtable_new(arena, cache_key_equal, // key_equal_func - cache_key_hash); // hash_func - parse_state->input_stream.input = input; - parse_state->input_stream.index = 0; - parse_state->input_stream.bit_offset = 8; // bit big endian - parse_state->input_stream.overrun = 0; - parse_state->input_stream.endianness = BIT_BIG_ENDIAN | BYTE_BIG_ENDIAN; - parse_state->input_stream.length = length; - parse_state->lr_stack = h_slist_new(arena); - parse_state->recursion_heads = h_hashtable_new(arena, cache_key_equal, - cache_key_hash); - parse_state->arena = arena; - HParseResult *res = h_do_parse(parser, parse_state); - h_slist_free(parse_state->lr_stack); - h_hashtable_free(parse_state->recursion_heads); - // tear down the parse state - h_hashtable_free(parse_state->cache); - if (!res) - h_delete_arena(parse_state->arena); - - return res; + HInputStream input_stream = { + .index = 0, + .bit_offset = 8, + .overrun = 0, + .endianness = BIT_BIG_ENDIAN | BYTE_BIG_ENDIAN, + .length = length, + .input = input + }; + + return backends[parser->backend]->parse(mm__, parser, &input_stream); } void h_parse_result_free(HParseResult *result) { h_delete_arena(result->arena); } +bool h_false(void* env) { + (void)env; + return false; +} +bool h_true(void* env) { + (void)env; + return true; +} + +bool h_not_regular(HRVMProg *prog, void *env) { + (void)env; + return false; +} + +int h_compile(HParser* parser, HParserBackend backend, const void* params) { + return h_compile__m(&system_allocator, parser, backend, params); +} + +int h_compile__m(HAllocator* mm__, HParser* parser, HParserBackend backend, const void* params) { + int ret = backends[backend]->compile(mm__, parser, params); + if (!ret) + parser->backend = backend; + return ret; +} diff --git a/src/hammer.h b/src/hammer.h index e75a41fd4dc7d5b283018916a0a7c5957f081807..918977559dc252e618e264a7dd2856235db9b51b 100644 --- a/src/hammer.h +++ b/src/hammer.h @@ -34,8 +34,11 @@ typedef struct HParseState_ HParseState; typedef enum HParserBackend_ { PB_MIN = 0, PB_PACKRAT = PB_MIN, // PB_MIN is always the default. - PB_LL, - PB_MAX + PB_REGULAR, // + PB_LLk, // + PB_LALR, // Not Implemented + PB_GLR, // Not Implemented + PB_MAX = PB_LLk } HParserBackend; typedef enum HTokenType_ { @@ -44,6 +47,7 @@ typedef enum HTokenType_ { TT_SINT, TT_UINT, TT_SEQUENCE, + TT_RESERVED_1, // reserved for backend-specific internal use TT_USER = 64, TT_ERR, TT_MAX @@ -75,7 +79,9 @@ typedef struct HParsedToken_ { } HParsedToken; /** - * The result of a successful parse. + * The result of a successful parse. Note that this may reference the + * input string. + * * If a parse fails, the parse result will be NULL. * If a parse is successful but there's nothing there (i.e., if end_p * succeeds) then there's a parse result but its ast is NULL. @@ -111,12 +117,14 @@ typedef const HParsedToken* (*HAction)(const HParseResult *p); */ typedef bool (*HPredicate)(HParseResult *p); -typedef struct HParserVtable_ HParserVtable; - typedef struct HCFChoice_ HCFChoice; +typedef struct HRVMProg_ HRVMProg; +typedef struct HParserVtable_ HParserVtable; typedef struct HParser_ { const HParserVtable *vtable; + HParserBackend backend; + void* backend_data; void *env; void *data; /* e.g., parse tables */ HCFChoice *desugared; /* if the parser can be desugared, its desugared form */ @@ -191,7 +199,7 @@ HAMMER_FN_DECL(HParseResult*, h_parse, const HParser* parser, const uint8_t* inp * * Result token type: TT_BYTES */ -HAMMER_FN_DECL(const HParser*, h_token, const uint8_t *str, const size_t len); +HAMMER_FN_DECL(HParser*, h_token, const uint8_t *str, const size_t len); /** * Given a single character, returns a parser that parses that @@ -199,7 +207,7 @@ HAMMER_FN_DECL(const HParser*, h_token, const uint8_t *str, const size_t len); * * Result token type: TT_UINT */ -HAMMER_FN_DECL(const HParser*, h_ch, const uint8_t c); +HAMMER_FN_DECL(HParser*, h_ch, const uint8_t c); /** * Given two single-character bounds, lower and upper, returns a parser @@ -208,14 +216,14 @@ HAMMER_FN_DECL(const HParser*, h_ch, const uint8_t c); * * Result token type: TT_UINT */ -HAMMER_FN_DECL(const HParser*, h_ch_range, const uint8_t lower, const uint8_t upper); +HAMMER_FN_DECL(HParser*, h_ch_range, const uint8_t lower, const uint8_t upper); /** * Given an integer parser, p, and two integer bounds, lower and upper, * returns a parser that parses an integral value within the range * [lower, upper] (inclusive). */ -HAMMER_FN_DECL(const HParser*, h_int_range, const HParser *p, const int64_t lower, const int64_t upper); +HAMMER_FN_DECL(HParser*, h_int_range, const HParser *p, const int64_t lower, const int64_t upper); /** * Returns a parser that parses the specified number of bits. sign == @@ -223,63 +231,63 @@ HAMMER_FN_DECL(const HParser*, h_int_range, const HParser *p, const int64_t lowe * * Result token type: TT_SINT if sign == true, TT_UINT if sign == false */ -HAMMER_FN_DECL(const HParser*, h_bits, size_t len, bool sign); +HAMMER_FN_DECL(HParser*, h_bits, size_t len, bool sign); /** * Returns a parser that parses a signed 8-byte integer value. * * Result token type: TT_SINT */ -HAMMER_FN_DECL_NOARG(const HParser*, h_int64); +HAMMER_FN_DECL_NOARG(HParser*, h_int64); /** * Returns a parser that parses a signed 4-byte integer value. * * Result token type: TT_SINT */ -HAMMER_FN_DECL_NOARG(const HParser*, h_int32); +HAMMER_FN_DECL_NOARG(HParser*, h_int32); /** * Returns a parser that parses a signed 2-byte integer value. * * Result token type: TT_SINT */ -HAMMER_FN_DECL_NOARG(const HParser*, h_int16); +HAMMER_FN_DECL_NOARG(HParser*, h_int16); /** * Returns a parser that parses a signed 1-byte integer value. * * Result token type: TT_SINT */ -HAMMER_FN_DECL_NOARG(const HParser*, h_int8); +HAMMER_FN_DECL_NOARG(HParser*, h_int8); /** * Returns a parser that parses an unsigned 8-byte integer value. * * Result token type: TT_UINT */ -HAMMER_FN_DECL_NOARG(const HParser*, h_uint64); +HAMMER_FN_DECL_NOARG(HParser*, h_uint64); /** * Returns a parser that parses an unsigned 4-byte integer value. * * Result token type: TT_UINT */ -HAMMER_FN_DECL_NOARG(const HParser*, h_uint32); +HAMMER_FN_DECL_NOARG(HParser*, h_uint32); /** * Returns a parser that parses an unsigned 2-byte integer value. * * Result token type: TT_UINT */ -HAMMER_FN_DECL_NOARG(const HParser*, h_uint16); +HAMMER_FN_DECL_NOARG(HParser*, h_uint16); /** * Returns a parser that parses an unsigned 1-byte integer value. * * Result token type: TT_UINT */ -HAMMER_FN_DECL_NOARG(const HParser*, h_uint8); +HAMMER_FN_DECL_NOARG(HParser*, h_uint8); /** * Given another parser, p, returns a parser that skips any whitespace @@ -287,7 +295,7 @@ HAMMER_FN_DECL_NOARG(const HParser*, h_uint8); * * Result token type: p's result type */ -HAMMER_FN_DECL(const HParser*, h_whitespace, const HParser* p); +HAMMER_FN_DECL(HParser*, h_whitespace, const HParser* p); /** * Given two parsers, p and q, returns a parser that parses them in @@ -295,7 +303,7 @@ HAMMER_FN_DECL(const HParser*, h_whitespace, const HParser* p); * * Result token type: p's result type */ -HAMMER_FN_DECL(const HParser*, h_left, const HParser* p, const HParser* q); +HAMMER_FN_DECL(HParser*, h_left, const HParser* p, const HParser* q); /** * Given two parsers, p and q, returns a parser that parses them in @@ -303,7 +311,7 @@ HAMMER_FN_DECL(const HParser*, h_left, const HParser* p, const HParser* q); * * Result token type: q's result type */ -HAMMER_FN_DECL(const HParser*, h_right, const HParser* p, const HParser* q); +HAMMER_FN_DECL(HParser*, h_right, const HParser* p, const HParser* q); /** * Given three parsers, p, x, and q, returns a parser that parses them in @@ -311,7 +319,7 @@ HAMMER_FN_DECL(const HParser*, h_right, const HParser* p, const HParser* q); * * Result token type: x's result type */ -HAMMER_FN_DECL(const HParser*, h_middle, const HParser* p, const HParser* x, const HParser* q); +HAMMER_FN_DECL(HParser*, h_middle, const HParser* p, const HParser* x, const HParser* q); /** * Given another parser, p, and a function f, returns a parser that @@ -319,21 +327,21 @@ HAMMER_FN_DECL(const HParser*, h_middle, const HParser* p, const HParser* x, con * * Result token type: any */ -HAMMER_FN_DECL(const HParser*, h_action, const HParser* p, const HAction a); +HAMMER_FN_DECL(HParser*, h_action, const HParser* p, const HAction a); /** * Parse a single character in the given charset. * * Result token type: TT_UINT */ -HAMMER_FN_DECL(const HParser*, h_in, const uint8_t *charset, size_t length); +HAMMER_FN_DECL(HParser*, h_in, const uint8_t *charset, size_t length); /** * Parse a single character *NOT* in the given charset. * * Result token type: TT_UINT */ -HAMMER_FN_DECL(const HParser*, h_not_in, const uint8_t *charset, size_t length); +HAMMER_FN_DECL(HParser*, h_not_in, const uint8_t *charset, size_t length); /** * A no-argument parser that succeeds if there is no more input to @@ -341,14 +349,14 @@ HAMMER_FN_DECL(const HParser*, h_not_in, const uint8_t *charset, size_t length); * * Result token type: None. The HParseResult exists but its AST is NULL. */ -HAMMER_FN_DECL_NOARG(const HParser*, h_end_p); +HAMMER_FN_DECL_NOARG(HParser*, h_end_p); /** * This parser always fails. * * Result token type: NULL. Always. */ -HAMMER_FN_DECL_NOARG(const HParser*, h_nothing_p); +HAMMER_FN_DECL_NOARG(HParser*, h_nothing_p); /** * Given a null-terminated list of parsers, apply each parser in order. @@ -356,7 +364,7 @@ HAMMER_FN_DECL_NOARG(const HParser*, h_nothing_p); * * Result token type: TT_SEQUENCE */ -HAMMER_FN_DECL_VARARGS_ATTR(__attribute__((sentinel)), const HParser*, h_sequence, const HParser* p); +HAMMER_FN_DECL_VARARGS_ATTR(__attribute__((sentinel)), HParser*, h_sequence, const HParser* p); /** * Given an array of parsers, p_array, apply each parser in order. The @@ -365,7 +373,7 @@ HAMMER_FN_DECL_VARARGS_ATTR(__attribute__((sentinel)), const HParser*, h_sequenc * * Result token type: The type of the first successful parser's result. */ -HAMMER_FN_DECL_VARARGS_ATTR(__attribute__((sentinel)), const HParser*, h_choice, const HParser* p); +HAMMER_FN_DECL_VARARGS_ATTR(__attribute__((sentinel)), HParser*, h_choice, const HParser* p); /** * Given two parsers, p1 and p2, this parser succeeds in the following @@ -375,7 +383,7 @@ HAMMER_FN_DECL_VARARGS_ATTR(__attribute__((sentinel)), const HParser*, h_choice, * * Result token type: p1's result type. */ -HAMMER_FN_DECL(const HParser*, h_butnot, const HParser* p1, const HParser* p2); +HAMMER_FN_DECL(HParser*, h_butnot, const HParser* p1, const HParser* p2); /** * Given two parsers, p1 and p2, this parser succeeds in the following @@ -385,7 +393,7 @@ HAMMER_FN_DECL(const HParser*, h_butnot, const HParser* p1, const HParser* p2); * * Result token type: p1's result type. */ -HAMMER_FN_DECL(const HParser*, h_difference, const HParser* p1, const HParser* p2); +HAMMER_FN_DECL(HParser*, h_difference, const HParser* p1, const HParser* p2); /** * Given two parsers, p1 and p2, this parser succeeds if *either* p1 or @@ -393,7 +401,7 @@ HAMMER_FN_DECL(const HParser*, h_difference, const HParser* p1, const HParser* p * * Result token type: The type of the result of whichever parser succeeded. */ -HAMMER_FN_DECL(const HParser*, h_xor, const HParser* p1, const HParser* p2); +HAMMER_FN_DECL(HParser*, h_xor, const HParser* p1, const HParser* p2); /** * Given a parser, p, this parser succeeds for zero or more repetitions @@ -401,7 +409,7 @@ HAMMER_FN_DECL(const HParser*, h_xor, const HParser* p1, const HParser* p2); * * Result token type: TT_SEQUENCE */ -HAMMER_FN_DECL(const HParser*, h_many, const HParser* p); +HAMMER_FN_DECL(HParser*, h_many, const HParser* p); /** * Given a parser, p, this parser succeeds for one or more repetitions @@ -409,7 +417,7 @@ HAMMER_FN_DECL(const HParser*, h_many, const HParser* p); * * Result token type: TT_SEQUENCE */ -HAMMER_FN_DECL(const HParser*, h_many1, const HParser* p); +HAMMER_FN_DECL(HParser*, h_many1, const HParser* p); /** * Given a parser, p, this parser succeeds for exactly N repetitions @@ -417,7 +425,7 @@ HAMMER_FN_DECL(const HParser*, h_many1, const HParser* p); * * Result token type: TT_SEQUENCE */ -HAMMER_FN_DECL(const HParser*, h_repeat_n, const HParser* p, const size_t n); +HAMMER_FN_DECL(HParser*, h_repeat_n, const HParser* p, const size_t n); /** * Given a parser, p, this parser succeeds with the value p parsed or @@ -425,7 +433,7 @@ HAMMER_FN_DECL(const HParser*, h_repeat_n, const HParser* p, const size_t n); * * Result token type: If p succeeded, the type of its result; if not, TT_NONE. */ -HAMMER_FN_DECL(const HParser*, h_optional, const HParser* p); +HAMMER_FN_DECL(HParser*, h_optional, const HParser* p); /** * Given a parser, p, this parser succeeds if p succeeds, but doesn't @@ -433,7 +441,7 @@ HAMMER_FN_DECL(const HParser*, h_optional, const HParser* p); * * Result token type: None. The HParseResult exists but its AST is NULL. */ -HAMMER_FN_DECL(const HParser*, h_ignore, const HParser* p); +HAMMER_FN_DECL(HParser*, h_ignore, const HParser* p); /** * Given a parser, p, and a parser for a separator, sep, this parser @@ -444,7 +452,7 @@ HAMMER_FN_DECL(const HParser*, h_ignore, const HParser* p); * * Result token type: TT_SEQUENCE */ -HAMMER_FN_DECL(const HParser*, h_sepBy, const HParser* p, const HParser* sep); +HAMMER_FN_DECL(HParser*, h_sepBy, const HParser* p, const HParser* sep); /** * Given a parser, p, and a parser for a separator, sep, this parser matches a list of things that p can parse, separated by sep. Unlike sepBy, this ensures that the result has at least one element. @@ -452,14 +460,14 @@ HAMMER_FN_DECL(const HParser*, h_sepBy, const HParser* p, const HParser* sep); * * Result token type: TT_SEQUENCE */ -HAMMER_FN_DECL(const HParser*, h_sepBy1, const HParser* p, const HParser* sep); +HAMMER_FN_DECL(HParser*, h_sepBy1, const HParser* p, const HParser* sep); /** * This parser always returns a zero length match, i.e., empty string. * * Result token type: None. The HParseResult exists but its AST is NULL. */ -HAMMER_FN_DECL_NOARG(const HParser*, h_epsilon_p); +HAMMER_FN_DECL_NOARG(HParser*, h_epsilon_p); /** * This parser applies its first argument to read an unsigned integer @@ -470,7 +478,7 @@ HAMMER_FN_DECL_NOARG(const HParser*, h_epsilon_p); * * Result token type: TT_SEQUENCE */ -HAMMER_FN_DECL(const HParser*, h_length_value, const HParser* length, const HParser* value); +HAMMER_FN_DECL(HParser*, h_length_value, const HParser* length, const HParser* value); /** * This parser attaches a predicate function, which returns true or @@ -485,7 +493,7 @@ HAMMER_FN_DECL(const HParser*, h_length_value, const HParser* length, const HPar * * Result token type: p's result type if pred succeeded, NULL otherwise. */ -HAMMER_FN_DECL(const HParser*, h_attr_bool, const HParser* p, HPredicate pred); +HAMMER_FN_DECL(HParser*, h_attr_bool, const HParser* p, HPredicate pred); /** * The 'and' parser asserts that a conditional syntax is satisfied, @@ -502,7 +510,7 @@ HAMMER_FN_DECL(const HParser*, h_attr_bool, const HParser* p, HPredicate pred); * * Result token type: None. The HParseResult exists but its AST is NULL. */ -HAMMER_FN_DECL(const HParser*, h_and, const HParser* p); +HAMMER_FN_DECL(HParser*, h_and, const HParser* p); /** * The 'not' parser asserts that a conditional syntax is *not* @@ -522,7 +530,7 @@ HAMMER_FN_DECL(const HParser*, h_and, const HParser* p); * * Result token type: None. The HParseResult exists but its AST is NULL. */ -HAMMER_FN_DECL(const HParser*, h_not, const HParser* p); +HAMMER_FN_DECL(HParser*, h_not, const HParser* p); /** * Create a parser that just calls out to another, as yet unknown, @@ -565,7 +573,7 @@ HAMMER_FN_DECL(void, h_pprint, FILE* stream, const HParsedToken* tok, int indent * * Returns -1 if grammar cannot be compiled with the specified options; 0 otherwise. */ -HAMMER_FN_DECL(int, h_compile, const HParser* parser, HParserBackend backend, const void* params); +HAMMER_FN_DECL(int, h_compile, HParser* parser, HParserBackend backend, const void* params); /** * TODO: Document me @@ -590,7 +598,7 @@ const uint8_t* h_bit_writer_get_buffer(HBitWriter* w, size_t *len); void h_bit_writer_free(HBitWriter* w); // {{{ Benchmark functions -HAMMER_FN_DECL(HBenchmarkResults *, h_benchmark, const HParser* parser, HParserTestcase* testcases); +HAMMER_FN_DECL(HBenchmarkResults *, h_benchmark, HParser* parser, HParserTestcase* testcases); void h_benchmark_report(FILE* stream, HBenchmarkResults* results); void h_benchmark_dump_optimized_code(FILE* stream, HBenchmarkResults* results); // }}} diff --git a/src/internal.h b/src/internal.h index ee4281fc4804b4638c4e72f170f0fafc627393b8..e50466281441827988ad5dda5a8268a98d32bef0 100644 --- a/src/internal.h +++ b/src/internal.h @@ -48,7 +48,7 @@ static inline void h_generic_free(HAllocator *allocator, void* ptr) { allocator->free(allocator, ptr); } -HAllocator system_allocator; +extern HAllocator system_allocator; typedef struct HInputStream_ { @@ -131,7 +131,8 @@ struct HParseState_ { typedef struct HParserBackendVTable_ { int (*compile)(HAllocator *mm__, HParser* parser, const void* params); - HParseResult* (*parse)(HAllocator *mm__, const HParser* parser, HParseState* parse_state); + HParseResult* (*parse)(HAllocator *mm__, const HParser* parser, HInputStream* parse_state); + void (*free)(HParser* parser); } HParserBackendVTable; @@ -213,9 +214,10 @@ struct HBitWriter_ { // }}} + // Backends {{{ extern HParserBackendVTable h__packrat_backend_vtable; -extern HParserBackendVTable h__ll_backend_vtable; +extern HParserBackendVTable h__llk_backend_vtable; // }}} // TODO(thequux): Set symbol visibility for these functions so that they aren't exported. @@ -295,9 +297,14 @@ struct HParserVtable_ { HParseResult* (*parse)(void *env, HParseState *state); bool (*isValidRegular)(void *env); bool (*isValidCF)(void *env); + bool (*compile_to_rvm)(HRVMProg *prog, void* env); // FIXME: forgot what the bool return value was supposed to mean. HCFChoice* (*desugar)(HAllocator *mm__, void *env); }; +bool h_false(void*); +bool h_true(void*); +bool h_not_regular(HRVMProg*, void*); + #if 0 #include <stdlib.h> #define h_arena_malloc(a, s) malloc(s) diff --git a/src/parsers/action.c b/src/parsers/action.c index d6bcc0c984b26acdac12bbb42bad6f28647bc999..f708881cdc980023b1f0fb81e821ec7678507795 100644 --- a/src/parsers/action.c +++ b/src/parsers/action.c @@ -12,7 +12,7 @@ static HParseResult* parse_action(void *env, HParseState *state) { //HParsedToken *tok = a->action(h_do_parse(a->p, state)); if(tmp) { const HParsedToken *tok = a->action(tmp); - return make_result(state, (HParsedToken*)tok); + return make_result(state->arena, (HParsedToken*)tok); } else return NULL; } else // either the parser's missing or the action's missing @@ -44,18 +44,24 @@ static bool action_isValidCF(void *env) { return a->p->vtable->isValidCF(a->p->env); } +static bool action_ctrvm(HRVMProg *prog, void* env) { + HParseAction *a = (HParseAction*)env; + return a->p->vtable->compile_to_rvm(prog, a->p->env); +} + static const HParserVtable action_vt = { .parse = parse_action, .isValidRegular = action_isValidRegular, .isValidCF = action_isValidCF, .desugar = desugar_action, + .compile_to_rvm = action_ctrvm, }; -const HParser* h_action(const HParser* p, const HAction a) { +HParser* h_action(const HParser* p, const HAction a) { return h_action__m(&system_allocator, p, a); } -const HParser* h_action__m(HAllocator* mm__, const HParser* p, const HAction a) { +HParser* h_action__m(HAllocator* mm__, const HParser* p, const HAction a) { HParseAction *env = h_new(HParseAction, 1); env->p = p; env->action = a; diff --git a/src/parsers/and.c b/src/parsers/and.c index f5fadb71790fb57e06265bd6f3a6eb89f9e6a58d..49d43870be4a130655d303c66d10eb7487bb4d40 100644 --- a/src/parsers/and.c +++ b/src/parsers/and.c @@ -5,7 +5,7 @@ static HParseResult *parse_and(void* env, HParseState* state) { HParseResult *res = h_do_parse((HParser*)env, state); state->input_stream = bak; if (res) - return make_result(state, NULL); + return make_result(state->arena, NULL); return NULL; } @@ -22,13 +22,14 @@ static const HParserVtable and_vt = { revision. --mlp, 18/12/12 */ .isValidCF = h_false, /* despite TODO above, this remains false. */ .desugar = desugar_and, + .compile_to_rvm = h_not_regular, }; -const HParser* h_and(const HParser* p) { +HParser* h_and(const HParser* p) { return h_and__m(&system_allocator, p); } -const HParser* h_and__m(HAllocator* mm__, const HParser* p) { +HParser* h_and__m(HAllocator* mm__, const HParser* p) { // zero-width postive lookahead return h_new_parser(mm__, &and_vt, (void *)p); } diff --git a/src/parsers/attr_bool.c b/src/parsers/attr_bool.c index c365f00da125da188ee616d15251e227d194b60a..c63273b62c51e72867591a314c2b279a5ccb7a88 100644 --- a/src/parsers/attr_bool.c +++ b/src/parsers/attr_bool.c @@ -47,18 +47,24 @@ static HCFChoice* desugar_ab(HAllocator *mm__, void *env) { return ret; } +static bool ab_ctrvm(HRVMProg *prog, void *env) { + HAttrBool *ab = (HAttrBool*)env; + return h_compile_regex(prog, ab->p); +} + static const HParserVtable attr_bool_vt = { .parse = parse_attr_bool, .isValidRegular = ab_isValidRegular, .isValidCF = ab_isValidCF, .desugar = desugar_ab, + .compile_to_rvm = ab_ctrvm, }; -const HParser* h_attr_bool(const HParser* p, HPredicate pred) { +HParser* h_attr_bool(const HParser* p, HPredicate pred) { return h_attr_bool__m(&system_allocator, p, pred); } -const HParser* h_attr_bool__m(HAllocator* mm__, const HParser* p, HPredicate pred) { +HParser* h_attr_bool__m(HAllocator* mm__, const HParser* p, HPredicate pred) { HAttrBool *env = h_new(HAttrBool, 1); env->p = p; env->pred = pred; diff --git a/src/parsers/bits.c b/src/parsers/bits.c index 54602eaffd742a6fa86b4045b8c6a3e0af763782..ebc95541d47990f37362af24bb1b32532d570892 100644 --- a/src/parsers/bits.c +++ b/src/parsers/bits.c @@ -1,3 +1,4 @@ +#include <assert.h> #include "parser_internal.h" struct bits_env { @@ -13,7 +14,7 @@ static HParseResult* parse_bits(void* env, HParseState *state) { result->sint = h_read_bits(&state->input_stream, env_->length, true); else result->uint = h_read_bits(&state->input_stream, env_->length, false); - return make_result(state, result); + return make_result(state->arena, result); } static HCFChoice* desugar_bits(HAllocator *mm__, void *env) { @@ -41,16 +42,43 @@ static HCFChoice* desugar_bits(HAllocator *mm__, void *env) { return ret; } +static bool h_svm_action_bits(HArena *arena, HSVMContext *ctx, void* env) { + // BUG: relies un undefined behaviour: int64_t is a signed uint64_t; not necessarily true on 32-bit + struct bits_env *env_ = env; + HParsedToken *top = ctx->stack[ctx->stack_count-1]; + assert(top->token_type == TT_BYTES); + uint64_t res = 0; + for (size_t i = 0; i < top->bytes.len; i++) + res = (res << 8) | top->bytes.token[i]; // TODO: Handle other endiannesses. + top->uint = res; // possibly cast to signed through union + top->token_type = (env_->signedp ? TT_SINT : TT_UINT); + return true; +} + +static bool bits_ctrvm(HRVMProg *prog, void* env) { + struct bits_env *env_ = (struct bits_env*)env; + h_rvm_insert_insn(prog, RVM_PUSH, 0); + for (size_t i=0; (i < env_->length)/8; ++i) { // FUTURE: when we can handle non-byte-aligned, the env_->length/8 part will be different + h_rvm_insert_insn(prog, RVM_MATCH, 0xFF00); + h_rvm_insert_insn(prog, RVM_STEP, 0); + } + h_rvm_insert_insn(prog, RVM_CAPTURE, 0); + h_rvm_insert_insn(prog, RVM_ACTION, h_rvm_create_action(prog, h_svm_action_bits, env)); + return true; +} + static const HParserVtable bits_vt = { .parse = parse_bits, .isValidRegular = h_true, .isValidCF = h_true, .desugar = desugar_bits, + .compile_to_rvm = bits_ctrvm, }; -const HParser* h_bits(size_t len, bool sign) { + +HParser* h_bits(size_t len, bool sign) { return h_bits__m(&system_allocator, len, sign); } -const HParser* h_bits__m(HAllocator* mm__, size_t len, bool sign) { +HParser* h_bits__m(HAllocator* mm__, size_t len, bool sign) { struct bits_env *env = h_new(struct bits_env, 1); env->length = len; env->signedp = sign; @@ -58,10 +86,10 @@ const HParser* h_bits__m(HAllocator* mm__, size_t len, bool sign) { } #define SIZED_BITS(name_pre, len, signedp) \ - const HParser* h_##name_pre##len () { \ + HParser* h_##name_pre##len () { \ return h_bits__m(&system_allocator, len, signedp); \ } \ - const HParser* h_##name_pre##len##__m(HAllocator* mm__) { \ + HParser* h_##name_pre##len##__m(HAllocator* mm__) { \ return h_bits__m(mm__, len, signedp); \ } SIZED_BITS(int, 8, true) diff --git a/src/parsers/butnot.c b/src/parsers/butnot.c index 3dec32272ecf1a4e62c88aa3f6556d4c3b4b5cd6..1400e3652d98bd72e932725ebd83c57b63b98e0b 100644 --- a/src/parsers/butnot.c +++ b/src/parsers/butnot.c @@ -43,14 +43,15 @@ static HCFChoice* desugar_butnot(HAllocator *mm__, void *env) { static const HParserVtable butnot_vt = { .parse = parse_butnot, .isValidRegular = h_false, - .isValidCF = h_false, + .isValidCF = h_false, // XXX should this be true if both p1 and p2 are CF? .desugar = desugar_butnot, + .compile_to_rvm = h_not_regular, }; -const HParser* h_butnot(const HParser* p1, const HParser* p2) { +HParser* h_butnot(const HParser* p1, const HParser* p2) { return h_butnot__m(&system_allocator, p1, p2); } -const HParser* h_butnot__m(HAllocator* mm__, const HParser* p1, const HParser* p2) { +HParser* h_butnot__m(HAllocator* mm__, const HParser* p1, const HParser* p2) { HTwoParsers *env = h_new(HTwoParsers, 1); env->p1 = p1; env->p2 = p2; diff --git a/src/parsers/ch.c b/src/parsers/ch.c index 82b5e9ac115c96af4e744137864e86d3d86d1ddc..9621869c01286ea7e7664e3d1f6549e94d0e1937 100644 --- a/src/parsers/ch.c +++ b/src/parsers/ch.c @@ -6,7 +6,7 @@ static HParseResult* parse_ch(void* env, HParseState *state) { if (c == r) { HParsedToken *tok = a_new(HParsedToken, 1); tok->token_type = TT_UINT; tok->uint = r; - return make_result(state, tok); + return make_result(state->arena, tok); } else { return NULL; } @@ -20,16 +20,25 @@ static HCFChoice* desugar_ch(HAllocator *mm__, void *env) { return ret; } +static bool ch_ctrvm(HRVMProg *prog, void* env) { + uint8_t c = (uint8_t)(unsigned long)(env); + // TODO: Does this capture anything? + h_rvm_insert_insn(prog, RVM_MATCH, c & c << 8); + h_rvm_insert_insn(prog, RVM_STEP, 0); + return true; +} + static const HParserVtable ch_vt = { .parse = parse_ch, .isValidRegular = h_true, .isValidCF = h_true, .desugar = desugar_ch, + .compile_to_rvm = ch_ctrvm, }; -const HParser* h_ch(const uint8_t c) { +HParser* h_ch(const uint8_t c) { return h_ch__m(&system_allocator, c); } -const HParser* h_ch__m(HAllocator* mm__, const uint8_t c) { +HParser* h_ch__m(HAllocator* mm__, const uint8_t c) { return h_new_parser(mm__, &ch_vt, (void *)(uintptr_t)c); } diff --git a/src/parsers/charset.c b/src/parsers/charset.c index 858d299cde14e4b28e3318f99d031e104d7b1445..1b06cce6f044f6045c5c8920f14ddf6039bc0674 100644 --- a/src/parsers/charset.c +++ b/src/parsers/charset.c @@ -9,7 +9,7 @@ static HParseResult* parse_charset(void *env, HParseState *state) { if (charset_isset(cs, in)) { HParsedToken *tok = a_new(HParsedToken, 1); tok->token_type = TT_UINT; tok->uint = in; - return make_result(state, tok); + return make_result(state->arena, tok); } else return NULL; } @@ -22,17 +22,38 @@ static HCFChoice* desugar_charset(HAllocator *mm__, void *env) { return ret; } +// FUTURE: this is horribly inefficient +static bool cs_ctrvm(HRVMProg *prog, void *env) { + HCharset cs = (HCharset)env; + uint16_t start = h_rvm_get_ip(prog); + for (size_t i=0; i<256; ++i) { + if (charset_isset(cs, i)) { + uint16_t insn = h_rvm_insert_insn(prog, RVM_FORK, 0); + h_rvm_insert_insn(prog, RVM_MATCH, i & i << 8); + h_rvm_insert_insn(prog, RVM_GOTO, 0); + h_rvm_patch_arg(prog, insn, h_rvm_get_ip(prog)); + } + } + uint16_t jump = h_rvm_insert_insn(prog, RVM_STEP, 0); + for (size_t i=start; i<jump; ++i) { + if (RVM_GOTO == prog->insns[i].op) + h_rvm_patch_arg(prog, i, jump); + } + return true; +} + static const HParserVtable charset_vt = { .parse = parse_charset, .isValidRegular = h_true, .isValidCF = h_true, .desugar = desugar_charset, + .compile_to_rvm = cs_ctrvm, }; -const HParser* h_ch_range(const uint8_t lower, const uint8_t upper) { +HParser* h_ch_range(const uint8_t lower, const uint8_t upper) { return h_ch_range__m(&system_allocator, lower, upper); } -const HParser* h_ch_range__m(HAllocator* mm__, const uint8_t lower, const uint8_t upper) { +HParser* h_ch_range__m(HAllocator* mm__, const uint8_t lower, const uint8_t upper) { HCharset cs = new_charset(mm__); for (int i = 0; i < 256; i++) charset_set(cs, i, (lower <= i) && (i <= upper)); @@ -40,7 +61,7 @@ const HParser* h_ch_range__m(HAllocator* mm__, const uint8_t lower, const uint8_ } -static const HParser* h_in_or_not__m(HAllocator* mm__, const uint8_t *options, size_t count, int val) { +static HParser* h_in_or_not__m(HAllocator* mm__, const uint8_t *options, size_t count, int val) { HCharset cs = new_charset(mm__); for (size_t i = 0; i < 256; i++) charset_set(cs, i, 1-val); @@ -50,19 +71,19 @@ static const HParser* h_in_or_not__m(HAllocator* mm__, const uint8_t *options, s return h_new_parser(mm__, &charset_vt, cs); } -const HParser* h_in(const uint8_t *options, size_t count) { +HParser* h_in(const uint8_t *options, size_t count) { return h_in_or_not__m(&system_allocator, options, count, 1); } -const HParser* h_in__m(HAllocator* mm__, const uint8_t *options, size_t count) { +HParser* h_in__m(HAllocator* mm__, const uint8_t *options, size_t count) { return h_in_or_not__m(mm__, options, count, 1); } -const HParser* h_not_in(const uint8_t *options, size_t count) { +HParser* h_not_in(const uint8_t *options, size_t count) { return h_in_or_not__m(&system_allocator, options, count, 0); } -const HParser* h_not_in__m(HAllocator* mm__, const uint8_t *options, size_t count) { +HParser* h_not_in__m(HAllocator* mm__, const uint8_t *options, size_t count) { return h_in_or_not__m(mm__, options, count, 0); } diff --git a/src/parsers/choice.c b/src/parsers/choice.c index 28394058b9c9e531b02a7e7a5ae1c5e203218a54..9a186252b008f9b9de86f57804277c98080e7ccc 100644 --- a/src/parsers/choice.c +++ b/src/parsers/choice.c @@ -54,34 +54,53 @@ static HCFChoice* desugar_choice(HAllocator *mm__, void *env) { return ret; } +static bool choice_ctrvm(HRVMProg *prog, void* env) { + HSequence *s = (HSequence*)env; + uint16_t gotos[s->len]; + uint16_t start = h_rvm_get_ip(prog); + for (size_t i=0; i<s->len; ++i) { + uint16_t insn = h_rvm_insert_insn(prog, RVM_FORK, 0); + if (!h_compile_regex(prog, s->p_array[i]->env)) + return false; + gotos[i] = h_rvm_insert_insn(prog, RVM_GOTO, 0); + h_rvm_patch_arg(prog, insn, h_rvm_get_ip(prog)); + } + uint16_t jump = h_rvm_insert_insn(prog, RVM_STEP, 0); + for (size_t i=start; i<s->len; ++i) { + h_rvm_patch_arg(prog, gotos[i], jump); + } + return true; +} + static const HParserVtable choice_vt = { .parse = parse_choice, .isValidRegular = choice_isValidRegular, .isValidCF = choice_isValidCF, .desugar = desugar_choice, + .compile_to_rvm = choice_ctrvm, }; -const HParser* h_choice(const HParser* p, ...) { +HParser* h_choice(const HParser* p, ...) { va_list ap; va_start(ap, p); - const HParser* ret = h_choice__mv(&system_allocator, p, ap); + HParser* ret = h_choice__mv(&system_allocator, p, ap); va_end(ap); return ret; } -const HParser* h_choice__m(HAllocator* mm__, const HParser* p, ...) { +HParser* h_choice__m(HAllocator* mm__, const HParser* p, ...) { va_list ap; va_start(ap, p); - const HParser* ret = h_choice__mv(mm__, p, ap); + HParser* ret = h_choice__mv(mm__, p, ap); va_end(ap); return ret; } -const HParser* h_choice__v(const HParser* p, va_list ap) { +HParser* h_choice__v(const HParser* p, va_list ap) { return h_choice__mv(&system_allocator, p, ap); } -const HParser* h_choice__mv(HAllocator* mm__, const HParser* p, va_list ap_) { +HParser* h_choice__mv(HAllocator* mm__, const HParser* p, va_list ap_) { va_list ap; size_t len = 0; HSequence *s = h_new(HSequence, 1); diff --git a/src/parsers/difference.c b/src/parsers/difference.c index 9302490793f859f85c69759ab832d899bbe8678d..4da0521495a8fc39f415389becf8ba4a3a816b2f 100644 --- a/src/parsers/difference.c +++ b/src/parsers/difference.c @@ -42,14 +42,15 @@ static HCFChoice* desugar_difference(HAllocator *mm__, void *env) { static HParserVtable difference_vt = { .parse = parse_difference, .isValidRegular = h_false, - .isValidCF = h_false, + .isValidCF = h_false, // XXX should this be true if both p1 and p2 are CF? .desugar = desugar_difference, + .compile_to_rvm = h_not_regular, }; -const HParser* h_difference(const HParser* p1, const HParser* p2) { +HParser* h_difference(const HParser* p1, const HParser* p2) { return h_difference__m(&system_allocator, p1, p2); } -const HParser* h_difference__m(HAllocator* mm__, const HParser* p1, const HParser* p2) { +HParser* h_difference__m(HAllocator* mm__, const HParser* p1, const HParser* p2) { HTwoParsers *env = h_new(HTwoParsers, 1); env->p1 = p1; env->p2 = p2; diff --git a/src/parsers/end.c b/src/parsers/end.c index 05aeaafb3eee5928f84619249f43bb33b8823063..fa8ab8b35c5d3b016a79769b04f405b175532d57 100644 --- a/src/parsers/end.c +++ b/src/parsers/end.c @@ -17,17 +17,23 @@ static HCFChoice* desugar_end(HAllocator *mm__, void *env) { return &ret; } +static bool end_ctrvm(HRVMProg *prog, void *env) { + h_rvm_insert_insn(prog, RVM_EOF, 0); + return true; +} + static const HParserVtable end_vt = { .parse = parse_end, .isValidRegular = h_true, .isValidCF = h_true, .desugar = desugar_end, + .compile_to_rvm = end_ctrvm, }; -const HParser* h_end_p() { +HParser* h_end_p() { return h_end_p__m(&system_allocator); } -const HParser* h_end_p__m(HAllocator* mm__) { +HParser* h_end_p__m(HAllocator* mm__) { return h_new_parser(mm__, &end_vt, NULL); } diff --git a/src/parsers/epsilon.c b/src/parsers/epsilon.c index 7582ce043fdd0b731a6507004fcbae54ddc50c6f..92e394779ee007105387391d6c348ffe55da5c8a 100644 --- a/src/parsers/epsilon.c +++ b/src/parsers/epsilon.c @@ -8,21 +8,23 @@ static HParseResult* parse_epsilon(void* env, HParseState* state) { return res; } +static bool epsilon_ctrvm(HRVMProg *prog, void* env) { + return true; +} + static const HParserVtable epsilon_vt = { .parse = parse_epsilon, .isValidRegular = h_true, .isValidCF = h_true, .desugar = desugar_epsilon, + .compile_to_rvm = epsilon_ctrvm, }; -static HParser epsilon_p = { - .vtable = &epsilon_vt, - .env = NULL -}; - -const HParser* h_epsilon_p() { - return &epsilon_p; +HParser* h_epsilon_p() { + return h_epsilon_p__m(&system_allocator); } -const HParser* h_epsilon_p__m(HAllocator* mm__) { - return &epsilon_p; +HParser* h_epsilon_p__m(HAllocator* mm__) { + HParser *epsilon_p = h_new(HParser, 1); + epsilon_p->vtable = &epsilon_vt; + return epsilon_p; } diff --git a/src/parsers/ignore.c b/src/parsers/ignore.c index 33bccd0154c000f1bea5316b7bde3d5fd5513828..e6768c1c8fa9fd4225adbb67c5a5d37e9eaeb07b 100644 --- a/src/parsers/ignore.c +++ b/src/parsers/ignore.c @@ -1,3 +1,4 @@ +#include <assert.h> #include "parser_internal.h" static HParseResult* parse_ignore(void* env, HParseState* state) { @@ -25,16 +26,30 @@ static HCFChoice* desugar_ignore(HAllocator *mm__, void *env) { return (h_desugar(mm__, p)); } +static bool h_svm_action_pop(HArena *arena, HSVMContext *ctx, void* arg) { + assert(ctx->stack_count > 0); + ctx->stack_count--; + return true; +} + +static bool ignore_ctrvm(HRVMProg *prog, void *env) { + HParser *p = (HParser*)env; + h_compile_regex(prog, p->env); + h_rvm_insert_insn(prog, RVM_ACTION, h_rvm_create_action(prog, h_svm_action_pop, NULL)); + return true; +} + static const HParserVtable ignore_vt = { .parse = parse_ignore, .isValidRegular = ignore_isValidRegular, .isValidCF = ignore_isValidCF, .desugar = desugar_ignore, + .compile_to_rvm = ignore_ctrvm, }; -const HParser* h_ignore(const HParser* p) { +HParser* h_ignore(const HParser* p) { return h_ignore__m(&system_allocator, p); } -const HParser* h_ignore__m(HAllocator* mm__, const HParser* p) { +HParser* h_ignore__m(HAllocator* mm__, const HParser* p) { return h_new_parser(mm__, &ignore_vt, (void *)p); } diff --git a/src/parsers/ignoreseq.c b/src/parsers/ignoreseq.c index 822920b1b1ad18450addb04aa7f63ba4ebd2f332..d7f4743be1cba3e71970b5f33c1b9c7894e5a6dc 100644 --- a/src/parsers/ignoreseq.c +++ b/src/parsers/ignoreseq.c @@ -1,3 +1,4 @@ +#include <assert.h> #include "parser_internal.h" @@ -5,7 +6,7 @@ // general case: parse sequence, pick one result // -typedef struct { +typedef struct HIgnoreSeq_ { const HParser **parsers; size_t len; // how many parsers in 'ps' size_t which; // whose result to return @@ -61,11 +62,40 @@ static bool is_isValidCF(void *env) { return true; } +static bool h_svm_action_ignoreseq(HArena *arena, HSVMContext *ctx, void* env) { + HIgnoreSeq *seq = (HIgnoreSeq*)env; + HParsedToken* save; + // We can assume that each subitem generated at most one item on the + // stack. + assert(seq->len >= 1); + for (int i = seq->len - 1; i>=0; i--) { + if (i == (int)seq->which && ctx->stack[ctx->stack_count]->token_type != TT_MARK) + save = ctx->stack[ctx->stack_count-1]; + // skip over everything up to and including the mark. + while (ctx->stack[--ctx->stack_count]->token_type != TT_MARK) + ; + } + ctx->stack[ctx->stack_count++] = save; + return true; +} + +static bool is_ctrvm(HRVMProg *prog, void* env) { + HIgnoreSeq *seq = (HIgnoreSeq*)env; + for (size_t i=0; i<seq->len; ++i) { + h_rvm_insert_insn(prog, RVM_PUSH, 0); + if (!h_compile_regex(prog, seq->parsers[i]->env)) + return false; + } + h_rvm_insert_insn(prog, RVM_ACTION, h_rvm_create_action(prog, h_svm_action_ignoreseq, env)); + return true; +} + static const HParserVtable ignoreseq_vt = { .parse = parse_ignoreseq, .isValidRegular = is_isValidRegular, .isValidCF = is_isValidCF, .desugar = desugar_ignoreseq, + .compile_to_rvm = is_ctrvm, }; @@ -73,7 +103,7 @@ static const HParserVtable ignoreseq_vt = { // API frontends // -static const HParser* h_leftright__m(HAllocator* mm__, const HParser* p, const HParser* q, size_t which) { +static HParser* h_leftright__m(HAllocator* mm__, const HParser* p, const HParser* q, size_t which) { HIgnoreSeq *seq = h_new(HIgnoreSeq, 1); seq->parsers = h_new(const HParser*, 2); seq->parsers[0] = p; @@ -84,25 +114,25 @@ static const HParser* h_leftright__m(HAllocator* mm__, const HParser* p, const H return h_new_parser(mm__, &ignoreseq_vt, seq); } -const HParser* h_left(const HParser* p, const HParser* q) { +HParser* h_left(const HParser* p, const HParser* q) { return h_leftright__m(&system_allocator, p, q, 0); } -const HParser* h_left__m(HAllocator* mm__, const HParser* p, const HParser* q) { +HParser* h_left__m(HAllocator* mm__, const HParser* p, const HParser* q) { return h_leftright__m(mm__, p, q, 0); } -const HParser* h_right(const HParser* p, const HParser* q) { +HParser* h_right(const HParser* p, const HParser* q) { return h_leftright__m(&system_allocator, p, q, 1); } -const HParser* h_right__m(HAllocator* mm__, const HParser* p, const HParser* q) { +HParser* h_right__m(HAllocator* mm__, const HParser* p, const HParser* q) { return h_leftright__m(mm__, p, q, 1); } -const HParser* h_middle(const HParser* p, const HParser* x, const HParser* q) { +HParser* h_middle(const HParser* p, const HParser* x, const HParser* q) { return h_middle__m(&system_allocator, p, x, q); } -const HParser* h_middle__m(HAllocator* mm__, const HParser* p, const HParser* x, const HParser* q) { +HParser* h_middle__m(HAllocator* mm__, const HParser* p, const HParser* x, const HParser* q) { HIgnoreSeq *seq = h_new(HIgnoreSeq, 1); seq->parsers = h_new(const HParser*, 3); seq->parsers[0] = p; diff --git a/src/parsers/indirect.c b/src/parsers/indirect.c index 77227d2c501d49db84599106d41a066b05aab940..cacfd04195b2f9a71f8b0d255032f5bde27fa150 100644 --- a/src/parsers/indirect.c +++ b/src/parsers/indirect.c @@ -21,6 +21,7 @@ static const HParserVtable indirect_vt = { .isValidRegular = h_false, .isValidCF = indirect_isValidCF, .desugar = desugar_indirect, + .compile_to_rvm = h_not_regular, }; void h_bind_indirect(HParser* indirect, const HParser* inner) { diff --git a/src/parsers/int_range.c b/src/parsers/int_range.c index 50149b2e5e271368501b9355a33d0b7ef94bc4d4..d67a786f4bad8314bcc643f3ee7be12999713a2a 100644 --- a/src/parsers/int_range.c +++ b/src/parsers/int_range.c @@ -121,17 +121,39 @@ static HCFChoice* desugar_int_range(HAllocator *mm__, void *env) { return gen_int_range(mm__, r->lower, r->upper, bytes); } +bool h_svm_action_validate_int_range(HArena *arena, HSVMContext *ctx, void* env) { + HRange *r_env = (HRange*)env; + HParsedToken *head = ctx->stack[ctx->stack_count-1]; + switch (head-> token_type) { + case TT_SINT: + return head->sint >= r_env->lower && head->sint <= r_env->upper; + case TT_UINT: + return head->uint >= (uint64_t)r_env->lower && head->uint <= (uint64_t)r_env->upper; + default: + return false; + } +} + +static bool ir_ctrvm(HRVMProg *prog, void *env) { + HRange *r_env = (HRange*)env; + + h_compile_regex(prog, r_env->p); + h_rvm_insert_insn(prog, RVM_ACTION, h_rvm_create_action(prog, h_svm_action_validate_int_range, env)); + return false; +} + static const HParserVtable int_range_vt = { .parse = parse_int_range, .isValidRegular = h_true, .isValidCF = h_true, .desugar = desugar_int_range, + .compile_to_rvm = ir_ctrvm, }; -const HParser* h_int_range(const HParser *p, const int64_t lower, const int64_t upper) { +HParser* h_int_range(const HParser *p, const int64_t lower, const int64_t upper) { return h_int_range__m(&system_allocator, p, lower, upper); } -const HParser* h_int_range__m(HAllocator* mm__, const HParser *p, const int64_t lower, const int64_t upper) { +HParser* h_int_range__m(HAllocator* mm__, const HParser *p, const int64_t lower, const int64_t upper) { // p must be an integer parser, which means it's using parse_bits // TODO: re-add this check //assert_message(p->vtable == &bits_vt, "int_range requires an integer parser"); diff --git a/src/parsers/many.c b/src/parsers/many.c index 2ce89f13071fccb3bf00a7e933fbb6257753f696..1e3850f0ba78be8abc8aca657cfc67807f8253cb 100644 --- a/src/parsers/many.c +++ b/src/parsers/many.c @@ -33,7 +33,7 @@ static HParseResult *parse_many(void* env, HParseState *state) { HParsedToken *res = a_new(HParsedToken, 1); res->token_type = TT_SEQUENCE; res->seq = seq; - return make_result(state, res); + return make_result(state->arena, res); err0: if (count >= env_->count) { state->input_stream = bak; @@ -112,17 +112,40 @@ static HCFChoice* desugar_many(HAllocator *mm__, void *env) { return ma; } +static bool many_ctrvm(HRVMProg *prog, void *env) { + HRepeat *repeat = (HRepeat*)env; + // FIXME: Implement clear_to_mark + uint16_t clear_to_mark = h_rvm_create_action(prog, h_svm_action_clear_to_mark, NULL); + h_rvm_insert_insn(prog, RVM_PUSH, 0); + // TODO: implement min and max properly. Right now, it's always min==0, max==inf + uint16_t insn = h_rvm_insert_insn(prog, RVM_FORK, 0); + if (!h_compile_regex(prog, repeat->p)) + return false; + if (repeat->sep != NULL) { + h_rvm_insert_insn(prog, RVM_PUSH, 0); + if (!h_compile_regex(prog, repeat->sep)) + return false; + h_rvm_insert_insn(prog, RVM_ACTION, clear_to_mark); + } + h_rvm_insert_insn(prog, RVM_GOTO, insn); + h_rvm_patch_arg(prog, insn, h_rvm_get_ip(prog)); + + h_rvm_insert_insn(prog, RVM_ACTION, h_rvm_create_action(prog, h_svm_action_make_sequence, NULL)); + return true; +} + static const HParserVtable many_vt = { .parse = parse_many, .isValidRegular = many_isValidRegular, .isValidCF = many_isValidCF, .desugar = desugar_many, + .compile_to_rvm = many_ctrvm, }; -const HParser* h_many(const HParser* p) { +HParser* h_many(const HParser* p) { return h_many__m(&system_allocator, p); } -const HParser* h_many__m(HAllocator* mm__, const HParser* p) { +HParser* h_many__m(HAllocator* mm__, const HParser* p) { HRepeat *env = h_new(HRepeat, 1); env->p = p; env->sep = h_epsilon_p__m(mm__); @@ -131,10 +154,10 @@ const HParser* h_many__m(HAllocator* mm__, const HParser* p) { return h_new_parser(mm__, &many_vt, env); } -const HParser* h_many1(const HParser* p) { +HParser* h_many1(const HParser* p) { return h_many1__m(&system_allocator, p); } -const HParser* h_many1__m(HAllocator* mm__, const HParser* p) { +HParser* h_many1__m(HAllocator* mm__, const HParser* p) { HRepeat *env = h_new(HRepeat, 1); env->p = p; env->sep = h_epsilon_p__m(mm__); @@ -143,10 +166,10 @@ const HParser* h_many1__m(HAllocator* mm__, const HParser* p) { return h_new_parser(mm__, &many_vt, env); } -const HParser* h_repeat_n(const HParser* p, const size_t n) { +HParser* h_repeat_n(const HParser* p, const size_t n) { return h_repeat_n__m(&system_allocator, p, n); } -const HParser* h_repeat_n__m(HAllocator* mm__, const HParser* p, const size_t n) { +HParser* h_repeat_n__m(HAllocator* mm__, const HParser* p, const size_t n) { HRepeat *env = h_new(HRepeat, 1); env->p = p; env->sep = h_epsilon_p__m(mm__); @@ -155,10 +178,10 @@ const HParser* h_repeat_n__m(HAllocator* mm__, const HParser* p, const size_t n) return h_new_parser(mm__, &many_vt, env); } -const HParser* h_sepBy(const HParser* p, const HParser* sep) { +HParser* h_sepBy(const HParser* p, const HParser* sep) { return h_sepBy__m(&system_allocator, p, sep); } -const HParser* h_sepBy__m(HAllocator* mm__, const HParser* p, const HParser* sep) { +HParser* h_sepBy__m(HAllocator* mm__, const HParser* p, const HParser* sep) { HRepeat *env = h_new(HRepeat, 1); env->p = p; env->sep = sep; @@ -167,10 +190,10 @@ const HParser* h_sepBy__m(HAllocator* mm__, const HParser* p, const HParser* sep return h_new_parser(mm__, &many_vt, env); } -const HParser* h_sepBy1(const HParser* p, const HParser* sep) { +HParser* h_sepBy1(const HParser* p, const HParser* sep) { return h_sepBy1__m(&system_allocator, p, sep); } -const HParser* h_sepBy1__m(HAllocator* mm__, const HParser* p, const HParser* sep) { +HParser* h_sepBy1__m(HAllocator* mm__, const HParser* p, const HParser* sep) { HRepeat *env = h_new(HRepeat, 1); env->p = p; env->sep = sep; @@ -213,10 +236,10 @@ static const HParserVtable length_value_vt = { .desugar = desugar_length_value, }; -const HParser* h_length_value(const HParser* length, const HParser* value) { +HParser* h_length_value(const HParser* length, const HParser* value) { return h_length_value__m(&system_allocator, length, value); } -const HParser* h_length_value__m(HAllocator* mm__, const HParser* length, const HParser* value) { +HParser* h_length_value__m(HAllocator* mm__, const HParser* length, const HParser* value) { HLenVal *env = h_new(HLenVal, 1); env->length = length; env->value = value; diff --git a/src/parsers/not.c b/src/parsers/not.c index 523b92a60d6c9ecc9bc46c0119dbee81554cd61c..61b632214f60e7b9664e661241b507e98366a77c 100644 --- a/src/parsers/not.c +++ b/src/parsers/not.c @@ -6,7 +6,7 @@ static HParseResult* parse_not(void* env, HParseState* state) { return NULL; else { state->input_stream = bak; - return make_result(state, NULL); + return make_result(state->arena, NULL); } } @@ -20,11 +20,12 @@ static const HParserVtable not_vt = { .isValidRegular = h_false, /* see and.c for why */ .isValidCF = h_false, /* also see and.c for why */ .desugar = desugar_not, + .compile_to_rvm = h_not_regular, // Is actually regular, but the generation step is currently unable to handle it. TODO: fix this. }; -const HParser* h_not(const HParser* p) { +HParser* h_not(const HParser* p) { return h_not__m(&system_allocator, p); } -const HParser* h_not__m(HAllocator* mm__, const HParser* p) { +HParser* h_not__m(HAllocator* mm__, const HParser* p) { return h_new_parser(mm__, ¬_vt, (void *)p); } diff --git a/src/parsers/nothing.c b/src/parsers/nothing.c index b204acaef409d063e81d4c1ac075db6990b415a7..645a2137cbdec111c94839848a2217395d58459e 100644 --- a/src/parsers/nothing.c +++ b/src/parsers/nothing.c @@ -1,6 +1,5 @@ #include "parser_internal.h" - static HParseResult* parse_nothing() { // not a mistake, this parser always fails return NULL; @@ -15,16 +14,23 @@ static HCFChoice *desugar_nothing(HAllocator *mm__, void *env) { return ret; } +static bool nothing_ctrvm(HRVMProg *prog, void* env) { + h_rvm_insert_insn(prog, RVM_MATCH, 0x0000); + h_rvm_insert_insn(prog, RVM_MATCH, 0xFFFF); + return true; +} + static const HParserVtable nothing_vt = { .parse = parse_nothing, .isValidRegular = h_true, .isValidCF = h_true, .desugar = desugar_nothing, + .compile_to_rvm = nothing_ctrvm, }; -const HParser* h_nothing_p() { +HParser* h_nothing_p() { return h_nothing_p__m(&system_allocator); } -const HParser* h_nothing_p__m(HAllocator* mm__) { +HParser* h_nothing_p__m(HAllocator* mm__) { return h_new_parser(mm__, ¬hing_vt, NULL); } diff --git a/src/parsers/optional.c b/src/parsers/optional.c index 27199ab2f5aefceb8cade25e7c2fa1c7d4a0dbd2..9c8c991a41527a2147082c1816f07406c7da393f 100644 --- a/src/parsers/optional.c +++ b/src/parsers/optional.c @@ -1,3 +1,4 @@ +#include <assert.h> #include "parser_internal.h" static HParseResult* parse_optional(void* env, HParseState* state) { @@ -8,7 +9,7 @@ static HParseResult* parse_optional(void* env, HParseState* state) { state->input_stream = bak; HParsedToken *ast = a_new(HParsedToken, 1); ast->token_type = TT_NONE; - return make_result(state, ast); + return make_result(state->arena, ast); } static bool opt_isValidRegular(void *env) { @@ -26,17 +27,40 @@ static HCFChoice* desugar_optional(HAllocator *mm__, void *env) { return h_desugar(mm__, p); } +static bool h_svm_action_optional(HArena *arena, HSVMContext *ctx, void *env) { + if (ctx->stack[ctx->stack_count-1]->token_type == TT_MARK) { + ctx->stack[ctx->stack_count-1]->token_type = TT_NONE; + } else { + ctx->stack_count--; + assert(ctx->stack[ctx->stack_count-1]->token_type == TT_MARK); + ctx->stack[ctx->stack_count-1] = ctx->stack[ctx->stack_count]; + } + return true; +} + +static bool opt_ctrvm(HRVMProg *prog, void* env) { + h_rvm_insert_insn(prog, RVM_PUSH, 0); + uint16_t insn = h_rvm_insert_insn(prog, RVM_FORK, 0); + HParser *p = (HParser*) env; + if (!h_compile_regex(prog, p->env)) + return false; + h_rvm_patch_arg(prog, insn, h_rvm_get_ip(prog)); + h_rvm_insert_insn(prog, RVM_ACTION, h_rvm_create_action(prog, h_svm_action_optional, NULL)); + return true; +} + static const HParserVtable optional_vt = { .parse = parse_optional, .isValidRegular = opt_isValidRegular, .isValidCF = opt_isValidCF, .desugar = desugar_optional, + .compile_to_rvm = opt_ctrvm, }; -const HParser* h_optional(const HParser* p) { +HParser* h_optional(const HParser* p) { return h_optional__m(&system_allocator, p); } -const HParser* h_optional__m(HAllocator* mm__, const HParser* p) { +HParser* h_optional__m(HAllocator* mm__, const HParser* p) { // TODO: re-add this //assert_message(p->vtable != &ignore_vt, "Thou shalt ignore an option, rather than the other way 'round."); return h_new_parser(mm__, &optional_vt, (void *)p); diff --git a/src/parsers/parser_internal.h b/src/parsers/parser_internal.h index 7e5bb066ee469357f8792311782e64a2d71b5959..3a09cd426073ade6ca11177293a434d97d37807c 100644 --- a/src/parsers/parser_internal.h +++ b/src/parsers/parser_internal.h @@ -2,15 +2,16 @@ #define HAMMER_PARSE_INTERNAL__H #include "../hammer.h" #include "../internal.h" +#include "../backends/regex.h" #define a_new_(arena, typ, count) ((typ*)h_arena_malloc((arena), sizeof(typ)*(count))) #define a_new(typ, count) a_new_(state->arena, typ, count) // we can create a_new0 if necessary. It would allocate some memory and immediately zero it out. -static inline HParseResult* make_result(HParseState *state, HParsedToken *tok) { - HParseResult *ret = a_new(HParseResult, 1); +static inline HParseResult* make_result(HArena *arena, HParsedToken *tok) { + HParseResult *ret = h_arena_malloc(arena, sizeof(HParseResult)); ret->ast = tok; - ret->arena = state->arena; + ret->arena = arena; return ret; } @@ -23,9 +24,6 @@ static inline size_t token_length(HParseResult *pr) { } } -static inline bool h_true(void *env) { return true; } -static inline bool h_false(void *env) { return false; } - /* Epsilon rules happen during desugaring. This handles them. */ static inline HCFChoice* desugar_epsilon(HAllocator *mm__, void *env) { static HCFChoice *res_seq_l[] = {NULL}; diff --git a/src/parsers/sequence.c b/src/parsers/sequence.c index e5d1da985e7e26921fb4244e6c10777a92209dcd..088e2c401bc4fcaa22349184d243e902eb07ed39 100644 --- a/src/parsers/sequence.c +++ b/src/parsers/sequence.c @@ -21,7 +21,7 @@ static HParseResult* parse_sequence(void *env, HParseState *state) { } HParsedToken *tok = a_new(HParsedToken, 1); tok->token_type = TT_SEQUENCE; tok->seq = seq; - return make_result(state, tok); + return make_result(state->arena, tok); } static bool sequence_isValidRegular(void *env) { @@ -59,34 +59,46 @@ static HCFChoice* desugar_sequence(HAllocator *mm__, void *env) { return ret; } +static bool sequence_ctrvm(HRVMProg *prog, void *env) { + HSequence *s = (HSequence*)env; + h_rvm_insert_insn(prog, RVM_PUSH, 0); + for (size_t i=0; i<s->len; ++i) { + if (!s->p_array[i]->vtable->compile_to_rvm(prog, s->p_array[i]->env)) + return false; + } + h_rvm_insert_insn(prog, RVM_ACTION, h_rvm_create_action(prog, h_svm_action_make_sequence, NULL)); + return true; +} + static const HParserVtable sequence_vt = { .parse = parse_sequence, .isValidRegular = sequence_isValidRegular, .isValidCF = sequence_isValidCF, .desugar = desugar_sequence, + .compile_to_rvm = sequence_ctrvm, }; -const HParser* h_sequence(const HParser* p, ...) { +HParser* h_sequence(const HParser* p, ...) { va_list ap; va_start(ap, p); - const HParser* ret = h_sequence__mv(&system_allocator, p, ap); + HParser* ret = h_sequence__mv(&system_allocator, p, ap); va_end(ap); return ret; } -const HParser* h_sequence__m(HAllocator* mm__, const HParser* p, ...) { +HParser* h_sequence__m(HAllocator* mm__, const HParser* p, ...) { va_list ap; va_start(ap, p); - const HParser* ret = h_sequence__mv(mm__, p, ap); + HParser* ret = h_sequence__mv(mm__, p, ap); va_end(ap); return ret; } -const HParser* h_sequence__v(const HParser* p, va_list ap) { +HParser* h_sequence__v(const HParser* p, va_list ap) { return h_sequence__mv(&system_allocator, p, ap); } -const HParser* h_sequence__mv(HAllocator* mm__, const HParser *p, va_list ap_) { +HParser* h_sequence__mv(HAllocator* mm__, const HParser *p, va_list ap_) { va_list ap; size_t len = 0; const HParser *arg; diff --git a/src/parsers/token.c b/src/parsers/token.c index 610a276a49886ae5350749979a0b6a8da7d25d80..9eed0233d4966e96ed98bf2858f4a80e45b749dc 100644 --- a/src/parsers/token.c +++ b/src/parsers/token.c @@ -15,9 +15,10 @@ static HParseResult* parse_token(void *env, HParseState *state) { } HParsedToken *tok = a_new(HParsedToken, 1); tok->token_type = TT_BYTES; tok->bytes.token = t->str; tok->bytes.len = t->len; - return make_result(state, tok); + return make_result(state->arena, tok); } + static HCFChoice* desugar_token(HAllocator *mm__, void *env) { HToken *tok = (HToken*)env; HCFSequence *seq = h_new(HCFSequence, 1); @@ -37,17 +38,29 @@ static HCFChoice* desugar_token(HAllocator *mm__, void *env) { return ret; } +static bool token_ctrvm(HRVMProg *prog, void *env) { + HToken *t = (HToken*)env; + h_rvm_insert_insn(prog, RVM_PUSH, 0); + for (int i=0; i<t->len; ++i) { + h_rvm_insert_insn(prog, RVM_MATCH, t->str[i] & t->str[i] << 8); + h_rvm_insert_insn(prog, RVM_STEP, 0); + } + h_rvm_insert_insn(prog, RVM_CAPTURE, 0); + return true; +} + const HParserVtable token_vt = { .parse = parse_token, .isValidRegular = h_true, .isValidCF = h_true, .desugar = desugar_token, + .compile_to_rvm = token_ctrvm, }; -const HParser* h_token(const uint8_t *str, const size_t len) { +HParser* h_token(const uint8_t *str, const size_t len) { return h_token__m(&system_allocator, str, len); } -const HParser* h_token__m(HAllocator* mm__, const uint8_t *str, const size_t len) { +HParser* h_token__m(HAllocator* mm__, const uint8_t *str, const size_t len) { HToken *t = h_new(HToken, 1); t->str = (uint8_t*)str, t->len = len; return h_new_parser(mm__, &token_vt, t); diff --git a/src/parsers/unimplemented.c b/src/parsers/unimplemented.c index fcfe821c60e2a8072c9b977edc482237c26cc5ca..3b8a8236f05e8a2c9912faabddfe8d46715f7e6a 100644 --- a/src/parsers/unimplemented.c +++ b/src/parsers/unimplemented.c @@ -22,6 +22,7 @@ static const HParserVtable unimplemented_vt = { .isValidRegular = h_false, .isValidCF = h_false, .desugar = desugar_unimplemented, + .compile_to_rvm = h_not_regular, }; static HParser unimplemented = { diff --git a/src/parsers/whitespace.c b/src/parsers/whitespace.c index 29e4ec70ea045ab842a5abb855f3325fd373bc43..b2547010bc9a19137ac4f6043f58e02dfea2a8e2 100644 --- a/src/parsers/whitespace.c +++ b/src/parsers/whitespace.c @@ -49,16 +49,32 @@ static bool ws_isValidCF(void *env) { return p->vtable->isValidCF(p->env); } +static bool ws_ctrvm(HRVMProg *prog, void *env) { + HParser *p = (HParser*)env; + uint16_t start = h_rvm_get_ip(prog); + uint16_t next; + const char SPACE_CHRS[6] = {' ', '\f', '\n', '\r', '\t', '\v'}; + + for (int i = 0; i < 6; i++) { + next = h_rvm_insert_insn(prog, RVM_FORK, 0); + h_rvm_insert_insn(prog, RVM_MATCH, (SPACE_CHRS[i] << 8) | (SPACE_CHRS[i])); + h_rvm_insert_insn(prog, RVM_GOTO, start); + h_rvm_patch_arg(prog, next, h_rvm_get_ip(prog)); + } + return h_compile_regex(prog, p->env); +} + static const HParserVtable whitespace_vt = { .parse = parse_whitespace, .isValidRegular = ws_isValidRegular, .isValidCF = ws_isValidCF, .desugar = desugar_whitespace, + .compile_to_rvm = ws_ctrvm, }; -const HParser* h_whitespace(const HParser* p) { +HParser* h_whitespace(const HParser* p) { return h_whitespace__m(&system_allocator, p); } -const HParser* h_whitespace__m(HAllocator* mm__, const HParser* p) { +HParser* h_whitespace__m(HAllocator* mm__, const HParser* p) { return h_new_parser(mm__, &whitespace_vt, (void *)p); } diff --git a/src/parsers/xor.c b/src/parsers/xor.c index 006fd9cbc5d3cb80a6f2f028c13bba4ea404fbe8..ccd37fdcdf19e934f75f6bac1195d064ec7eba72 100644 --- a/src/parsers/xor.c +++ b/src/parsers/xor.c @@ -39,14 +39,15 @@ static HCFChoice* desugar_xor(HAllocator *mm__, void *env) { static const HParserVtable xor_vt = { .parse = parse_xor, .isValidRegular = h_false, - .isValidCF = h_false, + .isValidCF = h_false, // XXX should this be true if both p1 and p2 are CF? .desugar = desugar_xor, + .compile_to_rvm = h_not_regular, }; -const HParser* h_xor(const HParser* p1, const HParser* p2) { +HParser* h_xor(const HParser* p1, const HParser* p2) { return h_xor__m(&system_allocator, p1, p2); } -const HParser* h_xor__m(HAllocator* mm__, const HParser* p1, const HParser* p2) { +HParser* h_xor__m(HAllocator* mm__, const HParser* p1, const HParser* p2) { HTwoParsers *env = h_new(HTwoParsers, 1); env->p1 = p1; env->p2 = p2; diff --git a/src/system_allocator.c b/src/system_allocator.c index 7248fd2ff35efd24729766c1ba394cbdff4ab2e2..80d7acf2be1d7dacd250d60fbd3a4338ed99c1af 100644 --- a/src/system_allocator.c +++ b/src/system_allocator.c @@ -1,16 +1,27 @@ +#include <string.h> #include <stdlib.h> #include "internal.h" static void* system_alloc(HAllocator *allocator, size_t size) { - return malloc(size); + + void* ptr = calloc(size + sizeof(size_t), 1); + *(size_t*)ptr = size; + return ptr + sizeof(size_t); } static void* system_realloc(HAllocator *allocator, void* ptr, size_t size) { - return realloc(ptr, size); + if (ptr == NULL) + return system_alloc(allocator, size); + ptr = realloc(ptr - sizeof(size_t), size + sizeof(size_t)); + size_t old_size = *(size_t*)ptr; + *(size_t*)ptr = size; + if (size > old_size) + memset(ptr+sizeof(size_t)+old_size, 0, size - old_size); + return ptr + sizeof(size_t); } static void system_free(HAllocator *allocator, void* ptr) { - free(ptr); + free(ptr - sizeof(size_t)); } HAllocator system_allocator = { diff --git a/src/t_benchmark.c b/src/t_benchmark.c index 60d22c5131ef2a458c4a1faa4ca08e3e3788a033..5cabbe58cc1e6b241c7c7d5ece71e0ac654359d2 100644 --- a/src/t_benchmark.c +++ b/src/t_benchmark.c @@ -11,7 +11,7 @@ HParserTestcase testcases[] = { }; static void test_benchmark_1() { - const HParser *parser = h_sepBy1(h_choice(h_ch('1'), h_ch('2'), h_ch('3'), NULL), h_ch(',')); + HParser *parser = h_sepBy1(h_choice(h_ch('1'), h_ch('2'), h_ch('3'), NULL), h_ch(',')); HBenchmarkResults *res = h_benchmark(parser, testcases); h_benchmark_report(stderr, res); diff --git a/src/test_suite.h b/src/test_suite.h index c4212b255b04228b899d70d95b11af82e895779a..7c2834f5c61b9bcc57339e2e36a0922e40de00f2 100644 --- a/src/test_suite.h +++ b/src/test_suite.h @@ -52,6 +52,28 @@ } \ } while(0) +#define g_check_regular(lang) do { \ + if (!lang->isValidRegular(lang->env)) { \ + g_test_message("Language is not regular"); \ + g_test_fail(); \ + } \ + } while(0) + +#define g_check_contextfree(lang) do { \ + if (!lang->isValidCF(lang->env)) { \ + g_test_message("Language is not context-free"); \ + g_test_fail(); \ + } \ + } while(0) + +#define g_check_compilable(lang, backend, params) do { \ + if (!h_compile(lang, backend, params)) { \ + g_test_message("Language is not %s(%s)", #backend, params); \ + g_test_fail(); \ + } \ + } while(0) + + // TODO: replace uses of this with g_check_parse_failed #define g_check_failed(res) do { \ const HParseResult *result = (res); \ @@ -77,14 +99,14 @@ } else { \ char* cres = h_write_result_unamb(res->ast); \ g_check_string(cres, ==, result); \ - g_free(cres); \ + system_allocator.free(&system_allocator, cres); \ HArenaStats stats; \ h_allocator_stats(res->arena, &stats); \ g_test_message("Parse used %zd bytes, wasted %zd bytes. " \ "Inefficiency: %5f%%", \ stats.used, stats.wasted, \ stats.wasted * 100. / (stats.used+stats.wasted)); \ - h_delete_arena(res->arena); \ + h_delete_arena(res->arena); \ } \ } while(0) @@ -149,4 +171,5 @@ #define g_check_cmpdouble(n1, op, n2) g_check_inttype("%g", double, n1, op, n2) + #endif // #ifndef HAMMER_TEST_SUITE__H