diff --git a/Makefile b/Makefile index 6c8f38633fde1688eec11b88611940852922c3b3..09aa037b487ff0c210810246275a77a76c882fdd 100644 --- a/Makefile +++ b/Makefile @@ -6,6 +6,8 @@ SUBDIRS = src examples jni include config.mk +TOPLEVEL=. +include common.mk CONFIG_VARS= INCLUDE_TESTS diff --git a/README.md b/README.md index 91ee36c02e214e266e0d4a333abbd9b7823f637c..c1c1293c2c5ff954bf589c9b93885640c285174a 100644 --- a/README.md +++ b/README.md @@ -12,8 +12,8 @@ Features * Parsing backends: * Packrat parsing * LL(k) - * GLR (not yet implemented) - * LALR(8) (not yet implemented) + * GLR + * LALR * Regular expressions * Language bindings: * C++ (not yet implemented) @@ -28,7 +28,7 @@ Features Installing ========== ### Prerequisites -* make +* SCons * a JDK ### Optional Dependencies @@ -36,11 +36,15 @@ Installing * glib-2.0 (>= 2.29) (for `make test`) * glib-2.0-dev (for `make test`) -To install, type `make`. To run the built-in test suite, type `make test`. +To build, type `scons`. To run the built-in test suite, type `scons test`. For a debug build, add `--variant=debug` -If jni.h and jni_md.h aren't already somewhere on your include path, prepend `C_INCLUDE_PATH=/path/to/jdk/include` to that. +If jni.h and jni_md.h aren't already somewhere on your include path, prepend +`C_INCLUDE_PATH=/path/to/jdk/include` to that. -There is not currently a `make install` target; to make Hammer available system-wide, copy `libhammer.a` to `/usr/lib/` (or `/usr/local/lib/`, or wherever ld will find it) and `hammer.h` to `/usr/include/`. +There is currently no `install` target; to make Hammer available system-wide, +copy `libhammer.a` and `libhammer.so` from `build/opt/src` to `/usr/lib/` (or +`/usr/local/lib/`, or wherever ld will find it) and `hammer.h` to +`/usr/include/`. Usage ===== diff --git a/SConstruct b/SConstruct index 10bcdecd437b0842162dc6a643fe487aa13242cc..c652b808abd60ac09dcb2fa4e41160ef4240708e 100644 --- a/SConstruct +++ b/SConstruct @@ -8,7 +8,7 @@ AddOption("--variant", dest="variant", nargs=1, type="choice", choices=["debug", "opt"], - default="debug", + default="opt", action="store", help="Build variant (debug or opt)") @@ -28,3 +28,5 @@ Export('env') env.SConscript(["src/SConscript"], variant_dir='build/$VARIANT/src') env.SConscript(["examples/SConscript"], variant_dir='build/$VARIANT/examples') + +env.Command('test', 'build/$VARIANT/src/test_suite', 'env LD_LIBRARY_PATH=build/$VARIANT/src $SOURCE') \ No newline at end of file diff --git a/common.mk b/common.mk index 26734952f3cde74e9b9b42c9c1330653ad4a3ef1..e98d3a2b2330f38ce1922e2d227269e55f18688d 100644 --- a/common.mk +++ b/common.mk @@ -1,3 +1,7 @@ +ifneq ($(REALLY_USE_OBSOLETE_BUILD_SYSTEM),yes) +$(error This is the old build system. Use "scons" to build, or use $(MAKE) REALLY_USE_OBSOLETE_BUILD_SYSTEM=yes) +endif + # Check to make sure variables are properly set ifeq ($(TOPLEVEL),) $(error $$TOPLEVEL is unset) diff --git a/examples/rr.c b/examples/rr.c index 2ba85341d0f444924f9801656eeb8fa94728ac3e..dd250637a7eaccf1074f9441bb6d273452cc130d 100644 --- a/examples/rr.c +++ b/examples/rr.c @@ -181,7 +181,7 @@ HParser* init_rdata(uint16_t type) { parsers[16] = txt; // All parsers must consume their input exactly. - for(uint16_t i; i<sizeof(parsers); i++) { + for(uint16_t i = 0; i<RDATA_TYPE_MAX+1; i++) { if(parsers[i]) { parsers[i] = h_action(h_sequence(parsers[i], h_end_p(), NULL), act_index0); diff --git a/src/Makefile b/src/Makefile index 7fac881ae0882fb0d436005d65dd63125914af4c..9ce6d9f8314ddbf22f1083c880dad86a349b82d9 100644 --- a/src/Makefile +++ b/src/Makefile @@ -27,8 +27,10 @@ PARSERS := \ BACKENDS := \ packrat \ + regex \ llk \ - regex + lalr \ + glr HAMMER_PARTS := \ bitreader.o \ @@ -42,6 +44,8 @@ HAMMER_PARTS := \ benchmark.o \ cfgrammar.o \ glue.o \ + backends/lr.o \ + backends/lr0.o \ $(PARSERS:%=parsers/%.o) \ $(BACKENDS:%=backends/%.o) diff --git a/src/SConscript b/src/SConscript index 70868a41aa6e354b18ff972a1b6d02fda0302a18..e87a038cb85568a2d23843be1c52ec0a7ee46c46 100644 --- a/src/SConscript +++ b/src/SConscript @@ -1,3 +1,4 @@ +# -*- python -*- Import('env') parsers = ['parsers/%s.c'%s for s in @@ -27,7 +28,7 @@ parsers = ['parsers/%s.c'%s for s in 'xor']] backends = ['backends/%s.c' % s for s in - ['packrat', 'llk', 'regex']] + ['packrat', 'llk', 'regex', 'glr', 'lalr', 'lr', 'lr0']] misc_hammer_parts = [ 'allocator.c', diff --git a/src/allocator.c b/src/allocator.c index e345c875317716da362e5b8ca9ed35298dc32e59..80fa92172eb9f0c785fdc4f94960d34cf6d0ff75 100644 --- a/src/allocator.c +++ b/src/allocator.c @@ -65,10 +65,10 @@ void* h_arena_malloc(HArena *arena, size_t size) { if (size <= arena->head->free) { // fast path.. void* ret = arena->head->rest + arena->head->used; - arena->used += size + 1; + arena->used += size; arena->wasted -= size; - arena->head->used += size + 1; - arena->head->free -= size + 1; + arena->head->used += size; + arena->head->free -= size; return ret; } else if (size > arena->block_size) { // We need a new, dedicated block for it, because it won't fit in a standard sized one. diff --git a/src/backends/contextfree.h b/src/backends/contextfree.h index 9c2ec4598cb703ba773c4d515b90a2165a73c336..b387e55df21387d4be137d7ff159889de50985ba 100644 --- a/src/backends/contextfree.h +++ b/src/backends/contextfree.h @@ -11,6 +11,8 @@ struct HCFStack_ { int count; int cap; HCFChoice *last_completed; // Last completed choice. + // XXX is last_completed still needed? + HCFChoice *prealloc; // If not NULL, will be used for the outermost choice. }; #ifndef UNUSED @@ -25,11 +27,13 @@ static HCFStack* h_cfstack_new(HAllocator *mm__) { stack->count = 0; stack->cap = 4; stack->stack = h_new(HCFChoice*, stack->cap); + stack->prealloc = NULL; return stack; } static void h_cfstack_free(HAllocator *mm__, HCFStack *stk__) UNUSED; static void h_cfstack_free(HAllocator *mm__, HCFStack *stk__) { + h_free(stk__->prealloc); h_free(stk__->stack); h_free(stk__); } @@ -56,7 +60,9 @@ static inline void h_cfstack_add_to_seq(HAllocator *mm__, HCFStack *stk__, HCFCh } static inline HCFChoice* h_cfstack_new_choice_raw(HAllocator *mm__, HCFStack *stk__) { - HCFChoice *ret = h_new(HCFChoice, 1); + HCFChoice *ret = stk__->prealloc? stk__->prealloc : h_new(HCFChoice, 1); + stk__->prealloc = NULL; + ret->reshape = NULL; ret->action = NULL; ret->pred = NULL; diff --git a/src/backends/glr.c b/src/backends/glr.c new file mode 100644 index 0000000000000000000000000000000000000000..353d0e6433100357287e40487299aa8327baad07 --- /dev/null +++ b/src/backends/glr.c @@ -0,0 +1,294 @@ +#include <assert.h> +#include "lr.h" + +static bool glr_step(HParseResult **result, HSlist *engines, + HLREngine *engine, const HLRAction *action); + + +/* GLR compilation (LALR w/o failing on conflict) */ + +int h_glr_compile(HAllocator* mm__, HParser* parser, const void* params) +{ + int result = h_lalr_compile(mm__, parser, params); + + if(result == -1 && parser->backend_data) { + // table is there, just has conflicts? nevermind, that's okay. + result = 0; + } + + return result; +} + +void h_glr_free(HParser *parser) +{ + h_lalr_free(parser); +} + + +/* Merging engines (when they converge on the same state) */ + +static HLREngine *lrengine_merge(HLREngine *old, HLREngine *new) +{ + HArena *arena = old->arena; + + HLREngine *ret = h_arena_malloc(arena, sizeof(HLREngine)); + + assert(old->state == new->state); + assert(old->input.input == new->input.input); + + *ret = *old; + ret->stack = h_slist_new(arena); + ret->merged[0] = old; + ret->merged[1] = new; + + return ret; +} + +static HSlist *demerge_stack(HSlistNode *bottom, HSlist *stack) +{ + HArena *arena = stack->arena; + + HSlist *ret = h_slist_new(arena); + + // copy the stack from the top + HSlistNode **y = &ret->head; + for(HSlistNode *x=stack->head; x; x=x->next) { + HSlistNode *node = h_arena_malloc(arena, sizeof(HSlistNode)); + node->elem = x->elem; + node->next = NULL; + *y = node; + y = &node->next; + } + *y = bottom; // attach the ancestor stack + + return ret; +} + +static inline HLREngine *respawn(HLREngine *eng, HSlist *stack) +{ + // NB: this can be a destructive update because an engine is not used for + // anything after it is merged. + eng->stack = demerge_stack(eng->stack->head, stack); + return eng; +} + +static HLREngine * +demerge(HParseResult **result, HSlist *engines, + HLREngine *engine, const HLRAction *action, size_t depth) +{ + // no-op on engines that are not merged + if(!engine->merged[0]) + return engine; + + HSlistNode *p = engine->stack->head; + for(size_t i=0; i<depth; i++) { + // if stack hits bottom, respawn ancestors + if(p == NULL) { + HLREngine *a = respawn(engine->merged[0], engine->stack); + HLREngine *b = respawn(engine->merged[1], engine->stack); + + // continue demerge until final depth reached + a = demerge(result, engines, a, action, depth-i); + b = demerge(result, engines, b, action, depth-i); + + // step and stow one ancestor... + glr_step(result, engines, a, action); + + // ...and return the other + return b; + } + p = p->next; + } + + return engine; // there is enough stack before the merge point +} + + +/* Forking engines (on conflicts */ + +HLREngine *fork_engine(const HLREngine *engine) +{ + HLREngine *eng2 = h_arena_malloc(engine->tarena, sizeof(HLREngine)); + eng2->table = engine->table; + eng2->state = engine->state; + eng2->input = engine->input; + + // shallow-copy the stack + // this works because h_slist_push and h_slist_drop never modify + // the underlying structure of HSlistNodes, only the head pointer. + // in fact, this gives us prefix sharing for free. + eng2->stack = h_arena_malloc(engine->tarena, sizeof(HSlist)); + *eng2->stack = *engine->stack; + + eng2->arena = engine->arena; + eng2->tarena = engine->tarena; + return eng2; +} + +static const HLRAction * +handle_conflict(HParseResult **result, HSlist *engines, + const HLREngine *engine, const HSlist *branches) +{ + // there should be at least two conflicting actions + assert(branches->head); + assert(branches->head->next); // this is just a consistency check + + // fork a new engine for all but the first action + for(HSlistNode *x=branches->head->next; x; x=x->next) { + HLRAction *act = x->elem; + HLREngine *eng = fork_engine(engine); + + // perform one step and add to engines + glr_step(result, engines, eng, act); + } + + // return first action for use with original engine + return branches->head->elem; +} + + +/* GLR driver */ + +static bool glr_step(HParseResult **result, HSlist *engines, + HLREngine *engine, const HLRAction *action) +{ + // handle forks and demerges (~> spawn engines) + if(action) { + if(action->type == HLR_CONFLICT) { + // fork engine on conflicts + action = handle_conflict(result, engines, engine, action->branches); + } else if(action->type == HLR_REDUCE) { + // demerge/respawn as needed + size_t depth = action->production.length; + engine = demerge(result, engines, engine, action, depth); + } + } + + bool run = h_lrengine_step(engine, action); + + if(run) { + // store engine in the list, merge if necessary + HSlistNode *x; + for(x=engines->head; x; x=x->next) { + HLREngine *eng = x->elem; + if(eng->state == engine->state) { + x->elem = lrengine_merge(eng, engine); + break; + } + } + if(!x) // no merge happened + h_slist_push(engines, engine); + } else if(engine->state == HLR_SUCCESS) { + // save the result + *result = h_lrengine_result(engine); + } + + return run; +} + +HParseResult *h_glr_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream) +{ + HLRTable *table = parser->backend_data; + if(!table) + return NULL; + + HArena *arena = h_new_arena(mm__, 0); // will hold the results + HArena *tarena = h_new_arena(mm__, 0); // tmp, deleted after parse + + // allocate engine lists (will hold one engine per state) + // these are swapped each iteration + HSlist *engines = h_slist_new(tarena); + HSlist *engback = h_slist_new(tarena); + + // create initial engine + h_slist_push(engines, h_lrengine_new(arena, tarena, table, stream)); + + HParseResult *result = NULL; + while(result == NULL && !h_slist_empty(engines)) { + assert(h_slist_empty(engback)); + + // step all engines + while(!h_slist_empty(engines)) { + HLREngine *engine = h_slist_pop(engines); + const HLRAction *action = h_lrengine_action(engine); + glr_step(&result, engback, engine, action); + } + + // swap the lists + HSlist *tmp = engines; + engines = engback; + engback = tmp; + } + + if(!result) + h_delete_arena(arena); + h_delete_arena(tarena); + return result; +} + + + +HParserBackendVTable h__glr_backend_vtable = { + .compile = h_glr_compile, + .parse = h_glr_parse, + .free = h_glr_free +}; + + + + +// dummy! +int test_glr(void) +{ + HAllocator *mm__ = &system_allocator; + + /* + E -> E '+' E + | 'd' + */ + + HParser *d = h_ch('d'); + HParser *E = h_indirect(); + HParser *E_ = h_choice(h_sequence(E, h_ch('+'), E, NULL), d, NULL); + h_bind_indirect(E, E_); + HParser *p = E; + + printf("\n==== G R A M M A R ====\n"); + HCFGrammar *g = h_cfgrammar_(mm__, h_desugar_augmented(mm__, p)); + if(g == NULL) { + fprintf(stderr, "h_cfgrammar failed\n"); + return 1; + } + h_pprint_grammar(stdout, g, 0); + + printf("\n==== D F A ====\n"); + HLRDFA *dfa = h_lr0_dfa(g); + if(dfa) + h_pprint_lrdfa(stdout, g, dfa, 0); + else + fprintf(stderr, "h_lalr_dfa failed\n"); + + printf("\n==== L R ( 0 ) T A B L E ====\n"); + HLRTable *table0 = h_lr0_table(g, dfa); + if(table0) + h_pprint_lrtable(stdout, g, table0, 0); + else + fprintf(stderr, "h_lr0_table failed\n"); + h_lrtable_free(table0); + + printf("\n==== L A L R T A B L E ====\n"); + if(h_compile(p, PB_GLR, NULL)) { + fprintf(stderr, "does not compile\n"); + return 2; + } + h_pprint_lrtable(stdout, g, (HLRTable *)p->backend_data, 0); + + printf("\n==== P A R S E R E S U L T ====\n"); + HParseResult *res = h_parse(p, (uint8_t *)"d+d+d", 5); + if(res) + h_pprint(stdout, res->ast, 0, 2); + else + printf("no parse\n"); + + return 0; +} diff --git a/src/backends/lalr.c b/src/backends/lalr.c new file mode 100644 index 0000000000000000000000000000000000000000..93becf31b23f0ba5a1204441442c96622aa55fe7 --- /dev/null +++ b/src/backends/lalr.c @@ -0,0 +1,389 @@ +#include <assert.h> +#include "contextfree.h" +#include "lr.h" + + + +/* LALR-via-SLR grammar transformation */ + +static inline size_t seqsize(void *p_) +{ + size_t n=0; + for(void **p=p_; *p; p++) n++; + return n+1; +} + +static HLRAction * +lrtable_lookup(const HLRTable *table, size_t state, const HCFChoice *symbol) +{ + switch(symbol->type) { + case HCF_END: + return table->tmap[state]->end_branch; + case HCF_CHAR: + return h_stringmap_get(table->tmap[state], &symbol->chr, 1, false); + default: + // nonterminal case + return h_hashtable_get(table->ntmap[state], symbol); + } +} + +static size_t follow_transition(const HLRTable *table, size_t x, HCFChoice *A) +{ + HLRAction *action = lrtable_lookup(table, x, A); + assert(action != NULL); + assert(action->type == HLR_SHIFT); + return action->nextstate; +} + +static inline HLRTransition *transition(HArena *arena, + size_t x, const HCFChoice *A, size_t y) +{ + HLRTransition *t = h_arena_malloc(arena, sizeof(HLRTransition)); + t->from = x; + t->symbol = A; + t->to = y; + return t; +} + +// no-op on terminal symbols +static void transform_productions(const HLRTable *table, HLREnhGrammar *eg, + size_t x, HCFChoice *xAy) +{ + if(xAy->type != HCF_CHOICE) + return; + // XXX CHARSET? + + HArena *arena = eg->arena; + + HCFSequence **seq = h_arena_malloc(arena, seqsize(xAy->seq) + * sizeof(HCFSequence *)); + HCFSequence **p, **q; + for(p=xAy->seq, q=seq; *p; p++, q++) { + // trace rhs starting in state x and following the transitions + // xAy -> ... iBj ... + + size_t i = x; + HCFChoice **B = (*p)->items; + HCFChoice **items = h_arena_malloc(arena, seqsize(B) * sizeof(HCFChoice *)); + HCFChoice **iBj = items; + for(; *B; B++, iBj++) { + size_t j = follow_transition(table, i, *B); + HLRTransition *i_B_j = transition(arena, i, *B, j); + *iBj = h_hashtable_get(eg->tmap, i_B_j); + assert(*iBj != NULL); + i = j; + } + *iBj = NULL; + + *q = h_arena_malloc(arena, sizeof(HCFSequence)); + (*q)->items = items; + } + *q = NULL; + xAy->seq = seq; +} + +static HCFChoice *new_enhanced_symbol(HLREnhGrammar *eg, const HCFChoice *sym) +{ + HArena *arena = eg->arena; + HCFChoice *esym = h_arena_malloc(arena, sizeof(HCFChoice)); + *esym = *sym; + + HHashSet *cs = h_hashtable_get(eg->corr, sym); + if(!cs) { + cs = h_hashset_new(arena, h_eq_symbol, h_hash_symbol); + h_hashtable_put(eg->corr, sym, cs); + } + h_hashset_put(cs, esym); + + return esym; +} + +static HLREnhGrammar *enhance_grammar(const HCFGrammar *g, const HLRDFA *dfa, + const HLRTable *table) +{ + HAllocator *mm__ = g->mm__; + HArena *arena = g->arena; + + HLREnhGrammar *eg = h_arena_malloc(arena, sizeof(HLREnhGrammar)); + eg->tmap = h_hashtable_new(arena, h_eq_transition, h_hash_transition); + eg->smap = h_hashtable_new(arena, h_eq_ptr, h_hash_ptr); + eg->corr = h_hashtable_new(arena, h_eq_symbol, h_hash_symbol); + // XXX must use h_eq/hash_ptr for symbols! so enhanced CHARs are different + eg->arena = arena; + + // establish mapping between transitions and symbols + for(HSlistNode *x=dfa->transitions->head; x; x=x->next) { + HLRTransition *t = x->elem; + + assert(!h_hashtable_present(eg->tmap, t)); + + HCFChoice *sym = new_enhanced_symbol(eg, t->symbol); + h_hashtable_put(eg->tmap, t, sym); + h_hashtable_put(eg->smap, sym, t); + } + + // transform the productions + H_FOREACH(eg->tmap, HLRTransition *t, HCFChoice *sym) + transform_productions(table, eg, t->from, sym); + H_END_FOREACH + + // add the start symbol + HCFChoice *start = new_enhanced_symbol(eg, g->start); + transform_productions(table, eg, 0, start); + + eg->grammar = h_cfgrammar_(mm__, start); + return eg; +} + + + +/* LALR table generation */ + +static inline bool has_conflicts(HLRTable *table) +{ + return !h_slist_empty(table->inadeq); +} + +// for each lookahead symbol (fs), put action into tmap +// returns 0 on success, -1 on conflict +// ignores forall entries +static int terminals_put(HStringMap *tmap, const HStringMap *fs, HLRAction *action) +{ + int ret = 0; + + if(fs->epsilon_branch) { + HLRAction *prev = tmap->epsilon_branch; + if(prev && prev != action) { + // conflict + tmap->epsilon_branch = h_lr_conflict(tmap->arena, prev, action); + ret = -1; + } else { + tmap->epsilon_branch = action; + } + } + + if(fs->end_branch) { + HLRAction *prev = tmap->end_branch; + if(prev && prev != action) { + // conflict + tmap->end_branch = h_lr_conflict(tmap->arena, prev, action); + ret = -1; + } else { + tmap->end_branch = action; + } + } + + H_FOREACH(fs->char_branches, void *key, HStringMap *fs_) + HStringMap *tmap_ = h_hashtable_get(tmap->char_branches, key); + + if(!tmap_) { + tmap_ = h_stringmap_new(tmap->arena); + h_hashtable_put(tmap->char_branches, key, tmap_); + } + + if(terminals_put(tmap_, fs_, action) < 0) + ret = -1; + H_END_FOREACH + + return ret; +} + +// check whether a sequence of enhanced-grammar symbols (p) matches the given +// (original-grammar) production rhs and terminates in the given end state. +static bool match_production(HLREnhGrammar *eg, HCFChoice **p, + HCFChoice **rhs, size_t endstate) +{ + size_t state = endstate; // initialized to end in case of empty rhs + for(; *p && *rhs; p++, rhs++) { + HLRTransition *t = h_hashtable_get(eg->smap, *p); + assert(t != NULL); + if(!h_eq_symbol(t->symbol, *rhs)) + return false; + state = t->to; + } + return (*p == *rhs // both NULL + && state == endstate); +} + +// desugar parser with a fresh start symbol +// this guarantees that the start symbol will not occur in any productions +HCFChoice *h_desugar_augmented(HAllocator *mm__, HParser *parser) +{ + HCFChoice *augmented = h_new(HCFChoice, 1); + + HCFStack *stk__ = h_cfstack_new(mm__); + stk__->prealloc = augmented; + HCFS_BEGIN_CHOICE() { + HCFS_BEGIN_SEQ() { + HCFS_DESUGAR(parser); + } HCFS_END_SEQ(); + HCFS_THIS_CHOICE->reshape = h_act_first; + } HCFS_END_CHOICE(); + h_cfstack_free(mm__, stk__); + + return augmented; +} + +int h_lalr_compile(HAllocator* mm__, HParser* parser, const void* params) +{ + // generate (augmented) CFG from parser + // construct LR(0) DFA + // build LR(0) table + // if necessary, resolve conflicts "by conversion to SLR" + + HCFGrammar *g = h_cfgrammar_(mm__, h_desugar_augmented(mm__, parser)); + if(g == NULL) // backend not suitable (language not context-free) + return -1; + + HLRDFA *dfa = h_lr0_dfa(g); + if(dfa == NULL) { // this should normally not happen + h_cfgrammar_free(g); + return -1; + } + + HLRTable *table = h_lr0_table(g, dfa); + if(table == NULL) { // this should normally not happen + h_cfgrammar_free(g); + return -1; + } + + if(has_conflicts(table)) { + HArena *arena = table->arena; + + HLREnhGrammar *eg = enhance_grammar(g, dfa, table); + if(eg == NULL) { // this should normally not happen + h_cfgrammar_free(g); + h_lrtable_free(table); + return -1; + } + + // go through the inadequate states; replace inadeq with a new list + HSlist *inadeq = table->inadeq; + table->inadeq = h_slist_new(arena); + + for(HSlistNode *x=inadeq->head; x; x=x->next) { + size_t state = (uintptr_t)x->elem; + bool inadeq = false; + + // clear old forall entry, it's being replaced by more fine-grained ones + table->forall[state] = NULL; + + // go through each reducible item of state + H_FOREACH_KEY(dfa->states[state], HLRItem *item) + if(item->mark < item->len) + continue; + + // action to place in the table cells indicated by lookahead + HLRAction *action = h_reduce_action(arena, item); + + // find all LR(0)-enhanced productions matching item + HHashSet *lhss = h_hashtable_get(eg->corr, item->lhs); + assert(lhss != NULL); + H_FOREACH_KEY(lhss, HCFChoice *lhs) + assert(lhs->type == HCF_CHOICE); // XXX could be CHARSET? + + for(HCFSequence **p=lhs->seq; *p; p++) { + HCFChoice **rhs = (*p)->items; + if(!match_production(eg, rhs, item->rhs, state)) + continue; + + // the left-hand symbol's follow set is this production's + // contribution to the lookahead + const HStringMap *fs = h_follow(1, eg->grammar, lhs); + assert(fs != NULL); + assert(fs->epsilon_branch == NULL); + assert(!h_stringmap_empty(fs)); + + // for each lookahead symbol, put action into table cell + if(terminals_put(table->tmap[state], fs, action) < 0) + inadeq = true; + } H_END_FOREACH // enhanced production + H_END_FOREACH // reducible item + + if(inadeq) + h_slist_push(table->inadeq, (void *)(uintptr_t)state); + } + } + + h_cfgrammar_free(g); + parser->backend_data = table; + return has_conflicts(table)? -1 : 0; +} + +void h_lalr_free(HParser *parser) +{ + HLRTable *table = parser->backend_data; + h_lrtable_free(table); + parser->backend_data = NULL; + parser->backend = PB_PACKRAT; +} + + + +HParserBackendVTable h__lalr_backend_vtable = { + .compile = h_lalr_compile, + .parse = h_lr_parse, + .free = h_lalr_free +}; + + + + +// dummy! +int test_lalr(void) +{ + HAllocator *mm__ = &system_allocator; + + /* + E -> E '-' T + | T + T -> '(' E ')' + | 'n' -- also try [0-9] for the charset paths + */ + + HParser *n = h_ch('n'); + HParser *E = h_indirect(); + HParser *T = h_choice(h_sequence(h_ch('('), E, h_ch(')'), NULL), n, NULL); + HParser *E_ = h_choice(h_sequence(E, h_ch('-'), T, NULL), T, NULL); + h_bind_indirect(E, E_); + HParser *p = E; + + printf("\n==== G R A M M A R ====\n"); + HCFGrammar *g = h_cfgrammar_(mm__, h_desugar_augmented(mm__, p)); + if(g == NULL) { + fprintf(stderr, "h_cfgrammar failed\n"); + return 1; + } + h_pprint_grammar(stdout, g, 0); + + printf("\n==== D F A ====\n"); + HLRDFA *dfa = h_lr0_dfa(g); + if(dfa) + h_pprint_lrdfa(stdout, g, dfa, 0); + else + fprintf(stderr, "h_lalr_dfa failed\n"); + + printf("\n==== L R ( 0 ) T A B L E ====\n"); + HLRTable *table0 = h_lr0_table(g, dfa); + if(table0) + h_pprint_lrtable(stdout, g, table0, 0); + else + fprintf(stderr, "h_lr0_table failed\n"); + h_lrtable_free(table0); + + printf("\n==== L A L R T A B L E ====\n"); + if(h_compile(p, PB_LALR, NULL)) { + fprintf(stderr, "does not compile\n"); + return 2; + } + h_pprint_lrtable(stdout, g, (HLRTable *)p->backend_data, 0); + + printf("\n==== P A R S E R E S U L T ====\n"); + HParseResult *res = h_parse(p, (uint8_t *)"n-(n-((n)))-n", 13); + if(res) + h_pprint(stdout, res->ast, 0, 2); + else + printf("no parse\n"); + + return 0; +} diff --git a/src/backends/llk.c b/src/backends/llk.c index 50e11bfa059591c03403e75242fffa3b7649b029..c0cf6afef75aae37c9b9479cdc44223244326ab7 100644 --- a/src/backends/llk.c +++ b/src/backends/llk.c @@ -3,13 +3,13 @@ #include "../cfgrammar.h" #include "../parsers/parser_internal.h" -// XXX despite the names, this is all LL(1) right now. TODO +static const size_t DEFAULT_KMAX = 1; /* Generating the LL(k) parse table */ -/* Maps each nonterminal (HCFChoice) of the grammar to another hash table that - * maps lookahead tokens (HCFToken) to productions (HCFSequence). +/* Maps each nonterminal (HCFChoice) of the grammar to a HStringMap that + * maps lookahead strings to productions (HCFSequence). */ typedef struct HLLkTable_ { HHashTable *rows; @@ -19,29 +19,17 @@ typedef struct HLLkTable_ { } HLLkTable; -// XXX adaptation to LL(1), to be removed -typedef HCharKey HCFToken; -static const HCFToken end_token = 0x200; -#define char_token char_key - /* Interface to look up an entry in the parse table. */ const HCFSequence *h_llk_lookup(const HLLkTable *table, const HCFChoice *x, - HInputStream lookahead) + const HInputStream *stream) { - // note the lookahead stream is passed by value, i.e. a copy. - // reading bits from it does not consume them from the real input. - HCFToken tok; - uint8_t c = h_read_bits(&lookahead, 8, false); - if(lookahead.overrun) - tok = end_token; - else - tok = char_token(c); - - const HHashTable *row = h_hashtable_get(table->rows, x); + const HStringMap *row = h_hashtable_get(table->rows, x); assert(row != NULL); // the table should have one row for each nonterminal - const HCFSequence *production = h_hashtable_get(row, (void *)tok); - return production; + assert(!row->epsilon_branch); // would match without looking at the input + // XXX cases where this could be useful? + + return h_stringmap_get_lookahead(row, *stream); } /* Allocate a new parse table. */ @@ -72,58 +60,131 @@ void h_llktable_free(HLLkTable *table) h_free(table); } -/* Compute the predict set of production "A -> rhs". */ -HHashSet *h_predict(HCFGrammar *g, const HCFChoice *A, const HCFSequence *rhs) +void *const CONFLICT = (void *)(uintptr_t)(-1); + +// helper for stringmap_merge +static void *combine_entries(HHashSet *workset, void *dst, const void *src) { - // predict(A -> rhs) = first(rhs) u follow(A) if "" can be derived from rhs - // predict(A -> rhs) = first(rhs) otherwise - const HCFStringMap *first_rhs = h_first_seq(1, g, rhs->items); - const HCFStringMap *follow_A = h_follow(1, g, A); - HHashSet *ret = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr); - - h_hashset_put_all(ret, first_rhs->char_branches); - if(first_rhs->end_branch) - h_hashset_put(ret, (void *)end_token); - - if(h_derives_epsilon_seq(g, rhs->items)) { - h_hashset_put_all(ret, follow_A->char_branches); - if(follow_A->end_branch) - h_hashset_put(ret, (void *)end_token); + assert(dst != NULL); + assert(src != NULL); + + if(dst == CONFLICT) { // previous conflict + h_hashset_put(workset, src); + } else if(dst != src) { // new conflict + h_hashset_put(workset, dst); + h_hashset_put(workset, src); + dst = CONFLICT; } - return ret; + return dst; } -/* Generate entries for the production "A -> rhs" in the given table row. */ -static -int fill_table_row(HCFGrammar *g, HHashTable *row, - const HCFChoice *A, HCFSequence *rhs) +// add the mappings of src to dst, marking conflicts and adding the conflicting +// values to workset. +// note: reuses parts of src to build dst! +static void stringmap_merge(HHashSet *workset, HStringMap *dst, HStringMap *src) { - // iterate over predict(A -> rhs) - HHashSet *pred = h_predict(g, A, rhs); + if(src->epsilon_branch) { + if(dst->epsilon_branch) + dst->epsilon_branch = + combine_entries(workset, dst->epsilon_branch, src->epsilon_branch); + else + dst->epsilon_branch = src->epsilon_branch; + } else { + // if there is a non-conflicting value on the left (dst) side, it means + // that prediction is already unambiguous. we can drop the right (src) + // side we were going to extend with. + if(dst->epsilon_branch && dst->epsilon_branch != CONFLICT) + return; + } - size_t i; - HHashTableEntry *hte; - for(i=0; i < pred->capacity; i++) { - for(hte = &pred->contents[i]; hte; hte = hte->next) { + if(src->end_branch) { + if(dst->end_branch) + dst->end_branch = + combine_entries(workset, dst->end_branch, src->end_branch); + else + dst->end_branch = src->end_branch; + } + + // iterate over src->char_branches + const HHashTable *ht = src->char_branches; + for(size_t i=0; i < ht->capacity; i++) { + for(HHashTableEntry *hte = &ht->contents[i]; hte; hte = hte->next) { if(hte->key == NULL) continue; - HCFToken x = (uintptr_t)hte->key; - if(h_hashtable_present(row, (void *)x)) - return -1; // table would be ambiguous + HCharKey c = (HCharKey)hte->key; + HStringMap *src_ = hte->value; - h_hashtable_put(row, (void *)x, rhs); + if(src_) { + HStringMap *dst_ = h_hashtable_get(dst->char_branches, (void *)c); + if(dst_) + stringmap_merge(workset, dst_, src_); + else + h_hashtable_put(dst->char_branches, (void *)c, src_); + } } } +} - return 0; +/* Generate entries for the productions of A in the given table row. */ +static int fill_table_row(size_t kmax, HCFGrammar *g, HStringMap *row, + const HCFChoice *A) +{ + HHashSet *workset; + + // initialize working set to the productions of A + workset = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr); + for(HCFSequence **s = A->seq; *s; s++) + h_hashset_put(workset, *s); + + // run until workset exhausted or kmax hit + size_t k; + for(k=1; k<=kmax; k++) { + // allocate a fresh workset for the next round + HHashSet *nextset = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr); + + // iterate over the productions in workset... + const HHashTable *ht = workset; + for(size_t i=0; i < ht->capacity; i++) { + for(HHashTableEntry *hte = &ht->contents[i]; hte; hte = hte->next) { + if(hte->key == NULL) + continue; + + HCFSequence *rhs = (void *)hte->key; + assert(rhs != NULL); + assert(rhs != CONFLICT); // just to be sure there's no mixup + + // calculate predict set; let values map to rhs + HStringMap *pred = h_predict(k, g, A, rhs); + h_stringmap_replace(pred, NULL, rhs); + + // merge predict set into the row + // accumulates conflicts in new workset + stringmap_merge(nextset, row, pred); + } + } + + // switch to the updated workset + h_hashset_free(workset); + workset = nextset; + + // if the workset is empty, row is without conflict; we're done + if(h_hashset_empty(workset)) + break; + + // clear conflict markers for next iteration + h_stringmap_replace(row, CONFLICT, NULL); + } + + h_hashset_free(workset); + return (k>kmax)? -1 : 0; } /* Generate the LL(k) parse table from the given grammar. * Returns -1 on error, 0 on success. */ -static int fill_table(HCFGrammar *g, HLLkTable *table) +static int fill_table(size_t kmax, HCFGrammar *g, HLLkTable *table) { table->start = g->start; @@ -138,18 +199,14 @@ static int fill_table(HCFGrammar *g, HLLkTable *table) assert(a->type == HCF_CHOICE); // create table row for this nonterminal - HHashTable *row = h_hashtable_new(table->arena, h_eq_ptr, h_hash_ptr); + HStringMap *row = h_stringmap_new(table->arena); h_hashtable_put(table->rows, a, row); - // iterate over a's productions - HCFSequence **s; - for(s = a->seq; *s; s++) { - // record this production in row as appropriate - // this can signal an ambiguity conflict. + if(fill_table_row(kmax, g, row, a) < 0) { + // unresolvable conflicts in row // NB we don't worry about deallocating anything, h_llk_compile will // delete the whole arena for us. - if(fill_table_row(g, row, a, *s) < 0) - return -1; + return -1; } } } @@ -159,6 +216,9 @@ static int fill_table(HCFGrammar *g, HLLkTable *table) int h_llk_compile(HAllocator* mm__, HParser* parser, const void* params) { + size_t kmax = params? (uintptr_t)params : DEFAULT_KMAX; + assert(kmax>0); + // Convert parser to a CFG. This can fail as indicated by a NULL return. HCFGrammar *grammar = h_cfgrammar(mm__, parser); if(grammar == NULL) @@ -170,7 +230,7 @@ int h_llk_compile(HAllocator* mm__, HParser* parser, const void* params) // generate table and store in parser->backend_data. HLLkTable *table = h_llktable_new(mm__); - if(fill_table(grammar, table) < 0) { + if(fill_table(kmax, grammar, table) < 0) { // the table was ambiguous h_cfgrammar_free(grammar); h_llktable_free(table); @@ -240,10 +300,13 @@ HParseResult *h_llk_parse(HAllocator* mm__, const HParser* parser, HInputStream* seq = h_carray_new(arena); // look up applicable production in parse table - const HCFSequence *p = h_llk_lookup(table, x, *stream); + const HCFSequence *p = h_llk_lookup(table, x, stream); if(p == NULL) goto no_parse; + // an infinite loop case that shouldn't happen + assert(!p->items[0] || p->items[0] != x); + // push production's rhs onto the stack (in reverse order) HCFChoice **s; for(s = p->items; *s; s++); @@ -255,10 +318,12 @@ HParseResult *h_llk_parse(HAllocator* mm__, const HParser* parser, HInputStream* // the top of stack is such that there will be a result... HParsedToken *tok; // will hold result token + tok = h_arena_malloc(arena, sizeof(HParsedToken)); + tok->index = stream->index; + tok->bit_offset = stream->bit_offset; if(x == mark) { // hit stack frame boundary... // wrap the accumulated parse result, this sequence is finished - tok = h_arena_malloc(arena, sizeof(HParsedToken)); tok->token_type = TT_SEQUENCE; tok->seq = seq; @@ -277,13 +342,13 @@ HParseResult *h_llk_parse(HAllocator* mm__, const HParser* parser, HInputStream* case HCF_END: if(!stream->overrun) goto no_parse; + h_arena_free(arena, tok); tok = NULL; break; case HCF_CHAR: if(input != x->chr) goto no_parse; - tok = h_arena_malloc(arena, sizeof(HParsedToken)); tok->token_type = TT_UINT; tok->uint = x->chr; break; @@ -293,7 +358,6 @@ HParseResult *h_llk_parse(HAllocator* mm__, const HParser* parser, HInputStream* goto no_parse; if(!charset_isset(x->charset, input)) goto no_parse; - tok = h_arena_malloc(arena, sizeof(HParsedToken)); tok->token_type = TT_UINT; tok->uint = input; break; @@ -306,8 +370,6 @@ HParseResult *h_llk_parse(HAllocator* mm__, const HParser* parser, HInputStream* // 'tok' has been parsed; process it - // XXX set tok->index and tok->bit_offset (don't take directly from stream, cuz peek!) - // perform token reshape if indicated if(x->reshape) tok = (HParsedToken *)x->reshape(make_result(arena, tok)); @@ -328,10 +390,10 @@ HParseResult *h_llk_parse(HAllocator* mm__, const HParser* parser, HInputStream* h_delete_arena(tarena); return make_result(arena, seq->elements[0]); - no_parse: - h_delete_arena(tarena); - h_delete_arena(arena); - return NULL; + no_parse: + h_delete_arena(tarena); + h_delete_arena(arena); + return NULL; } @@ -357,9 +419,11 @@ int test_llk(void) Y -> y -- for k=3 use "yy" */ - HParser *c = h_many(h_ch('x')); - HParser *q = h_sequence(c, h_ch('y'), NULL); - HParser *p = h_choice(q, h_end_p(), NULL); + HParser *X = h_optional(h_ch('x')); + HParser *Y = h_sequence(h_ch('y'), h_ch('y'), NULL); + HParser *A = h_sequence(X, Y, h_ch('a'), NULL); + HParser *B = h_sequence(Y, h_ch('b'), NULL); + HParser *p = h_choice(A, B, NULL); HCFGrammar *g = h_cfgrammar(&system_allocator, p); @@ -372,13 +436,16 @@ int test_llk(void) printf("derive epsilon: "); h_pprint_symbolset(stdout, g, g->geneps, 0); printf("first(A) = "); - h_pprint_stringset(stdout, g, h_first(2, g, g->start), 0); - printf("follow(C) = "); - h_pprint_stringset(stdout, g, h_follow(2, g, h_desugar(&system_allocator, NULL, c)), 0); + h_pprint_stringset(stdout, h_first(3, g, g->start), 0); + // printf("follow(C) = "); + // h_pprint_stringset(stdout, h_follow(3, g, h_desugar(&system_allocator, NULL, c)), 0); - h_compile(p, PB_LLk, NULL); + if(h_compile(p, PB_LLk, (void *)3)) { + fprintf(stderr, "does not compile\n"); + return 2; + } - HParseResult *res = h_parse(p, (uint8_t *)"xxy", 3); + HParseResult *res = h_parse(p, (uint8_t *)"xyya", 4); if(res) h_pprint(stdout, res->ast, 0, 2); else diff --git a/src/backends/lr.c b/src/backends/lr.c new file mode 100644 index 0000000000000000000000000000000000000000..4c89d19d70030a1475c1a1e3cccc92ed58bdef58 --- /dev/null +++ b/src/backends/lr.c @@ -0,0 +1,538 @@ +#include <assert.h> +#include <ctype.h> +#include "../parsers/parser_internal.h" +#include "lr.h" + + + +/* Comparison and hashing functions */ + +// compare symbols - terminals by value, others by pointer +bool h_eq_symbol(const void *p, const void *q) +{ + const HCFChoice *x=p, *y=q; + return (x==y + || (x->type==HCF_END && y->type==HCF_END) + || (x->type==HCF_CHAR && y->type==HCF_CHAR && x->chr==y->chr)); +} + +// hash symbols - terminals by value, others by pointer +HHashValue h_hash_symbol(const void *p) +{ + const HCFChoice *x=p; + if(x->type == HCF_END) + return 0; + else if(x->type == HCF_CHAR) + return x->chr * 33; + else + return h_hash_ptr(p); +} + +// compare LR items by value +static bool eq_lr_item(const void *p, const void *q) +{ + const HLRItem *a=p, *b=q; + + if(!h_eq_symbol(a->lhs, b->lhs)) return false; + if(a->mark != b->mark) return false; + if(a->len != b->len) return false; + + for(size_t i=0; i<a->len; i++) + if(!h_eq_symbol(a->rhs[i], b->rhs[i])) return false; + + return true; +} + +// hash LALR items +static inline HHashValue hash_lr_item(const void *p) +{ + const HLRItem *x = p; + HHashValue hash = 0; + + hash += h_hash_symbol(x->lhs); + for(HCFChoice **p=x->rhs; *p; p++) + hash += h_hash_symbol(*p); + hash += x->mark; + + return hash; +} + +// compare item sets (DFA states) +bool h_eq_lr_itemset(const void *p, const void *q) +{ + return h_hashset_equal(p, q); +} + +// hash LR item sets (DFA states) - hash the elements and sum +HHashValue h_hash_lr_itemset(const void *p) +{ + HHashValue hash = 0; + + H_FOREACH_KEY((const HHashSet *)p, HLRItem *item) + hash += hash_lr_item(item); + H_END_FOREACH + + return hash; +} + +bool h_eq_transition(const void *p, const void *q) +{ + const HLRTransition *a=p, *b=q; + return (a->from == b->from && a->to == b->to && h_eq_symbol(a->symbol, b->symbol)); +} + +HHashValue h_hash_transition(const void *p) +{ + const HLRTransition *t = p; + return (h_hash_symbol(t->symbol) + t->from + t->to); // XXX ? +} + + + +/* Constructors */ + +HLRItem *h_lritem_new(HArena *a, HCFChoice *lhs, HCFChoice **rhs, size_t mark) +{ + HLRItem *ret = h_arena_malloc(a, sizeof(HLRItem)); + + size_t len = 0; + for(HCFChoice **p=rhs; *p; p++) len++; + assert(mark <= len); + + ret->lhs = lhs; + ret->rhs = rhs; + ret->len = len; + ret->mark = mark; + + return ret; +} + +HLRState *h_lrstate_new(HArena *arena) +{ + return h_hashset_new(arena, eq_lr_item, hash_lr_item); +} + +HLRTable *h_lrtable_new(HAllocator *mm__, size_t nrows) +{ + HArena *arena = h_new_arena(mm__, 0); // default blocksize + assert(arena != NULL); + + HLRTable *ret = h_new(HLRTable, 1); + ret->nrows = nrows; + ret->ntmap = h_arena_malloc(arena, nrows * sizeof(HHashTable *)); + ret->tmap = h_arena_malloc(arena, nrows * sizeof(HStringMap *)); + ret->forall = h_arena_malloc(arena, nrows * sizeof(HLRAction *)); + ret->inadeq = h_slist_new(arena); + ret->arena = arena; + ret->mm__ = mm__; + + for(size_t i=0; i<nrows; i++) { + ret->ntmap[i] = h_hashtable_new(arena, h_eq_symbol, h_hash_symbol); + ret->tmap[i] = h_stringmap_new(arena); + ret->forall[i] = NULL; + } + + return ret; +} + +void h_lrtable_free(HLRTable *table) +{ + HAllocator *mm__ = table->mm__; + h_delete_arena(table->arena); + h_free(table); +} + +HLRAction *h_shift_action(HArena *arena, size_t nextstate) +{ + HLRAction *action = h_arena_malloc(arena, sizeof(HLRAction)); + action->type = HLR_SHIFT; + action->nextstate = nextstate; + return action; +} + +HLRAction *h_reduce_action(HArena *arena, const HLRItem *item) +{ + HLRAction *action = h_arena_malloc(arena, sizeof(HLRAction)); + action->type = HLR_REDUCE; + action->production.lhs = item->lhs; + action->production.length = item->len; +#ifndef NDEBUG + action->production.rhs = item->rhs; +#endif + return action; +} + +// adds 'new' to the branches of 'action' +// returns a 'action' if it is already of type HLR_CONFLICT +// allocates a new HLRAction otherwise +HLRAction *h_lr_conflict(HArena *arena, HLRAction *action, HLRAction *new) +{ + if(action->type != HLR_CONFLICT) { + HLRAction *old = action; + action = h_arena_malloc(arena, sizeof(HLRAction)); + action->type = HLR_CONFLICT; + action->branches = h_slist_new(arena); + h_slist_push(action->branches, old); + h_slist_push(action->branches, new); + } else { + // check if 'new' is already among branches + HSlistNode *x; + for(x=action->branches->head; x; x=x->next) { + if(x->elem == new) + break; + } + // add 'new' if it is not already in list + if(x == NULL) + h_slist_push(action->branches, new); + } + + return action; +} + +bool h_lrtable_row_empty(const HLRTable *table, size_t i) +{ + return (h_hashtable_empty(table->ntmap[i]) + && h_stringmap_empty(table->tmap[i])); +} + + + +/* LR driver */ + +HLREngine *h_lrengine_new(HArena *arena, HArena *tarena, const HLRTable *table, + const HInputStream *stream) +{ + HLREngine *engine = h_arena_malloc(tarena, sizeof(HLREngine)); + + engine->table = table; + engine->state = 0; + engine->stack = h_slist_new(tarena); + engine->input = *stream; + engine->merged[0] = NULL; + engine->merged[1] = NULL; + engine->arena = arena; + engine->tarena = tarena; + + return engine; +} + +static const HLRAction * +terminal_lookup(const HLREngine *engine, const HInputStream *stream) +{ + const HLRTable *table = engine->table; + size_t state = engine->state; + + assert(state < table->nrows); + if(table->forall[state]) { + assert(h_lrtable_row_empty(table, state)); // that would be a conflict + return table->forall[state]; + } else { + return h_stringmap_get_lookahead(table->tmap[state], *stream); + } +} + +static const HLRAction * +nonterminal_lookup(const HLREngine *engine, const HCFChoice *symbol) +{ + const HLRTable *table = engine->table; + size_t state = engine->state; + + assert(state < table->nrows); + assert(!table->forall[state]); // contains only reduce entries + // we are only looking for shifts + return h_hashtable_get(table->ntmap[state], symbol); +} + +const HLRAction *h_lrengine_action(const HLREngine *engine) +{ + return terminal_lookup(engine, &engine->input); +} + +static HParsedToken *consume_input(HLREngine *engine) +{ + HParsedToken *v; + + uint8_t c = h_read_bits(&engine->input, 8, false); + + if(engine->input.overrun) { // end of input + v = NULL; + } else { + v = h_arena_malloc(engine->arena, sizeof(HParsedToken)); + v->token_type = TT_UINT; + v->uint = c; + } + + return v; +} + +// run LR parser for one round; returns false when finished +bool h_lrengine_step(HLREngine *engine, const HLRAction *action) +{ + // short-hand names + HSlist *stack = engine->stack; + HArena *arena = engine->arena; + HArena *tarena = engine->tarena; + + if(action == NULL) + return false; // no handle recognizable in input, terminate + + assert(action->type == HLR_SHIFT || action->type == HLR_REDUCE); + + if(action->type == HLR_REDUCE) { + size_t len = action->production.length; + HCFChoice *symbol = action->production.lhs; + + // semantic value of the reduction result + HParsedToken *value = h_arena_malloc(arena, sizeof(HParsedToken)); + value->token_type = TT_SEQUENCE; + value->seq = h_carray_new_sized(arena, len); + + // pull values off the stack, rewinding state accordingly + HParsedToken *v = NULL; + for(size_t i=0; i<len; i++) { + v = h_slist_drop(stack); + engine->state = (uintptr_t)h_slist_drop(stack); + + // collect values in result sequence + value->seq->elements[len-1-i] = v; + value->seq->used++; + } + if(v) { + // result position equals position of left-most symbol + value->index = v->index; + value->bit_offset = v->bit_offset; + } else { + // XXX how to get the position in this case? + } + + // perform token reshape if indicated + if(symbol->reshape) + value = (HParsedToken *)symbol->reshape(make_result(arena, value)); + + // call validation and semantic action, if present + if(symbol->pred && !symbol->pred(make_result(tarena, value))) + return false; // validation failed -> no parse; terminate + if(symbol->action) + value = (HParsedToken *)symbol->action(make_result(arena, value)); + + // this is LR, building a right-most derivation bottom-up, so no reduce can + // follow a reduce. we can also assume no conflict follows for GLR if we + // use LALR tables, because only terminal symbols (lookahead) get reduces. + const HLRAction *shift = nonterminal_lookup(engine, symbol); + if(shift == NULL) + return false; // parse error + assert(shift->type == HLR_SHIFT); + + // piggy-back the shift right here, never touching the input + h_slist_push(stack, (void *)(uintptr_t)engine->state); + h_slist_push(stack, value); + engine->state = shift->nextstate; + + // check for success + if(engine->state == HLR_SUCCESS) { + assert(symbol == engine->table->start); + return false; + } + } else { + assert(action->type == HLR_SHIFT); + HParsedToken *value = consume_input(engine); + h_slist_push(stack, (void *)(uintptr_t)engine->state); + h_slist_push(stack, value); + engine->state = action->nextstate; + } + + return true; +} + +HParseResult *h_lrengine_result(HLREngine *engine) +{ + // parsing was successful iff the engine reaches the end state + if(engine->state == HLR_SUCCESS) { + // on top of the stack is the start symbol's semantic value + assert(!h_slist_empty(engine->stack)); + HParsedToken *tok = engine->stack->head->elem; + return make_result(engine->arena, tok); + } else { + return NULL; + } +} + +HParseResult *h_lr_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream) +{ + HLRTable *table = parser->backend_data; + if(!table) + return NULL; + + HArena *arena = h_new_arena(mm__, 0); // will hold the results + HArena *tarena = h_new_arena(mm__, 0); // tmp, deleted after parse + HLREngine *engine = h_lrengine_new(arena, tarena, table, stream); + + // iterate engine to completion + while(h_lrengine_step(engine, h_lrengine_action(engine))); + + HParseResult *result = h_lrengine_result(engine); + if(!result) + h_delete_arena(arena); + h_delete_arena(tarena); + return result; +} + + + +/* Pretty-printers */ + +void h_pprint_lritem(FILE *f, const HCFGrammar *g, const HLRItem *item) +{ + h_pprint_symbol(f, g, item->lhs); + fputs(" ->", f); + + HCFChoice **x = item->rhs; + HCFChoice **mark = item->rhs + item->mark; + if(*x == NULL) { + fputc('.', f); + } else { + while(*x) { + if(x == mark) + fputc('.', f); + else + fputc(' ', f); + + if((*x)->type == HCF_CHAR) { + // condense character strings + fputc('"', f); + h_pprint_char(f, (*x)->chr); + for(x++; *x; x++) { + if(x == mark) + break; + if((*x)->type != HCF_CHAR) + break; + h_pprint_char(f, (*x)->chr); + } + fputc('"', f); + } else { + h_pprint_symbol(f, g, *x); + x++; + } + } + if(x == mark) + fputs(".", f); + } +} + +void h_pprint_lrstate(FILE *f, const HCFGrammar *g, + const HLRState *state, unsigned int indent) +{ + bool first = true; + H_FOREACH_KEY(state, HLRItem *item) + if(!first) + for(unsigned int i=0; i<indent; i++) fputc(' ', f); + first = false; + h_pprint_lritem(f, g, item); + fputc('\n', f); + H_END_FOREACH +} + +static void pprint_transition(FILE *f, const HCFGrammar *g, const HLRTransition *t) +{ + fputs("-", f); + h_pprint_symbol(f, g, t->symbol); + fprintf(f, "->%lu", t->to); +} + +void h_pprint_lrdfa(FILE *f, const HCFGrammar *g, + const HLRDFA *dfa, unsigned int indent) +{ + for(size_t i=0; i<dfa->nstates; i++) { + unsigned int indent2 = indent + fprintf(f, "%4lu: ", i); + h_pprint_lrstate(f, g, dfa->states[i], indent2); + for(HSlistNode *x = dfa->transitions->head; x; x = x->next) { + const HLRTransition *t = x->elem; + if(t->from == i) { + for(unsigned int i=0; i<indent2-2; i++) fputc(' ', f); + pprint_transition(f, g, t); + fputc('\n', f); + } + } + } +} + +void pprint_lraction(FILE *f, const HCFGrammar *g, const HLRAction *action) +{ + switch(action->type) { + case HLR_SHIFT: + if(action->nextstate == HLR_SUCCESS) + fputs("s~", f); + else + fprintf(f, "s%lu", action->nextstate); + break; + case HLR_REDUCE: + fputs("r(", f); + h_pprint_symbol(f, g, action->production.lhs); + fputs(" -> ", f); +#ifdef NDEBUG + // if we can't print the production, at least print its length + fprintf(f, "[%lu]", action->production.length); +#else + HCFSequence seq = {action->production.rhs}; + h_pprint_sequence(f, g, &seq); +#endif + fputc(')', f); + break; + case HLR_CONFLICT: + fputc('!', f); + for(HSlistNode *x=action->branches->head; x; x=x->next) { + HLRAction *branch = x->elem; + assert(branch->type != HLR_CONFLICT); // no nesting + pprint_lraction(f, g, branch); + if(x->next) fputc('/', f); // separator + } + break; + default: + assert_message(0, "not reached"); + } +} + +static void valprint_lraction(FILE *file, void *env, void *val) +{ + const HLRAction *action = val; + const HCFGrammar *grammar = env; + pprint_lraction(file, grammar, action); +} + +static void pprint_lrtable_terminals(FILE *file, const HCFGrammar *g, + const HStringMap *map) +{ + h_pprint_stringmap(file, ' ', valprint_lraction, (void *)g, map); +} + +void h_pprint_lrtable(FILE *f, const HCFGrammar *g, const HLRTable *table, + unsigned int indent) +{ + for(size_t i=0; i<table->nrows; i++) { + for(unsigned int j=0; j<indent; j++) fputc(' ', f); + fprintf(f, "%4lu:", i); + if(table->forall[i]) { + fputc(' ', f); + pprint_lraction(f, g, table->forall[i]); + if(!h_lrtable_row_empty(table, i)) + fputs(" !!", f); + } + H_FOREACH(table->ntmap[i], HCFChoice *symbol, HLRAction *action) + fputc(' ', f); // separator + h_pprint_symbol(f, g, symbol); + fputc(':', f); + pprint_lraction(f, g, action); + H_END_FOREACH + fputc(' ', f); // separator + pprint_lrtable_terminals(f, g, table->tmap[i]); + fputc('\n', f); + } + +#if 0 + fputs("inadeq=", f); + for(HSlistNode *x=table->inadeq->head; x; x=x->next) { + fprintf(f, "%lu ", (uintptr_t)x->elem); + } + fputc('\n', f); +#endif +} diff --git a/src/backends/lr.h b/src/backends/lr.h new file mode 100644 index 0000000000000000000000000000000000000000..8f1eadd9059330b23c77e58aedfd680690b07950 --- /dev/null +++ b/src/backends/lr.h @@ -0,0 +1,147 @@ +#ifndef HAMMER_BACKENDS_LR__H +#define HAMMER_BACKENDS_LR__H + +#include "../hammer.h" +#include "../cfgrammar.h" +#include "../internal.h" + + +typedef HHashSet HLRState; // states are sets of LRItems + +typedef struct HLRDFA_ { + size_t nstates; + const HLRState **states; // array of size nstates + HSlist *transitions; +} HLRDFA; + +typedef struct HLRTransition_ { + size_t from; // index into 'states' array + const HCFChoice *symbol; + size_t to; // index into 'states' array +} HLRTransition; + +typedef struct HLRItem_ { + HCFChoice *lhs; + HCFChoice **rhs; // NULL-terminated + size_t len; // number of elements in rhs + size_t mark; +} HLRItem; + +typedef struct HLRAction_ { + enum {HLR_SHIFT, HLR_REDUCE, HLR_CONFLICT} type; + union { + // used with HLR_SHIFT + size_t nextstate; + + // used with HLR_REDUCE + struct { + HCFChoice *lhs; // symbol carrying semantic actions etc. + size_t length; // # of symbols in rhs +#ifndef NDEBUG + HCFChoice **rhs; // NB: the rhs symbols are not needed for the parse +#endif + } production; + + // used with HLR_CONFLICT + HSlist *branches; // list of possible HLRActions + }; +} HLRAction; + +typedef struct HLRTable_ { + size_t nrows; // dimension of the pointer arrays below + HHashTable **ntmap; // map nonterminal symbols to HLRActions, per row + HStringMap **tmap; // map lookahead strings to HLRActions, per row + HLRAction **forall; // shortcut to set an action for an entire row + HCFChoice *start; // start symbol + HSlist *inadeq; // indices of any inadequate states + HArena *arena; + HAllocator *mm__; +} HLRTable; + +typedef struct HLREnhGrammar_ { + HCFGrammar *grammar; // enhanced grammar + HHashTable *tmap; // maps transitions to enhanced-grammar symbols + HHashTable *smap; // maps enhanced-grammar symbols to transitions + HHashTable *corr; // maps symbols to sets of corresponding e. symbols + HArena *arena; +} HLREnhGrammar; + +typedef struct HLREngine_ { + const HLRTable *table; + size_t state; + + HSlist *stack; // holds pairs: (saved state, semantic value) + HInputStream input; + + struct HLREngine_ *merged[2]; // ancestors merged into this engine + + HArena *arena; // will hold the results + HArena *tarena; // tmp, deleted after parse +} HLREngine; + +#define HLR_SUCCESS ((size_t)~0) // parser end state + + +// XXX move to internal.h or something +// XXX replace other hashtable iterations with this +#define H_FOREACH_(HT) { \ + const HHashTable *ht__ = HT; \ + for(size_t i__=0; i__ < ht__->capacity; i__++) { \ + for(HHashTableEntry *hte__ = &ht__->contents[i__]; \ + hte__; \ + hte__ = hte__->next) { \ + if(hte__->key == NULL) continue; + +#define H_FOREACH_KEY(HT, KEYVAR) H_FOREACH_(HT) \ + const KEYVAR = hte__->key; + +#define H_FOREACH(HT, KEYVAR, VALVAR) H_FOREACH_KEY(HT, KEYVAR) \ + VALVAR = hte__->value; + +#define H_END_FOREACH \ + } \ + } \ + } + + + +HLRItem *h_lritem_new(HArena *a, HCFChoice *lhs, HCFChoice **rhs, size_t mark); +HLRState *h_lrstate_new(HArena *arena); +HLRTable *h_lrtable_new(HAllocator *mm__, size_t nrows); +void h_lrtable_free(HLRTable *table); +HLREngine *h_lrengine_new(HArena *arena, HArena *tarena, const HLRTable *table, + const HInputStream *stream); +HLRAction *h_reduce_action(HArena *arena, const HLRItem *item); +HLRAction *h_shift_action(HArena *arena, size_t nextstate); +HLRAction *h_lr_conflict(HArena *arena, HLRAction *action, HLRAction *new); +bool h_lrtable_row_empty(const HLRTable *table, size_t i); + +bool h_eq_symbol(const void *p, const void *q); +bool h_eq_lr_itemset(const void *p, const void *q); +bool h_eq_transition(const void *p, const void *q); +HHashValue h_hash_symbol(const void *p); +HHashValue h_hash_lr_itemset(const void *p); +HHashValue h_hash_transition(const void *p); + +HLRDFA *h_lr0_dfa(HCFGrammar *g); +HLRTable *h_lr0_table(HCFGrammar *g, const HLRDFA *dfa); + +HCFChoice *h_desugar_augmented(HAllocator *mm__, HParser *parser); +int h_lalr_compile(HAllocator* mm__, HParser* parser, const void* params); +void h_lalr_free(HParser *parser); + +const HLRAction *h_lrengine_action(const HLREngine *engine); +bool h_lrengine_step(HLREngine *engine, const HLRAction *action); +HParseResult *h_lrengine_result(HLREngine *engine); +HParseResult *h_lr_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream); +HParseResult *h_glr_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream); + +void h_pprint_lritem(FILE *f, const HCFGrammar *g, const HLRItem *item); +void h_pprint_lrstate(FILE *f, const HCFGrammar *g, + const HLRState *state, unsigned int indent); +void h_pprint_lrdfa(FILE *f, const HCFGrammar *g, + const HLRDFA *dfa, unsigned int indent); +void h_pprint_lrtable(FILE *f, const HCFGrammar *g, const HLRTable *table, + unsigned int indent); + +#endif diff --git a/src/backends/lr0.c b/src/backends/lr0.c new file mode 100644 index 0000000000000000000000000000000000000000..1c86484e61300ec40362a9abb47105424ddff2b9 --- /dev/null +++ b/src/backends/lr0.c @@ -0,0 +1,233 @@ +#include <assert.h> +#include "lr.h" + + + +/* Constructing the characteristic automaton (handle recognizer) */ + +static HLRItem *advance_mark(HArena *arena, const HLRItem *item) +{ + assert(item->rhs[item->mark] != NULL); + HLRItem *ret = h_arena_malloc(arena, sizeof(HLRItem)); + *ret = *item; + ret->mark++; + return ret; +} + +static void expand_to_closure(HCFGrammar *g, HHashSet *items) +{ + HAllocator *mm__ = g->mm__; + HArena *arena = g->arena; + HSlist *work = h_slist_new(arena); + + // initialize work list with items + H_FOREACH_KEY(items, HLRItem *item) + h_slist_push(work, (void *)item); + H_END_FOREACH + + while(!h_slist_empty(work)) { + const HLRItem *item = h_slist_pop(work); + HCFChoice *sym = item->rhs[item->mark]; // symbol after mark + + // if there is a non-terminal after the mark, follow it + // NB: unlike LLk, we do consider HCF_CHARSET a non-terminal here + if(sym != NULL && (sym->type==HCF_CHOICE || sym->type==HCF_CHARSET)) { + // add items corresponding to the productions of sym + if(sym->type == HCF_CHOICE) { + for(HCFSequence **p=sym->seq; *p; p++) { + HLRItem *it = h_lritem_new(arena, sym, (*p)->items, 0); + if(!h_hashset_present(items, it)) { + h_hashset_put(items, it); + h_slist_push(work, it); + } + } + } else { // HCF_CHARSET + for(unsigned int i=0; i<256; i++) { + if(charset_isset(sym->charset, i)) { + // XXX allocate these single-character symbols statically somewhere + HCFChoice **rhs = h_new(HCFChoice *, 2); + rhs[0] = h_new(HCFChoice, 1); + rhs[0]->type = HCF_CHAR; + rhs[0]->chr = i; + rhs[1] = NULL; + HLRItem *it = h_lritem_new(arena, sym, rhs, 0); + h_hashset_put(items, it); + // single-character item needs no further work + } + } + // if sym is a non-terminal, we need a reshape on it + // this seems as good a place as any to set it + sym->reshape = h_act_first; + } + } + } +} + +HLRDFA *h_lr0_dfa(HCFGrammar *g) +{ + HArena *arena = g->arena; + + HHashSet *states = h_hashset_new(arena, h_eq_lr_itemset, h_hash_lr_itemset); + // maps itemsets to assigned array indices + HSlist *transitions = h_slist_new(arena); + + // list of states that need to be processed + // to save lookups, we push two elements per state, the itemset and its + // assigned index. + HSlist *work = h_slist_new(arena); + + // make initial state (kernel) + HLRState *start = h_lrstate_new(arena); + assert(g->start->type == HCF_CHOICE); + for(HCFSequence **p=g->start->seq; *p; p++) + h_hashset_put(start, h_lritem_new(arena, g->start, (*p)->items, 0)); + expand_to_closure(g, start); + h_hashtable_put(states, start, 0); + h_slist_push(work, start); + h_slist_push(work, 0); + + // while work to do (on some state) + // determine edge symbols + // for each edge symbol: + // advance respective items -> destination state (kernel) + // compute closure + // if destination is a new state: + // add it to state set + // add transition to it + // add it to the work list + + while(!h_slist_empty(work)) { + size_t state_idx = (uintptr_t)h_slist_pop(work); + HLRState *state = h_slist_pop(work); + + // maps edge symbols to neighbor states (item sets) of s + HHashTable *neighbors = h_hashtable_new(arena, h_eq_symbol, h_hash_symbol); + + // iterate over state (closure) and generate neighboring sets + H_FOREACH_KEY(state, HLRItem *item) + HCFChoice *sym = item->rhs[item->mark]; // symbol after mark + + if(sym != NULL) { // mark was not at the end + // find or create prospective neighbor set + HLRState *neighbor = h_hashtable_get(neighbors, sym); + if(neighbor == NULL) { + neighbor = h_lrstate_new(arena); + h_hashtable_put(neighbors, sym, neighbor); + } + + // ...and add the advanced item to it + h_hashset_put(neighbor, advance_mark(arena, item)); + } + H_END_FOREACH + + // merge expanded neighbor sets into the set of existing states + H_FOREACH(neighbors, HCFChoice *symbol, HLRState *neighbor) + expand_to_closure(g, neighbor); + + // look up existing state, allocate new if not found + size_t neighbor_idx; + if(!h_hashset_present(states, neighbor)) { + neighbor_idx = states->used; + h_hashtable_put(states, neighbor, (void *)(uintptr_t)neighbor_idx); + h_slist_push(work, neighbor); + h_slist_push(work, (void *)(uintptr_t)neighbor_idx); + } else { + neighbor_idx = (uintptr_t)h_hashtable_get(states, neighbor); + } + + // add transition "state --symbol--> neighbor" + HLRTransition *t = h_arena_malloc(arena, sizeof(HLRTransition)); + t->from = state_idx; + t->to = neighbor_idx; + t->symbol = symbol; + h_slist_push(transitions, t); + H_END_FOREACH + } // end while(work) + + // fill DFA struct + HLRDFA *dfa = h_arena_malloc(arena, sizeof(HLRDFA)); + dfa->nstates = states->used; + dfa->states = h_arena_malloc(arena, dfa->nstates*sizeof(HLRState *)); + H_FOREACH(states, HLRState *state, void *v) + size_t idx = (uintptr_t)v; + dfa->states[idx] = state; + H_END_FOREACH + dfa->transitions = transitions; + + return dfa; +} + + + +/* LR(0) table generation */ + +static inline +void put_shift(HLRTable *table, size_t state, const HCFChoice *symbol, + size_t nextstate) +{ + HLRAction *action = h_shift_action(table->arena, nextstate); + + switch(symbol->type) { + case HCF_END: + h_stringmap_put_end(table->tmap[state], action); + break; + case HCF_CHAR: + h_stringmap_put_char(table->tmap[state], symbol->chr, action); + break; + default: + // nonterminal case + h_hashtable_put(table->ntmap[state], symbol, action); + } +} + +HLRTable *h_lr0_table(HCFGrammar *g, const HLRDFA *dfa) +{ + HAllocator *mm__ = g->mm__; + + HLRTable *table = h_lrtable_new(mm__, dfa->nstates); + HArena *arena = table->arena; + + // remember start symbol + table->start = g->start; + + // shift to the accepting end state for the start symbol + put_shift(table, 0, g->start, HLR_SUCCESS); + + // add shift entries + for(HSlistNode *x = dfa->transitions->head; x; x = x->next) { + // for each transition x-A->y, add "shift, goto y" to table entry (x,A) + HLRTransition *t = x->elem; + + put_shift(table, t->from, t->symbol, t->to); + } + + // add reduce entries, record inadequate states + for(size_t i=0; i<dfa->nstates; i++) { + bool inadeq = false; + + // find reducible items in state + H_FOREACH_KEY(dfa->states[i], HLRItem *item) + if(item->mark == item->len) { // mark at the end + HLRAction *reduce = h_reduce_action(arena, item); + + // check for reduce/reduce conflict on forall + if(table->forall[i]) { + reduce = h_lr_conflict(arena, table->forall[i], reduce); + inadeq = true; + } + table->forall[i] = reduce; + + // check for shift/reduce conflict with other entries + // NOTE: these are not recorded as HLR_CONFLICTs at this point + + if(!h_lrtable_row_empty(table, i)) + inadeq = true; + } + H_END_FOREACH + + if(inadeq) + h_slist_push(table->inadeq, (void *)(uintptr_t)i); + } + + return table; +} diff --git a/src/backends/packrat.c b/src/backends/packrat.c index c5c9565f272caab47aeab2f59592bf93dd40d524..8aa1f8ed670502f4b59e9be6498d22eaa74723ad 100644 --- a/src/backends/packrat.c +++ b/src/backends/packrat.c @@ -3,14 +3,6 @@ #include "../internal.h" #include "../parsers/parser_internal.h" -static uint32_t djbhash(const uint8_t *buf, size_t len) { - uint32_t hash = 5381; - while (len--) { - hash = hash * 33 + *buf++; - } - return hash; -} - // short-hand for constructing HCachedResult's static HCachedResult *cached_result(const HParseState *state, HParseResult *result) { HCachedResult *ret = a_new(HCachedResult, 1); @@ -214,7 +206,7 @@ void h_packrat_free(HParser *parser) { } static uint32_t cache_key_hash(const void* key) { - return djbhash(key, sizeof(HParserCacheKey)); + return h_djbhash(key, sizeof(HParserCacheKey)); } static bool cache_key_equal(const void* key1, const void* key2) { return memcmp(key1, key2, sizeof(HParserCacheKey)) == 0; diff --git a/src/cfgrammar.c b/src/cfgrammar.c index d3168adc948ca3951b579666d2a3ae9c93ced4fe..196d9d3c8b6ee9cb77b24a98ff365b8b4634ac45 100644 --- a/src/cfgrammar.c +++ b/src/cfgrammar.c @@ -18,12 +18,13 @@ HCFGrammar *h_cfgrammar_new(HAllocator *mm__) g->mm__ = mm__; g->arena = h_new_arena(mm__, 0); // default blocksize g->nts = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr); + g->start = NULL; g->geneps = NULL; g->first = NULL; g->follow = NULL; g->kmax = 0; // will be increased as needed by ensure_k - HCFStringMap *eps = h_stringmap_new(g->arena); + HStringMap *eps = h_stringmap_new(g->arena); h_stringmap_put_epsilon(eps, INSET); g->singleton_epsilon = eps; @@ -50,6 +51,11 @@ HCFGrammar *h_cfgrammar(HAllocator* mm__, const HParser *parser) if(desugared == NULL) return NULL; // -> backend not suitable for this parser + return h_cfgrammar_(mm__, desugared); +} + +HCFGrammar *h_cfgrammar_(HAllocator* mm__, HCFChoice *desugared) +{ HCFGrammar *g = h_cfgrammar_new(mm__); // recursively traverse the desugared form and collect all HCFChoices that @@ -219,39 +225,52 @@ static void collect_geneps(HCFGrammar *g) } -HCFStringMap *h_stringmap_new(HArena *a) +HStringMap *h_stringmap_new(HArena *a) { - HCFStringMap *m = h_arena_malloc(a, sizeof(HCFStringMap)); + HStringMap *m = h_arena_malloc(a, sizeof(HStringMap)); + m->epsilon_branch = NULL; + m->end_branch = NULL; m->char_branches = h_hashtable_new(a, h_eq_ptr, h_hash_ptr); m->arena = a; return m; } -void h_stringmap_put_end(HCFStringMap *m, void *v) +void h_stringmap_put_end(HStringMap *m, void *v) { m->end_branch = v; } -void h_stringmap_put_epsilon(HCFStringMap *m, void *v) +void h_stringmap_put_epsilon(HStringMap *m, void *v) { m->epsilon_branch = v; } -void h_stringmap_put_char(HCFStringMap *m, uint8_t c, void *v) +void h_stringmap_put_after(HStringMap *m, uint8_t c, HStringMap *ends) { - HCFStringMap *node = h_stringmap_new(m->arena); + h_hashtable_put(m->char_branches, (void *)char_key(c), ends); +} + +void h_stringmap_put_char(HStringMap *m, uint8_t c, void *v) +{ + HStringMap *node = h_stringmap_new(m->arena); h_stringmap_put_epsilon(node, v); - h_hashtable_put(m->char_branches, (void *)char_key(c), node); + h_stringmap_put_after(m, c, node); } // helper for h_stringmap_update -void *combine_stringmap(void *v1, void *v2) +static void *combine_stringmap(void *v1, const void *v2) { - h_stringmap_update((HCFStringMap *)v1, (HCFStringMap *)v2); - return v1; + HStringMap *m1 = v1; + const HStringMap *m2 = v2; + if(!m1) + m1 = h_stringmap_new(m2->arena); + h_stringmap_update(m1, m2); + + return m1; } -void h_stringmap_update(HCFStringMap *m, const HCFStringMap *n) +/* Note: Does *not* reuse submaps from n in building m. */ +void h_stringmap_update(HStringMap *m, const HStringMap *n) { if(n->epsilon_branch) m->epsilon_branch = n->epsilon_branch; @@ -262,32 +281,91 @@ void h_stringmap_update(HCFStringMap *m, const HCFStringMap *n) h_hashtable_merge(combine_stringmap, m->char_branches, n->char_branches); } -void *h_stringmap_get(const HCFStringMap *m, const uint8_t *str, size_t n, bool end) +/* Replace all occurances of old in m with new. + * If old is NULL, replace all values in m with new. + * If new is NULL, remove the respective values. + */ +void h_stringmap_replace(HStringMap *m, void *old, void *new) +{ + if(!old) { + if(m->epsilon_branch) m->epsilon_branch = new; + if(m->end_branch) m->end_branch = new; + } else { + if(m->epsilon_branch == old) m->epsilon_branch = new; + if(m->end_branch == old) m->end_branch = new; + } + + // iterate over m->char_branches + const HHashTable *ht = m->char_branches; + for(size_t i=0; i < ht->capacity; i++) { + for(HHashTableEntry *hte = &ht->contents[i]; hte; hte = hte->next) { + if(hte->key == NULL) + continue; + + HStringMap *m_ = hte->value; + if(m_) + h_stringmap_replace(m_, old, new); + } + } +} + +void *h_stringmap_get(const HStringMap *m, const uint8_t *str, size_t n, bool end) { for(size_t i=0; i<n; i++) { if(i==n-1 && end && m->end_branch) return m->end_branch; - m = h_hashtable_get(m->char_branches, (void *)char_key(str[i])); + m = h_stringmap_get_char(m, str[i]); if(!m) return NULL; } return m->epsilon_branch; } -bool h_stringmap_present(const HCFStringMap *m, const uint8_t *str, size_t n, bool end) +void *h_stringmap_get_lookahead(const HStringMap *m, HInputStream lookahead) +{ + while(m) { + if(m->epsilon_branch) { // input matched + // assert: another lookahead would not bring a more specific match. + // this is for the table generator to ensure. (LLk) + return m->epsilon_branch; + } + + // note the lookahead stream is passed by value, i.e. a copy. + // reading bits from it does not consume them from the real input. + uint8_t c = h_read_bits(&lookahead, 8, false); + + if(lookahead.overrun) { // end of input + // XXX assumption of byte-wise grammar and input + return m->end_branch; + } + + // no match yet, descend + m = h_stringmap_get_char(m, c); + } + + return NULL; +} + +bool h_stringmap_present(const HStringMap *m, const uint8_t *str, size_t n, bool end) { return (h_stringmap_get(m, str, n, end) != NULL); } -bool h_stringmap_present_epsilon(const HCFStringMap *m) +bool h_stringmap_present_epsilon(const HStringMap *m) { return (m->epsilon_branch != NULL); } +bool h_stringmap_empty(const HStringMap *m) +{ + return (m->epsilon_branch == NULL + && m->end_branch == NULL + && h_hashtable_empty(m->char_branches)); +} -const HCFStringMap *h_first(size_t k, HCFGrammar *g, const HCFChoice *x) +const HStringMap *h_first(size_t k, HCFGrammar *g, const HCFChoice *x) { - HCFStringMap *ret; + HStringMap *ret; HCFSequence **p; uint8_t c; @@ -333,18 +411,18 @@ const HCFStringMap *h_first(size_t k, HCFGrammar *g, const HCFChoice *x) } // helpers for h_first_seq, definitions below -static bool is_singleton_epsilon(const HCFStringMap *m); -static bool any_string_shorter(size_t k, const HCFStringMap *m); +static bool is_singleton_epsilon(const HStringMap *m); +static bool any_string_shorter(size_t k, const HStringMap *m); // pointer to functions like h_first_seq -typedef const HCFStringMap *(*StringSetFun)(size_t, HCFGrammar *, HCFChoice **); +typedef const HStringMap *(*StringSetFun)(size_t, HCFGrammar *, HCFChoice **); // helper for h_first_seq and h_follow -static void stringset_extend(HCFGrammar *g, HCFStringMap *ret, - size_t k, const HCFStringMap *as, +static void stringset_extend(HCFGrammar *g, HStringMap *ret, + size_t k, const HStringMap *as, StringSetFun f, HCFChoice **tail); -const HCFStringMap *h_first_seq(size_t k, HCFGrammar *g, HCFChoice **s) +const HStringMap *h_first_seq(size_t k, HCFGrammar *g, HCFChoice **s) { // shortcut: the first set of the empty sequence, for any k, is {""} if(*s == NULL) @@ -355,7 +433,7 @@ const HCFStringMap *h_first_seq(size_t k, HCFGrammar *g, HCFChoice **s) HCFChoice *x = s[0]; HCFChoice **tail = s+1; - const HCFStringMap *first_x = h_first(k, g, x); + const HStringMap *first_x = h_first(k, g, x); // shortcut: if first_k(X) = {""}, just return first_k(tail) if(is_singleton_epsilon(first_x)) @@ -366,7 +444,7 @@ const HCFStringMap *h_first_seq(size_t k, HCFGrammar *g, HCFChoice **s) return first_x; // create a new result set and build up the set described above - HCFStringMap *ret = h_stringmap_new(g->arena); + HStringMap *ret = h_stringmap_new(g->arena); // extend the elements of first_k(X) up to length k from tail stringset_extend(g, ret, k, first_x, h_first_seq, tail); @@ -374,14 +452,14 @@ const HCFStringMap *h_first_seq(size_t k, HCFGrammar *g, HCFChoice **s) return ret; } -static bool is_singleton_epsilon(const HCFStringMap *m) +static bool is_singleton_epsilon(const HStringMap *m) { return ( m->epsilon_branch && !m->end_branch && h_hashtable_empty(m->char_branches) ); } -static bool any_string_shorter(size_t k, const HCFStringMap *m) +static bool any_string_shorter(size_t k, const HStringMap *m) { if(k==0) return false; @@ -395,7 +473,7 @@ static bool any_string_shorter(size_t k, const HCFStringMap *m) for(HHashTableEntry *hte = &ht->contents[i]; hte; hte = hte->next) { if(hte->key == NULL) continue; - HCFStringMap *m_ = hte->value; + HStringMap *m_ = hte->value; // check subtree for strings shorter than k-1 if(any_string_shorter(k-1, m_)) @@ -406,15 +484,32 @@ static bool any_string_shorter(size_t k, const HCFStringMap *m) return false; } -const HCFStringMap *h_follow(size_t k, HCFGrammar *g, const HCFChoice *x); +// helper for h_predict +static void remove_all_shorter(size_t k, HStringMap *m) +{ + if(k==0) return; + m->epsilon_branch = NULL; + if(k==1) return; + + // iterate over m->char_branches + const HHashTable *ht = m->char_branches; + for(size_t i=0; i < ht->capacity; i++) { + for(HHashTableEntry *hte = &ht->contents[i]; hte; hte = hte->next) { + if(hte->key == NULL) + continue; + remove_all_shorter(k-1, hte->value); // recursion into subtree + } + } +} // h_follow adapted to the signature of StringSetFun -static inline const HCFStringMap *h_follow_(size_t k, HCFGrammar *g, HCFChoice **s) +static inline +const HStringMap *h_follow_(size_t k, HCFGrammar *g, HCFChoice **s) { return h_follow(k, g, *s); } -const HCFStringMap *h_follow(size_t k, HCFGrammar *g, const HCFChoice *x) +const HStringMap *h_follow(size_t k, HCFGrammar *g, const HCFChoice *x) { // consider all occurances of X in g // the follow set of X is the union of: @@ -425,7 +520,7 @@ const HCFStringMap *h_follow(size_t k, HCFGrammar *g, const HCFChoice *x) // first_k(tail follow_k(A)) = // { a b | a <- first_k(tail), b <- follow_l(A), l=k-|a| } - HCFStringMap *ret; + HStringMap *ret; // shortcut: follow_0(X) is always {""} if(k==0) @@ -463,9 +558,7 @@ const HCFStringMap *h_follow(size_t k, HCFGrammar *g, const HCFChoice *x) if(*s == x) { // occurance found HCFChoice **tail = s+1; - const HCFStringMap *first_tail = h_first_seq(k, g, tail); - - //h_stringmap_update(ret, first_tail); + const HStringMap *first_tail = h_first_seq(k, g, tail); // extend the elems of first_k(tail) up to length k from follow(A) stringset_extend(g, ret, k, first_tail, h_follow_, &a); @@ -478,9 +571,30 @@ const HCFStringMap *h_follow(size_t k, HCFGrammar *g, const HCFChoice *x) return ret; } +HStringMap *h_predict(size_t k, HCFGrammar *g, + const HCFChoice *A, const HCFSequence *rhs) +{ + HStringMap *ret = h_stringmap_new(g->arena); + + // predict_k(A -> rhs) = + // { ab | a <- first_k(rhs), b <- follow_k(A), |ab|=k } + + const HStringMap *first_rhs = h_first_seq(k, g, rhs->items); + + // casting the const off of A below. note: stringset_extend does + // not touch this argument, only passes it through to h_follow + // in this case, which accepts it, once again, as const. + stringset_extend(g, ret, k, first_rhs, h_follow_, (HCFChoice **)&A); + + // make sure there are only strings of length _exactly_ k + remove_all_shorter(k, ret); + + return ret; +} + // add the set { a b | a <- as, b <- f_l(S), l=k-|a| } to ret -static void stringset_extend(HCFGrammar *g, HCFStringMap *ret, - size_t k, const HCFStringMap *as, +static void stringset_extend(HCFGrammar *g, HStringMap *ret, + size_t k, const HStringMap *as, StringSetFun f, HCFChoice **tail) { if(as->epsilon_branch) { @@ -503,13 +617,13 @@ static void stringset_extend(HCFGrammar *g, HCFStringMap *ret, uint8_t c = key_char((HCharKey)hte->key); // follow the branch to find the set { a' | t a' <- as } - HCFStringMap *as_ = (HCFStringMap *)hte->value; + HStringMap *as_ = (HStringMap *)hte->value; // now the elements of ret that begin with t are given by // t { a b | a <- as_, b <- f_l(tail), l=k-|a|-1 } // so we can use recursion over k - HCFStringMap *ret_ = h_stringmap_new(g->arena); - h_stringmap_put_char(ret, c, ret_); + HStringMap *ret_ = h_stringmap_new(g->arena); + h_stringmap_put_after(ret, c, ret_); stringset_extend(g, ret_, k-1, as_, f, tail); } @@ -517,7 +631,7 @@ static void stringset_extend(HCFGrammar *g, HCFStringMap *ret, } -static void pprint_char(FILE *f, char c) +void h_pprint_char(FILE *f, char c) { switch(c) { case '"': fputs("\\\"", f); break; @@ -541,7 +655,7 @@ static void pprint_charset_char(FILE *f, char c) case '"': fputc(c, f); break; case '-': fputs("\\-", f); break; case ']': fputs("\\-", f); break; - default: pprint_char(f, c); + default: h_pprint_char(f, c); } } @@ -589,18 +703,18 @@ static HCFChoice **pprint_string(FILE *f, HCFChoice **x) for(; *x; x++) { if((*x)->type != HCF_CHAR) break; - pprint_char(f, (*x)->chr); + h_pprint_char(f, (*x)->chr); } fputc('"', f); return x; } -static void pprint_symbol(FILE *f, const HCFGrammar *g, const HCFChoice *x) +void h_pprint_symbol(FILE *f, const HCFGrammar *g, const HCFChoice *x) { switch(x->type) { case HCF_CHAR: fputc('"', f); - pprint_char(f, x->chr); + h_pprint_char(f, x->chr); fputc('"', f); break; case HCF_END: @@ -614,32 +728,37 @@ static void pprint_symbol(FILE *f, const HCFGrammar *g, const HCFChoice *x) } } -static void pprint_sequence(FILE *f, const HCFGrammar *g, const HCFSequence *seq) +void h_pprint_sequence(FILE *f, const HCFGrammar *g, const HCFSequence *seq) { HCFChoice **x = seq->items; if(*x == NULL) { // the empty sequence - fputs(" \"\"", f); + fputs("\"\"", f); } else { while(*x) { - fputc(' ', f); // separator + if(x != seq->items) fputc(' ', f); // internal separator if((*x)->type == HCF_CHAR) { // condense character strings x = pprint_string(f, x); } else { - pprint_symbol(f, g, *x); + h_pprint_symbol(f, g, *x); x++; } } } +} +// adds some separators expected below +static void pprint_sequence(FILE *f, const HCFGrammar *g, const HCFSequence *seq) +{ + fputc(' ', f); + h_pprint_sequence(f, g, seq); fputc('\n', f); } -static -void pprint_ntrules(FILE *f, const HCFGrammar *g, const HCFChoice *nt, - int indent, int len) +static void pprint_ntrules(FILE *f, const HCFGrammar *g, const HCFChoice *nt, + int indent, int len) { int i; int column = indent + len; @@ -709,7 +828,7 @@ void h_pprint_symbolset(FILE *file, const HCFGrammar *g, const HHashSet *set, in a = hte->key; // production's left-hand symbol - pprint_symbol(file, g, a); + h_pprint_symbol(file, g, a); } } @@ -718,26 +837,44 @@ void h_pprint_symbolset(FILE *file, const HCFGrammar *g, const HHashSet *set, in #define BUFSIZE 512 -void pprint_stringset_elems(FILE *file, bool first, char *prefix, size_t n, const HCFStringMap *set) +static bool +pprint_stringmap_elems(FILE *file, bool first, char *prefix, size_t n, char sep, + void (*valprint)(FILE *f, void *env, void *val), void *env, + const HStringMap *map) { assert(n < BUFSIZE-4); - if(set->epsilon_branch) { - if(!first) fputc(',', file); first=false; - if(n==0) - fputs("''", file); - else + if(map->epsilon_branch) { + if(!first) fputc(sep, file); first=false; + if(n==0) { + fputs("\"\"", file); + } else { + fputs("\"", file); fwrite(prefix, 1, n, file); + fputs("\"", file); + } + + if(valprint) { + fputc(':', file); + valprint(file, env, map->epsilon_branch); + } } - if(set->end_branch) { - if(!first) fputc(',', file); first=false; + if(map->end_branch) { + if(!first) fputs(",\"", file); first=false; + if(n>0) fputs("\"\"", file); fwrite(prefix, 1, n, file); - fputc('$', file); + if(n>0) fputs("\"\"", file); + fputs("$", file); + + if(valprint) { + fputc(':', file); + valprint(file, env, map->end_branch); + } } - // iterate over set->char_branches - HHashTable *ht = set->char_branches; + // iterate over map->char_branches + HHashTable *ht = map->char_branches; size_t i; HHashTableEntry *hte; for(i=0; i < ht->capacity; i++) { @@ -745,7 +882,7 @@ void pprint_stringset_elems(FILE *file, bool first, char *prefix, size_t n, cons if(hte->key == NULL) continue; uint8_t c = key_char((HCharKey)hte->key); - HCFStringMap *ends = hte->value; + HStringMap *ends = hte->value; size_t n_ = n; switch(c) { @@ -763,18 +900,28 @@ void pprint_stringset_elems(FILE *file, bool first, char *prefix, size_t n, cons n_ += sprintf(prefix+n_, "\\x%.2X", c); } - pprint_stringset_elems(file, first, prefix, n_, ends); + first = pprint_stringmap_elems(file, first, prefix, n_, + sep, valprint, env, ends); } } + + return first; +} + +void h_pprint_stringmap(FILE *file, char sep, + void (*valprint)(FILE *f, void *env, void *val), void *env, + const HStringMap *map) +{ + char buf[BUFSIZE]; + pprint_stringmap_elems(file, true, buf, 0, sep, valprint, env, map); } -void h_pprint_stringset(FILE *file, const HCFGrammar *g, const HCFStringMap *set, int indent) +void h_pprint_stringset(FILE *file, const HStringMap *set, int indent) { int j; for(j=0; j<indent; j++) fputc(' ', file); - char buf[BUFSIZE]; fputc('{', file); - pprint_stringset_elems(file, true, buf, 0, set); + h_pprint_stringmap(file, ',', NULL, NULL, set); fputs("}\n", file); } diff --git a/src/cfgrammar.h b/src/cfgrammar.h index cec5d6e9b4b7ea011fe896b7f6d433826ebefcc9..193f8ca327d2f9c0b74518b9942b5fe3f37c407b 100644 --- a/src/cfgrammar.h +++ b/src/cfgrammar.h @@ -16,7 +16,7 @@ typedef struct HCFGrammar_ { // constant set containing only the empty string. // this is only a member of HCFGrammar because it needs a pointer to arena. - const struct HCFStringMap_ *singleton_epsilon; + const struct HStringMap_ *singleton_epsilon; } HCFGrammar; @@ -32,21 +32,28 @@ static inline uint8_t key_char(HCharKey k) { return (0xFF & k); } * input tokens. * Each path through the tree represents the string along its branches. */ -typedef struct HCFStringMap_ { +typedef struct HStringMap_ { void *epsilon_branch; // points to leaf value void *end_branch; // points to leaf value - HHashTable *char_branches; // maps to inner nodes (HCFStringMaps) + HHashTable *char_branches; // maps to inner nodes (HStringMaps) HArena *arena; -} HCFStringMap; +} HStringMap; -HCFStringMap *h_stringmap_new(HArena *a); -void h_stringmap_put_end(HCFStringMap *m, void *v); -void h_stringmap_put_epsilon(HCFStringMap *m, void *v); -void h_stringmap_put_char(HCFStringMap *m, uint8_t c, void *v); -void h_stringmap_update(HCFStringMap *m, const HCFStringMap *n); -void *h_stringmap_get(const HCFStringMap *m, const uint8_t *str, size_t n, bool end); -bool h_stringmap_present(const HCFStringMap *m, const uint8_t *str, size_t n, bool end); -bool h_stringmap_present_epsilon(const HCFStringMap *m); +HStringMap *h_stringmap_new(HArena *a); +void h_stringmap_put_end(HStringMap *m, void *v); +void h_stringmap_put_epsilon(HStringMap *m, void *v); +void h_stringmap_put_after(HStringMap *m, uint8_t c, HStringMap *ends); +void h_stringmap_put_char(HStringMap *m, uint8_t c, void *v); +void h_stringmap_update(HStringMap *m, const HStringMap *n); +void h_stringmap_replace(HStringMap *m, void *old, void *new); +void *h_stringmap_get(const HStringMap *m, const uint8_t *str, size_t n, bool end); +void *h_stringmap_get_lookahead(const HStringMap *m, HInputStream lookahead); +bool h_stringmap_present(const HStringMap *m, const uint8_t *str, size_t n, bool end); +bool h_stringmap_present_epsilon(const HStringMap *m); +bool h_stringmap_empty(const HStringMap *m); + +static inline HStringMap *h_stringmap_get_char(const HStringMap *m, const uint8_t c) + { return h_hashtable_get(m->char_branches, (void *)char_key(c)); } /* Convert 'parser' into CFG representation by desugaring and compiling the set @@ -54,6 +61,9 @@ bool h_stringmap_present_epsilon(const HCFStringMap *m); * A NULL return means we are unable to represent the parser as a CFG. */ HCFGrammar *h_cfgrammar(HAllocator* mm__, const HParser *parser); +HCFGrammar *h_cfgrammar_(HAllocator* mm__, HCFChoice *start); + +HCFGrammar *h_cfgrammar_new(HAllocator *mm__); /* Frees the given grammar and associated data. * Does *not* free parsers' CFG forms as created by h_desugar. @@ -67,16 +77,28 @@ bool h_derives_epsilon(HCFGrammar *g, const HCFChoice *symbol); bool h_derives_epsilon_seq(HCFGrammar *g, HCFChoice **s); /* Compute first_k set of symbol x. Memoized. */ -const HCFStringMap *h_first(size_t k, HCFGrammar *g, const HCFChoice *x); +const HStringMap *h_first(size_t k, HCFGrammar *g, const HCFChoice *x); /* Compute first_k set of sentential form s. s NULL-terminated. */ -const HCFStringMap *h_first_seq(size_t k, HCFGrammar *g, HCFChoice **s); +const HStringMap *h_first_seq(size_t k, HCFGrammar *g, HCFChoice **s); /* Compute follow_k set of symbol x. Memoized. */ -const HCFStringMap *h_follow(size_t k, HCFGrammar *g, const HCFChoice *x); +const HStringMap *h_follow(size_t k, HCFGrammar *g, const HCFChoice *x); + +/* Compute the predict_k set of production "A -> rhs". + * Always returns a newly-allocated HStringMap. + */ +HStringMap *h_predict(size_t k, HCFGrammar *g, + const HCFChoice *A, const HCFSequence *rhs); /* Pretty-printers for grammars and associated data. */ void h_pprint_grammar(FILE *file, const HCFGrammar *g, int indent); +void h_pprint_sequence(FILE *f, const HCFGrammar *g, const HCFSequence *seq); +void h_pprint_symbol(FILE *f, const HCFGrammar *g, const HCFChoice *x); void h_pprint_symbolset(FILE *file, const HCFGrammar *g, const HHashSet *set, int indent); -void h_pprint_stringset(FILE *file, const HCFGrammar *g, const HCFStringMap *set, int indent); +void h_pprint_stringset(FILE *file, const HStringMap *set, int indent); +void h_pprint_stringmap(FILE *file, char sep, + void (*valprint)(FILE *f, void *env, void *val), void *env, + const HStringMap *map); +void h_pprint_char(FILE *file, char c); diff --git a/src/datastructures.c b/src/datastructures.c index 1ddd6203cb25dd8db748e3ead3a15db3d06f27bb..45a7eba768e4fa0fa25b42aca1eebc5b8946e740 100644 --- a/src/datastructures.c +++ b/src/datastructures.c @@ -62,6 +62,16 @@ HSlist* h_slist_copy(HSlist *slist) { return ret; } +// like h_slist_pop, but does not deallocate the head node +void* h_slist_drop(HSlist *slist) { + HSlistNode *head = slist->head; + if (!head) + return NULL; + void* ret = head->elem; + slist->head = head->next; + return ret; +} + void* h_slist_pop(HSlist *slist) { HSlistNode *head = slist->head; if (!head) @@ -147,6 +157,8 @@ void* h_hashtable_get(const HHashTable* ht, const void* key) { for (hte = &ht->contents[hashval & (ht->capacity - 1)]; hte != NULL; hte = hte->next) { + if (hte->key == NULL) + continue; if (hte->hashval != hashval) continue; if (ht->equalFunc(key, hte->key)) @@ -201,7 +213,7 @@ void h_hashtable_update(HHashTable *dst, const HHashTable *src) { } } -void h_hashtable_merge(void *(*combine)(void *v1, void *v2), +void h_hashtable_merge(void *(*combine)(void *v1, const void *v2), HHashTable *dst, const HHashTable *src) { size_t i; HHashTableEntry *hte; @@ -209,13 +221,9 @@ void h_hashtable_merge(void *(*combine)(void *v1, void *v2), for(hte = &src->contents[i]; hte; hte = hte->next) { if(hte->key == NULL) continue; - void *oldvalue = h_hashtable_get(dst, hte->key); - void *newvalue; - if(oldvalue) - newvalue = combine(oldvalue, hte->value); - else - newvalue = hte->value; - h_hashtable_put(dst, hte->key, newvalue); + void *dstvalue = h_hashtable_get(dst, hte->key); + void *srcvalue = hte->value; + h_hashtable_put(dst, hte->key, combine(dstvalue, srcvalue)); } } } @@ -236,6 +244,7 @@ int h_hashtable_present(const HHashTable* ht, const void* key) { } return false; } + void h_hashtable_del(HHashTable* ht, const void* key) { HHashValue hashval = ht->hashFunc(key); #ifdef CONSISTENCY_CHECK @@ -261,6 +270,7 @@ void h_hashtable_del(HHashTable* ht, const void* key) { } } } + void h_hashtable_free(HHashTable* ht) { for (size_t i = 0; i < ht->capacity; i++) { HHashTableEntry *hten, *hte = &ht->contents[i]; @@ -276,15 +286,76 @@ void h_hashtable_free(HHashTable* ht) { h_arena_free(ht->arena, ht->contents); } +// helper for hte_equal +static bool hte_same_length(HHashTableEntry *xs, HHashTableEntry *ys) { + while(xs && ys) { + xs=xs->next; + ys=ys->next; + // skip NULL keys (= element not present) + while(xs && xs->key == NULL) xs=xs->next; + while(ys && ys->key == NULL) ys=ys->next; + } + return (xs == ys); // both NULL +} + +// helper for hte_equal: are all elements of xs present in ys? +static bool hte_subset(HEqualFunc eq, HHashTableEntry *xs, HHashTableEntry *ys) +{ + for(; xs; xs=xs->next) { + if(xs->key == NULL) continue; // element not present + + HHashTableEntry *hte; + for(hte=ys; hte; hte=hte->next) { + if(hte->key == xs->key) break; // assume an element is equal to itself + if(hte->hashval != xs->hashval) continue; // shortcut + if(eq(hte->key, xs->key)) break; + } + if(hte == NULL) return false; // element not found + } + return true; // all found +} + +// compare two lists of HHashTableEntries +static inline bool hte_equal(HEqualFunc eq, HHashTableEntry *xs, HHashTableEntry *ys) { + return (hte_same_length(xs, ys) && hte_subset(eq, xs, ys)); +} + +/* Set equality of HHashSets. + * Obviously, 'a' and 'b' must use the same equality function. + * Not strictly necessary, but we also assume the same hash function. + */ +bool h_hashset_equal(const HHashSet *a, const HHashSet *b) { + if(a->capacity == b->capacity) { + // iterate over the buckets in parallel + for(size_t i=0; i < a->capacity; i++) { + if(!hte_equal(a->equalFunc, &a->contents[i], &b->contents[i])) + return false; + } + } else { + assert_message(0, "h_hashset_equal called on sets of different capacity"); + // TODO implement general case + } + return true; +} + bool h_eq_ptr(const void *p, const void *q) { return (p==q); } HHashValue h_hash_ptr(const void *p) { - // XXX just djbhash it + // XXX just djbhash it? it does make the benchmark ~7% slower. + //return h_djbhash((const uint8_t *)&p, sizeof(void *)); return (uintptr_t)p >> 4; } +uint32_t h_djbhash(const uint8_t *buf, size_t len) { + uint32_t hash = 5381; + while (len--) { + hash = hash * 33 + *buf++; + } + return hash; +} + HSArray *h_sarray_new(HAllocator *mm__, size_t size) { HSArray *ret = h_new(HSArray, 1); ret->capacity = size; diff --git a/src/desugar.c b/src/desugar.c index a613644e9c5f5dab017b761000a3c6856b8393a5..5ef8f9b95deb9c440c54fc568b73453ad5c69946 100644 --- a/src/desugar.c +++ b/src/desugar.c @@ -8,9 +8,12 @@ HCFChoice *h_desugar(HAllocator *mm__, HCFStack *stk__, const HParser *parser) { if (nstk__ == NULL) { nstk__ = h_cfstack_new(mm__); } + if(nstk__->prealloc == NULL) + nstk__->prealloc = h_new(HCFChoice, 1); + // we're going to do something naughty and cast away the const to memoize assert(parser->vtable->desugar != NULL); + ((HParser *)parser)->desugared = nstk__->prealloc; parser->vtable->desugar(mm__, nstk__, parser->env); - ((HParser *)parser)->desugared = nstk__->last_completed; if (stk__ == NULL) h_cfstack_free(mm__, nstk__); } else if (stk__ != NULL) { diff --git a/src/hammer.c b/src/hammer.c index 5f94142908f48f86a0dde79ccd376c2625063635..7fc80dba0c86ec76a2376d0d69914f235bf08afc 100644 --- a/src/hammer.c +++ b/src/hammer.c @@ -30,6 +30,8 @@ static HParserBackendVTable *backends[PB_MAX + 1] = { &h__packrat_backend_vtable, &h__regex_backend_vtable, &h__llk_backend_vtable, + &h__lalr_backend_vtable, + &h__glr_backend_vtable, }; diff --git a/src/hammer.h b/src/hammer.h index 455684cc92edbfbf9b9352625e373ca408f61261..67fb8e4bff34eb741d7f14d3c64fcba160598369 100644 --- a/src/hammer.h +++ b/src/hammer.h @@ -34,11 +34,11 @@ typedef struct HParseState_ HParseState; typedef enum HParserBackend_ { PB_MIN = 0, PB_PACKRAT = PB_MIN, // PB_MIN is always the default. - PB_REGULAR, // - PB_LLk, // - PB_LALR, // Not Implemented - PB_GLR, // Not Implemented - PB_MAX = PB_LLk + PB_REGULAR, + PB_LLk, + PB_LALR, + PB_GLR, + PB_MAX = PB_GLR } HParserBackend; typedef enum HTokenType_ { diff --git a/src/internal.h b/src/internal.h index b7fe6213da5c5749f159956345de5bbc755704eb..02ee7482e2066e91e605c5a34cbd216f951db348 100644 --- a/src/internal.h +++ b/src/internal.h @@ -279,6 +279,8 @@ struct HBitWriter_ { // Backends {{{ extern HParserBackendVTable h__packrat_backend_vtable; extern HParserBackendVTable h__llk_backend_vtable; +extern HParserBackendVTable h__lalr_backend_vtable; +extern HParserBackendVTable h__glr_backend_vtable; // }}} // TODO(thequux): Set symbol visibility for these functions so that they aren't exported. @@ -306,6 +308,7 @@ void h_carray_append(HCountedArray *array, void* item); HSlist* h_slist_new(HArena *arena); HSlist* h_slist_copy(HSlist *slist); void* h_slist_pop(HSlist *slist); +void* h_slist_drop(HSlist *slist); void h_slist_push(HSlist *slist, void* item); bool h_slist_find(HSlist *slist, const void* item); HSlist* h_slist_remove_all(HSlist *slist, const void* item); @@ -316,7 +319,7 @@ HHashTable* h_hashtable_new(HArena *arena, HEqualFunc equalFunc, HHashFunc hashF void* h_hashtable_get(const HHashTable* ht, const void* key); void h_hashtable_put(HHashTable* ht, const void* key, void* value); void h_hashtable_update(HHashTable* dst, const HHashTable *src); -void h_hashtable_merge(void *(*combine)(void *v1, void *v2), +void h_hashtable_merge(void *(*combine)(void *v1, const void *v2), HHashTable *dst, const HHashTable *src); int h_hashtable_present(const HHashTable* ht, const void* key); void h_hashtable_del(HHashTable* ht, const void* key); @@ -331,9 +334,11 @@ typedef HHashTable HHashSet; #define h_hashset_empty(ht) h_hashtable_empty(ht) #define h_hashset_del(ht,el) h_hashtable_del(ht,el) #define h_hashset_free(ht) h_hashtable_free(ht) +bool h_hashset_equal(const HHashSet *a, const HHashSet *b); bool h_eq_ptr(const void *p, const void *q); HHashValue h_hash_ptr(const void *p); +uint32_t h_djbhash(const uint8_t *buf, size_t len); typedef struct HCFSequence_ HCFSequence; diff --git a/src/parsers/indirect.c b/src/parsers/indirect.c index 746f1a9ee37a0fbcdf6558cf7670290d34a76972..2217a202968f2a11306c60ccea34c9e3126186c1 100644 --- a/src/parsers/indirect.c +++ b/src/parsers/indirect.c @@ -10,7 +10,7 @@ static bool indirect_isValidCF(void *env) { } static void desugar_indirect(HAllocator *mm__, HCFStack *stk__, void *env) { - HCFS_DESUGAR( (HParser*)env ); + HCFS_DESUGAR( (HParser *)env ); } static const HParserVtable indirect_vt = { diff --git a/src/parsers/unimplemented.c b/src/parsers/unimplemented.c index 7c3c6671f8946fc61dd7f33ccd5971cf00e88ed8..e3f3039407eacaa1d24689767a4a1038fce66a93 100644 --- a/src/parsers/unimplemented.c +++ b/src/parsers/unimplemented.c @@ -16,6 +16,7 @@ static const HParserVtable unimplemented_vt = { .parse = parse_unimplemented, .isValidRegular = h_false, .isValidCF = h_false, + .desugar = NULL, .compile_to_rvm = h_not_regular, }; diff --git a/src/t_parser.c b/src/t_parser.c index 8aab7bb38e4b950e60da93e1c362b4a09ef0bbb0..59adf36a69d1e34f9a460088a7021d8d14ff6df5 100644 --- a/src/t_parser.c +++ b/src/t_parser.c @@ -405,9 +405,9 @@ static void test_not(gconstpointer backend) { g_check_parse_ok(not_2, (HParserBackend)GPOINTER_TO_INT(backend), "a+b", 3, "(u0x61 (u0x2b) u0x62)"); g_check_parse_ok(not_2, (HParserBackend)GPOINTER_TO_INT(backend), "a++b", 4, "(u0x61 <2b.2b> u0x62)"); } -/* + static void test_leftrec(gconstpointer backend) { - const HParser *a_ = h_ch('a'); + HParser *a_ = h_ch('a'); HParser *lr_ = h_indirect(); h_bind_indirect(lr_, h_choice(h_sequence(lr_, a_, NULL), a_, NULL)); @@ -416,7 +416,31 @@ static void test_leftrec(gconstpointer backend) { g_check_parse_ok(lr_, (HParserBackend)GPOINTER_TO_INT(backend), "aa", 2, "(u0x61 u0x61)"); g_check_parse_ok(lr_, (HParserBackend)GPOINTER_TO_INT(backend), "aaa", 3, "((u0x61 u0x61) u0x61)"); } -*/ + +static void test_rightrec(gconstpointer backend) { + HParser *a_ = h_ch('a'); + + HParser *rr_ = h_indirect(); + h_bind_indirect(rr_, h_choice(h_sequence(a_, rr_, NULL), h_epsilon_p(), NULL)); + + g_check_parse_ok(rr_, (HParserBackend)GPOINTER_TO_INT(backend), "a", 1, "(u0x61)"); + g_check_parse_ok(rr_, (HParserBackend)GPOINTER_TO_INT(backend), "aa", 2, "(u0x61 (u0x61))"); + g_check_parse_ok(rr_, (HParserBackend)GPOINTER_TO_INT(backend), "aaa", 3, "(u0x61 (u0x61 (u0x61)))"); +} + +static void test_ambiguous(gconstpointer backend) { + HParser *d_ = h_ch('d'); + HParser *p_ = h_ch('+'); + HParser *E_ = h_indirect(); + h_bind_indirect(E_, h_choice(h_sequence(E_, p_, E_, NULL), d_, NULL)); + HParser *expr_ = h_action(E_, h_act_flatten); + + g_check_parse_ok(expr_, (HParserBackend)GPOINTER_TO_INT(backend), "d", 1, "(u0x64)"); + g_check_parse_ok(expr_, (HParserBackend)GPOINTER_TO_INT(backend), "d+d", 3, "(u0x64 u0x2b u0x64)"); + g_check_parse_ok(expr_, (HParserBackend)GPOINTER_TO_INT(backend), "d+d+d", 5, "(u0x64 u0x2b u0x64 u0x2b u0x64)"); + g_check_parse_failed(expr_, (HParserBackend)GPOINTER_TO_INT(backend), "d+", 2); +} + void register_parser_tests(void) { g_test_add_data_func("/core/parser/packrat/token", GINT_TO_POINTER(PB_PACKRAT), test_token); g_test_add_data_func("/core/parser/packrat/ch", GINT_TO_POINTER(PB_PACKRAT), test_ch); @@ -460,6 +484,7 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/packrat/not", GINT_TO_POINTER(PB_PACKRAT), test_not); g_test_add_data_func("/core/parser/packrat/ignore", GINT_TO_POINTER(PB_PACKRAT), test_ignore); // g_test_add_data_func("/core/parser/packrat/leftrec", GINT_TO_POINTER(PB_PACKRAT), test_leftrec); + g_test_add_data_func("/core/parser/packrat/rightrec", GINT_TO_POINTER(PB_PACKRAT), test_rightrec); g_test_add_data_func("/core/parser/llk/token", GINT_TO_POINTER(PB_LLk), test_token); g_test_add_data_func("/core/parser/llk/ch", GINT_TO_POINTER(PB_LLk), test_ch); @@ -496,6 +521,8 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/llk/epsilon_p", GINT_TO_POINTER(PB_LLk), test_epsilon_p); g_test_add_data_func("/core/parser/llk/attr_bool", GINT_TO_POINTER(PB_LLk), test_attr_bool); g_test_add_data_func("/core/parser/llk/ignore", GINT_TO_POINTER(PB_LLk), test_ignore); + //g_test_add_data_func("/core/parser/llk/leftrec", GINT_TO_POINTER(PB_LLk), test_leftrec); + g_test_add_data_func("/core/parser/llk/rightrec", GINT_TO_POINTER(PB_LLk), test_rightrec); g_test_add_data_func("/core/parser/regex/token", GINT_TO_POINTER(PB_REGULAR), test_token); g_test_add_data_func("/core/parser/regex/ch", GINT_TO_POINTER(PB_REGULAR), test_ch); @@ -533,4 +560,81 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/regex/epsilon_p", GINT_TO_POINTER(PB_REGULAR), test_epsilon_p); g_test_add_data_func("/core/parser/regex/attr_bool", GINT_TO_POINTER(PB_REGULAR), test_attr_bool); g_test_add_data_func("/core/parser/regex/ignore", GINT_TO_POINTER(PB_REGULAR), test_ignore); + + g_test_add_data_func("/core/parser/lalr/token", GINT_TO_POINTER(PB_LALR), test_token); + g_test_add_data_func("/core/parser/lalr/ch", GINT_TO_POINTER(PB_LALR), test_ch); + g_test_add_data_func("/core/parser/lalr/ch_range", GINT_TO_POINTER(PB_LALR), test_ch_range); + g_test_add_data_func("/core/parser/lalr/int64", GINT_TO_POINTER(PB_LALR), test_int64); + g_test_add_data_func("/core/parser/lalr/int32", GINT_TO_POINTER(PB_LALR), test_int32); + g_test_add_data_func("/core/parser/lalr/int16", GINT_TO_POINTER(PB_LALR), test_int16); + g_test_add_data_func("/core/parser/lalr/int8", GINT_TO_POINTER(PB_LALR), test_int8); + g_test_add_data_func("/core/parser/lalr/uint64", GINT_TO_POINTER(PB_LALR), test_uint64); + g_test_add_data_func("/core/parser/lalr/uint32", GINT_TO_POINTER(PB_LALR), test_uint32); + g_test_add_data_func("/core/parser/lalr/uint16", GINT_TO_POINTER(PB_LALR), test_uint16); + g_test_add_data_func("/core/parser/lalr/uint8", GINT_TO_POINTER(PB_LALR), test_uint8); + g_test_add_data_func("/core/parser/lalr/int_range", GINT_TO_POINTER(PB_LALR), test_int_range); +#if 0 + g_test_add_data_func("/core/parser/lalr/float64", GINT_TO_POINTER(PB_LALR), test_float64); + g_test_add_data_func("/core/parser/lalr/float32", GINT_TO_POINTER(PB_LALR), test_float32); +#endif + g_test_add_data_func("/core/parser/lalr/whitespace", GINT_TO_POINTER(PB_LALR), test_whitespace); + g_test_add_data_func("/core/parser/lalr/left", GINT_TO_POINTER(PB_LALR), test_left); + g_test_add_data_func("/core/parser/lalr/right", GINT_TO_POINTER(PB_LALR), test_right); + g_test_add_data_func("/core/parser/lalr/middle", GINT_TO_POINTER(PB_LALR), test_middle); + g_test_add_data_func("/core/parser/lalr/action", GINT_TO_POINTER(PB_LALR), test_action); + g_test_add_data_func("/core/parser/lalr/in", GINT_TO_POINTER(PB_LALR), test_in); + g_test_add_data_func("/core/parser/lalr/not_in", GINT_TO_POINTER(PB_LALR), test_not_in); + g_test_add_data_func("/core/parser/lalr/end_p", GINT_TO_POINTER(PB_LALR), test_end_p); + g_test_add_data_func("/core/parser/lalr/nothing_p", GINT_TO_POINTER(PB_LALR), test_nothing_p); + g_test_add_data_func("/core/parser/lalr/sequence", GINT_TO_POINTER(PB_LALR), test_sequence); + g_test_add_data_func("/core/parser/lalr/choice", GINT_TO_POINTER(PB_LALR), test_choice); + g_test_add_data_func("/core/parser/lalr/many", GINT_TO_POINTER(PB_LALR), test_many); + g_test_add_data_func("/core/parser/lalr/many1", GINT_TO_POINTER(PB_LALR), test_many1); + g_test_add_data_func("/core/parser/lalr/optional", GINT_TO_POINTER(PB_LALR), test_optional); + g_test_add_data_func("/core/parser/lalr/sepBy", GINT_TO_POINTER(PB_LALR), test_sepBy); + g_test_add_data_func("/core/parser/lalr/sepBy1", GINT_TO_POINTER(PB_LALR), test_sepBy1); + g_test_add_data_func("/core/parser/lalr/epsilon_p", GINT_TO_POINTER(PB_LALR), test_epsilon_p); + g_test_add_data_func("/core/parser/lalr/attr_bool", GINT_TO_POINTER(PB_LALR), test_attr_bool); + g_test_add_data_func("/core/parser/lalr/ignore", GINT_TO_POINTER(PB_LALR), test_ignore); + g_test_add_data_func("/core/parser/lalr/leftrec", GINT_TO_POINTER(PB_LALR), test_leftrec); + g_test_add_data_func("/core/parser/lalr/rightrec", GINT_TO_POINTER(PB_LALR), test_rightrec); + + g_test_add_data_func("/core/parser/glr/token", GINT_TO_POINTER(PB_GLR), test_token); + g_test_add_data_func("/core/parser/glr/ch", GINT_TO_POINTER(PB_GLR), test_ch); + g_test_add_data_func("/core/parser/glr/ch_range", GINT_TO_POINTER(PB_GLR), test_ch_range); + g_test_add_data_func("/core/parser/glr/int64", GINT_TO_POINTER(PB_GLR), test_int64); + g_test_add_data_func("/core/parser/glr/int32", GINT_TO_POINTER(PB_GLR), test_int32); + g_test_add_data_func("/core/parser/glr/int16", GINT_TO_POINTER(PB_GLR), test_int16); + g_test_add_data_func("/core/parser/glr/int8", GINT_TO_POINTER(PB_GLR), test_int8); + g_test_add_data_func("/core/parser/glr/uint64", GINT_TO_POINTER(PB_GLR), test_uint64); + g_test_add_data_func("/core/parser/glr/uint32", GINT_TO_POINTER(PB_GLR), test_uint32); + g_test_add_data_func("/core/parser/glr/uint16", GINT_TO_POINTER(PB_GLR), test_uint16); + g_test_add_data_func("/core/parser/glr/uint8", GINT_TO_POINTER(PB_GLR), test_uint8); + g_test_add_data_func("/core/parser/glr/int_range", GINT_TO_POINTER(PB_GLR), test_int_range); +#if 0 + g_test_add_data_func("/core/parser/glr/float64", GINT_TO_POINTER(PB_GLR), test_float64); + g_test_add_data_func("/core/parser/glr/float32", GINT_TO_POINTER(PB_GLR), test_float32); +#endif + g_test_add_data_func("/core/parser/glr/whitespace", GINT_TO_POINTER(PB_GLR), test_whitespace); + g_test_add_data_func("/core/parser/glr/left", GINT_TO_POINTER(PB_GLR), test_left); + g_test_add_data_func("/core/parser/glr/right", GINT_TO_POINTER(PB_GLR), test_right); + g_test_add_data_func("/core/parser/glr/middle", GINT_TO_POINTER(PB_GLR), test_middle); + g_test_add_data_func("/core/parser/glr/action", GINT_TO_POINTER(PB_GLR), test_action); + g_test_add_data_func("/core/parser/glr/in", GINT_TO_POINTER(PB_GLR), test_in); + g_test_add_data_func("/core/parser/glr/not_in", GINT_TO_POINTER(PB_GLR), test_not_in); + g_test_add_data_func("/core/parser/glr/end_p", GINT_TO_POINTER(PB_GLR), test_end_p); + g_test_add_data_func("/core/parser/glr/nothing_p", GINT_TO_POINTER(PB_GLR), test_nothing_p); + g_test_add_data_func("/core/parser/glr/sequence", GINT_TO_POINTER(PB_GLR), test_sequence); + g_test_add_data_func("/core/parser/glr/choice", GINT_TO_POINTER(PB_GLR), test_choice); + g_test_add_data_func("/core/parser/glr/many", GINT_TO_POINTER(PB_GLR), test_many); + g_test_add_data_func("/core/parser/glr/many1", GINT_TO_POINTER(PB_GLR), test_many1); + g_test_add_data_func("/core/parser/glr/optional", GINT_TO_POINTER(PB_GLR), test_optional); + g_test_add_data_func("/core/parser/glr/sepBy", GINT_TO_POINTER(PB_GLR), test_sepBy); + g_test_add_data_func("/core/parser/glr/sepBy1", GINT_TO_POINTER(PB_GLR), test_sepBy1); + g_test_add_data_func("/core/parser/glr/epsilon_p", GINT_TO_POINTER(PB_GLR), test_epsilon_p); + g_test_add_data_func("/core/parser/glr/attr_bool", GINT_TO_POINTER(PB_GLR), test_attr_bool); + g_test_add_data_func("/core/parser/glr/ignore", GINT_TO_POINTER(PB_GLR), test_ignore); + g_test_add_data_func("/core/parser/glr/leftrec", GINT_TO_POINTER(PB_GLR), test_leftrec); + g_test_add_data_func("/core/parser/glr/rightrec", GINT_TO_POINTER(PB_GLR), test_rightrec); + g_test_add_data_func("/core/parser/glr/ambiguous", GINT_TO_POINTER(PB_GLR), test_ambiguous); } diff --git a/src/test_suite.h b/src/test_suite.h index 168ab641ba7968730deea69ad8aa0df09b47650c..fc008e7fb96b6524b6298f6d27e7b45e4c7c5b3a 100644 --- a/src/test_suite.h +++ b/src/test_suite.h @@ -153,7 +153,7 @@ } while(0) #define g_check_stringmap_absent(table, key) do { \ - bool end = (key[strlen(key)-2] == '$'); \ + bool end = (key[strlen(key)-1] == '$'); \ if(h_stringmap_present(table, (uint8_t *)key, strlen(key), end)) { \ g_test_message("Check failed: \"%s\" shouldn't have been in map, but was", key); \ g_test_fail(); \