diff --git a/src/backends/lalr.c b/src/backends/lalr.c index 975735a16dab016576c625d5bb541d8028b776f7..b82ef71c477128728db39d4ac72ef8d4ab0dc56c 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -91,7 +91,7 @@ static HCFChoice *new_enhanced_symbol(HLREnhGrammar *eg, const HCFChoice *sym) HHashSet *cs = h_hashtable_get(eg->corr, sym); if (!cs) { - cs = h_hashset_new(arena, h_eq_symbol, h_hash_symbol); + cs = h_hashset_new(arena, h_eq_ptr, h_hash_ptr); h_hashtable_put(eg->corr, sym, cs); } h_hashset_put(cs, esym); @@ -208,6 +208,46 @@ static bool match_production(HLREnhGrammar *eg, HCFChoice **p, && state == endstate); } +// variant of match_production where the production lhs is a charset +// [..x..] -> x +static bool match_charset_production(const HLRTable *table, HLREnhGrammar *eg, + const HCFChoice *lhs, HCFChoice *rhs, + size_t endstate) +{ + assert(lhs->type == HCF_CHARSET); + assert(rhs->type == HCF_CHAR); + + if(!charset_isset(lhs->charset, rhs->chr)) + return false; + + // determine the enhanced-grammar right-hand side and check end state + HLRTransition *t = h_hashtable_get(eg->smap, lhs); + assert(t != NULL); + return (follow_transition(table, t->from, rhs) == endstate); +} + +// check wether any production for sym (enhanced-grammar) matches the given +// (original-grammar) rhs and terminates in the given end state. +static bool match_any_production(const HLRTable *table, HLREnhGrammar *eg, + const HCFChoice *sym, HCFChoice **rhs, + size_t endstate) +{ + assert(sym->type == HCF_CHOICE || sym->type == HCF_CHARSET); + + if(sym->type == HCF_CHOICE) { + for(HCFSequence **p=sym->seq; *p; p++) { + if(match_production(eg, (*p)->items, rhs, endstate)) + return true; + } + } else { // HCF_CHARSET + assert(rhs[0] != NULL); + assert(rhs[1] == NULL); + return match_charset_production(table, eg, sym, rhs[0], endstate); + } + + return false; +} + // desugar parser with a fresh start symbol // this guarantees that the start symbol will not occur in any productions HCFChoice *h_desugar_augmented(HAllocator *mm__, HParser *parser) @@ -286,28 +326,7 @@ int h_lalr_compile(HAllocator* mm__, HParser* parser, const void* params) HHashSet *lhss = h_hashtable_get(eg->corr, item->lhs); assert(lhss != NULL); H_FOREACH_KEY(lhss, HCFChoice *lhs) - assert(lhs->type == HCF_CHOICE || lhs->type == HCF_CHARSET); - - bool match = false; - if(lhs->type == HCF_CHOICE) { - for(HCFSequence **p=lhs->seq; *p; p++) { - HCFChoice **rhs = (*p)->items; - if(match_production(eg, rhs, item->rhs, state)) { - match = true; - break; - } - } - } else { // HCF_CHARSET - assert(item->rhs[0] != NULL); - assert(item->rhs[1] == NULL); - assert(item->rhs[0]->type == HCF_CHAR); - HLRTransition *t = h_hashtable_get(eg->smap, lhs); - assert(t != NULL); - match = (t->to == state - && charset_isset(lhs->charset, item->rhs[0]->chr)); - } - - if(match) { + if(match_any_production(table, eg, lhs, item->rhs, state)) { // the left-hand symbol's follow set is this production's // contribution to the lookahead const HStringMap *fs = h_follow(1, eg->grammar, lhs); diff --git a/src/backends/lr.c b/src/backends/lr.c index d9aaee72dcca6b290c486680511b0f1100d80069..fb256c0bfafa0b6c53b32307bea64f61d4885919 100644 --- a/src/backends/lr.c +++ b/src/backends/lr.c @@ -163,7 +163,7 @@ HLRAction *h_reduce_action(HArena *arena, const HLRItem *item) } // adds 'new' to the branches of 'action' -// returns a 'action' if it is already of type HLR_CONFLICT +// returns 'action' if it is already of type HLR_CONFLICT // allocates a new HLRAction otherwise HLRAction *h_lr_conflict(HArena *arena, HLRAction *action, HLRAction *new) { diff --git a/src/backends/lr0.c b/src/backends/lr0.c index 1c86484e61300ec40362a9abb47105424ddff2b9..a02df9e11da52ca3835e390487062a5a76ea0a31 100644 --- a/src/backends/lr0.c +++ b/src/backends/lr0.c @@ -30,9 +30,9 @@ static void expand_to_closure(HCFGrammar *g, HHashSet *items) HCFChoice *sym = item->rhs[item->mark]; // symbol after mark // if there is a non-terminal after the mark, follow it + // and add items corresponding to the productions of sym // NB: unlike LLk, we do consider HCF_CHARSET a non-terminal here - if(sym != NULL && (sym->type==HCF_CHOICE || sym->type==HCF_CHARSET)) { - // add items corresponding to the productions of sym + if(sym != NULL) { if(sym->type == HCF_CHOICE) { for(HCFSequence **p=sym->seq; *p; p++) { HLRItem *it = h_lritem_new(arena, sym, (*p)->items, 0); @@ -41,7 +41,7 @@ static void expand_to_closure(HCFGrammar *g, HHashSet *items) h_slist_push(work, it); } } - } else { // HCF_CHARSET + } else if(sym->type == HCF_CHARSET) { for(unsigned int i=0; i<256; i++) { if(charset_isset(sym->charset, i)) { // XXX allocate these single-character symbols statically somewhere @@ -93,8 +93,8 @@ HLRDFA *h_lr0_dfa(HCFGrammar *g) // compute closure // if destination is a new state: // add it to state set - // add transition to it // add it to the work list + // add transition to it while(!h_slist_empty(work)) { size_t state_idx = (uintptr_t)h_slist_pop(work); diff --git a/src/t_regression.c b/src/t_regression.c index 4097fb6cf1a64392438e9dddb4ac17427e5e0065..faa953c8451073e556acb2eda1f1fcf48ecc2d79 100644 --- a/src/t_regression.c +++ b/src/t_regression.c @@ -121,13 +121,24 @@ static void test_llk_zero_end(void) { static void test_lalr_charset_lhs(void) { HParserBackend be = PB_LALR; - HParser *p = h_choice(h_ch('A'), h_uint8(), NULL); + HParser *p = h_many(h_choice(h_sequence(h_ch('A'), h_ch('B'), NULL), + h_in((uint8_t*)"AB",2), NULL)); - // the above would fail to compile because of an unhandled case in trying - // to resolve a conflict where an item's left-hand-side was an HCF_CHARSET. + // the above would abort because of an unhandled case in trying to resolve + // a conflict where an item's left-hand-side was an HCF_CHARSET. + // however, the compile should fail - the conflict cannot be resolved. - g_check_parse_match(p, be, "A",1, "u0x41"); - g_check_parse_match(p, be, "B",1, "u0x42"); + if(h_compile(p, be, NULL) == 0) { + g_test_message("LALR compile didn't detect ambiguous grammar"); + + // it says it compiled it - well, then it should parse it! + // (this helps us see what it thinks it should be doing.) + g_check_parse_match(p, be, "AA",2, "(u0x41 u0x41)"); + g_check_parse_match(p, be, "AB",2, "((u0x41 u0x42))"); + + g_test_fail(); + return; + } } static void test_cfg_many_seq(void) {