From df179eba43b7e93e5ace1e4d54194adc709cd07b Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" <pesco@khjk.org> Date: Fri, 25 Sep 2015 01:28:52 +0200 Subject: [PATCH] fix conflict resolution with charset productions --- src/backends/lalr.c | 63 +++++++++++++++++++++++++++++---------------- src/t_regression.c | 21 +++++++++++---- 2 files changed, 57 insertions(+), 27 deletions(-) diff --git a/src/backends/lalr.c b/src/backends/lalr.c index 1e629422..b82ef71c 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -208,6 +208,46 @@ static bool match_production(HLREnhGrammar *eg, HCFChoice **p, && state == endstate); } +// variant of match_production where the production lhs is a charset +// [..x..] -> x +static bool match_charset_production(const HLRTable *table, HLREnhGrammar *eg, + const HCFChoice *lhs, HCFChoice *rhs, + size_t endstate) +{ + assert(lhs->type == HCF_CHARSET); + assert(rhs->type == HCF_CHAR); + + if(!charset_isset(lhs->charset, rhs->chr)) + return false; + + // determine the enhanced-grammar right-hand side and check end state + HLRTransition *t = h_hashtable_get(eg->smap, lhs); + assert(t != NULL); + return (follow_transition(table, t->from, rhs) == endstate); +} + +// check wether any production for sym (enhanced-grammar) matches the given +// (original-grammar) rhs and terminates in the given end state. +static bool match_any_production(const HLRTable *table, HLREnhGrammar *eg, + const HCFChoice *sym, HCFChoice **rhs, + size_t endstate) +{ + assert(sym->type == HCF_CHOICE || sym->type == HCF_CHARSET); + + if(sym->type == HCF_CHOICE) { + for(HCFSequence **p=sym->seq; *p; p++) { + if(match_production(eg, (*p)->items, rhs, endstate)) + return true; + } + } else { // HCF_CHARSET + assert(rhs[0] != NULL); + assert(rhs[1] == NULL); + return match_charset_production(table, eg, sym, rhs[0], endstate); + } + + return false; +} + // desugar parser with a fresh start symbol // this guarantees that the start symbol will not occur in any productions HCFChoice *h_desugar_augmented(HAllocator *mm__, HParser *parser) @@ -286,28 +326,7 @@ int h_lalr_compile(HAllocator* mm__, HParser* parser, const void* params) HHashSet *lhss = h_hashtable_get(eg->corr, item->lhs); assert(lhss != NULL); H_FOREACH_KEY(lhss, HCFChoice *lhs) - assert(lhs->type == HCF_CHOICE || lhs->type == HCF_CHARSET); - - bool match = false; - if(lhs->type == HCF_CHOICE) { - for(HCFSequence **p=lhs->seq; *p; p++) { - HCFChoice **rhs = (*p)->items; - if(match_production(eg, rhs, item->rhs, state)) { - match = true; - break; - } - } - } else { // HCF_CHARSET - assert(item->rhs[0] != NULL); - assert(item->rhs[1] == NULL); - assert(item->rhs[0]->type == HCF_CHAR); - HLRTransition *t = h_hashtable_get(eg->smap, lhs); - assert(t != NULL); - match = (t->to == state - && charset_isset(lhs->charset, item->rhs[0]->chr)); - } - - if(match) { + if(match_any_production(table, eg, lhs, item->rhs, state)) { // the left-hand symbol's follow set is this production's // contribution to the lookahead const HStringMap *fs = h_follow(1, eg->grammar, lhs); diff --git a/src/t_regression.c b/src/t_regression.c index 4097fb6c..faa953c8 100644 --- a/src/t_regression.c +++ b/src/t_regression.c @@ -121,13 +121,24 @@ static void test_llk_zero_end(void) { static void test_lalr_charset_lhs(void) { HParserBackend be = PB_LALR; - HParser *p = h_choice(h_ch('A'), h_uint8(), NULL); + HParser *p = h_many(h_choice(h_sequence(h_ch('A'), h_ch('B'), NULL), + h_in((uint8_t*)"AB",2), NULL)); - // the above would fail to compile because of an unhandled case in trying - // to resolve a conflict where an item's left-hand-side was an HCF_CHARSET. + // the above would abort because of an unhandled case in trying to resolve + // a conflict where an item's left-hand-side was an HCF_CHARSET. + // however, the compile should fail - the conflict cannot be resolved. - g_check_parse_match(p, be, "A",1, "u0x41"); - g_check_parse_match(p, be, "B",1, "u0x42"); + if(h_compile(p, be, NULL) == 0) { + g_test_message("LALR compile didn't detect ambiguous grammar"); + + // it says it compiled it - well, then it should parse it! + // (this helps us see what it thinks it should be doing.) + g_check_parse_match(p, be, "AA",2, "(u0x41 u0x41)"); + g_check_parse_match(p, be, "AB",2, "((u0x41 u0x42))"); + + g_test_fail(); + return; + } } static void test_cfg_many_seq(void) { -- GitLab