diff --git a/SConstruct b/SConstruct index 3120df44a885b706cc57534e9a0f548bcd1a0562..149951b026e8d86f9e298da7f8d59084763d8b0a 100644 --- a/SConstruct +++ b/SConstruct @@ -119,7 +119,7 @@ if env['CC'] == 'cl': ) else: # -Wno-clobbered only really works with gcc >= 4.2.x, but ... scons - env.MergeFlags('-std=c99 -D_POSIX_C_SOURCE=200809L -Wall -Wextra -Werror -Wno-unused-parameter -Wno-attributes -Wno-unused-variable') + env.MergeFlags('-std=c99 -Wall -Wextra -Werror -Wno-unused-parameter -Wno-attributes -Wno-unused-variable') # Linker options if env['PLATFORM'] == 'darwin': diff --git a/src/backends/glr.c b/src/backends/glr.c index 44b0c50cafd08486866eedf17e29c50236434f9b..ea69ea37ebb9275387b2eb67cf0e21fc64ac8960 100644 --- a/src/backends/glr.c +++ b/src/backends/glr.c @@ -225,6 +225,8 @@ HParseResult *h_glr_parse(HAllocator* mm__, const HParser* parser, HInputStream* HLREngine *engine = h_slist_pop(engines); const HLRAction *action = h_lrengine_action(engine); glr_step(&result, engback, engine, action); + // XXX detect ambiguous results - two engines terminating at the same pos + // -> kill both engines, i.e. ignore if there is a later unamb. success } // swap the lists diff --git a/src/backends/lalr.c b/src/backends/lalr.c index ba10e0ca8587c866b72e96d03c1351a9a037d8a2..79a2eca50c1690fab329870bf3e8f7e994927ad3 100644 --- a/src/backends/lalr.c +++ b/src/backends/lalr.c @@ -338,7 +338,11 @@ int h_lalr_compile(HAllocator* mm__, HParser* parser, const void* params) const HStringMap *fs = h_follow(1, eg->grammar, lhs); assert(fs != NULL); assert(fs->epsilon_branch == NULL); - assert(!h_stringmap_empty(fs)); + // NB: there is a case where fs can be empty: when reducing by lhs + // would lead to certain parse failure, by means of h_nothing_p() + // for instance. in that case, the below code correctly adds no + // reduce action. + assert(!h_stringmap_empty(fs)); // XXX // for each lookahead symbol, put action into table cell if(terminals_put(table->tmap[state], fs, action) < 0) @@ -351,6 +355,8 @@ int h_lalr_compile(HAllocator* mm__, HParser* parser, const void* params) h_slist_push(table->inadeq, (void *)(uintptr_t)state); } } + + h_cfgrammar_free(eg->grammar); } h_cfgrammar_free(g); diff --git a/src/cfgrammar.c b/src/cfgrammar.c index 77e7ecad7ea1a70597a4c7c70ee21d9184a6c672..bd69588a70395e007cd8cf83eaa5a3cce69e771e 100644 --- a/src/cfgrammar.c +++ b/src/cfgrammar.c @@ -6,10 +6,25 @@ #include <ctype.h> +// type of pairs used as memoization keys by h_follow and h_first +struct k_nt {size_t k; const HCFChoice *nt;}; + // a special map value for use when the map is used to represent a set static void * const INSET = (void *)(uintptr_t)1; +static bool eq_k_nt(const void *p, const void *q) +{ + const struct k_nt *a=p, *b=q; + return a->k == b->k && a->nt == b->nt; +} + +static HHashValue hash_k_nt(const void *p) +{ + const struct k_nt *x = p; + return h_hash_ptr(x->nt) * x->k; +} + HCFGrammar *h_cfgrammar_new(HAllocator *mm__) { HCFGrammar *g = h_new(HCFGrammar, 1); @@ -20,14 +35,17 @@ HCFGrammar *h_cfgrammar_new(HAllocator *mm__) g->nts = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr); g->start = NULL; g->geneps = NULL; - g->first = NULL; - g->follow = NULL; - g->kmax = 0; // will be increased as needed by ensure_k + g->first = h_hashtable_new(g->arena, eq_k_nt, hash_k_nt); + g->follow = h_hashtable_new(g->arena, eq_k_nt, hash_k_nt); HStringMap *eps = h_stringmap_new(g->arena); h_stringmap_put_epsilon(eps, INSET); g->singleton_epsilon = eps; + HStringMap *end = h_stringmap_new(g->arena); + h_stringmap_put_end(end, INSET); + g->singleton_end = end; + return g; } @@ -42,6 +60,7 @@ void h_cfgrammar_free(HCFGrammar *g) // helpers static void collect_nts(HCFGrammar *grammar, HCFChoice *symbol); static void collect_geneps(HCFGrammar *grammar); +static void eliminate_dead_rules(HCFGrammar *g); HCFGrammar *h_cfgrammar(HAllocator* mm__, const HParser *parser) @@ -83,6 +102,9 @@ HCFGrammar *h_cfgrammar_(HAllocator* mm__, HCFChoice *desugared) g->start = desugared; } + // simplifications + eliminate_dead_rules(g); + // determine which nonterminals generate epsilon collect_geneps(g); @@ -128,42 +150,6 @@ static void collect_nts(HCFGrammar *grammar, HCFChoice *symbol) } } -/* Increase g->kmax if needed, allocating enough first/follow slots. */ -static void ensure_k(HCFGrammar *g, size_t k) -{ - if (k <= g->kmax) { - return; - } - // NB: we don't actually use first/follow[0] but allocate it anyway - // so indices of the array correspond neatly to values of k - - // allocate the new arrays - HHashTable **first = h_arena_malloc(g->arena, (k+1)*sizeof(HHashTable *)); - HHashTable **follow = h_arena_malloc(g->arena, (k+1)*sizeof(HHashTable *)); - - if (g->kmax > 0) { - // we are resizing, copy the old tables over - for(size_t i=0; i<=g->kmax; i++) { - first[i] = g->first[i]; - follow[i] = g->follow[i]; - } - } else { - // we are initializing, allocate the first (in fact, dummy) tables - first[0] = h_hashtable_new(g->arena, h_eq_ptr, h_hash_ptr); - follow[0] = h_hashtable_new(g->arena, h_eq_ptr, h_hash_ptr); - } - - // allocate the new tables - for(size_t i=g->kmax+1; i<=k; i++) { - first[i] = h_hashtable_new(g->arena, h_eq_ptr, h_hash_ptr); - follow[i] = h_hashtable_new(g->arena, h_eq_ptr, h_hash_ptr); - } - - g->first = first; - g->follow = follow; - g->kmax = k; -} - bool h_derives_epsilon(HCFGrammar *g, const HCFChoice *symbol) { // XXX this can now also be implemented in terms of h_first: @@ -232,6 +218,76 @@ static void collect_geneps(HCFGrammar *g) } while(g->geneps->used != prevused); } +static bool mentions_symbol(HCFChoice **s, const HCFChoice *x) +{ + for(; *s; s++) { + if (*s == x) + return true; + } + return false; +} + +static void remove_productions_with(HCFGrammar *g, const HCFChoice *x) +{ + HHashTableEntry *hte; + const HCFChoice *symbol; + size_t i; + + for(i=0; i < g->nts->capacity; i++) { + for(hte = &g->nts->contents[i]; hte; hte = hte->next) { + if (hte->key == NULL) + continue; + symbol = hte->key; + assert(symbol->type == HCF_CHOICE); + + HCFSequence **p, **q; + for(p = symbol->seq; *p != NULL; ) { + if (mentions_symbol((*p)->items, x)) { + // remove production p + for(q=p; *(q+1) != NULL; q++); // q = last production + *p = *q; // move q over p + *q = NULL; // delete old q + } else { + p++; + } + } + } + } +} + +static void eliminate_dead_rules(HCFGrammar *g) +{ + HHashTableEntry *hte; + const HCFChoice *symbol; + size_t i; + bool found; + + do { + found = false; + for(i=0; !found && i < g->nts->capacity; i++) { + for(hte = &g->nts->contents[i]; !found && hte; hte = hte->next) { + if (hte->key == NULL) + continue; + symbol = hte->key; + assert(symbol->type == HCF_CHOICE); + + // this NT is dead if it has no productions + if (*symbol->seq == NULL) + found = true; + } + } + if (found) { + h_hashtable_del(g->nts, symbol); + remove_productions_with(g, symbol); + } + } while(found); // until nothing left to remove + + // rebuild g->nts. there may now be symbols that no longer appear in any + // productions. we also might have removed g->start. + g->nts = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr); + collect_nts(g, g->start); +} + HStringMap *h_stringmap_new(HArena *a) { @@ -240,6 +296,7 @@ HStringMap *h_stringmap_new(HArena *a) m->end_branch = NULL; m->char_branches = h_hashtable_new(a, h_eq_ptr, h_hash_ptr); m->arena = a; + m->taint = false; return m; } @@ -396,30 +453,65 @@ bool h_stringmap_empty(const HStringMap *m) && h_hashtable_empty(m->char_branches)); } -const HStringMap *h_first(size_t k, HCFGrammar *g, const HCFChoice *x) +static bool eq_stringmap(const void *a, const void *b) +{ + return h_stringmap_equal(a, b); +} + +bool h_stringmap_equal(const HStringMap *a, const HStringMap *b) +{ + if (a->epsilon_branch != b->epsilon_branch) + return false; + if (a->end_branch != b->end_branch) + return false; + return h_hashtable_equal(a->char_branches, b->char_branches, eq_stringmap); +} + +// helper for h_follow and h_first +bool workset_equal(HHashTable *a, HHashTable *b) { + if (a == NULL || b == NULL) + return (a == b); + else + return h_hashtable_equal(a, b, eq_stringmap); +} + +static const HStringMap * +h_first_seq_work(size_t k, HCFGrammar *g, HHashTable **pws, HCFChoice **s); + +static const HStringMap * +h_first_work(size_t k, HCFGrammar *g, HHashTable **pws, const HCFChoice *x) +{ + HHashTable *ws = *pws; HStringMap *ret; HCFSequence **p; uint8_t c; + struct k_nt kx = {k,x}; + struct k_nt *pkx = NULL; + bool taint = false; // shortcut: first_0(X) is always {""} if (k==0) { return g->singleton_epsilon; } - // memoize via g->first - ensure_k(g, k); - ret = h_hashtable_get(g->first[k], x); + // shortcut: first_k($) is always {$} + if (x->type == HCF_END) { + return g->singleton_end; + } + + // check memoization and workset + ret = h_hashtable_get(g->first, &kx); + if (ret == NULL && ws != NULL) + ret = h_hashtable_get(ws, &kx); if (ret != NULL) { return ret; } + + // not found, create result ret = h_stringmap_new(g->arena); assert(ret != NULL); - h_hashtable_put(g->first[k], x, ret); switch(x->type) { - case HCF_END: - h_stringmap_put_end(ret, INSET); - break; case HCF_CHAR: h_stringmap_put_char(ret, x->chr, INSET); break; @@ -433,30 +525,75 @@ const HStringMap *h_first(size_t k, HCFGrammar *g, const HCFChoice *x) break; case HCF_CHOICE: // this is a nonterminal + + // to avoid recursive loops, taint ret and place it in workset + ret->taint = true; + if (ws == NULL) + ws = *pws = h_hashtable_new(g->arena, eq_k_nt, hash_k_nt); + pkx = h_arena_malloc(g->arena, sizeof kx); + *pkx = kx; + h_hashtable_put(ws, pkx, ret); + // return the union of the first sets of all productions - for(p=x->seq; *p; ++p) - h_stringmap_update(ret, h_first_seq(k, g, (*p)->items)); + for(p=x->seq; *p; ++p) { + const HStringMap *first_rhs = h_first_seq_work(k, g, pws, (*p)->items); + assert(ws == *pws); // call above did not change the workset pointer + taint |= first_rhs->taint; + h_stringmap_update(ret, first_rhs); + } break; default: // should not be reached - assert_message(0, "unknown HCFChoice type"); + assert_message(0, "unexpected HCFChoice type"); + } + + // immediately memoize ret and remove it from ws if untainted by recursion + if (!taint) { + if (pkx == NULL) { + pkx = h_arena_malloc(g->arena, sizeof kx); + *pkx = kx; + } else if (ws != NULL) { + // we already had a key, so ret might (will) be in ws; remove it. + h_hashtable_del(ws, pkx); + } + ret->taint = false; + h_hashtable_put(g->first, pkx, ret); } return ret; } +const HStringMap *h_first(size_t k, HCFGrammar *g, const HCFChoice *x) +{ + HHashTable *ws, *bak; + const HStringMap *ret; + + // fixpoint iteration on workset + ws = NULL; + do { + bak = ws; + ws = NULL; + ret = h_first_work(k, g, &ws, x); + } while(!workset_equal(ws, bak)); + + assert(ret != NULL); + return ret; +} + // helpers for h_first_seq, definitions below static bool is_singleton_epsilon(const HStringMap *m); static bool any_string_shorter(size_t k, const HStringMap *m); // pointer to functions like h_first_seq -typedef const HStringMap *(*StringSetFun)(size_t, HCFGrammar *, HCFChoice **); +typedef const HStringMap * + (*StringSetFun)(size_t, HCFGrammar *, HHashTable **, HCFChoice **); // helper for h_first_seq and h_follow -static void stringset_extend(HCFGrammar *g, HStringMap *ret, +static bool stringset_extend(HCFGrammar *g, HHashTable **pws, HStringMap *ret, size_t k, const HStringMap *as, StringSetFun f, HCFChoice **tail); -const HStringMap *h_first_seq(size_t k, HCFGrammar *g, HCFChoice **s) +static const HStringMap * +h_first_seq_work(size_t k, HCFGrammar *g, HHashTable **pws, HCFChoice **s) { // shortcut: the first set of the empty sequence, for any k, is {""} if (*s == NULL) { @@ -467,11 +604,11 @@ const HStringMap *h_first_seq(size_t k, HCFGrammar *g, HCFChoice **s) HCFChoice *x = s[0]; HCFChoice **tail = s+1; - const HStringMap *first_x = h_first(k, g, x); + const HStringMap *first_x = h_first_work(k, g, pws, x); // shortcut: if first_k(X) = {""}, just return first_k(tail) if (is_singleton_epsilon(first_x)) { - return h_first_seq(k, g, tail); + return h_first_seq_work(k, g, pws, tail); } // shortcut: if no elements of first_k(X) have length <k, just return first_k(X) @@ -483,8 +620,25 @@ const HStringMap *h_first_seq(size_t k, HCFGrammar *g, HCFChoice **s) HStringMap *ret = h_stringmap_new(g->arena); // extend the elements of first_k(X) up to length k from tail - stringset_extend(g, ret, k, first_x, h_first_seq, tail); + ret->taint = stringset_extend(g, pws, ret, k, first_x, h_first_seq_work, tail); + + return ret; +} + +const HStringMap *h_first_seq(size_t k, HCFGrammar *g, HCFChoice **s) +{ + HHashTable *ws, *bak; + const HStringMap *ret; + // fixpoint iteration on workset + ws = NULL; + do { + bak = ws; + ws = NULL; + ret = h_first_seq_work(k, g, &ws, s); + } while(!workset_equal(ws, bak)); + + assert(ret != NULL); return ret; } @@ -546,13 +700,25 @@ static void remove_all_shorter(size_t k, HStringMap *m) } // h_follow adapted to the signature of StringSetFun -static inline -const HStringMap *h_follow_(size_t k, HCFGrammar *g, HCFChoice **s) +static const HStringMap * +h_follow_(size_t k, HCFGrammar *g, HHashTable **pws, HCFChoice **s) { + assert(pws == NULL); return h_follow(k, g, *s); } -const HStringMap *h_follow(size_t k, HCFGrammar *g, const HCFChoice *x) +static const HStringMap * +h_follow_work(size_t k, HCFGrammar *g, HHashTable **pws, const HCFChoice *x); + +// h_follow_work adapted to the signature of StringSetFun +static const HStringMap * +h_follow_work_(size_t k, HCFGrammar *g, HHashTable **pws, HCFChoice **s) +{ + return h_follow_work(k, g, pws, *s); +} + +static const HStringMap * +h_follow_work(size_t k, HCFGrammar *g, HHashTable **pws, const HCFChoice *x) { // consider all occurances of X in g // the follow set of X is the union of: @@ -564,28 +730,45 @@ const HStringMap *h_follow(size_t k, HCFGrammar *g, const HCFChoice *x) // { a b | a <- first_k(tail), b <- follow_l(A), l=k-|a| } HStringMap *ret; + HHashTable *ws = *pws; + struct k_nt kx = {k,x}; + struct k_nt *pkx; + bool taint = false; // shortcut: follow_0(X) is always {""} if (k==0) { return g->singleton_epsilon; } - // memoize via g->follow - ensure_k(g, k); - ret = h_hashtable_get(g->follow[k], x); + + // check memoization and workset + ret = h_hashtable_get(g->follow, &kx); + if (ret == NULL && ws != NULL) + ret = h_hashtable_get(ws, &kx); if (ret != NULL) { return ret; } + + // not found, create result ret = h_stringmap_new(g->arena); assert(ret != NULL); - h_hashtable_put(g->follow[k], x, ret); + + // to avoid recursive loops, taint ret and place it in workset + ret->taint = true; + if (ws == NULL) + ws = *pws = h_hashtable_new(g->arena, eq_k_nt, hash_k_nt); + pkx = h_arena_malloc(g->arena, sizeof kx); + *pkx = kx; + h_hashtable_put(ws, pkx, ret); // if X is the start symbol, the end token is in its follow set if (x == g->start) { h_stringmap_put_end(ret, INSET); } - // iterate over g->nts + + // iterate over g->nts, looking for X size_t i; HHashTableEntry *hte; + int x_found=0; for (i=0; i < g->nts->capacity; i++) { for (hte = &g->nts->contents[i]; hte; hte = hte->next) { if (hte->key == NULL) { @@ -600,19 +783,46 @@ const HStringMap *h_follow(size_t k, HCFGrammar *g, const HCFChoice *x) HCFChoice **s = (*p)->items; // production's right-hand side for (; *s; s++) { - if (*s == x) { // occurance found + if (*s == x) { // occurrence found + x_found=1; HCFChoice **tail = s+1; const HStringMap *first_tail = h_first_seq(k, g, tail); // extend the elems of first_k(tail) up to length k from follow(A) - stringset_extend(g, ret, k, first_tail, h_follow_, &a); + taint |= stringset_extend(g, pws, ret, k, + first_tail, h_follow_work_, &a); } } } } } + assert(x_found || x == g->start); // no orphan non-terminals + + // immediately memoize ret and remove it from ws if untainted by recursion + if (!taint) { + ret->taint = false; + h_hashtable_del(ws, pkx); + h_hashtable_put(g->follow, pkx, ret); + } + + return ret; +} + +const HStringMap *h_follow(size_t k, HCFGrammar *g, const HCFChoice *x) +{ + HHashTable *ws, *bak; + const HStringMap *ret; + // fixpoint iteration on workset + ws = NULL; + do { + bak = ws; + ws = NULL; + ret = h_follow_work(k, g, &ws, x); + } while(!workset_equal(ws, bak)); + + assert(ret != NULL); return ret; } @@ -629,7 +839,7 @@ HStringMap *h_predict(size_t k, HCFGrammar *g, // casting the const off of A below. note: stringset_extend does // not touch this argument, only passes it through to h_follow // in this case, which accepts it, once again, as const. - stringset_extend(g, ret, k, first_rhs, h_follow_, (HCFChoice **)&A); + stringset_extend(g, NULL, ret, k, first_rhs, h_follow_, (HCFChoice **)&A); // make sure there are only strings of length _exactly_ k remove_all_shorter(k, ret); @@ -638,13 +848,17 @@ HStringMap *h_predict(size_t k, HCFGrammar *g, } // add the set { a b | a <- as, b <- f_l(S), l=k-|a| } to ret -static void stringset_extend(HCFGrammar *g, HStringMap *ret, +static bool stringset_extend(HCFGrammar *g, HHashTable **pws, HStringMap *ret, size_t k, const HStringMap *as, StringSetFun f, HCFChoice **tail) { + bool taint = false; + if (as->epsilon_branch) { // for a="", add f_k(tail) to ret - h_stringmap_update(ret, f(k, g, tail)); + const HStringMap *f_tail = f(k, g, pws, tail); + taint |= f_tail->taint; + h_stringmap_update(ret, f_tail); } if (as->end_branch) { @@ -671,9 +885,11 @@ static void stringset_extend(HCFGrammar *g, HStringMap *ret, HStringMap *ret_ = h_stringmap_new(g->arena); h_stringmap_put_after(ret, c, ret_); - stringset_extend(g, ret_, k-1, as_, f, tail); + taint |= stringset_extend(g, pws, ret_, k-1, as_, f, tail); } } + + return taint; } @@ -818,13 +1034,15 @@ static void pprint_ntrules(FILE *f, const HCFGrammar *g, const HCFChoice *nt, fputs(name, f); i += strlen(name); for(; i<column; i++) fputc(' ', f); - fputs(" ->", f); assert(nt->type == HCF_CHOICE); HCFSequence **p = nt->seq; if (*p == NULL) { - return; // shouldn't happen + fputs(" -x\n", f); // empty choice, e.g. h_nothing_p() + return; } + + fputs(" ->", f); pprint_sequence(f, g, *p++); // print first production on the same line for(; *p; p++) { // print the rest below with "or" bars for(i=0; i<column; i++) fputc(' ', f); // indent @@ -835,6 +1053,8 @@ static void pprint_ntrules(FILE *f, const HCFGrammar *g, const HCFChoice *nt, void h_pprint_grammar(FILE *file, const HCFGrammar *g, int indent) { + HAllocator *mm__ = g->mm__; + if (g->nts->used < 1) { return; } @@ -842,11 +1062,12 @@ void h_pprint_grammar(FILE *file, const HCFGrammar *g, int indent) // determine maximum string length of symbol names int len; size_t s; - for(len=1, s=26; s < g->nts->used; len++, s*=26); + for(len=1, s=26; s < g->nts->used; len++, s*=26); - // iterate over g->nts + // iterate over g->nts and collect its entries in an ordered array size_t i; HHashTableEntry *hte; + const HCFChoice **arr = h_new(const HCFChoice *, g->nts->used); for(i=0; i < g->nts->capacity; i++) { for(hte = &g->nts->contents[i]; hte; hte = hte->next) { if (hte->key == NULL) { @@ -855,9 +1076,16 @@ void h_pprint_grammar(FILE *file, const HCFGrammar *g, int indent) const HCFChoice *a = hte->key; // production's left-hand symbol assert(a->type == HCF_CHOICE); - pprint_ntrules(file, g, a, indent, len); + size_t id = (uintptr_t)hte->value; // nonterminal id + assert(id < g->nts->used); + arr[id] = a; } } + + // print rules in alphabetical order + for(i=0; i < g->nts->used; i++) + pprint_ntrules(file, g, arr[i], indent, len); + h_free(arr); } void h_pprint_symbolset(FILE *file, const HCFGrammar *g, const HHashSet *set, int indent) diff --git a/src/cfgrammar.h b/src/cfgrammar.h index 2e8ba83cee5c152baae1177ed7b99d45cf11042c..8945ecb97d0adc1aa1f69391f54726a156c91211 100644 --- a/src/cfgrammar.h +++ b/src/cfgrammar.h @@ -8,15 +8,15 @@ typedef struct HCFGrammar_ { HHashSet *nts; // HCFChoices, each representing the alternative // productions for one nonterminal HHashSet *geneps; // set of NTs that can generate the empty string - HHashTable **first; // memoized first sets of the grammar's symbols - HHashTable **follow; // memoized follow sets of the grammar's NTs - size_t kmax; // maximum lookahead depth allocated + HHashTable *first; // memoized first sets of the grammar's symbols + HHashTable *follow; // memoized follow sets of the grammar's NTs HArena *arena; HAllocator *mm__; - // constant set containing only the empty string. - // this is only a member of HCFGrammar because it needs a pointer to arena. + // constant sets containing only the empty string or end symbol. + // these are only members of HCFGrammar because they need a pointer to arena. const struct HStringMap_ *singleton_epsilon; + const struct HStringMap_ *singleton_end; } HCFGrammar; @@ -37,6 +37,7 @@ typedef struct HStringMap_ { void *end_branch; // points to leaf value HHashTable *char_branches; // maps to inner nodes (HStringMaps) HArena *arena; + bool taint; // for use by h_follow() and h_first() } HStringMap; HStringMap *h_stringmap_new(HArena *a); @@ -52,6 +53,7 @@ void *h_stringmap_get_lookahead(const HStringMap *m, HInputStream lookahead); bool h_stringmap_present(const HStringMap *m, const uint8_t *str, size_t n, bool end); bool h_stringmap_present_epsilon(const HStringMap *m); bool h_stringmap_empty(const HStringMap *m); +bool h_stringmap_equal(const HStringMap *a, const HStringMap *b); static inline HStringMap *h_stringmap_get_char(const HStringMap *m, const uint8_t c) { return h_hashtable_get(m->char_branches, (void *)char_key(c)); } diff --git a/src/datastructures.c b/src/datastructures.c index 6971e0e0bee2fc8bbc644a0c6d9f9967d4ab6a01..8a09b5ce755e8880542f02c82b3b3e0db4f2fa48 100644 --- a/src/datastructures.c +++ b/src/datastructures.c @@ -387,16 +387,18 @@ static bool hte_same_length(HHashTableEntry *xs, HHashTableEntry *ys) { } // helper for hte_equal: are all elements of xs present in ys? -static bool hte_subset(HEqualFunc eq, HHashTableEntry *xs, HHashTableEntry *ys) +static bool hte_subset(HEqualFunc eq, HEqualFunc value_eq, + HHashTableEntry *xs, HHashTableEntry *ys) { for(; xs; xs=xs->next) { if(xs->key == NULL) continue; // element not present HHashTableEntry *hte; for(hte=ys; hte; hte=hte->next) { - if(hte->key == xs->key) break; // assume an element is equal to itself + // assume an element is equal to itself + if(hte->key == xs->key && hte->value == xs->value) break; if(hte->hashval != xs->hashval) continue; // shortcut - if(eq(hte->key, xs->key)) break; + if(eq(hte->key, xs->key) && value_eq(hte->value, xs->value)) break; } if(hte == NULL) return false; // element not found } @@ -404,19 +406,20 @@ static bool hte_subset(HEqualFunc eq, HHashTableEntry *xs, HHashTableEntry *ys) } // compare two lists of HHashTableEntries -static inline bool hte_equal(HEqualFunc eq, HHashTableEntry *xs, HHashTableEntry *ys) { - return (hte_same_length(xs, ys) && hte_subset(eq, xs, ys)); +static inline bool hte_equal(HEqualFunc eq, HEqualFunc value_eq, + HHashTableEntry *xs, HHashTableEntry *ys) { + return (hte_same_length(xs, ys) && hte_subset(eq, value_eq, xs, ys)); } -/* Set equality of HHashSets. +/* Equality of HHashTables. * Obviously, 'a' and 'b' must use the same equality function. * Not strictly necessary, but we also assume the same hash function. */ -bool h_hashset_equal(const HHashSet *a, const HHashSet *b) { +bool h_hashtable_equal(const HHashSet *a, const HHashSet *b, HEqualFunc value_eq) { if(a->capacity == b->capacity) { // iterate over the buckets in parallel for(size_t i=0; i < a->capacity; i++) { - if(!hte_equal(a->equalFunc, &a->contents[i], &b->contents[i])) + if(!hte_equal(a->equalFunc, value_eq, &a->contents[i], &b->contents[i])) return false; } } else { @@ -426,6 +429,18 @@ bool h_hashset_equal(const HHashSet *a, const HHashSet *b) { return true; } +static bool eq_dontcare(const void *p, const void *q) { + return true; +} + +/* Set equality of HHashSets. + * Obviously, 'a' and 'b' must use the same equality function. + * Not strictly necessary, but we also assume the same hash function. + */ +bool h_hashset_equal(const HHashSet *a, const HHashSet *b) { + return h_hashtable_equal(a, b, eq_dontcare); +} + bool h_eq_ptr(const void *p, const void *q) { return (p==q); } diff --git a/src/glue.c b/src/glue.c index 1df8173195c7090f3ca861f7263e15f84522766b..79e106c8bd902d6fe13ac485781222d202ea9dcc 100644 --- a/src/glue.c +++ b/src/glue.c @@ -121,6 +121,20 @@ HParsedToken *h_make_uint(HArena *arena, uint64_t val) return ret; } +HParsedToken *h_make_double(HArena *arena, double val) +{ + HParsedToken *ret = h_make_(arena, TT_DOUBLE); + ret->dbl = val; + return ret; +} + +HParsedToken *h_make_float(HArena *arena, float val) +{ + HParsedToken *ret = h_make_(arena, TT_FLOAT); + ret->flt = val; + return ret; +} + // XXX -> internal HParsedToken *h_carray_index(const HCountedArray *a, size_t i) { diff --git a/src/glue.h b/src/glue.h index 1de285823b3f2ae6fd956c04c9bc81d92445719a..08e5255ab2572d5d828943cd98331081ab02fc02 100644 --- a/src/glue.h +++ b/src/glue.h @@ -198,6 +198,8 @@ HParsedToken *h_make_seqn(HArena *arena, size_t n); // Makes empty sequence of HParsedToken *h_make_bytes(HArena *arena, const uint8_t *array, size_t len); HParsedToken *h_make_sint(HArena *arena, int64_t val); HParsedToken *h_make_uint(HArena *arena, uint64_t val); +HParsedToken *h_make_double(HArena *arena, double val); +HParsedToken *h_make_float(HArena *arena, float val); // Standard short-hands to make tokens in an action. #define H_MAKE(TYP, VAL) h_make(p->arena, (HTokenType)TT_ ## TYP, VAL) @@ -206,6 +208,8 @@ HParsedToken *h_make_uint(HArena *arena, uint64_t val); #define H_MAKE_BYTES(VAL, LEN) h_make_bytes(p->arena, VAL, LEN) #define H_MAKE_SINT(VAL) h_make_sint(p->arena, VAL) #define H_MAKE_UINT(VAL) h_make_uint(p->arena, VAL) +#define H_MAKE_DOUBLE(VAL) h_make_double(p->arena, VAL) +#define H_MAKE_FLOAT(VAL) h_make_float(p->arena, VAL) // Extract (cast) type-specific value back from HParsedTokens... @@ -218,6 +222,8 @@ HParsedToken *h_make_uint(HArena *arena, uint64_t val); #define H_ASSERT_BYTES(TOK) h_assert_type(TT_BYTES, TOK) #define H_ASSERT_SINT(TOK) h_assert_type(TT_SINT, TOK) #define H_ASSERT_UINT(TOK) h_assert_type(TT_UINT, TOK) +#define H_ASSERT_DOUBLE(TOK) h_assert_type(TT_DOUBLE, TOK) +#define H_ASSERT_FLOAT(TOK) h_assert_type(TT_FLOAT, TOK) // Assert expected type and return contained value. #define H_CAST(TYP, TOK) ((TYP *) H_ASSERT(TYP, TOK)->user) @@ -225,6 +231,8 @@ HParsedToken *h_make_uint(HArena *arena, uint64_t val); #define H_CAST_BYTES(TOK) (H_ASSERT_BYTES(TOK)->bytes) #define H_CAST_SINT(TOK) (H_ASSERT_SINT(TOK)->sint) #define H_CAST_UINT(TOK) (H_ASSERT_UINT(TOK)->uint) +#define H_CAST_DOUBLE(TOK) (H_ASSERT_DOUBLE(TOK)->dbl) +#define H_CAST_FLOAT(TOK) (H_ASSERT_FLOAT(TOK)->flt) // Sequence access... @@ -247,6 +255,8 @@ HParsedToken *h_seq_index_vpath(const HParsedToken *p, size_t i, va_list va); #define H_INDEX_BYTES(SEQ, ...) H_CAST_BYTES(H_INDEX_TOKEN(SEQ, __VA_ARGS__)) #define H_INDEX_SINT(SEQ, ...) H_CAST_SINT(H_INDEX_TOKEN(SEQ, __VA_ARGS__)) #define H_INDEX_UINT(SEQ, ...) H_CAST_UINT(H_INDEX_TOKEN(SEQ, __VA_ARGS__)) +#define H_INDEX_DOUBLE(SEQ, ...) H_CAST_DOUBLE(H_INDEX_TOKEN(SEQ, __VA_ARGS__)) +#define H_INDEX_FLOAT(SEQ, ...) H_CAST_FLOAT(H_INDEX_TOKEN(SEQ, __VA_ARGS__)) #define H_INDEX_TOKEN(SEQ, ...) h_seq_index_path(H_ASSERT_SEQ(SEQ), __VA_ARGS__, -1) // Standard short-hand to access and cast elements on a sequence token. @@ -255,6 +265,8 @@ HParsedToken *h_seq_index_vpath(const HParsedToken *p, size_t i, va_list va); #define H_FIELD_BYTES(...) H_INDEX_BYTES(p->ast, __VA_ARGS__) #define H_FIELD_SINT(...) H_INDEX_SINT(p->ast, __VA_ARGS__) #define H_FIELD_UINT(...) H_INDEX_UINT(p->ast, __VA_ARGS__) +#define H_FIELD_DOUBLE(...) H_INDEX_DOUBLE(p->ast, __VA_ARGS__) +#define H_FIELD_FLOAT(...) H_INDEX_FLOAT(p->ast, __VA_ARGS__) #define H_FIELD_TOKEN(...) H_INDEX_TOKEN(p->ast, __VA_ARGS__) // Lower-level helper for h_seq_index. diff --git a/src/hammer.h b/src/hammer.h index 6cd2660d3cfd29a9b4d34e1e054d2613ca4260a2..787af0b254a969226eeda985fc75d470796bd7cc 100644 --- a/src/hammer.h +++ b/src/hammer.h @@ -56,6 +56,8 @@ typedef enum HTokenType_ { TT_BYTES = 2, TT_SINT = 4, TT_UINT = 8, + TT_DOUBLE = 12, + TT_FLOAT = 13, TT_SEQUENCE = 16, TT_RESERVED_1, // reserved for backend-specific internal use TT_ERR = 32, diff --git a/src/internal.h b/src/internal.h index 07420681275a989925a08f6c596e3bc4a59202c1..f25d18ba4d1f42df96f77f79b53115be302c0490 100644 --- a/src/internal.h +++ b/src/internal.h @@ -382,6 +382,7 @@ int h_hashtable_present(const HHashTable *ht, const void *key); void h_hashtable_del(HHashTable *ht, const void *key); void h_hashtable_free(HHashTable *ht); static inline bool h_hashtable_empty(const HHashTable *ht) { return (ht->used == 0); } +bool h_hashtable_equal(const HHashTable *a, const HHashTable *b, HEqualFunc value_eq); typedef HHashTable HHashSet; #define h_hashset_new(a,eq,hash) h_hashtable_new(a,eq,hash) diff --git a/src/pprint.c b/src/pprint.c index 145bf5237ae98e7db240aa1540bf8b242801edd1..5f6e1e2c5a6d98869be764218cc2e4f191c0e669 100644 --- a/src/pprint.c +++ b/src/pprint.c @@ -63,6 +63,12 @@ void h_pprint(FILE* stream, const HParsedToken* tok, int indent, int delta) { case TT_UINT: fprintf(stream, "%" PRIu64, tok->uint); break; + case TT_DOUBLE: + fprintf(stream, "%f", tok->dbl); + break; + case TT_FLOAT: + fprintf(stream, "%f", (double)tok->flt); + break; case TT_SEQUENCE: if (tok->seq->used == 0) fprintf(stream, "[ ]"); @@ -183,6 +189,12 @@ static void unamb_sub(const HParsedToken* tok, struct result_buf *buf) { case TT_UINT: h_append_buf_formatted(buf, "u%#" PRIx64, tok->uint); break; + case TT_DOUBLE: + h_append_buf_formatted(buf, "d%a", tok->dbl); + break; + case TT_FLOAT: + h_append_buf_formatted(buf, "f%a", (double)tok->flt); + break; case TT_ERR: h_append_buf(buf, "ERR", 3); break; diff --git a/src/registry.c b/src/registry.c index 5486fd7bdb8022c65a296205b0dfd562a20a0572..15cf41a6c6eeb1ec8b9849b06a1e1ead2a5d204a 100644 --- a/src/registry.c +++ b/src/registry.c @@ -15,7 +15,6 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ -#include <search.h> #include <stdlib.h> #include "hammer.h" #include "internal.h" diff --git a/src/t_parser.c b/src/t_parser.c index 2d933ef1d3a025fd15fa8a1e247dc8ced3ba63ea..356c38f1674d6d3f90e3b0da672646455437f7a2 100644 --- a/src/t_parser.c +++ b/src/t_parser.c @@ -129,6 +129,29 @@ static void test_uint8(gconstpointer backend) { } //@MARK_END +// XXX implement h_double() and h_float(). these just test the pretty-printer... +static HParsedToken *act_double(const HParseResult *p, void *u) { + return H_MAKE_DOUBLE((double)H_FIELD_UINT(0) + (double)H_FIELD_UINT(1)/10); +} +static void test_double(gconstpointer backend) { + HParser *b = h_uint8(); + HParser *dbl = h_action(h_sequence(b, b, NULL), act_double, NULL); + uint8_t input[] = {4,2}; + + g_check_parse_match(dbl, (HParserBackend)GPOINTER_TO_INT(backend), input, 2, "d0x1.0cccccccccccdp+2"); +} + +static HParsedToken *act_float(const HParseResult *p, void *u) { + return H_MAKE_FLOAT((float)H_FIELD_UINT(0) + (float)H_FIELD_UINT(1)/10); +} +static void test_float(gconstpointer backend) { + HParser *b = h_uint8(); + HParser *flt = h_action(h_sequence(b, b, NULL), act_float, NULL); + uint8_t input[] = {4,2}; + + g_check_parse_match(flt, (HParserBackend)GPOINTER_TO_INT(backend), input, 2, "f0x1.0cccccp+2"); +} + static void test_int_range(gconstpointer backend) { const HParser *int_range_ = h_int_range(h_uint8(), 3, 10); @@ -873,10 +896,8 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/packrat/uint16", GINT_TO_POINTER(PB_PACKRAT), test_uint16); g_test_add_data_func("/core/parser/packrat/uint8", GINT_TO_POINTER(PB_PACKRAT), test_uint8); g_test_add_data_func("/core/parser/packrat/int_range", GINT_TO_POINTER(PB_PACKRAT), test_int_range); -#if 0 - g_test_add_data_func("/core/parser/packrat/float64", GINT_TO_POINTER(PB_PACKRAT), test_float64); - g_test_add_data_func("/core/parser/packrat/float32", GINT_TO_POINTER(PB_PACKRAT), test_float32); -#endif + g_test_add_data_func("/core/parser/packrat/double", GINT_TO_POINTER(PB_PACKRAT), test_double); + g_test_add_data_func("/core/parser/packrat/float", GINT_TO_POINTER(PB_PACKRAT), test_float); g_test_add_data_func("/core/parser/packrat/whitespace", GINT_TO_POINTER(PB_PACKRAT), test_whitespace); g_test_add_data_func("/core/parser/packrat/left", GINT_TO_POINTER(PB_PACKRAT), test_left); g_test_add_data_func("/core/parser/packrat/right", GINT_TO_POINTER(PB_PACKRAT), test_right); @@ -931,10 +952,8 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/llk/uint16", GINT_TO_POINTER(PB_LLk), test_uint16); g_test_add_data_func("/core/parser/llk/uint8", GINT_TO_POINTER(PB_LLk), test_uint8); g_test_add_data_func("/core/parser/llk/int_range", GINT_TO_POINTER(PB_LLk), test_int_range); -#if 0 - g_test_add_data_func("/core/parser/llk/float64", GINT_TO_POINTER(PB_LLk), test_float64); - g_test_add_data_func("/core/parser/llk/float32", GINT_TO_POINTER(PB_LLk), test_float32); -#endif + g_test_add_data_func("/core/parser/llk/double", GINT_TO_POINTER(PB_LLk), test_double); + g_test_add_data_func("/core/parser/llk/float", GINT_TO_POINTER(PB_LLk), test_float); g_test_add_data_func("/core/parser/llk/whitespace", GINT_TO_POINTER(PB_LLk), test_whitespace); g_test_add_data_func("/core/parser/llk/left", GINT_TO_POINTER(PB_LLk), test_left); g_test_add_data_func("/core/parser/llk/right", GINT_TO_POINTER(PB_LLk), test_right); @@ -977,11 +996,9 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/regex/uint32", GINT_TO_POINTER(PB_REGULAR), test_uint32); g_test_add_data_func("/core/parser/regex/uint16", GINT_TO_POINTER(PB_REGULAR), test_uint16); g_test_add_data_func("/core/parser/regex/uint8", GINT_TO_POINTER(PB_REGULAR), test_uint8); -#if 0 - g_test_add_data_func("/core/parser/regex/int_range", GINT_TO_POINTER(PB_REGULAR), test_int_range); - g_test_add_data_func("/core/parser/regex/float64", GINT_TO_POINTER(PB_REGULAR), test_float64); - g_test_add_data_func("/core/parser/regex/float32", GINT_TO_POINTER(PB_REGULAR), test_float32); -#endif + //g_test_add_data_func("/core/parser/regex/int_range", GINT_TO_POINTER(PB_REGULAR), test_int_range); + g_test_add_data_func("/core/parser/regex/double", GINT_TO_POINTER(PB_REGULAR), test_double); + g_test_add_data_func("/core/parser/regex/float", GINT_TO_POINTER(PB_REGULAR), test_float); g_test_add_data_func("/core/parser/regex/whitespace", GINT_TO_POINTER(PB_REGULAR), test_whitespace); g_test_add_data_func("/core/parser/regex/left", GINT_TO_POINTER(PB_REGULAR), test_left); g_test_add_data_func("/core/parser/regex/right", GINT_TO_POINTER(PB_REGULAR), test_right); @@ -1020,10 +1037,8 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/lalr/uint16", GINT_TO_POINTER(PB_LALR), test_uint16); g_test_add_data_func("/core/parser/lalr/uint8", GINT_TO_POINTER(PB_LALR), test_uint8); g_test_add_data_func("/core/parser/lalr/int_range", GINT_TO_POINTER(PB_LALR), test_int_range); -#if 0 - g_test_add_data_func("/core/parser/lalr/float64", GINT_TO_POINTER(PB_LALR), test_float64); - g_test_add_data_func("/core/parser/lalr/float32", GINT_TO_POINTER(PB_LALR), test_float32); -#endif + g_test_add_data_func("/core/parser/lalr/double", GINT_TO_POINTER(PB_LALR), test_double); + g_test_add_data_func("/core/parser/lalr/float", GINT_TO_POINTER(PB_LALR), test_float); g_test_add_data_func("/core/parser/lalr/whitespace", GINT_TO_POINTER(PB_LALR), test_whitespace); g_test_add_data_func("/core/parser/lalr/left", GINT_TO_POINTER(PB_LALR), test_left); g_test_add_data_func("/core/parser/lalr/right", GINT_TO_POINTER(PB_LALR), test_right); @@ -1068,10 +1083,8 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/glr/uint16", GINT_TO_POINTER(PB_GLR), test_uint16); g_test_add_data_func("/core/parser/glr/uint8", GINT_TO_POINTER(PB_GLR), test_uint8); g_test_add_data_func("/core/parser/glr/int_range", GINT_TO_POINTER(PB_GLR), test_int_range); -#if 0 - g_test_add_data_func("/core/parser/glr/float64", GINT_TO_POINTER(PB_GLR), test_float64); - g_test_add_data_func("/core/parser/glr/float32", GINT_TO_POINTER(PB_GLR), test_float32); -#endif + g_test_add_data_func("/core/parser/glr/double", GINT_TO_POINTER(PB_GLR), test_double); + g_test_add_data_func("/core/parser/glr/float", GINT_TO_POINTER(PB_GLR), test_float); g_test_add_data_func("/core/parser/glr/whitespace", GINT_TO_POINTER(PB_GLR), test_whitespace); g_test_add_data_func("/core/parser/glr/left", GINT_TO_POINTER(PB_GLR), test_left); g_test_add_data_func("/core/parser/glr/right", GINT_TO_POINTER(PB_GLR), test_right); diff --git a/src/t_regression.c b/src/t_regression.c index 587c268be1a7d5d6d7726d24292d9e1daee24145..2c28b99efe6a36e69e5831044dadcbcc381f4d18 100644 --- a/src/t_regression.c +++ b/src/t_regression.c @@ -394,6 +394,83 @@ static void test_issue87() { g_check_cmp_int(r, ==, -2); } +static void test_issue92() { + HParser *a = h_ch('a'); + HParser *b = h_ch('b'); + + HParser *str_a = h_indirect(); + HParser *str_b = h_choice(h_sequence(b, str_a, NULL), str_a, NULL); + //h_sequence(h_optional(b), str_a, NULL); // this works + HParser *str_a_ = h_optional(h_sequence(a, str_b, NULL)); + HParser *str = str_a; + h_bind_indirect(str_a, str_a_); + /* + * grammar generated from the above: + * + * A -> B -- "augmented" with a fresh start symbol + * B -> C -- B = str_a + * | "" + * C -> "a" D -- C = h_sequence(a, str_b) + * D -> E -- D = str_b + * | B + * E -> "b" B -- E = h_sequence(b, str_a) + * + * transformed to the following "enhanced grammar": + * + * S -> 0B3 + * 0B3 -> 0C2 + * | "" + * 1B4 -> 1C2 + * | "" + * 6B8 -> 6C2 + * | "" (*) here + * 0C2 -> "a" 1D7 + * 1C2 -> "a" 1D7 + * 6C2 -> "a" 1D7 + * 1D7 -> 1E5 + * | 1B4 + * 1E5 -> "b" 6B8 + */ + + /* + * the following call would cause an assertion failure. + * + * assertion "!h_stringmap_empty(fs)" failed: file + * "src/backends/lalr.c", line 341, function "h_lalr_compile" + * + * the bug happens when trying to compute h_follow() for 6B8 in state 6, + * production "" (*). intermediate results could end up in the memoization + * table and be treated as final by later calls to h_follow(). the problem + * could appear or not depending on the order of nonterminals (i.e. pointers) + * in a hashtable. + */ + int r = h_compile(str, PB_LALR, NULL); + g_check_cmp_int(r, ==, 0); +} + +static void test_issue83() { + HParser *p = h_sequence(h_sequence(NULL, NULL), h_nothing_p(), NULL); + /* + * A -> B + * B -> C D + * C -> "" + * D -x + * + * (S) -> 0B1 + * 0B1 -> 0C2 2D3 + * 0C2 -> "" (*) h_follow() + * 2D3 -x + */ + + /* + * similar to issue 91, this would cause the same assertion failure, but for + * a different reason. the follow set of 0C2 above is equal to the first set + * of 2D3, but 2D3 is an empty choice. The first set of an empty choice + * is legitimately empty. the asserting in h_lalr_compile() missed this case. + */ + int r = h_compile(p, PB_LALR, NULL); + g_check_cmp_int(r, ==, 0); +} void register_regression_tests(void) { g_test_add_func("/core/regression/bug118", test_bug118); @@ -409,4 +486,6 @@ void register_regression_tests(void) { //XXX g_test_add_func("/core/regression/ast_length_index", test_ast_length_index); g_test_add_func("/core/regression/issue91", test_issue91); g_test_add_func("/core/regression/issue87", test_issue87); + g_test_add_func("/core/regression/issue92", test_issue92); + g_test_add_func("/core/regression/issue83", test_issue83); } diff --git a/src/tsearch.h b/src/tsearch.h index 7b297db7c7ea425f350f0d2c3350d55a630fb97b..7ba71d97502ca34908284560d0c156d2d29d9c37 100644 --- a/src/tsearch.h +++ b/src/tsearch.h @@ -20,6 +20,7 @@ void *tfind(const void *vkey, void * const *vrootp, int (*compar)(const void *, const void *)); #else +#define _POSIX_C_SOURCE 200809L #include <search.h> #endif