From ad7eb52bf8b8b8db311ce3dcd0b8fabfbe8bfccd Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" <pesco@khjk.org> Date: Fri, 21 Feb 2020 18:49:51 +0100 Subject: [PATCH] track tainted entries in an explicit workset --- src/cfgrammar.c | 313 ++++++++++++++++++++++++++---------------------- src/cfgrammar.h | 10 +- 2 files changed, 172 insertions(+), 151 deletions(-) diff --git a/src/cfgrammar.c b/src/cfgrammar.c index 1e43b79..369e49b 100644 --- a/src/cfgrammar.c +++ b/src/cfgrammar.c @@ -6,10 +6,25 @@ #include <ctype.h> +// type of pairs used as memoization keys by h_follow and h_first +struct k_nt {size_t k; const HCFChoice *nt;}; + // a special map value for use when the map is used to represent a set static void * const INSET = (void *)(uintptr_t)1; +static bool eq_k_nt(const void *p, const void *q) +{ + const struct k_nt *a=p, *b=q; + return a->k == b->k && a->nt == b->nt; +} + +static HHashValue hash_k_nt(const void *p) +{ + const struct k_nt *x = p; + return h_hash_ptr(x->nt) * x->k; +} + HCFGrammar *h_cfgrammar_new(HAllocator *mm__) { HCFGrammar *g = h_new(HCFGrammar, 1); @@ -20,14 +35,17 @@ HCFGrammar *h_cfgrammar_new(HAllocator *mm__) g->nts = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr); g->start = NULL; g->geneps = NULL; - g->first = NULL; - g->follow = NULL; - g->kmax = 0; // will be increased as needed by ensure_k + g->first = h_hashtable_new(g->arena, eq_k_nt, hash_k_nt); + g->follow = h_hashtable_new(g->arena, eq_k_nt, hash_k_nt); HStringMap *eps = h_stringmap_new(g->arena); h_stringmap_put_epsilon(eps, INSET); g->singleton_epsilon = eps; + HStringMap *end = h_stringmap_new(g->arena); + h_stringmap_put_end(end, INSET); + g->singleton_end = end; + return g; } @@ -128,42 +146,6 @@ static void collect_nts(HCFGrammar *grammar, HCFChoice *symbol) } } -/* Increase g->kmax if needed, allocating enough first/follow slots. */ -static void ensure_k(HCFGrammar *g, size_t k) -{ - if (k <= g->kmax) { - return; - } - // NB: we don't actually use first/follow[0] but allocate it anyway - // so indices of the array correspond neatly to values of k - - // allocate the new arrays - HHashTable **first = h_arena_malloc(g->arena, (k+1)*sizeof(HHashTable *)); - HHashTable **follow = h_arena_malloc(g->arena, (k+1)*sizeof(HHashTable *)); - - if (g->kmax > 0) { - // we are resizing, copy the old tables over - for(size_t i=0; i<=g->kmax; i++) { - first[i] = g->first[i]; - follow[i] = g->follow[i]; - } - } else { - // we are initializing, allocate the first (in fact, dummy) tables - first[0] = h_hashtable_new(g->arena, h_eq_ptr, h_hash_ptr); - follow[0] = h_hashtable_new(g->arena, h_eq_ptr, h_hash_ptr); - } - - // allocate the new tables - for(size_t i=g->kmax+1; i<=k; i++) { - first[i] = h_hashtable_new(g->arena, h_eq_ptr, h_hash_ptr); - follow[i] = h_hashtable_new(g->arena, h_eq_ptr, h_hash_ptr); - } - - g->first = first; - g->follow = follow; - g->kmax = k; -} - bool h_derives_epsilon(HCFGrammar *g, const HCFChoice *symbol) { // XXX this can now also be implemented in terms of h_first: @@ -397,56 +379,42 @@ bool h_stringmap_empty(const HStringMap *m) && h_hashtable_empty(m->char_branches)); } -// helper: remove all tainted stringmap entries from a hashtable -void remove_tainted(HHashTable *ht) -{ - HHashTableEntry *hte; - HStringMap *s; - - for (size_t i=0; i < ht->capacity; i++) { - for (hte = &ht->contents[i]; hte; hte = hte->next) { - if (hte->key == NULL) - continue; - s = hte->value; - assert(s != NULL); - - if (s->taint) { - // remove the entry - if (hte->next != NULL) - *hte = *hte->next; - else { - hte->key = hte->value = NULL; - hte->hashval = 0; - } - } - } - } -} +static const HStringMap * +h_first_seq_work(size_t k, HCFGrammar *g, HHashTable **pws, HCFChoice **s); -const HStringMap *h_first(size_t k, HCFGrammar *g, const HCFChoice *x) +static const HStringMap * +h_first_work(size_t k, HCFGrammar *g, HHashTable **pws, const HCFChoice *x) { + HHashTable *ws = *pws; HStringMap *ret; HCFSequence **p; uint8_t c; + struct k_nt kx = {k,x}; + struct k_nt *pkx = NULL; + bool taint = false; // shortcut: first_0(X) is always {""} if (k==0) { return g->singleton_epsilon; } - // memoize via g->first - ensure_k(g, k); - ret = h_hashtable_get(g->first[k], x); + // shortcut: first_k($) is always {$} + if (x->type == HCF_END) { + return g->singleton_end; + } + + // check memoization and workset + ret = h_hashtable_get(g->first, &kx); + if (ret == NULL && ws != NULL) + ret = h_hashtable_get(ws, &kx); if (ret != NULL) { return ret; } + + // not found, create result ret = h_stringmap_new(g->arena); assert(ret != NULL); - h_hashtable_put(g->first[k], x, ret); switch(x->type) { - case HCF_END: - h_stringmap_put_end(ret, INSET); - break; case HCF_CHAR: h_stringmap_put_char(ret, x->chr, INSET); break; @@ -460,30 +428,70 @@ const HStringMap *h_first(size_t k, HCFGrammar *g, const HCFChoice *x) break; case HCF_CHOICE: // this is a nonterminal + + // to avoid recursive loops, taint ret and place it in workset + ret->taint = true; + if (ws == NULL) + ws = *pws = h_hashtable_new(g->arena, eq_k_nt, hash_k_nt); + pkx = h_arena_malloc(g->arena, sizeof kx); + *pkx = kx; + h_hashtable_put(ws, pkx, ret); + // return the union of the first sets of all productions - for(p=x->seq; *p; ++p) - h_stringmap_update(ret, h_first_seq(k, g, (*p)->items)); + for(p=x->seq; *p; ++p) { + const HStringMap *first_rhs = h_first_seq_work(k, g, pws, (*p)->items); + assert(ws == *pws); // call above did not change the workset pointer + taint |= first_rhs->taint; + h_stringmap_update(ret, first_rhs); + } break; default: // should not be reached - assert_message(0, "unknown HCFChoice type"); + assert_message(0, "unexpected HCFChoice type"); + } + + // immediately memoize ret and remove it from ws if untainted by recursion + if (!taint) { + if (pkx == NULL) { + pkx = h_arena_malloc(g->arena, sizeof kx); + *pkx = kx; + } else if (ws != NULL) { + // we already had a key, so ret might (will) be in ws; remove it. + h_hashtable_del(ws, pkx); + } + ret->taint = false; + h_hashtable_put(g->first, pkx, ret); } return ret; } +const HStringMap *h_first(size_t k, HCFGrammar *g, const HCFChoice *x) +{ + HHashTable *ws = NULL; + const HStringMap *ret; + + // XXX perform fixpoint iteration on workset + ret = h_first_work(k, g, &ws, x); + assert(ret != NULL); + + return ret; +} + // helpers for h_first_seq, definitions below static bool is_singleton_epsilon(const HStringMap *m); static bool any_string_shorter(size_t k, const HStringMap *m); // pointer to functions like h_first_seq -typedef const HStringMap *(*StringSetFun)(size_t, HCFGrammar *, HCFChoice **); +typedef const HStringMap * + (*StringSetFun)(size_t, HCFGrammar *, HHashTable **, HCFChoice **); // helper for h_first_seq and h_follow -static bool stringset_extend(HCFGrammar *g, HStringMap *ret, +static bool stringset_extend(HCFGrammar *g, HHashTable **pws, HStringMap *ret, size_t k, const HStringMap *as, StringSetFun f, HCFChoice **tail); -const HStringMap *h_first_seq(size_t k, HCFGrammar *g, HCFChoice **s) +static const HStringMap * +h_first_seq_work(size_t k, HCFGrammar *g, HHashTable **pws, HCFChoice **s) { // shortcut: the first set of the empty sequence, for any k, is {""} if (*s == NULL) { @@ -494,11 +502,11 @@ const HStringMap *h_first_seq(size_t k, HCFGrammar *g, HCFChoice **s) HCFChoice *x = s[0]; HCFChoice **tail = s+1; - const HStringMap *first_x = h_first(k, g, x); + const HStringMap *first_x = h_first_work(k, g, pws, x); // shortcut: if first_k(X) = {""}, just return first_k(tail) if (is_singleton_epsilon(first_x)) { - return h_first_seq(k, g, tail); + return h_first_seq_work(k, g, pws, tail); } // shortcut: if no elements of first_k(X) have length <k, just return first_k(X) @@ -510,7 +518,19 @@ const HStringMap *h_first_seq(size_t k, HCFGrammar *g, HCFChoice **s) HStringMap *ret = h_stringmap_new(g->arena); // extend the elements of first_k(X) up to length k from tail - stringset_extend(g, ret, k, first_x, h_first_seq, tail); + ret->taint = stringset_extend(g, pws, ret, k, first_x, h_first_seq_work, tail); + + return ret; +} + +const HStringMap *h_first_seq(size_t k, HCFGrammar *g, HCFChoice **s) +{ + HHashTable *ws = NULL; + const HStringMap *ret; + + // XXX perform fixpoint iteration on workset + ret = h_first_seq_work(k, g, &ws, s); + assert(ret != NULL); return ret; } @@ -574,18 +594,65 @@ static void remove_all_shorter(size_t k, HStringMap *m) // h_follow adapted to the signature of StringSetFun static const HStringMap * -h_follow_(size_t k, HCFGrammar *g, HCFChoice **s) +h_follow_(size_t k, HCFGrammar *g, HHashTable **pws, HCFChoice **s) { + assert(pws == NULL); return h_follow(k, g, *s); } -static const HStringMap *h_follow_rec(size_t k, HCFGrammar *g, HCFChoice **s); +static const HStringMap * +h_follow_work(size_t k, HCFGrammar *g, HHashTable **pws, const HCFChoice *x); -static bool -follow_work(size_t k, HCFGrammar *g, const HCFChoice *x, HStringMap *ret) +// h_follow_work adapted to the signature of StringSetFun +static const HStringMap * +h_follow_work_(size_t k, HCFGrammar *g, HHashTable **pws, HCFChoice **s) +{ + return h_follow_work(k, g, pws, *s); +} + +static const HStringMap * +h_follow_work(size_t k, HCFGrammar *g, HHashTable **pws, const HCFChoice *x) { + // consider all occurances of X in g + // the follow set of X is the union of: + // {$} if X is the start symbol + // given a production "A -> alpha X tail": + // first_k(tail follow_k(A)) + + // first_k(tail follow_k(A)) = + // { a b | a <- first_k(tail), b <- follow_l(A), l=k-|a| } + + HStringMap *ret; + HHashTable *ws = *pws; + struct k_nt kx = {k,x}; + struct k_nt *pkx; bool taint = false; + // shortcut: follow_0(X) is always {""} + if (k==0) { + return g->singleton_epsilon; + } + + // check memoization and workset + ret = h_hashtable_get(g->follow, &kx); + if (ret == NULL && ws != NULL) + ret = h_hashtable_get(ws, &kx); + if (ret != NULL) { + return ret; + } + + // not found, create result + ret = h_stringmap_new(g->arena); + assert(ret != NULL); + + // to avoid recursive loops, taint ret and place it in workset + ret->taint = true; + if (ws == NULL) + ws = *pws = h_hashtable_new(g->arena, eq_k_nt, hash_k_nt); + pkx = h_arena_malloc(g->arena, sizeof kx); + *pkx = kx; + h_hashtable_put(ws, pkx, ret); + // if X is the start symbol, the end token is in its follow set if (x == g->start) { h_stringmap_put_end(ret, INSET); @@ -616,7 +683,8 @@ follow_work(size_t k, HCFGrammar *g, const HCFChoice *x, HStringMap *ret) const HStringMap *first_tail = h_first_seq(k, g, tail); // extend the elems of first_k(tail) up to length k from follow(A) - taint |= stringset_extend(g, ret, k, first_tail, h_follow_rec, &a); + taint |= stringset_extend(g, pws, ret, k, + first_tail, h_follow_work_, &a); } } } @@ -624,70 +692,23 @@ follow_work(size_t k, HCFGrammar *g, const HCFChoice *x, HStringMap *ret) } assert(x_found || x == g->start); // no orphan non-terminals - return taint; -} - -// inner (recursion) variant of h_follow -static const HStringMap *h_follow_rec(size_t k, HCFGrammar *g, HCFChoice **s) -{ - HStringMap *ret; - HCFChoice *x = *s; - - // shortcut: follow_0(X) is always {""} - if (k==0) { - return g->singleton_epsilon; + // immediately memoize ret and remove it from ws if untainted by recursion + if (!taint) { + ret->taint = false; + h_hashtable_del(ws, pkx); + h_hashtable_put(g->follow, pkx, ret); } - // memoize via g->follow - assert(k <= g->kmax); - ret = h_hashtable_get(g->follow[k], x); - if (ret != NULL) { // return regardless of taint - return ret; - } - ret = h_stringmap_new(g->arena); - assert(ret != NULL); - h_hashtable_put(g->follow[k], x, ret); - - ret->taint = true; - ret->taint = follow_work(k, g, x, ret); return ret; } const HStringMap *h_follow(size_t k, HCFGrammar *g, const HCFChoice *x) { - // consider all occurances of X in g - // the follow set of X is the union of: - // {$} if X is the start symbol - // given a production "A -> alpha X tail": - // first_k(tail follow_k(A)) - - // first_k(tail follow_k(A)) = - // { a b | a <- first_k(tail), b <- follow_l(A), l=k-|a| } - - HStringMap *ret; - - // shortcut: follow_0(X) is always {""} - if (k==0) { - return g->singleton_epsilon; - } - - // memoize via g->follow - ensure_k(g, k); - ret = h_hashtable_get(g->follow[k], x); - if (ret != NULL && !ret->taint) { - return ret; - } - ret = h_stringmap_new(g->arena); - assert(ret != NULL); - h_hashtable_put(g->follow[k], x, ret); - - ret->taint = true; - follow_work(k, g, x, ret); - ret->taint = false; + HHashTable *ws = NULL; + const HStringMap *ret; - // finished - clear the temporaries (tainted entries) - for (size_t i=0; i <= g->kmax; i++) - remove_tainted(g->follow[i]); + // XXX perform fixpoint iteration on workset + ret = h_follow_work(k, g, &ws, x); return ret; } @@ -705,7 +726,7 @@ HStringMap *h_predict(size_t k, HCFGrammar *g, // casting the const off of A below. note: stringset_extend does // not touch this argument, only passes it through to h_follow // in this case, which accepts it, once again, as const. - stringset_extend(g, ret, k, first_rhs, h_follow_, (HCFChoice **)&A); + stringset_extend(g, NULL, ret, k, first_rhs, h_follow_, (HCFChoice **)&A); // make sure there are only strings of length _exactly_ k remove_all_shorter(k, ret); @@ -714,7 +735,7 @@ HStringMap *h_predict(size_t k, HCFGrammar *g, } // add the set { a b | a <- as, b <- f_l(S), l=k-|a| } to ret -static bool stringset_extend(HCFGrammar *g, HStringMap *ret, +static bool stringset_extend(HCFGrammar *g, HHashTable **pws, HStringMap *ret, size_t k, const HStringMap *as, StringSetFun f, HCFChoice **tail) { @@ -722,7 +743,7 @@ static bool stringset_extend(HCFGrammar *g, HStringMap *ret, if (as->epsilon_branch) { // for a="", add f_k(tail) to ret - const HStringMap *f_tail = f(k, g, tail); + const HStringMap *f_tail = f(k, g, pws, tail); taint |= f_tail->taint; h_stringmap_update(ret, f_tail); } @@ -751,7 +772,7 @@ static bool stringset_extend(HCFGrammar *g, HStringMap *ret, HStringMap *ret_ = h_stringmap_new(g->arena); h_stringmap_put_after(ret, c, ret_); - taint |= stringset_extend(g, ret_, k-1, as_, f, tail); + taint |= stringset_extend(g, pws, ret_, k-1, as_, f, tail); } } diff --git a/src/cfgrammar.h b/src/cfgrammar.h index 6068a2b..fb96e09 100644 --- a/src/cfgrammar.h +++ b/src/cfgrammar.h @@ -8,15 +8,15 @@ typedef struct HCFGrammar_ { HHashSet *nts; // HCFChoices, each representing the alternative // productions for one nonterminal HHashSet *geneps; // set of NTs that can generate the empty string - HHashTable **first; // memoized first sets of the grammar's symbols - HHashTable **follow; // memoized follow sets of the grammar's NTs - size_t kmax; // maximum lookahead depth allocated + HHashTable *first; // memoized first sets of the grammar's symbols + HHashTable *follow; // memoized follow sets of the grammar's NTs HArena *arena; HAllocator *mm__; - // constant set containing only the empty string. - // this is only a member of HCFGrammar because it needs a pointer to arena. + // constant sets containing only the empty string or end symbol. + // these are only members of HCFGrammar because they need a pointer to arena. const struct HStringMap_ *singleton_epsilon; + const struct HStringMap_ *singleton_end; } HCFGrammar; -- GitLab