From f5d5c367561805542ec1b8caaec2615ca1bedd89 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" <pesco@khjk.org> Date: Wed, 22 May 2013 20:38:36 +0200 Subject: [PATCH] generalize grammar analysis to k>1 --- src/backends/llk.c | 36 ++-- src/cfgrammar.c | 434 ++++++++++++++++++++++++++++++++++--------- src/cfgrammar.h | 60 ++++-- src/datastructures.c | 1 + src/t_grammar.c | 12 +- src/test_suite.h | 43 ++++- 6 files changed, 459 insertions(+), 127 deletions(-) diff --git a/src/backends/llk.c b/src/backends/llk.c index d62a3076..1cd95414 100644 --- a/src/backends/llk.c +++ b/src/backends/llk.c @@ -18,12 +18,18 @@ typedef struct HLLkTable_ { HAllocator *mm__; } HLLkTable; + +// XXX adaptation to LL(1), to be removed +typedef HCharKey HCFToken; +static const HCFToken end_token = 0x200; +#define char_token char_key + /* Interface to look up an entry in the parse table. */ const HCFSequence *h_llk_lookup(const HLLkTable *table, const HCFChoice *x, HInputStream lookahead) { - // note the lookahead stream is passed by value, i.e. a copy - // reading bits from it does not consume them from the input + // note the lookahead stream is passed by value, i.e. a copy. + // reading bits from it does not consume them from the real input. HCFToken tok; uint8_t c = h_read_bits(&lookahead, 8, false); if(lookahead.overrun) @@ -71,15 +77,21 @@ HHashSet *h_predict(HCFGrammar *g, const HCFChoice *A, const HCFSequence *rhs) { // predict(A -> rhs) = first(rhs) u follow(A) if "" can be derived from rhs // predict(A -> rhs) = first(rhs) otherwise - HHashSet *first_rhs = h_first_sequence(g, rhs->items); - if(h_sequence_derives_epsilon(g, rhs->items)) { - HHashSet *ret = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr); - h_hashset_put_all(ret, first_rhs); - h_hashset_put_all(ret, h_follow(g, A)); - return ret; - } else { - return first_rhs; + const HCFStringMap *first_rhs = h_first_seq(1, g, rhs->items); + const HCFStringMap *follow_A = h_follow(1, g, A); + HHashSet *ret = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr); + + h_hashset_put_all(ret, first_rhs->char_branches); + if(first_rhs->end_branch) + h_hashset_put(ret, (void *)end_token); + + if(h_derives_epsilon_seq(g, rhs->items)) { + h_hashset_put_all(ret, follow_A->char_branches); + if(follow_A->end_branch) + h_hashset_put(ret, (void *)end_token); } + + return ret; } /* Generate entries for the production "A -> rhs" in the given table row. */ @@ -360,9 +372,9 @@ int test_llk(void) printf("generate epsilon: "); h_pprint_symbolset(stdout, g, g->geneps, 0); printf("first(A) = "); - h_pprint_tokenset(stdout, g, h_first_symbol(g, g->start), 0); + h_pprint_stringset(stdout, g, h_first(1, g, g->start), 0); printf("follow(C) = "); - h_pprint_tokenset(stdout, g, h_follow(g, h_desugar(&system_allocator, c)), 0); + h_pprint_stringset(stdout, g, h_follow(1, g, h_desugar(&system_allocator, c)), 0); h_compile(p, PB_LLk, NULL); diff --git a/src/cfgrammar.c b/src/cfgrammar.c index d3ee9c64..7cafcb07 100644 --- a/src/cfgrammar.c +++ b/src/cfgrammar.c @@ -6,6 +6,10 @@ #include <ctype.h> +// a special map value for use when the map is used to represent a set +static void * const INSET = (void *)(uintptr_t)1; + + HCFGrammar *h_cfgrammar_new(HAllocator *mm__) { HCFGrammar *g = h_new(HCFGrammar, 1); @@ -15,15 +19,17 @@ HCFGrammar *h_cfgrammar_new(HAllocator *mm__) g->arena = h_new_arena(mm__, 0); // default blocksize g->nts = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr); g->geneps = NULL; - g->first = h_hashtable_new(g->arena, h_eq_ptr, h_hash_ptr); - g->follow = h_hashtable_new(g->arena, h_eq_ptr, h_hash_ptr); + g->first = NULL; + g->follow = NULL; + g->kmax = 0; // will be increased as needed by ensure_k + + HCFStringMap *eps = h_stringmap_new(g->arena); + h_stringmap_put_epsilon(eps, INSET); + g->singleton_epsilon = eps; return g; } -/* Frees the given grammar and associated data. - * Does *not* free parsers' CFG forms as created by h_desugar. - */ void h_cfgrammar_free(HCFGrammar *g) { HAllocator *mm__ = g->mm__; @@ -37,10 +43,6 @@ static void collect_nts(HCFGrammar *grammar, HCFChoice *symbol); static void collect_geneps(HCFGrammar *grammar); -/* Convert 'parser' into CFG representation by desugaring and compiling the set - * of nonterminals. - * A NULL return means we are unable to represent the parser as a CFG. - */ HCFGrammar *h_cfgrammar(HAllocator* mm__, const HParser *parser) { // convert parser to CFG form ("desugar"). @@ -114,9 +116,26 @@ static void collect_nts(HCFGrammar *grammar, HCFChoice *symbol) } } +/* Increase g->kmax if needed, allocating enough first/follow slots. */ +static void ensure_k(HCFGrammar *g, size_t k) +{ + if(k <= g->kmax) return; + + // NB: we don't actually use first/follow[0] but allocate it anyway + // so indices of the array correspond neatly to values of k + + assert(k==1); // XXX + g->first = h_arena_malloc(g->arena, (k+1)*sizeof(HHashTable *)); + g->follow = h_arena_malloc(g->arena, (k+1)*sizeof(HHashTable *)); + g->first[0] = h_hashtable_new(g->arena, h_eq_ptr, h_hash_ptr); + g->follow[0] = h_hashtable_new(g->arena, h_eq_ptr, h_hash_ptr); + g->first[1] = h_hashtable_new(g->arena, h_eq_ptr, h_hash_ptr); + g->follow[1] = h_hashtable_new(g->arena, h_eq_ptr, h_hash_ptr); + g->kmax = k; +} + -/* Does the given symbol derive the empty string (under g)? */ -bool h_symbol_derives_epsilon(HCFGrammar *g, const HCFChoice *symbol) +bool h_derives_epsilon(HCFGrammar *g, const HCFChoice *symbol) { assert(g->geneps != NULL); @@ -130,12 +149,11 @@ bool h_symbol_derives_epsilon(HCFGrammar *g, const HCFChoice *symbol) } } -/* Does the sentential form s derive the empty string? s NULL-terminated. */ -bool h_sequence_derives_epsilon(HCFGrammar *g, HCFChoice **s) +bool h_derives_epsilon_seq(HCFGrammar *g, HCFChoice **s) { // return true iff all symbols in s derive epsilon for(; *s; s++) { - if(!h_symbol_derives_epsilon(g, *s)) + if(!h_derives_epsilon(g, *s)) return false; } return true; @@ -165,10 +183,10 @@ static void collect_geneps(HCFGrammar *g) const HCFChoice *symbol = hte->key; assert(symbol->type == HCF_CHOICE); - // this NT derives epsilon if any of its productions does. + // this NT derives epsilon if any one of its productions does. HCFSequence **p; for(p = symbol->seq; *p != NULL; p++) { - if(h_sequence_derives_epsilon(g, (*p)->items)) { + if(h_derives_epsilon_seq(g, (*p)->items)) { h_hashset_put(g->geneps, symbol); break; } @@ -179,44 +197,118 @@ static void collect_geneps(HCFGrammar *g) } -/* Compute first set of sentential form s. s NULL-terminated. */ -HHashSet *h_first_sequence(HCFGrammar *g, HCFChoice **s); +HCFStringMap *h_stringmap_new(HArena *a) +{ + HCFStringMap *m = h_arena_malloc(a, sizeof(HCFStringMap)); + m->char_branches = h_hashtable_new(a, h_eq_ptr, h_hash_ptr); + m->arena = a; + return m; +} + +void h_stringmap_put_end(HCFStringMap *m, void *v) +{ + m->end_branch = v; +} + +void h_stringmap_put_epsilon(HCFStringMap *m, void *v) +{ + m->epsilon_branch = v; +} + +void h_stringmap_put_char(HCFStringMap *m, uint8_t c, void *v) +{ + HCFStringMap *node = h_stringmap_new(m->arena); + h_stringmap_put_epsilon(node, v); + h_hashtable_put(m->char_branches, (void *)char_key(c), node); +} + +void h_stringmap_update(HCFStringMap *m, const HCFStringMap *n) +{ + if(n->epsilon_branch) + m->epsilon_branch = n->epsilon_branch; + + if(n->end_branch) + m->end_branch = n->end_branch; + + h_hashtable_update(m->char_branches, n->char_branches); +} + +void *h_stringmap_get(const HCFStringMap *m, const uint8_t *str, size_t n, bool end) +{ + for(size_t i=0; i<n; i++) { + if(i==n-1 && end && m->end_branch) + return m->end_branch; + m = h_hashtable_get(m->char_branches, (void *)char_key(str[i])); + if(!m) + return NULL; + } + return m->epsilon_branch; +} + +bool h_stringmap_present(const HCFStringMap *m, const uint8_t *str, size_t n, bool end) +{ + return (h_stringmap_get(m, str, n, end) != NULL); +} + -/* Compute first set of symbol x. Memoized. */ -HHashSet *h_first_symbol(HCFGrammar *g, const HCFChoice *x) +const HCFStringMap *h_first(size_t k, HCFGrammar *g, const HCFChoice *x) { - HHashSet *ret; + HCFStringMap *ret; HCFSequence **p; uint8_t c; + // shortcut: first_0(X) is always {""} + if(k==0) + return g->singleton_epsilon; +#if 0 + // XXX this is bullshit? + // shortcut: first_0(X) is {""} if X derives anything + if(k==0) { + switch(x->type) { + case HCF_END: + case HCF_CHAR: + return g->singleton_epsilon; + case HCF_CHARSET: + c=0; + do { + if(charset_isset(x->charset, c)) + return g->singleton_epsilon; + } while(c++ < 255); + break; + // HCF_CHOICE is handled by the general case below + } + } +#endif + // memoize via g->first - assert(g->first != NULL); - ret = h_hashtable_get(g->first, x); + ensure_k(g, k); + ret = h_hashtable_get(g->first[k], x); if(ret != NULL) return ret; - ret = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr); + ret = h_stringmap_new(g->arena); assert(ret != NULL); - h_hashtable_put(g->first, x, ret); + h_hashtable_put(g->first[k], x, ret); switch(x->type) { case HCF_END: - h_hashset_put(ret, (void *)end_token); + h_stringmap_put_end(ret, INSET); break; case HCF_CHAR: - h_hashset_put(ret, (void *)char_token(x->chr)); + h_stringmap_put_char(ret, x->chr, INSET); break; case HCF_CHARSET: c=0; do { - if(charset_isset(x->charset, c)) - h_hashset_put(ret, (void *)char_token(c)); + if(charset_isset(x->charset, c)) { + h_stringmap_put_char(ret, c, INSET); + } } while(c++ < 255); break; case HCF_CHOICE: // this is a nonterminal // return the union of the first sets of all productions for(p=x->seq; *p; ++p) - h_hashset_put_all(ret, h_first_sequence(g, (*p)->items)); + h_stringmap_update(ret, h_first_seq(k, g, (*p)->items)); break; default: // should not be reached assert_message(0, "unknown HCFChoice type"); @@ -225,58 +317,155 @@ HHashSet *h_first_symbol(HCFGrammar *g, const HCFChoice *x) return ret; } -HHashSet *h_first_sequence(HCFGrammar *g, HCFChoice **s) +// helpers for h_first_seq, definitions below +static void first_extend(HCFGrammar *g, HCFStringMap *ret, + size_t k, const HCFStringMap *as, HCFChoice **tail); +static bool is_singleton_epsilon(const HCFStringMap *m); +static bool any_string_shorter(size_t k, const HCFStringMap *m); + +const HCFStringMap *h_first_seq(size_t k, HCFGrammar *g, HCFChoice **s) { - // the first set of the empty sequence is empty + // shortcut: the first set of the empty sequence, for any k, is {""} if(*s == NULL) - return h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr); + return g->singleton_epsilon; - // first(X tail) = first(X) if X does not derive epsilon - // = first(X) u first(tail) otherwise + // first_k(X tail) = { a b | a <- first_k(X), b <- first_l(tail), l=k-|a| } HCFChoice *x = s[0]; HCFChoice **tail = s+1; - HHashSet *first_x = h_first_symbol(g, x); - if(h_symbol_derives_epsilon(g, x)) { - // return the union of first(x) and first(tail) - HHashSet *first_tail = h_first_sequence(g, tail); - HHashSet *ret = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr); - h_hashset_put_all(ret, first_x); - h_hashset_put_all(ret, first_tail); - return ret; - } else { + const HCFStringMap *first_x = h_first(k, g, x); + + // shortcut: if first_k(X) = {""}, just return first_k(tail) + if(is_singleton_epsilon(first_x)) + return h_first_seq(k, g, tail); + + // shortcut: if no elements of first_k(X) have length <k, just return first_k(X) + if(!any_string_shorter(k, first_x)) return first_x; + + // create a new result set and build up the set described above + HCFStringMap *ret = h_stringmap_new(g->arena); + + // extend the elements of first_k(X) up to length k from tail + first_extend(g, ret, k, first_x, tail); + + return ret; +} + +// add the set { a b | a <- as, b <- first_l(tail), l=k-|a| } to ret +static void first_extend(HCFGrammar *g, HCFStringMap *ret, + size_t k, const HCFStringMap *as, HCFChoice **tail) +{ + if(as->epsilon_branch) { + // for a="", add first_k(tail) to ret + h_stringmap_update(ret, h_first_seq(k, g, tail)); + } + + if(as->end_branch) { + // for a="$", nothing can follow; just add "$" to ret + // NB: formally, "$" is considered to be of length k + h_stringmap_put_end(ret, INSET); + } + + // iterate over as->char_branches + const HHashTable *ht = as->char_branches; + for(size_t i=0; i < ht->capacity; i++) { + for(HHashTableEntry *hte = &ht->contents[i]; hte; hte = hte->next) { + if(hte->key == NULL) + continue; + uint8_t c = key_char((HCharKey)hte->key); + + // follow the branch to find the set { a' | t a' <- as } + HCFStringMap *as_ = (HCFStringMap *)hte->value; + + // now the elements of ret that begin with t are given by + // t { a b | a <- as_, b <- first_l(tail), l=k-|a|-1 } + // so we can use recursion over k + HCFStringMap *ret_ = h_stringmap_new(g->arena); + h_stringmap_put_char(ret, c, ret_); + + first_extend(g, ret_, k-1, as_, tail); + } + } +} + +static bool is_singleton_epsilon(const HCFStringMap *m) +{ + return ( m->epsilon_branch + && !m->end_branch + && h_hashtable_empty(m->char_branches) ); +} + +static bool any_string_shorter(size_t k, const HCFStringMap *m) +{ + if(k==0) + return false; + + if(m->epsilon_branch) + return true; + + // iterate over m->char_branches + const HHashTable *ht = m->char_branches; + for(size_t i=0; i < ht->capacity; i++) { + for(HHashTableEntry *hte = &ht->contents[i]; hte; hte = hte->next) { + if(hte->key == NULL) + continue; + HCFStringMap *m_ = hte->value; + + // check subtree for strings shorter than k-1 + if(any_string_shorter(k-1, m_)) + return true; + } } + + return false; } +const HCFStringMap *h_follow(size_t k, HCFGrammar *g, const HCFChoice *x); + +// pointer to functions like h_first_seq +typedef const HCFStringMap *(*StringSetFun)(size_t, HCFGrammar *, HCFChoice const* const*); + +static void stringset_extend(HCFGrammar *g, HCFStringMap *ret, + size_t k, const HCFStringMap *as, + StringSetFun f, HCFChoice const * const *tail); + +// h_follow adapted to the signature of StringSetFun +static inline const HCFStringMap *h_follow_(size_t k, HCFGrammar *g, HCFChoice const* const*s) +{ + return h_follow(k, g, *s); +} -/* Compute follow set of symbol x. Memoized. */ -HHashSet *h_follow(HCFGrammar *g, const HCFChoice *x) +const HCFStringMap *h_follow(size_t k, HCFGrammar *g, const HCFChoice *x) { // consider all occurances of X in g // the follow set of X is the union of: // {$} if X is the start symbol // given a production "A -> alpha X tail": - // if tail derives epsilon: - // first(tail) u follow(A) - // else: - // first(tail) + // first_k(tail follow_k(A)) + + // first_k(tail follow_k(A)) = + // { a b | a <- first_k(tail), b <- follow_l(A), l=k-|a| } + + HCFStringMap *ret; - HHashSet *ret; + // shortcut: follow_0(X) is always {""} + if(k==0) + return g->singleton_epsilon; // memoize via g->follow - assert(g->follow != NULL); - ret = h_hashtable_get(g->follow, x); + ensure_k(g, k); + ret = h_hashtable_get(g->follow[k], x); if(ret != NULL) return ret; - ret = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr); + ret = h_stringmap_new(g->arena); assert(ret != NULL); - h_hashtable_put(g->follow, x, ret); + h_hashtable_put(g->follow[k], x, ret); // if X is the start symbol, the end token is in its follow set if(x == g->start) - h_hashset_put(ret, (void *)end_token); + h_stringmap_put_end(ret, INSET); // iterate over g->nts size_t i; @@ -285,7 +474,7 @@ HHashSet *h_follow(HCFGrammar *g, const HCFChoice *x) for(hte = &g->nts->contents[i]; hte; hte = hte->next) { if(hte->key == NULL) continue; - const HCFChoice *a = hte->key; // production's left-hand symbol + HCFChoice const * const a = hte->key; // production's left-hand symbol assert(a->type == HCF_CHOICE); // iterate over the productions for A @@ -297,9 +486,12 @@ HHashSet *h_follow(HCFGrammar *g, const HCFChoice *x) if(*s == x) { // occurance found HCFChoice **tail = s+1; - h_hashset_put_all(ret, h_first_sequence(g, tail)); - if(h_sequence_derives_epsilon(g, tail)) - h_hashset_put_all(ret, h_follow(g, a)); + const HCFStringMap *first_tail = h_first_seq(k, g, tail); + + //h_stringmap_update(ret, first_tail); + + // extend the elems of first_k(tail) up to length k from follow(A) + stringset_extend(g, ret, k, first_tail, h_follow_, &a); } } } @@ -309,6 +501,44 @@ HHashSet *h_follow(HCFGrammar *g, const HCFChoice *x) return ret; } +// add the set { a b | a <- as, b <- f_l(S), l=k-|a| } to ret +static void stringset_extend(HCFGrammar *g, HCFStringMap *ret, + size_t k, const HCFStringMap *as, + StringSetFun f, HCFChoice const * const *tail) +{ + if(as->epsilon_branch) { + // for a="", add f_k(tail) to ret + h_stringmap_update(ret, f(k, g, tail)); + } + + if(as->end_branch) { + // for a="$", nothing can follow; just add "$" to ret + // NB: formally, "$" is considered to be of length k + h_stringmap_put_end(ret, INSET); + } + + // iterate over as->char_branches + const HHashTable *ht = as->char_branches; + for(size_t i=0; i < ht->capacity; i++) { + for(HHashTableEntry *hte = &ht->contents[i]; hte; hte = hte->next) { + if(hte->key == NULL) + continue; + uint8_t c = key_char((HCharKey)hte->key); + + // follow the branch to find the set { a' | t a' <- as } + HCFStringMap *as_ = (HCFStringMap *)hte->value; + + // now the elements of ret that begin with t are given by + // t { a b | a <- as_, b <- f_l(tail), l=k-|a|-1 } + // so we can use recursion over k + HCFStringMap *ret_ = h_stringmap_new(g->arena); + h_stringmap_put_char(ret, c, ret_); + + stringset_extend(g, ret_, k-1, as_, f, tail); + } + } +} + static void pprint_char(FILE *f, char c) { @@ -344,15 +574,16 @@ static void pprint_charset(FILE *f, const HCharset cs) fputc('[', f); for(i=0; i<256; i++) { - if(charset_isset(cs, i)) + if(charset_isset(cs, i)) { pprint_charset_char(f, i); - // detect ranges - if(i+2<256 && charset_isset(cs, i+1) && charset_isset(cs, i+2)) { - fputc('-', f); - for(; i<256 && charset_isset(cs, i); i++); - i--; // back to the last in range - pprint_charset_char(f, i); + // detect ranges + if(i+2<256 && charset_isset(cs, i+1) && charset_isset(cs, i+2)) { + fputc('-', f); + for(; i<256 && charset_isset(cs, i); i++); + i--; // back to the last in range + pprint_charset_char(f, i); + } } } fputc(']', f); @@ -400,6 +631,7 @@ static void pprint_symbol(FILE *f, const HCFGrammar *g, const HCFChoice *x) break; case HCF_CHARSET: pprint_charset(f, x->charset); + break; default: fputs(nonterminal_name(g, x), f); } @@ -507,30 +739,66 @@ void h_pprint_symbolset(FILE *file, const HCFGrammar *g, const HHashSet *set, in fputs("}\n", file); } -void h_pprint_tokenset(FILE *file, const HCFGrammar *g, const HHashSet *set, int indent) +#define BUFSIZE 512 + +void pprint_stringset_elems(FILE *file, char *prefix, size_t n, const HCFStringMap *set) { - int j; - for(j=0; j<indent; j++) fputc(' ', file); + assert(n < BUFSIZE-4); + + if(set->epsilon_branch) { + if(n==0) { + fputs("''", file); + } else { + fputc(',', file); + fwrite(prefix, 1, n, file); + } + } - fputc('[', file); + if(set->end_branch) { + fputc(',', file); + fwrite(prefix, 1, n, file); + fputc('$', file); + } - // iterate over set + // iterate over set->char_branches + HHashTable *ht = set->char_branches; size_t i; HHashTableEntry *hte; - for(i=0; i < set->capacity; i++) { - for(hte = &set->contents[i]; hte; hte = hte->next) { + for(i=0; i < ht->capacity; i++) { + for(hte = &ht->contents[i]; hte; hte = hte->next) { if(hte->key == NULL) continue; - HCFToken a = (HCFToken)hte->key; - - if(a == end_token) - fputc('$', file); - else if(token_char(a) == '$') - fputs("\\$", file); - else - pprint_char(file, token_char(a)); + uint8_t c = key_char((HCharKey)hte->key); + HCFStringMap *ends = hte->value; + + size_t n_ = n; + switch(c) { + case '$': prefix[n_++] = '\\'; prefix[n_++] = '$'; break; + case '"': prefix[n_++] = '\\'; prefix[n_++] = '"'; break; + case '\\': prefix[n_++] = '\\'; prefix[n_++] = '\\'; break; + case '\b': prefix[n_++] = '\\'; prefix[n_++] = 'b'; break; + case '\t': prefix[n_++] = '\\'; prefix[n_++] = 't'; break; + case '\n': prefix[n_++] = '\\'; prefix[n_++] = 'n'; break; + case '\r': prefix[n_++] = '\\'; prefix[n_++] = 'r'; break; + default: + if(isprint(c)) + prefix[n_++] = c; + else + n_ += sprintf(prefix+n_, "\\x%.2X", c); + } + + pprint_stringset_elems(file, prefix, n_, ends); } } +} - fputs("]\n", file); +void h_pprint_stringset(FILE *file, const HCFGrammar *g, const HCFStringMap *set, int indent) +{ + int j; + for(j=0; j<indent; j++) fputc(' ', file); + + char buf[BUFSIZE]; + fputc('{', file); + pprint_stringset_elems(file, buf, 0, set); + fputs("}\n", file); } diff --git a/src/cfgrammar.h b/src/cfgrammar.h index fdfa7c5c..f9768c85 100644 --- a/src/cfgrammar.h +++ b/src/cfgrammar.h @@ -8,20 +8,44 @@ typedef struct HCFGrammar_ { HHashSet *nts; // HCFChoices, each representing the alternative // productions for one nonterminal HHashSet *geneps; // set of NTs that can generate the empty string - HHashTable *first; // memoized first sets of the grammar's symbols - HHashTable *follow; // memoized follow sets of the grammar's NTs + HHashTable **first; // memoized first sets of the grammar's symbols + HHashTable **follow; // memoized follow sets of the grammar's NTs + size_t kmax; // maximum lookahead depth allocated HArena *arena; HAllocator *mm__; + + // constant set containing only the empty string. + // this is only a member of HCFGrammar because it needs a pointer to arena. + const struct HCFStringMap_ *singleton_epsilon; } HCFGrammar; -/* mapping input bytes or end to tokens - * we want to use these, cast to void *, as elements in hashsets - * therefore we must avoid 0 as a token value because NULL means "not in set". + +/* Representing input characters (bytes) in HHashTables. + * To use these as keys, we must avoid 0 as because NULL means "not set". */ -typedef uintptr_t HCFToken; -static inline HCFToken char_token(uint8_t c) { return (0x100 | c); } -static inline uint8_t token_char(HCFToken t) { return (0xFF & t); } -static const HCFToken end_token = 0x200; +typedef uintptr_t HCharKey; +static inline HCharKey char_key(uint8_t c) { return (0x100 | c); } +static inline uint8_t key_char(HCharKey k) { return (0xFF & k); } + +/* Mapping strings of input tokens to arbitrary values (or serving as a set). + * Common prefixes are folded into a tree of HHashTables, branches labeled with + * input tokens. + * Each path through the tree represents the string along its branches. + */ +typedef struct HCFStringMap_ { + void *epsilon_branch; // points to leaf value + void *end_branch; // points to leaf value + HHashTable *char_branches; // maps to inner nodes (HCFStringMaps) + HArena *arena; +} HCFStringMap; + +HCFStringMap *h_stringmap_new(HArena *a); +void h_stringmap_put_end(HCFStringMap *m, void *v); +void h_stringmap_put_epsilon(HCFStringMap *m, void *v); +void h_stringmap_put_char(HCFStringMap *m, uint8_t c, void *v); +void h_stringmap_update(HCFStringMap *m, const HCFStringMap *n); +void *h_stringmap_get(const HCFStringMap *m, const uint8_t *str, size_t n, bool end); +bool h_stringmap_present(const HCFStringMap *m, const uint8_t *str, size_t n, bool end); /* Convert 'parser' into CFG representation by desugaring and compiling the set @@ -36,22 +60,22 @@ HCFGrammar *h_cfgrammar(HAllocator* mm__, const HParser *parser); void h_cfgrammar_free(HCFGrammar *g); /* Does the given symbol derive the empty string (under g)? */ -bool h_symbol_derives_epsilon(HCFGrammar *g, const HCFChoice *symbol); +bool h_derives_epsilon(HCFGrammar *g, const HCFChoice *symbol); /* Does the sentential form s derive the empty string? s NULL-terminated. */ -bool h_sequence_derives_epsilon(HCFGrammar *g, HCFChoice **s); +bool h_derives_epsilon_seq(HCFGrammar *g, HCFChoice **s); -/* Compute first set of sentential form s. s NULL-terminated. */ -HHashSet *h_first_sequence(HCFGrammar *g, HCFChoice **s); +/* Compute first_k set of symbol x. Memoized. */ +const HCFStringMap *h_first(size_t k, HCFGrammar *g, const HCFChoice *x); -/* Compute first set of symbol x. Memoized. */ -HHashSet *h_first_symbol(HCFGrammar *g, const HCFChoice *x); +/* Compute first_k set of sentential form s. s NULL-terminated. */ +const HCFStringMap *h_first_seq(size_t k, HCFGrammar *g, HCFChoice **s); -/* Compute follow set of symbol x. Memoized. */ -HHashSet *h_follow(HCFGrammar *g, const HCFChoice *x); +/* Compute follow_k set of symbol x. Memoized. */ +const HCFStringMap *h_follow(size_t k, HCFGrammar *g, const HCFChoice *x); /* Pretty-printers for grammars and associated data. */ void h_pprint_grammar(FILE *file, const HCFGrammar *g, int indent); void h_pprint_symbolset(FILE *file, const HCFGrammar *g, const HHashSet *set, int indent); -void h_pprint_tokenset(FILE *file, const HCFGrammar *g, const HHashSet *set, int indent); +void h_pprint_stringset(FILE *file, const HCFGrammar *g, const HCFStringMap *set, int indent); diff --git a/src/datastructures.c b/src/datastructures.c index 5cb0fb42..ccd4c1d1 100644 --- a/src/datastructures.c +++ b/src/datastructures.c @@ -241,5 +241,6 @@ bool h_eq_ptr(const void *p, const void *q) { } HHashValue h_hash_ptr(const void *p) { + // XXX just djbhash it return (uintptr_t)p >> 4; } diff --git a/src/t_grammar.c b/src/t_grammar.c index 10a0853c..8003bcf6 100644 --- a/src/t_grammar.c +++ b/src/t_grammar.c @@ -28,13 +28,13 @@ static void test_example_1(void) { g_check_derives_epsilon_not(g, q); g_check_derives_epsilon_not(g, p); - g_check_firstset_present(g, p, end_token); - g_check_firstset_present(g, p, char_token('x')); - g_check_firstset_present(g, p, char_token('y')); + g_check_firstset_present(1, g, p, "$"); + g_check_firstset_present(1, g, p, "x"); + g_check_firstset_present(1, g, p, "y"); - g_check_followset_absent(g, c, end_token); - g_check_followset_absent(g, c, char_token('x')); - g_check_followset_present(g, c, char_token('y')); + g_check_followset_absent(1, g, c, "$"); + g_check_followset_absent(1, g, c, "x"); + g_check_followset_present(1, g, c, "y"); } void register_grammar_tests(void) { diff --git a/src/test_suite.h b/src/test_suite.h index 7c2834f5..35c414a1 100644 --- a/src/test_suite.h +++ b/src/test_suite.h @@ -84,6 +84,11 @@ } while(0) #define g_check_parse_failed(parser, input, inp_len) do { \ + int skip = h_compile((HParser *)(parser), PB_LLk, NULL); \ + if(skip != 0) { \ + g_test_message("Backend not applicable, skipping test"); \ + break; \ + } \ const HParseResult *result = h_parse(parser, (const uint8_t*)input, inp_len); \ if (NULL != result) { \ g_test_message("Check failed: shouldn't have succeeded, but did"); \ @@ -92,6 +97,11 @@ } while(0) #define g_check_parse_ok(parser, input, inp_len, result) do { \ + int skip = h_compile((HParser *)(parser), PB_LLk, NULL); \ + if(skip) { \ + g_test_message("Backend not applicable, skipping test"); \ + break; \ + } \ HParseResult *res = h_parse(parser, (const uint8_t*)input, inp_len); \ if (!res) { \ g_test_message("Parse failed on line %d", __LINE__); \ @@ -134,6 +144,23 @@ } \ } while(0) +#define g_check_stringmap_present(table, key) do { \ + bool end = (key[strlen(key)-1] == '$'); \ + if(!h_stringmap_present(table, (uint8_t *)key, strlen(key), end)) { \ + g_test_message("Check failed: \"%s\" should have been in map, but wasn't", key); \ + g_test_fail(); \ + } \ + } while(0) + +#define g_check_stringmap_absent(table, key) do { \ + bool end = (key[strlen(key)-2] == '$'); \ + if(h_stringmap_present(table, (uint8_t *)key, strlen(key), end)) { \ + g_test_message("Check failed: \"%s\" shouldn't have been in map, but was", key); \ + g_test_fail(); \ + } \ + } while(0) + + #define g_check_terminal(grammar, parser) \ g_check_hashtable_absent(grammar->nts, h_desugar(&system_allocator, parser)) @@ -146,17 +173,17 @@ #define g_check_derives_epsilon_not(grammar, parser) \ g_check_hashtable_absent(grammar->geneps, h_desugar(&system_allocator, parser)) -#define g_check_firstset_present(grammar, parser, token) \ - g_check_hashtable_present(h_first_symbol(grammar, h_desugar(&system_allocator, parser)), (void *)token) +#define g_check_firstset_present(k, grammar, parser, str) \ + g_check_stringmap_present(h_first(k, grammar, h_desugar(&system_allocator, parser)), str) -#define g_check_firstset_absent(grammar, parser, token) \ - g_check_hashtable_absent(h_first_symbol(grammar, h_desugar(&system_allocator, parser)), (void *)token) +#define g_check_firstset_absent(k, grammar, parser, str) \ + g_check_stringmap_absent(h_first(k, grammar, h_desugar(&system_allocator, parser)), str) -#define g_check_followset_present(grammar, parser, token) \ - g_check_hashtable_present(h_follow(grammar, h_desugar(&system_allocator, parser)), (void *)token) +#define g_check_followset_present(k, grammar, parser, str) \ + g_check_stringmap_present(h_follow(k, grammar, h_desugar(&system_allocator, parser)), str) -#define g_check_followset_absent(grammar, parser, token) \ - g_check_hashtable_absent(h_follow(grammar, h_desugar(&system_allocator, parser)), (void *)token) +#define g_check_followset_absent(k, grammar, parser, str) \ + g_check_stringmap_absent(h_follow(k, grammar, h_desugar(&system_allocator, parser)), str) -- GitLab