From f5d5c367561805542ec1b8caaec2615ca1bedd89 Mon Sep 17 00:00:00 2001
From: "Sven M. Hallberg" <pesco@khjk.org>
Date: Wed, 22 May 2013 20:38:36 +0200
Subject: [PATCH] generalize grammar analysis to k>1

---
 src/backends/llk.c   |  36 ++--
 src/cfgrammar.c      | 434 ++++++++++++++++++++++++++++++++++---------
 src/cfgrammar.h      |  60 ++++--
 src/datastructures.c |   1 +
 src/t_grammar.c      |  12 +-
 src/test_suite.h     |  43 ++++-
 6 files changed, 459 insertions(+), 127 deletions(-)

diff --git a/src/backends/llk.c b/src/backends/llk.c
index d62a3076..1cd95414 100644
--- a/src/backends/llk.c
+++ b/src/backends/llk.c
@@ -18,12 +18,18 @@ typedef struct HLLkTable_ {
   HAllocator *mm__;
 } HLLkTable;
 
+
+// XXX adaptation to LL(1), to be removed
+typedef HCharKey HCFToken;
+static const HCFToken end_token = 0x200;
+#define char_token char_key
+
 /* Interface to look up an entry in the parse table. */
 const HCFSequence *h_llk_lookup(const HLLkTable *table, const HCFChoice *x,
                                 HInputStream lookahead)
 {
-  // note the lookahead stream is passed by value, i.e. a copy
-  // reading bits from it does not consume them from the input
+  // note the lookahead stream is passed by value, i.e. a copy.
+  // reading bits from it does not consume them from the real input.
   HCFToken tok;
   uint8_t c = h_read_bits(&lookahead, 8, false);
   if(lookahead.overrun)
@@ -71,15 +77,21 @@ HHashSet *h_predict(HCFGrammar *g, const HCFChoice *A, const HCFSequence *rhs)
 {
   // predict(A -> rhs) = first(rhs) u follow(A)  if "" can be derived from rhs
   // predict(A -> rhs) = first(rhs)              otherwise
-  HHashSet *first_rhs = h_first_sequence(g, rhs->items);
-  if(h_sequence_derives_epsilon(g, rhs->items)) {
-    HHashSet *ret = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr);
-    h_hashset_put_all(ret, first_rhs);
-    h_hashset_put_all(ret, h_follow(g, A));
-    return ret;
-  } else {
-    return first_rhs;
+  const HCFStringMap *first_rhs = h_first_seq(1, g, rhs->items);
+  const HCFStringMap *follow_A = h_follow(1, g, A);
+  HHashSet *ret = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr);
+
+  h_hashset_put_all(ret, first_rhs->char_branches);
+  if(first_rhs->end_branch)
+    h_hashset_put(ret, (void *)end_token);
+
+  if(h_derives_epsilon_seq(g, rhs->items)) {
+    h_hashset_put_all(ret, follow_A->char_branches);
+    if(follow_A->end_branch)
+      h_hashset_put(ret, (void *)end_token);
   }
+
+  return ret;
 }
 
 /* Generate entries for the production "A -> rhs" in the given table row. */
@@ -360,9 +372,9 @@ int test_llk(void)
   printf("generate epsilon: ");
   h_pprint_symbolset(stdout, g, g->geneps, 0);
   printf("first(A) = ");
-  h_pprint_tokenset(stdout, g, h_first_symbol(g, g->start), 0);
+  h_pprint_stringset(stdout, g, h_first(1, g, g->start), 0);
   printf("follow(C) = ");
-  h_pprint_tokenset(stdout, g, h_follow(g, h_desugar(&system_allocator, c)), 0);
+  h_pprint_stringset(stdout, g, h_follow(1, g, h_desugar(&system_allocator, c)), 0);
 
   h_compile(p, PB_LLk, NULL);
 
diff --git a/src/cfgrammar.c b/src/cfgrammar.c
index d3ee9c64..7cafcb07 100644
--- a/src/cfgrammar.c
+++ b/src/cfgrammar.c
@@ -6,6 +6,10 @@
 #include <ctype.h>
 
 
+// a special map value for use when the map is used to represent a set
+static void * const INSET = (void *)(uintptr_t)1;
+
+
 HCFGrammar *h_cfgrammar_new(HAllocator *mm__)
 {
   HCFGrammar *g = h_new(HCFGrammar, 1);
@@ -15,15 +19,17 @@ HCFGrammar *h_cfgrammar_new(HAllocator *mm__)
   g->arena  = h_new_arena(mm__, 0);     // default blocksize
   g->nts    = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr);
   g->geneps = NULL;
-  g->first  = h_hashtable_new(g->arena, h_eq_ptr, h_hash_ptr);
-  g->follow = h_hashtable_new(g->arena, h_eq_ptr, h_hash_ptr);
+  g->first  = NULL;
+  g->follow = NULL;
+  g->kmax   = 0;    // will be increased as needed by ensure_k
+
+  HCFStringMap *eps = h_stringmap_new(g->arena);
+  h_stringmap_put_epsilon(eps, INSET);
+  g->singleton_epsilon = eps;
 
   return g;
 }
 
-/* Frees the given grammar and associated data.
- * Does *not* free parsers' CFG forms as created by h_desugar.
- */
 void h_cfgrammar_free(HCFGrammar *g)
 {
   HAllocator *mm__ = g->mm__;
@@ -37,10 +43,6 @@ static void collect_nts(HCFGrammar *grammar, HCFChoice *symbol);
 static void collect_geneps(HCFGrammar *grammar);
 
 
-/* Convert 'parser' into CFG representation by desugaring and compiling the set
- * of nonterminals.
- * A NULL return means we are unable to represent the parser as a CFG.
- */
 HCFGrammar *h_cfgrammar(HAllocator* mm__, const HParser *parser)
 {
   // convert parser to CFG form ("desugar").
@@ -114,9 +116,26 @@ static void collect_nts(HCFGrammar *grammar, HCFChoice *symbol)
   }
 }
 
+/* Increase g->kmax if needed, allocating enough first/follow slots. */
+static void ensure_k(HCFGrammar *g, size_t k)
+{
+  if(k <= g->kmax) return;
+
+  // NB: we don't actually use first/follow[0] but allocate it anyway
+  // so indices of the array correspond neatly to values of k
+
+  assert(k==1);   // XXX
+  g->first  = h_arena_malloc(g->arena, (k+1)*sizeof(HHashTable *));
+  g->follow = h_arena_malloc(g->arena, (k+1)*sizeof(HHashTable *));
+  g->first[0]  = h_hashtable_new(g->arena, h_eq_ptr, h_hash_ptr);
+  g->follow[0] = h_hashtable_new(g->arena, h_eq_ptr, h_hash_ptr);
+  g->first[1]  = h_hashtable_new(g->arena, h_eq_ptr, h_hash_ptr);
+  g->follow[1] = h_hashtable_new(g->arena, h_eq_ptr, h_hash_ptr);
+  g->kmax = k;
+}
+
 
-/* Does the given symbol derive the empty string (under g)? */
-bool h_symbol_derives_epsilon(HCFGrammar *g, const HCFChoice *symbol)
+bool h_derives_epsilon(HCFGrammar *g, const HCFChoice *symbol)
 {
   assert(g->geneps != NULL);
 
@@ -130,12 +149,11 @@ bool h_symbol_derives_epsilon(HCFGrammar *g, const HCFChoice *symbol)
   }
 }
 
-/* Does the sentential form s derive the empty string? s NULL-terminated. */
-bool h_sequence_derives_epsilon(HCFGrammar *g, HCFChoice **s)
+bool h_derives_epsilon_seq(HCFGrammar *g, HCFChoice **s)
 {
   // return true iff all symbols in s derive epsilon
   for(; *s; s++) {
-    if(!h_symbol_derives_epsilon(g, *s))
+    if(!h_derives_epsilon(g, *s))
       return false;
   }
   return true;
@@ -165,10 +183,10 @@ static void collect_geneps(HCFGrammar *g)
         const HCFChoice *symbol = hte->key;
         assert(symbol->type == HCF_CHOICE);
 
-        // this NT derives epsilon if any of its productions does.
+        // this NT derives epsilon if any one of its productions does.
         HCFSequence **p;
         for(p = symbol->seq; *p != NULL; p++) {
-          if(h_sequence_derives_epsilon(g, (*p)->items)) {
+          if(h_derives_epsilon_seq(g, (*p)->items)) {
             h_hashset_put(g->geneps, symbol);
             break;
           }
@@ -179,44 +197,118 @@ static void collect_geneps(HCFGrammar *g)
 }
 
 
-/* Compute first set of sentential form s. s NULL-terminated. */
-HHashSet *h_first_sequence(HCFGrammar *g, HCFChoice **s);
+HCFStringMap *h_stringmap_new(HArena *a)
+{
+  HCFStringMap *m = h_arena_malloc(a, sizeof(HCFStringMap));
+  m->char_branches = h_hashtable_new(a, h_eq_ptr, h_hash_ptr);
+  m->arena = a;
+  return m;
+}
+
+void h_stringmap_put_end(HCFStringMap *m, void *v)
+{
+  m->end_branch = v;
+}
+
+void h_stringmap_put_epsilon(HCFStringMap *m, void *v)
+{
+  m->epsilon_branch = v;
+}
+
+void h_stringmap_put_char(HCFStringMap *m, uint8_t c, void *v)
+{
+  HCFStringMap *node = h_stringmap_new(m->arena);
+  h_stringmap_put_epsilon(node, v);
+  h_hashtable_put(m->char_branches, (void *)char_key(c), node);
+}
+
+void h_stringmap_update(HCFStringMap *m, const HCFStringMap *n)
+{
+  if(n->epsilon_branch)
+    m->epsilon_branch = n->epsilon_branch;
+
+  if(n->end_branch)
+    m->end_branch = n->end_branch;
+
+  h_hashtable_update(m->char_branches, n->char_branches);
+}
+
+void *h_stringmap_get(const HCFStringMap *m, const uint8_t *str, size_t n, bool end)
+{
+  for(size_t i=0; i<n; i++) {
+    if(i==n-1 && end && m->end_branch)
+      return m->end_branch;
+    m = h_hashtable_get(m->char_branches, (void *)char_key(str[i]));
+    if(!m)
+      return NULL;
+  }
+  return m->epsilon_branch;
+}
+
+bool h_stringmap_present(const HCFStringMap *m, const uint8_t *str, size_t n, bool end)
+{
+  return (h_stringmap_get(m, str, n, end) != NULL);
+}
+
 
-/* Compute first set of symbol x. Memoized. */
-HHashSet *h_first_symbol(HCFGrammar *g, const HCFChoice *x)
+const HCFStringMap *h_first(size_t k, HCFGrammar *g, const HCFChoice *x)
 {
-  HHashSet *ret;
+  HCFStringMap *ret;
   HCFSequence **p;
   uint8_t c;
 
+  // shortcut: first_0(X) is always {""}
+  if(k==0)
+    return g->singleton_epsilon;
+#if 0
+  // XXX this is bullshit?
+  // shortcut: first_0(X) is {""} if X derives anything
+  if(k==0) {
+    switch(x->type) {
+    case HCF_END:
+    case HCF_CHAR:
+      return g->singleton_epsilon;
+    case HCF_CHARSET:
+      c=0;
+      do {
+        if(charset_isset(x->charset, c))
+          return g->singleton_epsilon;
+      } while(c++ < 255);
+      break;
+    // HCF_CHOICE is handled by the general case below
+    }
+  }
+#endif
+
   // memoize via g->first
-  assert(g->first != NULL);
-  ret = h_hashtable_get(g->first, x);
+  ensure_k(g, k);
+  ret = h_hashtable_get(g->first[k], x);
   if(ret != NULL)
     return ret;
-  ret = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr);
+  ret = h_stringmap_new(g->arena);
   assert(ret != NULL);
-  h_hashtable_put(g->first, x, ret);
+  h_hashtable_put(g->first[k], x, ret);
 
   switch(x->type) {
   case HCF_END:
-    h_hashset_put(ret, (void *)end_token);
+    h_stringmap_put_end(ret, INSET);
     break;
   case HCF_CHAR:
-    h_hashset_put(ret, (void *)char_token(x->chr));
+    h_stringmap_put_char(ret, x->chr, INSET);
     break;
   case HCF_CHARSET:
     c=0;
     do {
-      if(charset_isset(x->charset, c))
-        h_hashset_put(ret, (void *)char_token(c));
+      if(charset_isset(x->charset, c)) {
+        h_stringmap_put_char(ret, c, INSET);
+      }
     } while(c++ < 255);
     break;
   case HCF_CHOICE:
     // this is a nonterminal
     // return the union of the first sets of all productions
     for(p=x->seq; *p; ++p)
-      h_hashset_put_all(ret, h_first_sequence(g, (*p)->items));
+      h_stringmap_update(ret, h_first_seq(k, g, (*p)->items));
     break;
   default:  // should not be reached
     assert_message(0, "unknown HCFChoice type");
@@ -225,58 +317,155 @@ HHashSet *h_first_symbol(HCFGrammar *g, const HCFChoice *x)
   return ret;
 }
 
-HHashSet *h_first_sequence(HCFGrammar *g, HCFChoice **s)
+// helpers for h_first_seq, definitions below
+static void first_extend(HCFGrammar *g, HCFStringMap *ret,
+                         size_t k, const HCFStringMap *as, HCFChoice **tail);
+static bool is_singleton_epsilon(const HCFStringMap *m);
+static bool any_string_shorter(size_t k, const HCFStringMap *m);
+
+const HCFStringMap *h_first_seq(size_t k, HCFGrammar *g, HCFChoice **s)
 {
-  // the first set of the empty sequence is empty
+  // shortcut: the first set of the empty sequence, for any k, is {""}
   if(*s == NULL)
-    return h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr);
+    return g->singleton_epsilon;
 
-  // first(X tail) = first(X)                if X does not derive epsilon
-  //               = first(X) u first(tail)  otherwise
+  // first_k(X tail) = { a b | a <- first_k(X), b <- first_l(tail), l=k-|a| }
 
   HCFChoice *x = s[0];
   HCFChoice **tail = s+1;
 
-  HHashSet *first_x = h_first_symbol(g, x);
-  if(h_symbol_derives_epsilon(g, x)) {
-    // return the union of first(x) and first(tail)
-    HHashSet *first_tail = h_first_sequence(g, tail);
-    HHashSet *ret = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr);
-    h_hashset_put_all(ret, first_x);
-    h_hashset_put_all(ret, first_tail);
-    return ret;
-  } else {
+  const HCFStringMap *first_x = h_first(k, g, x);
+
+  // shortcut: if first_k(X) = {""}, just return first_k(tail)
+  if(is_singleton_epsilon(first_x))
+    return h_first_seq(k, g, tail);
+
+  // shortcut: if no elements of first_k(X) have length <k, just return first_k(X)
+  if(!any_string_shorter(k, first_x))
     return first_x;
+
+  // create a new result set and build up the set described above
+  HCFStringMap *ret = h_stringmap_new(g->arena);
+
+  // extend the elements of first_k(X) up to length k from tail
+  first_extend(g, ret, k, first_x, tail);
+
+  return ret;
+}
+
+// add the set { a b | a <- as, b <- first_l(tail), l=k-|a| } to ret
+static void first_extend(HCFGrammar *g, HCFStringMap *ret,
+                         size_t k, const HCFStringMap *as, HCFChoice **tail)
+{
+  if(as->epsilon_branch) {
+    // for a="", add first_k(tail) to ret
+    h_stringmap_update(ret, h_first_seq(k, g, tail));
+  }
+
+  if(as->end_branch) {
+    // for a="$", nothing can follow; just add "$" to ret
+    // NB: formally, "$" is considered to be of length k
+    h_stringmap_put_end(ret, INSET);
+  }
+
+  // iterate over as->char_branches
+  const HHashTable *ht = as->char_branches;
+  for(size_t i=0; i < ht->capacity; i++) {
+    for(HHashTableEntry *hte = &ht->contents[i]; hte; hte = hte->next) {
+      if(hte->key == NULL)
+        continue;
+      uint8_t c = key_char((HCharKey)hte->key);
+      
+      // follow the branch to find the set { a' | t a' <- as }
+      HCFStringMap *as_ = (HCFStringMap *)hte->value;
+
+      // now the elements of ret that begin with t are given by
+      // t { a b | a <- as_, b <- first_l(tail), l=k-|a|-1 }
+      // so we can use recursion over k
+      HCFStringMap *ret_ = h_stringmap_new(g->arena);
+      h_stringmap_put_char(ret, c, ret_);
+
+      first_extend(g, ret_, k-1, as_, tail);
+    }
+  }
+}
+
+static bool is_singleton_epsilon(const HCFStringMap *m)
+{
+  return ( m->epsilon_branch
+           && !m->end_branch
+           && h_hashtable_empty(m->char_branches) );
+}
+
+static bool any_string_shorter(size_t k, const HCFStringMap *m)
+{
+  if(k==0)
+    return false;
+
+  if(m->epsilon_branch)
+    return true;
+
+  // iterate over m->char_branches
+  const HHashTable *ht = m->char_branches;
+  for(size_t i=0; i < ht->capacity; i++) {
+    for(HHashTableEntry *hte = &ht->contents[i]; hte; hte = hte->next) {
+      if(hte->key == NULL)
+        continue;
+      HCFStringMap *m_ = hte->value;
+
+      // check subtree for strings shorter than k-1
+      if(any_string_shorter(k-1, m_))
+        return true;
+    }
   }
+
+  return false;
 }
 
+const HCFStringMap *h_follow(size_t k, HCFGrammar *g, const HCFChoice *x);
+
+// pointer to functions like h_first_seq
+typedef const HCFStringMap *(*StringSetFun)(size_t, HCFGrammar *, HCFChoice const* const*);
+
+static void stringset_extend(HCFGrammar *g, HCFStringMap *ret,
+                             size_t k, const HCFStringMap *as,
+                             StringSetFun f, HCFChoice const * const *tail);
+
+// h_follow adapted to the signature of StringSetFun
+static inline const HCFStringMap *h_follow_(size_t k, HCFGrammar *g, HCFChoice const* const*s)
+{
+  return h_follow(k, g, *s);
+}
 
-/* Compute follow set of symbol x. Memoized. */
-HHashSet *h_follow(HCFGrammar *g, const HCFChoice *x)
+const HCFStringMap *h_follow(size_t k, HCFGrammar *g, const HCFChoice *x)
 {
   // consider all occurances of X in g
   // the follow set of X is the union of:
   //   {$} if X is the start symbol
   //   given a production "A -> alpha X tail":
-  //   if tail derives epsilon:
-  //     first(tail) u follow(A)
-  //   else:
-  //     first(tail)
+  //     first_k(tail follow_k(A))
+
+  // first_k(tail follow_k(A)) =
+  //   { a b | a <- first_k(tail), b <- follow_l(A), l=k-|a| }
+
+  HCFStringMap *ret;
 
-  HHashSet *ret;
+  // shortcut: follow_0(X) is always {""}
+  if(k==0)
+    return g->singleton_epsilon;
 
   // memoize via g->follow
-  assert(g->follow != NULL);
-  ret = h_hashtable_get(g->follow, x);
+  ensure_k(g, k);
+  ret = h_hashtable_get(g->follow[k], x);
   if(ret != NULL)
     return ret;
-  ret = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr);
+  ret = h_stringmap_new(g->arena);
   assert(ret != NULL);
-  h_hashtable_put(g->follow, x, ret);
+  h_hashtable_put(g->follow[k], x, ret);
 
   // if X is the start symbol, the end token is in its follow set
   if(x == g->start)
-    h_hashset_put(ret, (void *)end_token);
+    h_stringmap_put_end(ret, INSET);
 
   // iterate over g->nts
   size_t i;
@@ -285,7 +474,7 @@ HHashSet *h_follow(HCFGrammar *g, const HCFChoice *x)
     for(hte = &g->nts->contents[i]; hte; hte = hte->next) {
       if(hte->key == NULL)
         continue;
-      const HCFChoice *a = hte->key;        // production's left-hand symbol
+      HCFChoice const * const a = hte->key; // production's left-hand symbol
       assert(a->type == HCF_CHOICE);
 
       // iterate over the productions for A
@@ -297,9 +486,12 @@ HHashSet *h_follow(HCFGrammar *g, const HCFChoice *x)
           if(*s == x) { // occurance found
             HCFChoice **tail = s+1;
 
-            h_hashset_put_all(ret, h_first_sequence(g, tail));
-            if(h_sequence_derives_epsilon(g, tail))
-              h_hashset_put_all(ret, h_follow(g, a));
+            const HCFStringMap *first_tail = h_first_seq(k, g, tail);
+
+            //h_stringmap_update(ret, first_tail);
+
+            // extend the elems of first_k(tail) up to length k from follow(A)
+            stringset_extend(g, ret, k, first_tail, h_follow_, &a);
           }
         }
       }
@@ -309,6 +501,44 @@ HHashSet *h_follow(HCFGrammar *g, const HCFChoice *x)
   return ret;
 }
 
+// add the set { a b | a <- as, b <- f_l(S), l=k-|a| } to ret
+static void stringset_extend(HCFGrammar *g, HCFStringMap *ret,
+                             size_t k, const HCFStringMap *as,
+                             StringSetFun f, HCFChoice const * const *tail)
+{
+  if(as->epsilon_branch) {
+    // for a="", add f_k(tail) to ret
+    h_stringmap_update(ret, f(k, g, tail));
+  }
+
+  if(as->end_branch) {
+    // for a="$", nothing can follow; just add "$" to ret
+    // NB: formally, "$" is considered to be of length k
+    h_stringmap_put_end(ret, INSET);
+  }
+
+  // iterate over as->char_branches
+  const HHashTable *ht = as->char_branches;
+  for(size_t i=0; i < ht->capacity; i++) {
+    for(HHashTableEntry *hte = &ht->contents[i]; hte; hte = hte->next) {
+      if(hte->key == NULL)
+        continue;
+      uint8_t c = key_char((HCharKey)hte->key);
+      
+      // follow the branch to find the set { a' | t a' <- as }
+      HCFStringMap *as_ = (HCFStringMap *)hte->value;
+
+      // now the elements of ret that begin with t are given by
+      // t { a b | a <- as_, b <- f_l(tail), l=k-|a|-1 }
+      // so we can use recursion over k
+      HCFStringMap *ret_ = h_stringmap_new(g->arena);
+      h_stringmap_put_char(ret, c, ret_);
+
+      stringset_extend(g, ret_, k-1, as_, f, tail);
+    }
+  }
+}
+
 
 static void pprint_char(FILE *f, char c)
 {
@@ -344,15 +574,16 @@ static void pprint_charset(FILE *f, const HCharset cs)
 
   fputc('[', f);
   for(i=0; i<256; i++) {
-    if(charset_isset(cs, i))
+    if(charset_isset(cs, i)) {
       pprint_charset_char(f, i);
 
-    // detect ranges
-    if(i+2<256 && charset_isset(cs, i+1) && charset_isset(cs, i+2)) {
-      fputc('-', f);
-      for(; i<256 && charset_isset(cs, i); i++);
-      i--;  // back to the last in range
-      pprint_charset_char(f, i);
+      // detect ranges
+      if(i+2<256 && charset_isset(cs, i+1) && charset_isset(cs, i+2)) {
+        fputc('-', f);
+        for(; i<256 && charset_isset(cs, i); i++);
+        i--;  // back to the last in range
+        pprint_charset_char(f, i);
+      }
     }
   }
   fputc(']', f);
@@ -400,6 +631,7 @@ static void pprint_symbol(FILE *f, const HCFGrammar *g, const HCFChoice *x)
     break;
   case HCF_CHARSET:
     pprint_charset(f, x->charset);
+    break;
   default:
     fputs(nonterminal_name(g, x), f);
   }
@@ -507,30 +739,66 @@ void h_pprint_symbolset(FILE *file, const HCFGrammar *g, const HHashSet *set, in
   fputs("}\n", file);
 }
 
-void h_pprint_tokenset(FILE *file, const HCFGrammar *g, const HHashSet *set, int indent)
+#define BUFSIZE 512
+
+void pprint_stringset_elems(FILE *file, char *prefix, size_t n, const HCFStringMap *set)
 {
-  int j;
-  for(j=0; j<indent; j++) fputc(' ', file);
+  assert(n < BUFSIZE-4);
+
+  if(set->epsilon_branch) {
+    if(n==0) {
+      fputs("''", file);
+    } else {
+      fputc(',', file);
+      fwrite(prefix, 1, n, file);
+    }
+  }
 
-  fputc('[', file);
+  if(set->end_branch) {
+    fputc(',', file);
+    fwrite(prefix, 1, n, file);
+    fputc('$', file);
+  }
 
-  // iterate over set
+  // iterate over set->char_branches
+  HHashTable *ht = set->char_branches;
   size_t i;
   HHashTableEntry *hte;
-  for(i=0; i < set->capacity; i++) {
-    for(hte = &set->contents[i]; hte; hte = hte->next) {
+  for(i=0; i < ht->capacity; i++) {
+    for(hte = &ht->contents[i]; hte; hte = hte->next) {
       if(hte->key == NULL)
         continue;
-      HCFToken a = (HCFToken)hte->key;
-      
-      if(a == end_token)
-        fputc('$', file);
-      else if(token_char(a) == '$')
-        fputs("\\$", file);
-      else
-        pprint_char(file, token_char(a));
+      uint8_t c = key_char((HCharKey)hte->key);
+      HCFStringMap *ends = hte->value;
+
+      size_t n_ = n;
+      switch(c) {
+      case '$':  prefix[n_++] = '\\'; prefix[n_++] = '$'; break;
+      case '"':  prefix[n_++] = '\\'; prefix[n_++] = '"'; break;
+      case '\\': prefix[n_++] = '\\'; prefix[n_++] = '\\'; break;
+      case '\b': prefix[n_++] = '\\'; prefix[n_++] = 'b'; break;
+      case '\t': prefix[n_++] = '\\'; prefix[n_++] = 't'; break;
+      case '\n': prefix[n_++] = '\\'; prefix[n_++] = 'n'; break;
+      case '\r': prefix[n_++] = '\\'; prefix[n_++] = 'r'; break;
+      default:
+        if(isprint(c))
+          prefix[n_++] = c;
+        else
+          n_ += sprintf(prefix+n_, "\\x%.2X", c);
+      }
+
+      pprint_stringset_elems(file, prefix, n_, ends);
     }
   }
+}
 
-  fputs("]\n", file);
+void h_pprint_stringset(FILE *file, const HCFGrammar *g, const HCFStringMap *set, int indent)
+{
+  int j;
+  for(j=0; j<indent; j++) fputc(' ', file);
+
+  char buf[BUFSIZE];
+  fputc('{', file);
+  pprint_stringset_elems(file, buf, 0, set);
+  fputs("}\n", file);
 }
diff --git a/src/cfgrammar.h b/src/cfgrammar.h
index fdfa7c5c..f9768c85 100644
--- a/src/cfgrammar.h
+++ b/src/cfgrammar.h
@@ -8,20 +8,44 @@ typedef struct HCFGrammar_ {
   HHashSet    *nts;     // HCFChoices, each representing the alternative
                         // productions for one nonterminal
   HHashSet    *geneps;  // set of NTs that can generate the empty string
-  HHashTable  *first;   // memoized first sets of the grammar's symbols
-  HHashTable  *follow;  // memoized follow sets of the grammar's NTs
+  HHashTable  **first;  // memoized first sets of the grammar's symbols
+  HHashTable  **follow; // memoized follow sets of the grammar's NTs
+  size_t      kmax;     // maximum lookahead depth allocated
   HArena      *arena;
   HAllocator  *mm__;
+
+  // constant set containing only the empty string.
+  // this is only a member of HCFGrammar because it needs a pointer to arena.
+  const struct HCFStringMap_ *singleton_epsilon;
 } HCFGrammar;
 
-/* mapping input bytes or end to tokens
- * we want to use these, cast to void *, as elements in hashsets
- * therefore we must avoid 0 as a token value because NULL means "not in set".
+
+/* Representing input characters (bytes) in HHashTables.
+ * To use these as keys, we must avoid 0 as because NULL means "not set".
  */
-typedef uintptr_t HCFToken;
-static inline HCFToken char_token(uint8_t c) { return (0x100 | c); }
-static inline uint8_t token_char(HCFToken t) { return (0xFF & t); }
-static const HCFToken end_token = 0x200;
+typedef uintptr_t HCharKey;
+static inline HCharKey char_key(uint8_t c) { return (0x100 | c); }
+static inline uint8_t key_char(HCharKey k) { return (0xFF & k); }
+
+/* Mapping strings of input tokens to arbitrary values (or serving as a set).
+ * Common prefixes are folded into a tree of HHashTables, branches labeled with
+ * input tokens.
+ * Each path through the tree represents the string along its branches.
+ */
+typedef struct HCFStringMap_ {
+  void *epsilon_branch;         // points to leaf value
+  void *end_branch;             // points to leaf value
+  HHashTable *char_branches;    // maps to inner nodes (HCFStringMaps)
+  HArena *arena;
+} HCFStringMap;
+
+HCFStringMap *h_stringmap_new(HArena *a);
+void h_stringmap_put_end(HCFStringMap *m, void *v);
+void h_stringmap_put_epsilon(HCFStringMap *m, void *v);
+void h_stringmap_put_char(HCFStringMap *m, uint8_t c, void *v);
+void h_stringmap_update(HCFStringMap *m, const HCFStringMap *n);
+void *h_stringmap_get(const HCFStringMap *m, const uint8_t *str, size_t n, bool end);
+bool h_stringmap_present(const HCFStringMap *m, const uint8_t *str, size_t n, bool end);
 
 
 /* Convert 'parser' into CFG representation by desugaring and compiling the set
@@ -36,22 +60,22 @@ HCFGrammar *h_cfgrammar(HAllocator* mm__, const HParser *parser);
 void h_cfgrammar_free(HCFGrammar *g);
 
 /* Does the given symbol derive the empty string (under g)? */
-bool h_symbol_derives_epsilon(HCFGrammar *g, const HCFChoice *symbol);
+bool h_derives_epsilon(HCFGrammar *g, const HCFChoice *symbol);
 
 /* Does the sentential form s derive the empty string? s NULL-terminated. */
-bool h_sequence_derives_epsilon(HCFGrammar *g, HCFChoice **s);
+bool h_derives_epsilon_seq(HCFGrammar *g, HCFChoice **s);
 
-/* Compute first set of sentential form s. s NULL-terminated. */
-HHashSet *h_first_sequence(HCFGrammar *g, HCFChoice **s);
+/* Compute first_k set of symbol x. Memoized. */
+const HCFStringMap *h_first(size_t k, HCFGrammar *g, const HCFChoice *x);
 
-/* Compute first set of symbol x. Memoized. */
-HHashSet *h_first_symbol(HCFGrammar *g, const HCFChoice *x);
+/* Compute first_k set of sentential form s. s NULL-terminated. */
+const HCFStringMap *h_first_seq(size_t k, HCFGrammar *g, HCFChoice **s);
 
-/* Compute follow set of symbol x. Memoized. */
-HHashSet *h_follow(HCFGrammar *g, const HCFChoice *x);
+/* Compute follow_k set of symbol x. Memoized. */
+const HCFStringMap *h_follow(size_t k, HCFGrammar *g, const HCFChoice *x);
 
 
 /* Pretty-printers for grammars and associated data. */
 void h_pprint_grammar(FILE *file, const HCFGrammar *g, int indent);
 void h_pprint_symbolset(FILE *file, const HCFGrammar *g, const HHashSet *set, int indent);
-void h_pprint_tokenset(FILE *file, const HCFGrammar *g, const HHashSet *set, int indent);
+void h_pprint_stringset(FILE *file, const HCFGrammar *g, const HCFStringMap *set, int indent);
diff --git a/src/datastructures.c b/src/datastructures.c
index 5cb0fb42..ccd4c1d1 100644
--- a/src/datastructures.c
+++ b/src/datastructures.c
@@ -241,5 +241,6 @@ bool h_eq_ptr(const void *p, const void *q) {
 }
 
 HHashValue h_hash_ptr(const void *p) {
+  // XXX just djbhash it
   return (uintptr_t)p >> 4;
 }
diff --git a/src/t_grammar.c b/src/t_grammar.c
index 10a0853c..8003bcf6 100644
--- a/src/t_grammar.c
+++ b/src/t_grammar.c
@@ -28,13 +28,13 @@ static void test_example_1(void) {
   g_check_derives_epsilon_not(g, q);
   g_check_derives_epsilon_not(g, p);
 
-  g_check_firstset_present(g, p, end_token);
-  g_check_firstset_present(g, p, char_token('x'));
-  g_check_firstset_present(g, p, char_token('y'));
+  g_check_firstset_present(1, g, p, "$");
+  g_check_firstset_present(1, g, p, "x");
+  g_check_firstset_present(1, g, p, "y");
 
-  g_check_followset_absent(g, c, end_token);
-  g_check_followset_absent(g, c, char_token('x'));
-  g_check_followset_present(g, c, char_token('y'));
+  g_check_followset_absent(1, g, c, "$");
+  g_check_followset_absent(1, g, c, "x");
+  g_check_followset_present(1, g, c, "y");
 }
 
 void register_grammar_tests(void) {
diff --git a/src/test_suite.h b/src/test_suite.h
index 7c2834f5..35c414a1 100644
--- a/src/test_suite.h
+++ b/src/test_suite.h
@@ -84,6 +84,11 @@
   } while(0)
 
 #define g_check_parse_failed(parser, input, inp_len) do {		\
+    int skip = h_compile((HParser *)(parser), PB_LLk, NULL); \
+    if(skip != 0) {	\
+      g_test_message("Backend not applicable, skipping test");	\
+      break;	\
+    }	\
     const HParseResult *result = h_parse(parser, (const uint8_t*)input, inp_len); \
     if (NULL != result) {						\
       g_test_message("Check failed: shouldn't have succeeded, but did"); \
@@ -92,6 +97,11 @@
   } while(0)
 
 #define g_check_parse_ok(parser, input, inp_len, result) do {		\
+    int skip = h_compile((HParser *)(parser), PB_LLk, NULL); \
+    if(skip) {	\
+      g_test_message("Backend not applicable, skipping test");	\
+      break;	\
+    }	\
     HParseResult *res = h_parse(parser, (const uint8_t*)input, inp_len); \
     if (!res) {								\
       g_test_message("Parse failed on line %d", __LINE__);		\
@@ -134,6 +144,23 @@
     }									\
   } while(0)
 
+#define g_check_stringmap_present(table, key) do {			\
+    bool end = (key[strlen(key)-1] == '$');				\
+    if(!h_stringmap_present(table, (uint8_t *)key, strlen(key), end)) {	\
+      g_test_message("Check failed: \"%s\" should have been in map, but wasn't", key); \
+      g_test_fail();							\
+    }									\
+  } while(0)
+
+#define g_check_stringmap_absent(table, key) do {			\
+    bool end = (key[strlen(key)-2] == '$');				\
+    if(h_stringmap_present(table, (uint8_t *)key, strlen(key), end)) {	\
+      g_test_message("Check failed: \"%s\" shouldn't have been in map, but was", key); \
+      g_test_fail();							\
+    }									\
+  } while(0)
+
+
 #define g_check_terminal(grammar, parser) \
   g_check_hashtable_absent(grammar->nts, h_desugar(&system_allocator, parser))
 
@@ -146,17 +173,17 @@
 #define g_check_derives_epsilon_not(grammar, parser) \
   g_check_hashtable_absent(grammar->geneps, h_desugar(&system_allocator, parser))
 
-#define g_check_firstset_present(grammar, parser, token) \
-  g_check_hashtable_present(h_first_symbol(grammar, h_desugar(&system_allocator, parser)), (void *)token)
+#define g_check_firstset_present(k, grammar, parser, str) \
+  g_check_stringmap_present(h_first(k, grammar, h_desugar(&system_allocator, parser)), str)
 
-#define g_check_firstset_absent(grammar, parser, token) \
-  g_check_hashtable_absent(h_first_symbol(grammar, h_desugar(&system_allocator, parser)), (void *)token)
+#define g_check_firstset_absent(k, grammar, parser, str) \
+  g_check_stringmap_absent(h_first(k, grammar, h_desugar(&system_allocator, parser)), str)
 
-#define g_check_followset_present(grammar, parser, token) \
-  g_check_hashtable_present(h_follow(grammar, h_desugar(&system_allocator, parser)), (void *)token)
+#define g_check_followset_present(k, grammar, parser, str) \
+  g_check_stringmap_present(h_follow(k, grammar, h_desugar(&system_allocator, parser)), str)
 
-#define g_check_followset_absent(grammar, parser, token) \
-  g_check_hashtable_absent(h_follow(grammar, h_desugar(&system_allocator, parser)), (void *)token)
+#define g_check_followset_absent(k, grammar, parser, str) \
+  g_check_stringmap_absent(h_follow(k, grammar, h_desugar(&system_allocator, parser)), str)
 
 
 
-- 
GitLab