diff --git a/src/Makefile b/src/Makefile
index 070e7a81572af44d8dfca05e15c292db4a9ca51b..834a834c27d92b8362228c130fcb363af97af399 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -40,6 +40,7 @@ HAMMER_PARTS := \
 	system_allocator.o \
 	benchmark.o \
 	compile.o \
+	cfgrammar.o \
 	$(PARSERS:%=parsers/%.o) \
 	$(BACKENDS:%=backends/%.o)
 
diff --git a/src/backends/ll.c b/src/backends/ll.c
index fc7e9d5ff1e976d222a8f0fcb03644d1241e8159..591bc2dca099341e2755b75d235d6aa9626a3fb8 100644
--- a/src/backends/ll.c
+++ b/src/backends/ll.c
@@ -1,560 +1,10 @@
 #include <assert.h>
-#include <ctype.h>
 #include "../internal.h"
+#include "../cfgrammar.h"
 #include "../parsers/parser_internal.h"
 
 
 
-/* Grammar representation and analysis */
-
-typedef struct HCFGrammar_ {
-  HCFChoice   *start;   // start symbol (nonterminal)
-  HHashSet    *nts;     // HCFChoices, each representing the alternative
-                        // productions for one nonterminal
-  HHashSet    *geneps;  // set of NTs that can generate the empty string
-  HHashTable  *first;   // memoized first sets of the grammar's symbols
-  HHashTable  *follow;  // memoized follow sets of the grammar's NTs
-  HArena      *arena;
-} HCFGrammar;
-
-// mapping input bytes or end to tokens
-// we want to use these, cast to void *, as elements in hashsets
-// therefore we must avoid 0 as a token value because NULL means "not in set".
-typedef uintptr_t HCFToken;
-static inline HCFToken char_token(char c) { return (0x100 | c); }
-static inline char token_char(HCFToken t) { return (0xFF & t); }
-static HCFToken end_token = 0x200;
-
-bool h_eq_ptr(const void *p, const void *q) { return (p==q); }
-HHashValue h_hash_ptr(const void *p) { return (uintptr_t)p; }
-
-HCFGrammar *h_cfgrammar_new(HAllocator *mm__)
-{
-  HCFGrammar *g = h_new(HCFGrammar, 1);
-  assert(g != NULL);
-
-  g->arena  = h_new_arena(mm__, 0);     // default blocksize
-  g->nts    = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr);
-  g->geneps = NULL;
-  g->first  = h_hashtable_new(g->arena, h_eq_ptr, h_hash_ptr);
-  g->follow = h_hashtable_new(g->arena, h_eq_ptr, h_hash_ptr);
-
-  return g;
-}
-
-
-// helper
-static void collect_nts(HCFGrammar *grammar, HCFChoice *symbol);
-
-/* Convert 'parser' into CFG representation by desugaring and compiling the set
- * of nonterminals.
- * A NULL return means we are unable to represent the parser as a CFG.
- */
-HCFGrammar *h_cfgrammar(HAllocator* mm__, const HParser *parser)
-{
-  // convert parser to CFG form ("desugar").
-  HCFChoice *desugared = h_desugar(mm__, parser);
-  if(desugared == NULL)
-    return NULL;  // -> backend not suitable for this parser
-
-  HCFGrammar *g = h_cfgrammar_new(mm__);
-
-  // recursively traverse the desugared form and collect all HCFChoices that
-  // represent a nonterminal (type HCF_CHOICE or HCF_CHARSET).
-  collect_nts(g, desugared);
-  if(h_hashset_empty(g->nts)) {
-    // desugared is a terminal. wrap it in a singleton HCF_CHOICE.
-    HCFChoice *nt = h_new(HCFChoice, 1);
-    nt->type = HCF_CHOICE;
-    nt->seq = h_new(HCFSequence *, 2);
-    nt->seq[0] = h_new(HCFSequence, 1);
-    nt->seq[0]->items = h_new(HCFChoice *, 2);
-    nt->seq[0]->items[0] = desugared;
-    nt->seq[0]->items[1] = NULL;
-    nt->seq[1] = NULL;
-    h_hashset_put(g->nts, nt);
-    g->start = nt;
-  } else {
-    g->start = desugared;
-  }
-
-  // XXX call collect_geneps here?
-
-  return g;
-}
-
-/* Add all nonterminals reachable from symbol to grammar. */
-static void collect_nts(HCFGrammar *grammar, HCFChoice *symbol)
-{
-  HCFSequence **s;  // for the rhs (sentential form) of a production
-  HCFChoice **x;    // for a symbol in s
-
-  if(h_hashset_present(grammar->nts, symbol))
-    return;     // already visited, get out
-
-  switch(symbol->type) {
-  case HCF_CHAR:
-  case HCF_END:
-    break;  // it's a terminal symbol, nothing to do
-
-  case HCF_CHARSET:
-  case HCF_CHOICE:
-    // exploiting the fact that HHashSet is also a HHashTable to number the
-    // nonterminals.
-    // NB top-level (start) symbol gets 0.
-    h_hashtable_put(grammar->nts, symbol,
-                    (void *)(uintptr_t)grammar->nts->used);
-
-    if(symbol->type == HCF_CHOICE) {
-      // each element s of symbol->seq (HCFSequence) represents the RHS of
-      // a production. call self on all symbols (HCFChoice) in s.
-      for(s = symbol->seq; *s != NULL; s++) {
-        for(x = (*s)->items; *x != NULL; x++) {
-          collect_nts(grammar, *x);
-        }
-      }
-    }
-    break;
-
-  default:  // should not be reachable
-    assert_message(0, "unknown HCFChoice type");
-  }
-}
-
-
-// helper
-static void collect_geneps(HCFGrammar *grammar);
-
-/* Does the given symbol derive the empty string (under g)? */
-bool h_symbol_derives_epsilon(HCFGrammar *g, const HCFChoice *symbol)
-{
-  if(g->geneps == NULL)
-    collect_geneps(g);
-  assert(g->geneps != NULL);
-
-  switch(symbol->type) {
-  case HCF_END:       // the end token doesn't count as empty
-  case HCF_CHAR:
-  case HCF_CHARSET:
-    return false;
-  default:  // HCF_CHOICE
-    return h_hashset_present(g->geneps, symbol);
-  }
-}
-
-/* Does the sentential form s derive the empty string? s NULL-terminated. */
-bool h_sequence_derives_epsilon(HCFGrammar *g, HCFChoice **s)
-{
-  // return true iff all symbols in s derive epsilon
-  for(; *s; s++) {
-    if(!h_symbol_derives_epsilon(g, *s))
-      return false;
-  }
-  return true;
-}
-
-/* Populate the geneps member of g; no-op if called multiple times. */
-static void collect_geneps(HCFGrammar *g)
-{
-  if(g->geneps != NULL)
-    return;
-
-  g->geneps = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr);
-  assert(g->geneps != NULL);
-
-  // iterate over the grammar's symbols, the elements of g->nts.
-  // add any we can identify as deriving epsilon to g->geneps.
-  // repeat until g->geneps no longer changes.
-  size_t prevused;
-  do {
-    prevused = g->geneps->used;
-    size_t i;
-    HHashTableEntry *hte;
-    for(i=0; i < g->nts->capacity; i++) {
-      for(hte = &g->nts->contents[i]; hte; hte = hte->next) {
-        if(hte->key == NULL)
-          continue;
-        const HCFChoice *symbol = hte->key;
-
-        // only "choice" nonterminals can derive epsilon.
-        if(symbol->type != HCF_CHOICE)
-          continue;
-
-        // this NT derives epsilon if any of its productions does.
-        HCFSequence **p;
-        for(p = symbol->seq; *p != NULL; p++) {
-          if(h_sequence_derives_epsilon(g, (*p)->items)) {
-            h_hashset_put(g->geneps, symbol);
-            break;
-          }
-        }
-      }
-    }
-  } while(g->geneps->used != prevused);
-}
-
-
-/* Compute first set of sentential form s. s NULL-terminated. */
-HHashSet *h_first_sequence(HCFGrammar *g, HCFChoice **s);
-
-/* Compute first set of symbol x. Memoized. */
-HHashSet *h_first_symbol(HCFGrammar *g, const HCFChoice *x)
-{
-  HHashSet *ret;
-  HCFSequence **p;
-  uint8_t c;
-
-  // memoize via g->first
-  assert(g->first != NULL);
-  ret = h_hashtable_get(g->first, x);
-  if(ret != NULL)
-    return ret;
-  ret = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr);
-  assert(ret != NULL);
-  h_hashtable_put(g->first, x, ret);
-
-  switch(x->type) {
-  case HCF_END:
-    h_hashset_put(ret, (void *)end_token);
-    break;
-  case HCF_CHAR:
-    h_hashset_put(ret, (void *)char_token(x->chr));
-    break;
-  case HCF_CHARSET:
-    c=0;
-    do {
-      if(charset_isset(x->charset, c))
-        h_hashset_put(ret, (void *)char_token(c));
-    } while(c++ < 255);
-    break;
-  case HCF_CHOICE:
-    // this is a nonterminal
-    // return the union of the first sets of all productions
-    for(p=x->seq; *p; ++p)
-      h_hashset_put_all(ret, h_first_sequence(g, (*p)->items));
-    break;
-  default:  // should not be reached
-    assert_message(0, "unknown HCFChoice type");
-  }
-  
-  return ret;
-}
-
-HHashSet *h_first_sequence(HCFGrammar *g, HCFChoice **s)
-{
-  // the first set of the empty sequence is empty
-  if(*s == NULL)
-    return h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr);
-
-  // first(X tail) = first(X)                if X does not derive epsilon
-  //               = first(X) u first(tail)  otherwise
-
-  HCFChoice *x = s[0];
-  HCFChoice **tail = s+1;
-
-  HHashSet *first_x = h_first_symbol(g, x);
-  if(h_symbol_derives_epsilon(g, x)) {
-    // return the union of first(x) and first(tail)
-    HHashSet *first_tail = h_first_sequence(g, tail);
-    HHashSet *ret = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr);
-    h_hashset_put_all(ret, first_x);
-    h_hashset_put_all(ret, first_tail);
-    return ret;
-  } else {
-    return first_x;
-  }
-}
-
-
-/* Compute follow set of symbol x. Memoized. */
-HHashSet *h_follow(HCFGrammar *g, const HCFChoice *x)
-{
-  // consider all occurances of X in g
-  // the follow set of X is the union of:
-  //   given a production "A -> alpha X tail":
-  //   if tail derives epsilon:
-  //     first(tail) u follow(A)
-  //   else:
-  //     first(tail)
-
-  HHashSet *ret;
-
-  // memoize via g->follow
-  assert(g->follow != NULL);
-  ret = h_hashtable_get(g->follow, x);
-  if(ret != NULL)
-    return ret;
-  ret = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr);
-  assert(ret != NULL);
-  h_hashtable_put(g->follow, x, ret);
-
-  // iterate over g->nts
-  size_t i;
-  HHashTableEntry *hte;
-  for(i=0; i < g->nts->capacity; i++) {
-    for(hte = &g->nts->contents[i]; hte; hte = hte->next) {
-      if(hte->key == NULL)
-        continue;
-      const HCFChoice *a = hte->key;        // production's left-hand symbol
-
-      // X can only occur in a proper HCF_CHOICE
-      if(a->type != HCF_CHOICE) continue;
-
-      // iterate over the productions for A
-      HCFSequence **p;
-      for(p=a->seq; *p; p++) {
-        HCFChoice **s = (*p)->items;        // production's right-hand side
-        
-        for(; *s; s++) {
-          if(*s == x) { // occurance found
-            HCFChoice **tail = s+1;
-
-            h_hashset_put_all(ret, h_first_sequence(g, tail));
-            if(h_sequence_derives_epsilon(g, tail))
-              h_hashset_put_all(ret, h_follow(g, a));
-          }
-        }
-      }
-    }
-  }
-
-  return ret;
-}
-
-
-static void pprint_char(FILE *f, char c)
-{
-  switch(c) {
-  case '"': fputs("\\\"", f); break;
-  case '\\': fputs("\\\\", f); break;
-  case '\b': fputs("\\b", f); break;
-  case '\t': fputs("\\t", f); break;
-  case '\n': fputs("\\n", f); break;
-  case '\r': fputs("\\r", f); break;
-  default:
-    if(isprint(c)) {
-      fputc(c, f);
-    } else {
-      fprintf(f, "\\x%.2X", c);
-    }
-  }
-}
-
-static void pprint_charset_char(FILE *f, char c)
-{
-  switch(c) {
-  case '"': fputc(c, f); break;
-  case '-': fputs("\\-", f); break;
-  case ']': fputs("\\-", f); break;
-  default:  pprint_char(f, c);
-  }
-}
-
-static void pprint_charset(FILE *f, const HCharset cs)
-{
-  int i;
-
-  fputc('[', f);
-  for(i=0; i<256; i++) {
-    if(charset_isset(cs, i))
-      pprint_charset_char(f, i);
-
-    // detect ranges
-    if(i+2<256 && charset_isset(cs, i+1) && charset_isset(cs, i+2)) {
-      fputc('-', f);
-      for(; i<256 && charset_isset(cs, i); i++);
-      i--;  // back to the last in range
-      pprint_charset_char(f, i);
-    }
-  }
-  fputc(']', f);
-}
-
-static const char *nonterminal_name(const HCFGrammar *g, const HCFChoice *nt)
-{
-  static char buf[16] = {0}; // 14 characters in base 26 are enough for 64 bits
-
-  // find nt's number in g
-  size_t n = (uintptr_t)h_hashtable_get(g->nts, nt);
-
-  // NB the start symbol (number 0) is always "A".
-  int i;
-  for(i=14; i>=0 && (n>0 || i==14); i--) {
-    buf[i] = 'A' + n%26;
-    n = n/26;   // shift one digit
-  }
-
-  return buf+i+1;
-}
-
-static HCFChoice **pprint_string(FILE *f, HCFChoice **x)
-{
-  fputc('"', f);
-  for(; *x; x++) {
-    if((*x)->type != HCF_CHAR)
-      break;
-    pprint_char(f, (*x)->chr);
-  }
-  fputc('"', f);
-  return x;
-}
-
-static void pprint_symbol(FILE *f, const HCFGrammar *g, const HCFChoice *x)
-{
-  switch(x->type) {
-  case HCF_CHAR:
-    fputc('"', f);
-    pprint_char(f, x->chr);
-    fputc('"', f);
-    break;
-  case HCF_END:
-    fputc('$', f);
-    break;
-  default:
-    fputs(nonterminal_name(g, x), f);
-  }
-}
-
-static void pprint_sequence(FILE *f, const HCFGrammar *g, const HCFSequence *seq)
-{
-  HCFChoice **x = seq->items;
-
-  if(*x == NULL) {  // the empty sequence
-    fputs(" \"\"", f);
-  } else {
-    while(*x) {
-      fputc(' ', f);      // separator
-
-      if((*x)->type == HCF_CHAR) {
-        // condense character strings
-        x = pprint_string(f, x);
-      } else {
-        pprint_symbol(f, g, *x);
-        x++;
-      }
-    }
-  }
-
-  fputc('\n', f);
-}
-
-static
-void pprint_ntrules(FILE *f, const HCFGrammar *g, const HCFChoice *nt,
-                    int indent, int len)
-{
-  int i;
-  int column = indent + len;
-
-  const char *name = nonterminal_name(g, nt);
-
-  // print rule head (symbol name)
-  for(i=0; i<indent; i++) fputc(' ', f);
-  fputs(name, f);
-  i += strlen(name);
-  for(; i<column; i++) fputc(' ', f);
-  fputs(" ->", f);
-
-  HCFSequence **p;
-  switch(nt->type) {
-  case HCF_CHARSET:
-    pprint_charset(f, nt->charset);
-    break;
-  case HCF_CHOICE:
-    p = nt->seq;
-    if(*p == NULL) break;           // shouldn't happen
-    pprint_sequence(f, g, *p++);    // print first production on the same line
-    for(; *p; p++) {                // print the rest below with "or" bars
-      for(i=0; i<column; i++) fputc(' ', f);    // indent
-      fputs("  |", f);
-      pprint_sequence(f, g, *p);
-    }
-    break;
-  default: // should not be reached
-    fputs(" ???\n", f);
-    assert_message(0, "unexpected nonterminal type");
-  }
-}
-
-void h_pprint_grammar(FILE *file, const HCFGrammar *g, int indent)
-{
-  if(g->nts->used < 1)
-    return;
-
-  // determine maximum string length of symbol names
-  int len;
-  size_t s;
-  for(len=1, s=26; s < g->nts->used; len++, s*=26); 
-
-  // iterate over g->nts
-  size_t i;
-  HHashTableEntry *hte;
-  for(i=0; i < g->nts->capacity; i++) {
-    for(hte = &g->nts->contents[i]; hte; hte = hte->next) {
-      if(hte->key == NULL)
-        continue;
-      const HCFChoice *a = hte->key;        // production's left-hand symbol
-
-      pprint_ntrules(file, g, a, indent, len);
-    }
-  }
-}
-
-void h_pprint_symbolset(FILE *file, const HCFGrammar *g, const HHashSet *set, int indent)
-{
-  int j;
-  for(j=0; j<indent; j++) fputc(' ', file);
-
-  fputc('{', file);
-
-  // iterate over set
-  size_t i;
-  HHashTableEntry *hte;
-  const HCFChoice *a = NULL;
-  for(i=0; i < set->capacity; i++) {
-    for(hte = &set->contents[i]; hte; hte = hte->next) {
-      if(hte->key == NULL)
-        continue;
-      if(a != NULL) // we're not on the first element
-          fputc(',', file);
-
-      a = hte->key;        // production's left-hand symbol
-
-      pprint_symbol(file, g, a);
-    }
-  }
-
-  fputs("}\n", file);
-}
-
-void h_pprint_tokenset(FILE *file, const HCFGrammar *g, const HHashSet *set, int indent)
-{
-  int j;
-  for(j=0; j<indent; j++) fputc(' ', file);
-
-  fputc('[', file);
-
-  // iterate over set
-  size_t i;
-  HHashTableEntry *hte;
-  for(i=0; i < set->capacity; i++) {
-    for(hte = &set->contents[i]; hte; hte = hte->next) {
-      if(hte->key == NULL)
-        continue;
-      HCFToken a = (HCFToken)hte->key;
-      
-      if(a == end_token)
-        fputc('$', file);
-      else if(token_char(a) == '$')
-        fputs("\\$", file);
-      else
-        pprint_char(file, token_char(a));
-    }
-  }
-
-  fputs("]\n", file);
-}
-
-
 /* LL parse table and associated data */
 
 typedef struct HLLTable_ {
@@ -630,7 +80,6 @@ int test_ll(void)
   }
 
   h_pprint_grammar(stdout, g, 0);
-  collect_geneps(g);
   printf("generate epsilon: ");
   h_pprint_symbolset(stdout, g, g->geneps, 0);
   printf("first(A) = ");
diff --git a/src/cfgrammar.c b/src/cfgrammar.c
new file mode 100644
index 0000000000000000000000000000000000000000..40e2efa53c9d2eff1bdb1caacdf4cdbf7e8d788d
--- /dev/null
+++ b/src/cfgrammar.c
@@ -0,0 +1,534 @@
+/* Context-free grammar representation and analysis */
+
+#include "cfgrammar.h"
+#include <assert.h>
+#include <ctype.h>
+
+
+bool h_eq_ptr(const void *p, const void *q) { return (p==q); }
+HHashValue h_hash_ptr(const void *p) { return (uintptr_t)p; }
+
+
+HCFGrammar *h_cfgrammar_new(HAllocator *mm__)
+{
+  HCFGrammar *g = h_new(HCFGrammar, 1);
+  assert(g != NULL);
+
+  g->arena  = h_new_arena(mm__, 0);     // default blocksize
+  g->nts    = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr);
+  g->geneps = NULL;
+  g->first  = h_hashtable_new(g->arena, h_eq_ptr, h_hash_ptr);
+  g->follow = h_hashtable_new(g->arena, h_eq_ptr, h_hash_ptr);
+
+  return g;
+}
+
+
+// helpers
+static void collect_nts(HCFGrammar *grammar, HCFChoice *symbol);
+static void collect_geneps(HCFGrammar *grammar);
+
+
+/* Convert 'parser' into CFG representation by desugaring and compiling the set
+ * of nonterminals.
+ * A NULL return means we are unable to represent the parser as a CFG.
+ */
+HCFGrammar *h_cfgrammar(HAllocator* mm__, const HParser *parser)
+{
+  // convert parser to CFG form ("desugar").
+  HCFChoice *desugared = h_desugar(mm__, parser);
+  if(desugared == NULL)
+    return NULL;  // -> backend not suitable for this parser
+
+  HCFGrammar *g = h_cfgrammar_new(mm__);
+
+  // recursively traverse the desugared form and collect all HCFChoices that
+  // represent a nonterminal (type HCF_CHOICE or HCF_CHARSET).
+  collect_nts(g, desugared);
+  if(h_hashset_empty(g->nts)) {
+    // desugared is a terminal. wrap it in a singleton HCF_CHOICE.
+    HCFChoice *nt = h_new(HCFChoice, 1);
+    nt->type = HCF_CHOICE;
+    nt->seq = h_new(HCFSequence *, 2);
+    nt->seq[0] = h_new(HCFSequence, 1);
+    nt->seq[0]->items = h_new(HCFChoice *, 2);
+    nt->seq[0]->items[0] = desugared;
+    nt->seq[0]->items[1] = NULL;
+    nt->seq[1] = NULL;
+    h_hashset_put(g->nts, nt);
+    g->start = nt;
+  } else {
+    g->start = desugared;
+  }
+
+  // determine which nonterminals generate epsilon
+  collect_geneps(g);
+
+  return g;
+}
+
+/* Add all nonterminals reachable from symbol to grammar. */
+static void collect_nts(HCFGrammar *grammar, HCFChoice *symbol)
+{
+  HCFSequence **s;  // for the rhs (sentential form) of a production
+  HCFChoice **x;    // for a symbol in s
+
+  if(h_hashset_present(grammar->nts, symbol))
+    return;     // already visited, get out
+
+  switch(symbol->type) {
+  case HCF_CHAR:
+  case HCF_END:
+    break;  // it's a terminal symbol, nothing to do
+
+  case HCF_CHARSET:
+  case HCF_CHOICE:
+    // exploiting the fact that HHashSet is also a HHashTable to number the
+    // nonterminals.
+    // NB top-level (start) symbol gets 0.
+    h_hashtable_put(grammar->nts, symbol,
+                    (void *)(uintptr_t)grammar->nts->used);
+
+    if(symbol->type == HCF_CHOICE) {
+      // each element s of symbol->seq (HCFSequence) represents the RHS of
+      // a production. call self on all symbols (HCFChoice) in s.
+      for(s = symbol->seq; *s != NULL; s++) {
+        for(x = (*s)->items; *x != NULL; x++) {
+          collect_nts(grammar, *x);
+        }
+      }
+    }
+    break;
+
+  default:  // should not be reachable
+    assert_message(0, "unknown HCFChoice type");
+  }
+}
+
+
+/* Does the given symbol derive the empty string (under g)? */
+bool h_symbol_derives_epsilon(HCFGrammar *g, const HCFChoice *symbol)
+{
+  assert(g->geneps != NULL);
+
+  switch(symbol->type) {
+  case HCF_END:       // the end token doesn't count as empty
+  case HCF_CHAR:
+  case HCF_CHARSET:
+    return false;
+  default:  // HCF_CHOICE
+    return h_hashset_present(g->geneps, symbol);
+  }
+}
+
+/* Does the sentential form s derive the empty string? s NULL-terminated. */
+bool h_sequence_derives_epsilon(HCFGrammar *g, HCFChoice **s)
+{
+  // return true iff all symbols in s derive epsilon
+  for(; *s; s++) {
+    if(!h_symbol_derives_epsilon(g, *s))
+      return false;
+  }
+  return true;
+}
+
+/* Populate the geneps member of g; no-op if called multiple times. */
+static void collect_geneps(HCFGrammar *g)
+{
+  if(g->geneps != NULL)
+    return;
+
+  g->geneps = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr);
+  assert(g->geneps != NULL);
+
+  // iterate over the grammar's symbols, the elements of g->nts.
+  // add any we can identify as deriving epsilon to g->geneps.
+  // repeat until g->geneps no longer changes.
+  size_t prevused;
+  do {
+    prevused = g->geneps->used;
+    size_t i;
+    HHashTableEntry *hte;
+    for(i=0; i < g->nts->capacity; i++) {
+      for(hte = &g->nts->contents[i]; hte; hte = hte->next) {
+        if(hte->key == NULL)
+          continue;
+        const HCFChoice *symbol = hte->key;
+
+        // only "choice" nonterminals can derive epsilon.
+        if(symbol->type != HCF_CHOICE)
+          continue;
+
+        // this NT derives epsilon if any of its productions does.
+        HCFSequence **p;
+        for(p = symbol->seq; *p != NULL; p++) {
+          if(h_sequence_derives_epsilon(g, (*p)->items)) {
+            h_hashset_put(g->geneps, symbol);
+            break;
+          }
+        }
+      }
+    }
+  } while(g->geneps->used != prevused);
+}
+
+
+/* Compute first set of sentential form s. s NULL-terminated. */
+HHashSet *h_first_sequence(HCFGrammar *g, HCFChoice **s);
+
+/* Compute first set of symbol x. Memoized. */
+HHashSet *h_first_symbol(HCFGrammar *g, const HCFChoice *x)
+{
+  HHashSet *ret;
+  HCFSequence **p;
+  uint8_t c;
+
+  // memoize via g->first
+  assert(g->first != NULL);
+  ret = h_hashtable_get(g->first, x);
+  if(ret != NULL)
+    return ret;
+  ret = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr);
+  assert(ret != NULL);
+  h_hashtable_put(g->first, x, ret);
+
+  switch(x->type) {
+  case HCF_END:
+    h_hashset_put(ret, (void *)end_token);
+    break;
+  case HCF_CHAR:
+    h_hashset_put(ret, (void *)char_token(x->chr));
+    break;
+  case HCF_CHARSET:
+    c=0;
+    do {
+      if(charset_isset(x->charset, c))
+        h_hashset_put(ret, (void *)char_token(c));
+    } while(c++ < 255);
+    break;
+  case HCF_CHOICE:
+    // this is a nonterminal
+    // return the union of the first sets of all productions
+    for(p=x->seq; *p; ++p)
+      h_hashset_put_all(ret, h_first_sequence(g, (*p)->items));
+    break;
+  default:  // should not be reached
+    assert_message(0, "unknown HCFChoice type");
+  }
+  
+  return ret;
+}
+
+HHashSet *h_first_sequence(HCFGrammar *g, HCFChoice **s)
+{
+  // the first set of the empty sequence is empty
+  if(*s == NULL)
+    return h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr);
+
+  // first(X tail) = first(X)                if X does not derive epsilon
+  //               = first(X) u first(tail)  otherwise
+
+  HCFChoice *x = s[0];
+  HCFChoice **tail = s+1;
+
+  HHashSet *first_x = h_first_symbol(g, x);
+  if(h_symbol_derives_epsilon(g, x)) {
+    // return the union of first(x) and first(tail)
+    HHashSet *first_tail = h_first_sequence(g, tail);
+    HHashSet *ret = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr);
+    h_hashset_put_all(ret, first_x);
+    h_hashset_put_all(ret, first_tail);
+    return ret;
+  } else {
+    return first_x;
+  }
+}
+
+
+/* Compute follow set of symbol x. Memoized. */
+HHashSet *h_follow(HCFGrammar *g, const HCFChoice *x)
+{
+  // consider all occurances of X in g
+  // the follow set of X is the union of:
+  //   given a production "A -> alpha X tail":
+  //   if tail derives epsilon:
+  //     first(tail) u follow(A)
+  //   else:
+  //     first(tail)
+
+  HHashSet *ret;
+
+  // memoize via g->follow
+  assert(g->follow != NULL);
+  ret = h_hashtable_get(g->follow, x);
+  if(ret != NULL)
+    return ret;
+  ret = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr);
+  assert(ret != NULL);
+  h_hashtable_put(g->follow, x, ret);
+
+  // iterate over g->nts
+  size_t i;
+  HHashTableEntry *hte;
+  for(i=0; i < g->nts->capacity; i++) {
+    for(hte = &g->nts->contents[i]; hte; hte = hte->next) {
+      if(hte->key == NULL)
+        continue;
+      const HCFChoice *a = hte->key;        // production's left-hand symbol
+
+      // X can only occur in a proper HCF_CHOICE
+      if(a->type != HCF_CHOICE) continue;
+
+      // iterate over the productions for A
+      HCFSequence **p;
+      for(p=a->seq; *p; p++) {
+        HCFChoice **s = (*p)->items;        // production's right-hand side
+        
+        for(; *s; s++) {
+          if(*s == x) { // occurance found
+            HCFChoice **tail = s+1;
+
+            h_hashset_put_all(ret, h_first_sequence(g, tail));
+            if(h_sequence_derives_epsilon(g, tail))
+              h_hashset_put_all(ret, h_follow(g, a));
+          }
+        }
+      }
+    }
+  }
+
+  return ret;
+}
+
+
+static void pprint_char(FILE *f, char c)
+{
+  switch(c) {
+  case '"': fputs("\\\"", f); break;
+  case '\\': fputs("\\\\", f); break;
+  case '\b': fputs("\\b", f); break;
+  case '\t': fputs("\\t", f); break;
+  case '\n': fputs("\\n", f); break;
+  case '\r': fputs("\\r", f); break;
+  default:
+    if(isprint(c)) {
+      fputc(c, f);
+    } else {
+      fprintf(f, "\\x%.2X", c);
+    }
+  }
+}
+
+static void pprint_charset_char(FILE *f, char c)
+{
+  switch(c) {
+  case '"': fputc(c, f); break;
+  case '-': fputs("\\-", f); break;
+  case ']': fputs("\\-", f); break;
+  default:  pprint_char(f, c);
+  }
+}
+
+static void pprint_charset(FILE *f, const HCharset cs)
+{
+  int i;
+
+  fputc('[', f);
+  for(i=0; i<256; i++) {
+    if(charset_isset(cs, i))
+      pprint_charset_char(f, i);
+
+    // detect ranges
+    if(i+2<256 && charset_isset(cs, i+1) && charset_isset(cs, i+2)) {
+      fputc('-', f);
+      for(; i<256 && charset_isset(cs, i); i++);
+      i--;  // back to the last in range
+      pprint_charset_char(f, i);
+    }
+  }
+  fputc(']', f);
+}
+
+static const char *nonterminal_name(const HCFGrammar *g, const HCFChoice *nt)
+{
+  static char buf[16] = {0}; // 14 characters in base 26 are enough for 64 bits
+
+  // find nt's number in g
+  size_t n = (uintptr_t)h_hashtable_get(g->nts, nt);
+
+  // NB the start symbol (number 0) is always "A".
+  int i;
+  for(i=14; i>=0 && (n>0 || i==14); i--) {
+    buf[i] = 'A' + n%26;
+    n = n/26;   // shift one digit
+  }
+
+  return buf+i+1;
+}
+
+static HCFChoice **pprint_string(FILE *f, HCFChoice **x)
+{
+  fputc('"', f);
+  for(; *x; x++) {
+    if((*x)->type != HCF_CHAR)
+      break;
+    pprint_char(f, (*x)->chr);
+  }
+  fputc('"', f);
+  return x;
+}
+
+static void pprint_symbol(FILE *f, const HCFGrammar *g, const HCFChoice *x)
+{
+  switch(x->type) {
+  case HCF_CHAR:
+    fputc('"', f);
+    pprint_char(f, x->chr);
+    fputc('"', f);
+    break;
+  case HCF_END:
+    fputc('$', f);
+    break;
+  default:
+    fputs(nonterminal_name(g, x), f);
+  }
+}
+
+static void pprint_sequence(FILE *f, const HCFGrammar *g, const HCFSequence *seq)
+{
+  HCFChoice **x = seq->items;
+
+  if(*x == NULL) {  // the empty sequence
+    fputs(" \"\"", f);
+  } else {
+    while(*x) {
+      fputc(' ', f);      // separator
+
+      if((*x)->type == HCF_CHAR) {
+        // condense character strings
+        x = pprint_string(f, x);
+      } else {
+        pprint_symbol(f, g, *x);
+        x++;
+      }
+    }
+  }
+
+  fputc('\n', f);
+}
+
+static
+void pprint_ntrules(FILE *f, const HCFGrammar *g, const HCFChoice *nt,
+                    int indent, int len)
+{
+  int i;
+  int column = indent + len;
+
+  const char *name = nonterminal_name(g, nt);
+
+  // print rule head (symbol name)
+  for(i=0; i<indent; i++) fputc(' ', f);
+  fputs(name, f);
+  i += strlen(name);
+  for(; i<column; i++) fputc(' ', f);
+  fputs(" ->", f);
+
+  HCFSequence **p;
+  switch(nt->type) {
+  case HCF_CHARSET:
+    pprint_charset(f, nt->charset);
+    break;
+  case HCF_CHOICE:
+    p = nt->seq;
+    if(*p == NULL) break;           // shouldn't happen
+    pprint_sequence(f, g, *p++);    // print first production on the same line
+    for(; *p; p++) {                // print the rest below with "or" bars
+      for(i=0; i<column; i++) fputc(' ', f);    // indent
+      fputs("  |", f);
+      pprint_sequence(f, g, *p);
+    }
+    break;
+  default: // should not be reached
+    fputs(" ???\n", f);
+    assert_message(0, "unexpected nonterminal type");
+  }
+}
+
+void h_pprint_grammar(FILE *file, const HCFGrammar *g, int indent)
+{
+  if(g->nts->used < 1)
+    return;
+
+  // determine maximum string length of symbol names
+  int len;
+  size_t s;
+  for(len=1, s=26; s < g->nts->used; len++, s*=26); 
+
+  // iterate over g->nts
+  size_t i;
+  HHashTableEntry *hte;
+  for(i=0; i < g->nts->capacity; i++) {
+    for(hte = &g->nts->contents[i]; hte; hte = hte->next) {
+      if(hte->key == NULL)
+        continue;
+      const HCFChoice *a = hte->key;        // production's left-hand symbol
+
+      pprint_ntrules(file, g, a, indent, len);
+    }
+  }
+}
+
+void h_pprint_symbolset(FILE *file, const HCFGrammar *g, const HHashSet *set, int indent)
+{
+  int j;
+  for(j=0; j<indent; j++) fputc(' ', file);
+
+  fputc('{', file);
+
+  // iterate over set
+  size_t i;
+  HHashTableEntry *hte;
+  const HCFChoice *a = NULL;
+  for(i=0; i < set->capacity; i++) {
+    for(hte = &set->contents[i]; hte; hte = hte->next) {
+      if(hte->key == NULL)
+        continue;
+      if(a != NULL) // we're not on the first element
+          fputc(',', file);
+
+      a = hte->key;        // production's left-hand symbol
+
+      pprint_symbol(file, g, a);
+    }
+  }
+
+  fputs("}\n", file);
+}
+
+void h_pprint_tokenset(FILE *file, const HCFGrammar *g, const HHashSet *set, int indent)
+{
+  int j;
+  for(j=0; j<indent; j++) fputc(' ', file);
+
+  fputc('[', file);
+
+  // iterate over set
+  size_t i;
+  HHashTableEntry *hte;
+  for(i=0; i < set->capacity; i++) {
+    for(hte = &set->contents[i]; hte; hte = hte->next) {
+      if(hte->key == NULL)
+        continue;
+      HCFToken a = (HCFToken)hte->key;
+      
+      if(a == end_token)
+        fputc('$', file);
+      else if(token_char(a) == '$')
+        fputs("\\$", file);
+      else
+        pprint_char(file, token_char(a));
+    }
+  }
+
+  fputs("]\n", file);
+}
diff --git a/src/cfgrammar.h b/src/cfgrammar.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e12ba07b87f03f2375cfabb37b198b14b4c3a85
--- /dev/null
+++ b/src/cfgrammar.h
@@ -0,0 +1,51 @@
+/* Context-free grammar representation and analysis */
+
+#include "internal.h"
+
+
+typedef struct HCFGrammar_ {
+  HCFChoice   *start;   // start symbol (nonterminal)
+  HHashSet    *nts;     // HCFChoices, each representing the alternative
+                        // productions for one nonterminal
+  HHashSet    *geneps;  // set of NTs that can generate the empty string
+  HHashTable  *first;   // memoized first sets of the grammar's symbols
+  HHashTable  *follow;  // memoized follow sets of the grammar's NTs
+  HArena      *arena;
+} HCFGrammar;
+
+/* mapping input bytes or end to tokens
+ * we want to use these, cast to void *, as elements in hashsets
+ * therefore we must avoid 0 as a token value because NULL means "not in set".
+ */
+typedef uintptr_t HCFToken;
+static inline HCFToken char_token(char c) { return (0x100 | c); }
+static inline char token_char(HCFToken t) { return (0xFF & t); }
+static const HCFToken end_token = 0x200;
+
+
+/* Convert 'parser' into CFG representation by desugaring and compiling the set
+ * of nonterminals.
+ * A NULL return means we are unable to represent the parser as a CFG.
+ */
+HCFGrammar *h_cfgrammar(HAllocator* mm__, const HParser *parser);
+
+/* Does the given symbol derive the empty string (under g)? */
+bool h_symbol_derives_epsilon(HCFGrammar *g, const HCFChoice *symbol);
+
+/* Does the sentential form s derive the empty string? s NULL-terminated. */
+bool h_sequence_derives_epsilon(HCFGrammar *g, HCFChoice **s);
+
+/* Compute first set of sentential form s. s NULL-terminated. */
+HHashSet *h_first_sequence(HCFGrammar *g, HCFChoice **s);
+
+/* Compute first set of symbol x. Memoized. */
+HHashSet *h_first_symbol(HCFGrammar *g, const HCFChoice *x);
+
+/* Compute follow set of symbol x. Memoized. */
+HHashSet *h_follow(HCFGrammar *g, const HCFChoice *x);
+
+
+/* Pretty-printers for grammars and associated data. */
+void h_pprint_grammar(FILE *file, const HCFGrammar *g, int indent);
+void h_pprint_symbolset(FILE *file, const HCFGrammar *g, const HHashSet *set, int indent);
+void h_pprint_tokenset(FILE *file, const HCFGrammar *g, const HHashSet *set, int indent);