From 5e3c681dbc0e9e114a0edf5f37a0048ff39f74d6 Mon Sep 17 00:00:00 2001
From: "Sven M. Hallberg" <pesco@khjk.org>
Date: Thu, 23 May 2013 21:01:37 +0200
Subject: [PATCH] generalize most of llk.c to arbitrary k (ex. h_predict) -
 still bugged

---
 src/backends/llk.c | 194 ++++++++++++++++++++++++++++++++-------------
 src/cfgrammar.c    |  28 ++++++-
 src/cfgrammar.h    |   4 +
 3 files changed, 169 insertions(+), 57 deletions(-)

diff --git a/src/backends/llk.c b/src/backends/llk.c
index 3337ebcc..bef5aa62 100644
--- a/src/backends/llk.c
+++ b/src/backends/llk.c
@@ -3,17 +3,16 @@
 #include "../cfgrammar.h"
 #include "../parsers/parser_internal.h"
 
-// XXX despite the names, this is all LL(1) right now. TODO
-
 
 /* Generating the LL(k) parse table */
 
-/* Maps each nonterminal (HCFChoice) of the grammar to another hash table that
- * maps lookahead tokens (HCFToken) to productions (HCFSequence).
+/* Maps each nonterminal (HCFChoice) of the grammar to a HCFStringMap that
+ * maps lookahead strings to productions (HCFSequence).
  */
 typedef struct HLLkTable_ {
   HHashTable *rows;
   HCFChoice  *start;    // start symbol
+  size_t     k;         // lookahead depth XXX needed?
   HArena     *arena;
   HAllocator *mm__;
 } HLLkTable;
@@ -28,20 +27,34 @@ static const HCFToken end_token = 0x200;
 const HCFSequence *h_llk_lookup(const HLLkTable *table, const HCFChoice *x,
                                 HInputStream lookahead)
 {
-  // note the lookahead stream is passed by value, i.e. a copy.
-  // reading bits from it does not consume them from the real input.
-  HCFToken tok;
-  uint8_t c = h_read_bits(&lookahead, 8, false);
-  if(lookahead.overrun)
-    tok = end_token;
-  else
-    tok = char_token(c);
-
-  const HHashTable *row = h_hashtable_get(table->rows, x);
+  const HCFStringMap *row = h_hashtable_get(table->rows, x);
   assert(row != NULL);  // the table should have one row for each nonterminal
 
-  const HCFSequence *production = h_hashtable_get(row, (void *)tok);
-  return production;
+  assert(!row->epsilon_branch); // would match without looking at the input
+                                // XXX cases where this could be useful?
+
+  const HCFStringMap *m = row;
+  while(m) {
+    if(m->epsilon_branch) {     // input matched
+      // assert: another lookahead would not bring a more specific match.
+      //         this is for the table generator to ensure.
+      return m->epsilon_branch;
+    }
+
+    // note the lookahead stream is passed by value, i.e. a copy.
+    // reading bits from it does not consume them from the real input.
+    uint8_t c = h_read_bits(&lookahead, 8, false);
+    
+    if(lookahead.overrun) {     // end of input
+      // XXX assumption of byte-wise grammar and input
+      return m->end_branch;
+    }
+
+    // no match yet, descend
+    m = h_stringmap_get_char(m, c);
+  }
+
+  return NULL;
 }
 
 /* Allocate a new parse table. */
@@ -72,58 +85,126 @@ void h_llktable_free(HLLkTable *table)
   h_free(table);
 }
 
-/* Compute the predict set of production "A -> rhs". */
-HHashSet *h_predict(HCFGrammar *g, const HCFChoice *A, const HCFSequence *rhs)
+/* Compute the predict_k set of production "A -> rhs".
+ * Always returns a newly-allocated HCFStringMap.
+ */
+HCFStringMap *h_predict(size_t k, HCFGrammar *g,
+                        const HCFChoice *A, const HCFSequence *rhs)
 {
+  assert(k==1); // XXX
+  HCFStringMap *ret = h_stringmap_new(g->arena);
+
   // predict(A -> rhs) = first(rhs) u follow(A)  if "" can be derived from rhs
   // predict(A -> rhs) = first(rhs)              otherwise
-  const HCFStringMap *first_rhs = h_first_seq(1, g, rhs->items);
-  const HCFStringMap *follow_A = h_follow(1, g, A);
-  HHashSet *ret = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr);
-
-  h_hashset_put_all(ret, first_rhs->char_branches);
-  if(first_rhs->end_branch)
-    h_hashset_put(ret, (void *)end_token);
-
-  if(h_derives_epsilon_seq(g, rhs->items)) {
-    h_hashset_put_all(ret, follow_A->char_branches);
-    if(follow_A->end_branch)
-      h_hashset_put(ret, (void *)end_token);
-  }
+
+  h_stringmap_update(ret, h_first_seq(k, g, rhs->items));
+  if(h_derives_epsilon_seq(g, rhs->items))
+    h_stringmap_update(ret, h_follow(k, g, A));
+
+  // make sure there are only strings of length _exactly_ k
+  ret->epsilon_branch = NULL;
 
   return ret;
 }
 
-/* Generate entries for the production "A -> rhs" in the given table row. */
-static
-int fill_table_row(HCFGrammar *g, HHashTable *row,
-                   const HCFChoice *A, HCFSequence *rhs)
+void *const CONFLICT = (void *)(uintptr_t)(-1);
+
+static HHashSet *cte_workset; // emulating a closure
+static void *combine_table_entry(void *dst, const void *src)
 {
-  // iterate over predict(A -> rhs)
-  HHashSet *pred = h_predict(g, A, rhs);
+  if(dst == CONFLICT) {                 // previous conflict
+    h_hashset_put(cte_workset, src);
+  } else if(dst != src) {               // new conflict
+    h_hashset_put(cte_workset, dst);
+    h_hashset_put(cte_workset, src);
+    dst = CONFLICT;
+  }
+  return dst;
+}
 
-  size_t i;
-  HHashTableEntry *hte;
-  for(i=0; i < pred->capacity; i++) {
-    for(hte = &pred->contents[i]; hte; hte = hte->next) {
+// add the mappings of src to dst, calling combine if there is a collision
+// note: might reuse parts of src in building up dst!
+static void stringmap_merge(void *(*combine)(void *, const void *),
+                            HCFStringMap *dst, HCFStringMap *src)
+{
+  if(src->epsilon_branch) {
+    if(dst->epsilon_branch)
+      dst->epsilon_branch = combine(dst->epsilon_branch, src->epsilon_branch);
+    else
+      dst->epsilon_branch = src->epsilon_branch;
+  }
+
+  if(src->end_branch) {
+    if(dst->end_branch)
+      dst->end_branch = combine(dst->end_branch, src->end_branch);
+    else
+      dst->end_branch = src->end_branch;
+  }
+
+  // iterate over src->char_branches
+  const HHashTable *ht = src->char_branches;
+  for(size_t i=0; i < ht->capacity; i++) {
+    for(HHashTableEntry *hte = &ht->contents[i]; hte; hte = hte->next) {
       if(hte->key == NULL)
         continue;
-      HCFToken x = (uintptr_t)hte->key;
 
-      if(h_hashtable_present(row, (void *)x))
-        return -1;  // table would be ambiguous
+      HCharKey c = (HCharKey)hte->key;
+      HCFStringMap *src_ = hte->value;
 
-      h_hashtable_put(row, (void *)x, rhs);
+      if(src_) {
+        HCFStringMap *dst_ = h_hashtable_get(dst->char_branches, (void *)c);
+        if(dst_)
+          stringmap_merge(combine, dst_, src_);
+        else
+          dst_ = src_;
+      }
     }
   }
+}
+
+/* Generate entries for the production "A -> rhs" in the given table row. */
+static int fill_production_entries(size_t k, HCFGrammar *g, HCFStringMap *row,
+                                   const HCFChoice *A, HCFSequence *rhs)
+{
+
+  for(size_t i=1; i<=k; i++) {
+    HCFStringMap *pred = h_predict(i, g, A, rhs);
+    h_stringmap_replace(pred, NULL, rhs); // make all values in pred map to rhs
 
+    // clear previous conflict markers
+    h_stringmap_replace(row, CONFLICT, NULL);
+
+    // merge predict set into the row, accumulating conflicts in workset
+    cte_workset = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr);
+                                           // will be deleted after compile
+    stringmap_merge(combine_table_entry, row, pred);
+
+    // if the workset is empty, row is free of conflicts and we are done.
+    if(h_hashset_empty(cte_workset))
+      return 0;
+  }
+
+  // if we reach here, conflicts remain at maximum lookahead
+  return -1;
+}
+
+/* Generate entries for the production "A" in the given table row. */
+static int fill_table_row(size_t k, HCFGrammar *g, HCFStringMap *row,
+                          const HCFChoice *A)
+{
+  // iterate over A's productions
+  for(HCFSequence **s = A->seq; *s; s++) {
+    // record this production in row as appropriate
+    if(fill_production_entries(k, g, row, A, *s) < 0)
+      return -1;
+  }
   return 0;
 }
 
 /* Generate the LL(k) parse table from the given grammar.
  * Returns -1 on error, 0 on success.
  */
-static int fill_table(HCFGrammar *g, HLLkTable *table)
+static int fill_table(size_t k, HCFGrammar *g, HLLkTable *table)
 {
   table->start = g->start;
 
@@ -138,18 +219,14 @@ static int fill_table(HCFGrammar *g, HLLkTable *table)
       assert(a->type == HCF_CHOICE);
 
       // create table row for this nonterminal
-      HHashTable *row = h_hashtable_new(table->arena, h_eq_ptr, h_hash_ptr);
+      HCFStringMap *row = h_stringmap_new(table->arena);
       h_hashtable_put(table->rows, a, row);
 
-      // iterate over a's productions
-      HCFSequence **s;
-      for(s = a->seq; *s; s++) {
-        // record this production in row as appropriate
-        // this can signal an ambiguity conflict.
+      if(fill_table_row(k, g, row, a) < 0) {
+        // unresolvable conflicts in row
         // NB we don't worry about deallocating anything, h_llk_compile will
         //    delete the whole arena for us.
-        if(fill_table_row(g, row, a, *s) < 0)
-          return -1;
+        return -1;
       }
     }
   }
@@ -157,8 +234,13 @@ static int fill_table(HCFGrammar *g, HLLkTable *table)
   return 0;
 }
 
+static const size_t K_DEFAULT = 1;
+
 int h_llk_compile(HAllocator* mm__, HParser* parser, const void* params)
 {
+  size_t k = params? (uintptr_t)params : K_DEFAULT;
+  assert(k>0);
+
   // Convert parser to a CFG. This can fail as indicated by a NULL return.
   HCFGrammar *grammar = h_cfgrammar(mm__, parser);
   if(grammar == NULL)
@@ -170,7 +252,7 @@ int h_llk_compile(HAllocator* mm__, HParser* parser, const void* params)
 
   // generate table and store in parser->backend_data.
   HLLkTable *table = h_llktable_new(mm__);
-  if(fill_table(grammar, table) < 0) {
+  if(fill_table(k, grammar, table) < 0) {
     // the table was ambiguous
     h_cfgrammar_free(grammar);
     h_llktable_free(table);
@@ -358,7 +440,7 @@ int test_llk(void)
   */
 
   HParser *X = h_optional(h_ch('x'));
-  HParser *Y = h_sequence(h_ch('y'), NULL);
+  HParser *Y = h_epsilon_p(); //h_sequence(h_ch('y'), NULL);
   HParser *A = h_sequence(X, Y, h_ch('a'), NULL);
   HParser *B = h_sequence(Y, h_ch('b'), NULL);
   HParser *p = h_choice(A, B, NULL);
diff --git a/src/cfgrammar.c b/src/cfgrammar.c
index 972cc4d4..b6941974 100644
--- a/src/cfgrammar.c
+++ b/src/cfgrammar.c
@@ -265,12 +265,38 @@ void h_stringmap_update(HCFStringMap *m, const HCFStringMap *n)
   h_hashtable_merge(combine_stringmap, m->char_branches, n->char_branches);
 }
 
+/* Replace all occurances of old in m with new.
+ * If old is NULL, replace all values in m with new.
+ * If new is NULL, remove the respective values.
+ */
+void h_stringmap_replace(HCFStringMap *m, void *old, void *new)
+{
+  if(!old || m->epsilon_branch == old)
+    m->epsilon_branch = new;
+
+  if(!old || m->end_branch == old)
+    m->end_branch = new;
+
+  // iterate over m->char_branches
+  const HHashTable *ht = m->char_branches;
+  for(size_t i=0; i < ht->capacity; i++) {
+    for(HHashTableEntry *hte = &ht->contents[i]; hte; hte = hte->next) {
+      if(hte->key == NULL)
+        continue;
+
+      HCFStringMap *m_ = hte->value;
+      if(m_)
+        h_stringmap_replace(m_, old, new);
+    }
+  }
+}
+
 void *h_stringmap_get(const HCFStringMap *m, const uint8_t *str, size_t n, bool end)
 {
   for(size_t i=0; i<n; i++) {
     if(i==n-1 && end && m->end_branch)
       return m->end_branch;
-    m = h_hashtable_get(m->char_branches, (void *)char_key(str[i]));
+    m = h_stringmap_get_char(m, str[i]);
     if(!m)
       return NULL;
   }
diff --git a/src/cfgrammar.h b/src/cfgrammar.h
index eb53b014..8dc4449a 100644
--- a/src/cfgrammar.h
+++ b/src/cfgrammar.h
@@ -45,10 +45,14 @@ void h_stringmap_put_epsilon(HCFStringMap *m, void *v);
 void h_stringmap_put_after(HCFStringMap *m, uint8_t c, HCFStringMap *ends);
 void h_stringmap_put_char(HCFStringMap *m, uint8_t c, void *v);
 void h_stringmap_update(HCFStringMap *m, const HCFStringMap *n);
+void h_stringmap_replace(HCFStringMap *m, void *old, void *new);
 void *h_stringmap_get(const HCFStringMap *m, const uint8_t *str, size_t n, bool end);
 bool h_stringmap_present(const HCFStringMap *m, const uint8_t *str, size_t n, bool end);
 bool h_stringmap_present_epsilon(const HCFStringMap *m);
 
+static inline void *h_stringmap_get_char(const HCFStringMap *m, const uint8_t c)
+ { return h_hashtable_get(m->char_branches, (void *)char_key(c)); }
+
 
 /* Convert 'parser' into CFG representation by desugaring and compiling the set
  * of nonterminals.
-- 
GitLab