From e8101f746883cdbf16afbe32776d658f1575115b Mon Sep 17 00:00:00 2001
From: Kia <kia@special-circumstanc.es>
Date: Sat, 15 Aug 2020 20:51:14 -0600
Subject: [PATCH] start implementing the McKenzie paper, because we actually do
 need osmething like it

---
 cfg_utils.py | 52 ++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 40 insertions(+), 12 deletions(-)

diff --git a/cfg_utils.py b/cfg_utils.py
index 96d9475..a7fe0a8 100644
--- a/cfg_utils.py
+++ b/cfg_utils.py
@@ -244,27 +244,55 @@ if __name__ == '__main__':
 # Actual Boltzmann sampling goes here.
 
 # We use the "sandwich" technique in https://notebook.drmaciver.com/posts/2020-07-11-10:49.html
-# and https://notebook.drmaciver.com/posts/2020-07-06-09:52.html
+# to estimate the Boltzmann distribution generating function. However, to enumerate the number of
+# words in the grammar with length n, we use the technique described in 
+#
+# "Generating Strings at Random from a Context Free Grammar" by Bruce McKenzie, 1997
+
+# The notation is therefore as follows from that paper:
+
+# The paper uses the notation ||x||_n to mean "the number of strings of length n generated by
+# the symbol x", but since this is text and not LaTeX, we use the notation "||X||==n" to mean
+# the same.
+
+# Nonterminals are indexed by i, i = [1, number of nonterminals]
+# Each nonterminal may have multiple ways of generating it. For each nonterminal i,
+# there may be multiple rules that create it. For each i, there is an index j of these
+# multiple ways.
+
+# Production rules are thus indexed by:
+# (i = [1, number of nonterminals], j = [1, number of ways to generate nonterminal i]).
+
+# Each production rule is represented by α_ij = x_ij1, x_ij2, ..., x_ijT, with T=T_ij
+# representing the number of symbols in the reduce rule (i, j)
+
+# We have two functions, and we use memoization in order to evaluate them without
+# pathological cost.
+
+
+# The first function, f(i, N), which we will call Fzero, represents the number
+# of strings with length exactly N that can be generated by the expansion of
+# nonterminal number i. It is a list of integers, with one element for each 
+# production rule that generates nonterminal i
+
+# f(i, N) = [||α_ij||==N for j = [1, number of production rules for nonterminal i]]
+
+
+# The second function, f'(i, j, k, N), which we will call Fprim, represents the
+# number of strings with length exactly N that can be generated by the
+# last k symbols of the specific production rule (i, j):
+
+# f'(i, j, k, N) 
+
 
 
 
 
-# Here, c(n) represents the number of total strings in the language that are of length exactly n
 
 
-# C(State_0, K) represents the *number* of strings in the language that can be generated from
-# state State_0 with length exactly K.
 
 
-# R(State_0, K) represents the TOTAL number of strings with lengths (0,1,...,K) that can
-# be generated starting from state State_0:
 
-#                    n = K
-#                   _____
-#                   \
-# R(State_0, K) =   /      C(State_0, n)
-#                   -----
-#                   n = 0
 
 
 
-- 
GitLab