From e8101f746883cdbf16afbe32776d658f1575115b Mon Sep 17 00:00:00 2001 From: Kia <kia@special-circumstanc.es> Date: Sat, 15 Aug 2020 20:51:14 -0600 Subject: [PATCH] start implementing the McKenzie paper, because we actually do need osmething like it --- cfg_utils.py | 52 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/cfg_utils.py b/cfg_utils.py index 96d9475..a7fe0a8 100644 --- a/cfg_utils.py +++ b/cfg_utils.py @@ -244,27 +244,55 @@ if __name__ == '__main__': # Actual Boltzmann sampling goes here. # We use the "sandwich" technique in https://notebook.drmaciver.com/posts/2020-07-11-10:49.html -# and https://notebook.drmaciver.com/posts/2020-07-06-09:52.html +# to estimate the Boltzmann distribution generating function. However, to enumerate the number of +# words in the grammar with length n, we use the technique described in +# +# "Generating Strings at Random from a Context Free Grammar" by Bruce McKenzie, 1997 + +# The notation is therefore as follows from that paper: + +# The paper uses the notation ||x||_n to mean "the number of strings of length n generated by +# the symbol x", but since this is text and not LaTeX, we use the notation "||X||==n" to mean +# the same. + +# Nonterminals are indexed by i, i = [1, number of nonterminals] +# Each nonterminal may have multiple ways of generating it. For each nonterminal i, +# there may be multiple rules that create it. For each i, there is an index j of these +# multiple ways. + +# Production rules are thus indexed by: +# (i = [1, number of nonterminals], j = [1, number of ways to generate nonterminal i]). + +# Each production rule is represented by α_ij = x_ij1, x_ij2, ..., x_ijT, with T=T_ij +# representing the number of symbols in the reduce rule (i, j) + +# We have two functions, and we use memoization in order to evaluate them without +# pathological cost. + + +# The first function, f(i, N), which we will call Fzero, represents the number +# of strings with length exactly N that can be generated by the expansion of +# nonterminal number i. It is a list of integers, with one element for each +# production rule that generates nonterminal i + +# f(i, N) = [||α_ij||==N for j = [1, number of production rules for nonterminal i]] + + +# The second function, f'(i, j, k, N), which we will call Fprim, represents the +# number of strings with length exactly N that can be generated by the +# last k symbols of the specific production rule (i, j): + +# f'(i, j, k, N) + -# Here, c(n) represents the number of total strings in the language that are of length exactly n -# C(State_0, K) represents the *number* of strings in the language that can be generated from -# state State_0 with length exactly K. -# R(State_0, K) represents the TOTAL number of strings with lengths (0,1,...,K) that can -# be generated starting from state State_0: -# n = K -# _____ -# \ -# R(State_0, K) = / C(State_0, n) -# ----- -# n = 0 -- GitLab