diff --git a/cfg_utils.py b/cfg_utils.py index a7fe0a80763189fe8d8a46ad8928dce520355f96..5dab672c18d4b36e6a8e16690904d81abe0b4ff1 100644 --- a/cfg_utils.py +++ b/cfg_utils.py @@ -280,16 +280,50 @@ if __name__ == '__main__': # The second function, f'(i, j, k, N), which we will call Fprim, represents the # number of strings with length exactly N that can be generated by the -# last k symbols of the specific production rule (i, j): +# last k symbols of the specific production rule (i, j). -# f'(i, j, k, N) +# We define it as follows, which leads to a simple "peeling off" way of evaluating it +# by induction. It is a list, with one element for each way to "split" the N characters +# between the first symbol (the k'th symbol in the entire rule) in the relevant subset +# of the rule and the k+1'th symbol to the last symbol of the rule: +# f'(i, j, k, N) = [ (||x_ijk|| == L) * (||x_ij(k+1) x_ij(k+2)...x_ijT|| == N-L) for L = [1, N-T_ij+k]] +# The * is an ordinary multiplication, not a Kleene star. We note that this looks like +# a convolution. +# The "ways to split the length N" is N-T_ij+k because the *entire production rule* has +# T_ij symbols, the smaller subset from k+1 to T_ij has T_ij-k symbols. Therefore, +# this smaller subset will *always* generate a list of terminals with length T_ij-k or +# greater (because we require that there be no epsilon-productions). +# To evaluate Fzero and Fprim, we define them in terms of each other, analyze cases, and +# use memoization: +# Fzero(i, N) = [sum(Fprim(i, j, 1, N)) for all possible j (j = [1, number of rules for nonterminal i])] +# There are two types of special cases for Fprim(i, j, k, N), and they can both occur at +# the same time . We use these special cases to evaluate Fprim efficiently: + +# Special case 1: k is T_ij, and thus x_ijk is the last symbol in the production rule. +# Special case 2: The symbol x_ijk is a terminal symbol. + + +# /----------------------------------------------------------\ +# |X_ijk is a terminal symbol | X_ijk is a nonterminal symbol| +# |---------------------------|------------------------------| +# X_ijk is the last | Case A | Case C | +# symbol in production | | Reduces to analysis for the | +# rule (i,j) | Extremely easy base case | production rule for the | +# | | nonterminal @ X_ijk | +#------------------------|---------------------------|------------------------------| +# X_ijk is not the last | Case B | Case D | +# symbol in the | A terminal always has len | The difficult case. We must | +# production rule (i,j) | 1 and no possibilities so | evaluate the full convolution| +# | this reduces to | | +# | Fprim(i, j, k+1, N-1) | | +# \----------------------------------------------------------/