diff --git a/cfg_utils.py b/cfg_utils.py index 0cd05ed45758961a0838031ed876305291ee381b..83d7c4c2a008b87501a9c7bd3d727a717ef648f0 100644 --- a/cfg_utils.py +++ b/cfg_utils.py @@ -1,3 +1,4 @@ +assert(1 == 0) # do not use me import collections import functools import random @@ -5,22 +6,6 @@ from enum import * -class TreeNode: - def __init__(self, language_element, subnodes): - self.language_element = language_element - self.subnodes = subnodes - - def __str__(self): - return str(self.language_element) + " " + str(self.subnodes) - - -def walk_the_tree(tree, level = 1): - if tree == None: - return - - print(" " * (level -1) + "|---" + str(tree.language_element) + " with " + str(len(tree.subnodes)) + " subnodes") - for subnode in tree.subnodes: - walk_the_tree(subnode, level + 1) @@ -237,448 +222,3 @@ if __name__ == '__main__': - - - - -# Actual Boltzmann sampling goes here. - -# We use the "sandwich" technique in https://notebook.drmaciver.com/posts/2020-07-11-10:49.html -# to estimate the Boltzmann distribution generating function. However, to enumerate the number of -# words in the grammar with length n, we use the technique described in: -# -# "Generating Strings at Random from a Context Free Grammar" by Bruce McKenzie, 1997 - -# There is the more effective method in "Automating Grammar Comparison", where the authors -# devise a slightly more advanced scheme that uses the Cantor pairing function to -# encode the parse tree of each generated string in a unitary index. I do not think I need -# to resort to such a technique just yet. - -# The notation is therefore as follows from that paper: - -# The paper uses the notation ||X||_n to mean "the number of strings of length n generated by -# the symbol X", but since this is text and not LaTeX, we use the notation "||X||==n" to mean -# the same. - -# Nonterminals are indexed by i, i = [1, number of nonterminals] -# Each nonterminal may have multiple ways of generating it. For each nonterminal i, -# there may be multiple rules that create it. For each i, there is an index j of these -# multiple ways. - -# Production rules are thus indexed by: -# (i = [1, number of nonterminals], j = [1, number of ways to generate nonterminal i]). - -# Each production rule is represented by α_ij = X_ij1, X_ij2, ..., X_ijT, with T=T_ij -# representing the number of symbols in the reduce rule (i, j) - -# We have two functions, and we use memoization in order to evaluate them without -# pathological cost. - - -# The first function, f(i, N), which we will call Fzero, represents the number -# of strings with length exactly N that can be generated by the expansion of -# nonterminal number i. It is a list of integers, with one element for each -# production rule that generates nonterminal i - -# f(i, N) = [||α_ij||==N for j = [1, number of production rules for nonterminal i]] - - -# The second function, f'(i, j, k, N), which we will call Fprim, represents the -# number of strings with length exactly N that can be generated by the -# last k symbols of the specific production rule (i, j). - -# We define it as follows, which leads to a simple "peeling off" way of evaluating it -# by induction. It is a list, with one element for each way to "split" the N characters -# between the first symbol (the k'th symbol in the entire rule) in the relevant subset -# of the rule and the k+1'th symbol to the last symbol of the rule: - -# f'(i, j, k, N) = [ (||X_ijk|| == L) * (||X_ij(k+1) X_ij(k+2)...X_ijT|| == N-L) for L = [1, N-T_ij+k]] - -# The * is an ordinary multiplication, not a Kleene star. We note that this looks like -# a convolution. - -# The "ways to split the length N" is N-T_ij+k because the *entire production rule* has -# T_ij symbols, the smaller subset from k+1 to T_ij has T_ij-k symbols. Therefore, -# this smaller subset will *always* generate a list of terminals with length T_ij-k or -# greater (because we require that there be no epsilon-productions). - - -# To evaluate Fzero and Fprim, we define them in terms of each other, analyze cases, and -# use memoization: - -# Fzero(i, N) = [sum(Fprim(i, j, 1, N)) for all possible j (j = [1, number of rules for nonterminal i])] -# we index arrays starting at zero. like everyone else, so the 1 is gonna be a zero in the code. - -# There are two types of special cases for Fprim(i, j, k, N), and they can both occur at -# the same time . We use these special cases to evaluate Fprim efficiently: - -# Special case 1: k is T_ij, and thus x_ijk is the last symbol in the production rule. -# Special case 2: The symbol x_ijk is a terminal symbol. - -# The exhaustive case analysis is as follows. - -# /----------------------------------------------------------\ -# |X_ijk is a terminal symbol | X_ijk is a nonterminal symbol| -# |---------------------------|------------------------------| -# X_ijk is the last | Case A | Case C | -# symbol in production | | Reduces to analysis for the | -# rule (i,j) | Extremely easy base case | production rule for the | -# | | nonterminal @ X_ijk | -#------------------------|---------------------------|------------------------------| -# X_ijk is not the last | Case B | Case D | -# symbol in the | A terminal always has len | The difficult case. We must | -# production rule (i,j) | 1 and no possibilities so | evaluate the full convolution| -# | this reduces to | for both X_ijk and remaining | -# | Fprim(i, j, k+1, N-1) | symbols in the subset of rule| -# \----------------------------------------------------------/ - -# We also note that for the degenerate case N=0, we always can and must return the empty array [] - -# In pseudocode (taken from the paper, with the correction that on top of page 5 -# the "if N==0 then return [1]" seems like it should be "if N==1 then return [1]"): - -# Fprim(i, j, k, N): -# if (N==0) return [] -# if X_ijk is a terminal: -# if k==T_ij: -# # CASE A -# if N==1 return [1], else return [0] -# if k != T_ij: -# # CASE B -# return [sum(Fprim(i, j, k+1, N-1))] -# else if X_ijk is a nonterminal: -# if k=T_ij: -# # CASE C -# return [sum(Fzero(index of nonterminal at index k, N))] -# if k != T_ij -# # CASE D -# return [sum(Fzero(index of nonterminal at index k, L)) * -# sum(Fprim(i, j, k+1, N-L)) for L = [1, N - T_ij + k]] - - -# First, we preprocess the grammar by grouping all the rules for each nonterminal together. - - -class CFGBoltzmann: - def __init__(self, rules, nonterminals, terminals): - self.unitary_rules = rules - self.nonterminals = nonterminals - self.terminals = terminals - - def preprocessor(self): - unitary_rules = self.unitary_rules - factorized_rulepack = {} - - for rule in unitary_rules: - nonterminal = rule[0] - rhs = rule[1] - if (nonterminal not in factorized_rulepack): - factorized_rulepack[nonterminal] = [rhs] - else: - factorized_rulepack[nonterminal].append(rhs) - - - rulepack_list = [] - list_of_terminals_in_sync_with_rulepack = [] - for x in factorized_rulepack: - rulepack_list.append((x, factorized_rulepack[x])) - list_of_terminals_in_sync_with_rulepack.append(x) - self.nonterminals_ordered = list_of_terminals_in_sync_with_rulepack - # print("ORDERED nonterminal list is", self.nonterminals_ordered) - - self.processed_rulepack = rulepack_list - - # We recall that: - - # Fzero(i, N) = [sum(Fprim(i, j, 1, N)) for all possible j (j = [1, number of rules for nonterminal i])] - - @functools.lru_cache(maxsize=8192) - def Fzero(self, nonterminal_index, exact_length_total): - # First find the nonterminal in the rulepack - - possible_RHSes = self.processed_rulepack[nonterminal_index][1] - # print(possible_RHSes) - assert(possible_RHSes != []) - - retlist = [] - - for rhs_index in range(len(possible_RHSes)): -# print("IN Fzero, trying rhs_index", rhs_index) - retlist.append(sum(self.Fprim(nonterminal_index, rhs_index, 0, exact_length_total))) # we index arrays starting at zero. like everyone else. -# print ("Fzero returns", retlist) - return retlist - - - - # Fprim is where the complicated case analysis lives. - - @functools.lru_cache(maxsize=8192) - def Fprim(self, nonterminal_index, which_RHS, how_far_into_the_RHS, exact_length_total): - #print("arguments are", nonterminal_index, which_RHS, how_far_into_the_RHS, exact_length_total) - - # first, handle the ultimate degenerate case: - if (exact_length_total == 0): - #print("LENGTH ZERO RETURNING []") - return [] - - # The case analysis hinges on what is at X_ijk. - - RHS_in_question = self.processed_rulepack[nonterminal_index][1][which_RHS] - #print("Our RHS is", RHS_in_question) - #print("WE ARE PROCESSING index", how_far_into_the_RHS, "and our remaining lengthis", exact_length_total) - - xijk = RHS_in_question[how_far_into_the_RHS] - - if (xijk in self.terminals): - # print("xijk", xijk, "is a terminal, hokay") - # print("lenghts of RHS In question is", len(RHS_in_question)) - - # are we at the end of the rule - if (how_far_into_the_RHS == (len(RHS_in_question) - 1)): - # CASE A - if (exact_length_total == 1): -# print("CASE A LEN 1") -# print ("Fprim returns SPECIAL CASE", [1]) - return [1] - else: - # print("CASE A LEN NON-ONE") - # print ("Fprim returns BAD SPECIAL CASE", [0]) - return [0] - else: - # CASE B - # print("CASE B") - reduct = self.Fprim(nonterminal_index, which_RHS, how_far_into_the_RHS + 1, exact_length_total - 1) - retlist = [sum(reduct)] - # print ("Fprim returns", retlist) - - return retlist - - - else: - assert (xijk in self.nonterminals) - # print("xijk", xijk, "is a NONterminal!") - # print("lenghts of RHS In question is", len(RHS_in_question)) - #print("how far?", how_far_into_the_RHS) - ## we now calculate the index of nonterminal at index K, we need it for both cases - new_nonterminal_index = self.nonterminals_ordered.index(xijk) - # print("NEW NONTERMINAL INDDEX IS", new_nonterminal_index) - - if (how_far_into_the_RHS == (len(RHS_in_question) - 1)): - # CASE C - # print("CASE C") - reduct = self.Fzero(new_nonterminal_index, exact_length_total) - retlist = [sum(reduct)] - # print ("Fprim returns", retlist) - return retlist - - else: - # CASE D - # Here we must do the full convolution to account for every - # possible way that the total length of the desired string can - # be split between the nonterminal at X_ijk and the remaining - # segment of the rule (from k+1 to the end of the rule) - - # Here, L will be the number of characters produced by X_ijk - # L will therefore go from 1 (we disallow epsilon-productions so - # X_ijk must produce at least one character) (INCLUSIVE) to (INCLUSIVE): - # Total Characters Requested - (minimum number of characters generated by k+1 to end of rule) - # which will be: - # Total Characters Requested - (T_ij -(k+1)+1) = TCR - (T_ij-k) - # But T_ij is the INDEX of the last element of the rule, aka len(rule)-1 - # so the expression is - # Total Characters Requested - (len(rule)-k-1) - # exact_length_total - len(rule) + how_far_into_the_RHS + 1 - - # print("CASE D") - retlist = [] - #print(exact_length_total - len(RHS_in_question) + how_far_into_the_RHS + 2) - for l in range(1, exact_length_total - len(RHS_in_question) + how_far_into_the_RHS + 2): - # print("L IS", l) - nonterminal_possibilities = sum(self.Fzero(new_nonterminal_index, l)) - remainder_possibilities = sum(self.Fprim(nonterminal_index, which_RHS, how_far_into_the_RHS + 1, exact_length_total - l)) - retlist.append(nonterminal_possibilities * remainder_possibilities) - # print ("Fprim returns", retlist) - return retlist - - - # Now we do generation of the strings. - - # This returns an *index* (not the item at the index) from a list, weighted - # by the integer at each element. - - def normalized_choice(self, inlist): - #total_weight = sum(inlist) - #weights = [x / total_weight for x in inlist] - choice = random.choices(range(len(inlist)), weights=inlist) - return choice[0] - - - # Similar to Fzero and Fprim, we have Gzero and Gprim. - - # the first function, Gzero(nonterminal index, requested length) serves only to - # select (weighted appropriately) which production rule for a nonterminal will - # be used. It then calls out to GPrim, which actually generates the string. - - def Gzero_shimmed(self, nonterminal, requested_length): - nonterminal_index = self.nonterminals_ordered.index(nonterminal) - return self.Gzero(nonterminal_index, requested_length, 0) - def Gzero(self, nonterminal_index, requested_length, depth): - print(" "* depth +"ENTERING Gzero") - possibilities = self.Fzero(nonterminal_index, requested_length) - chosen_production = self.normalized_choice(possibilities) - generated_string = self.Gprim(nonterminal_index, chosen_production, 0, requested_length, depth) - - return generated_string - - # Like Fprim, Gprim takes a nonterminal index, a production index for the nonterminal, - # and an index into the production rule (and of course, a requested length). - # This lets us walk through the production rule symbol by symbol. - - # As in Fprim, there is a case analysis based on if the index represents the last - # symbol in the rule, and if the index points to a terminal or to a nonterminal. - - # There are two types of special cases for Fprim(i, j, k, N), and they can both occur at - # the same time . We use these special cases to evaluate Fprim efficiently: - - # Special case 1: k is T_ij, and thus x_ijk is the last symbol in the production rule. - # Special case 2: The symbol x_ijk is a terminal symbol. - - # The exhaustive case analysis is as follows. - - # /----------------------------------------------------------\ - # |X_ijk is a terminal symbol | X_ijk is a nonterminal symbol| - # |---------------------------|------------------------------| - # X_ijk is the last | Case A | Case C | - # symbol in production | | Reduces to Gzero for the | - # rule (i,j) | The easiest base case. | production rule for the | - # | | nonterminal @ X_ijk | - #------------------------|---------------------------|------------------------------| - # X_ijk is not the last | Case B | Case D | - # symbol in the | A terminal always has len | Like the convolution, but use| - # production rule (i,j) | 1 and no possibilities so | Fprim and weighted-choice on | - # | this reduces to | [X_ijk, last symbol in the | - # | Gprim(i, j, k+1, N-1) | production rule] to find out | - # | | how many symbols we're having| - # | | X_ijk generate. Then we use | - # | | Gzero on X_ijk and Gprim on | - # | | (k+1) to end of rule | - # \----------------------------------------------------------/ - - - def Gprim(self, nonterminal_index, chosen_production, how_far_into_the_RHS, exact_length_total, depth): - print(" "* depth + "GPRIM arguments are", nonterminal_index, chosen_production, how_far_into_the_RHS, exact_length_total) - - # first, handle the ultimate degenerate case: - assert(exact_length_total != 0) - - # The case analysis hinges on what is at X_ijk. - - RHS_in_question = self.processed_rulepack[nonterminal_index][1][chosen_production] - print(" "* depth +"Our RHS is", RHS_in_question) - print(" "* depth +"WE ARE PROCESSING index", how_far_into_the_RHS, "and our remaining lengthis", exact_length_total) - - xijk = RHS_in_question[how_far_into_the_RHS] - - if (xijk in self.terminals): - # print("xijk", xijk, "is a terminal, hokay") - # print("lenghts of RHS In question is", len(RHS_in_question)) - - # are we at the end of the rule - if (how_far_into_the_RHS == (len(RHS_in_question) - 1)): - # CASE A - print(" "* depth +"GPRIM CASE A RETURNING", [xijk]) - return [xijk] - else: - # CASE B - print(" "* depth +"GPRIM CASE B pointing @", [xijk]) - reduct = self.Gprim(nonterminal_index, chosen_production, how_far_into_the_RHS + 1, exact_length_total - 1, depth+1) - retstring = [xijk] + reduct - print(" "* depth +"GPRIM CASE B RETURNING", retstring) - return retstring - - - else: - assert (xijk in self.nonterminals) - # print("xijk", xijk, "is a NONterminal!") - # print("lenghts of RHS In question is", len(RHS_in_question)) - #print("how far?", how_far_into_the_RHS) - ## we now calculate the index of nonterminal at index K, we need it for both cases - new_nonterminal_index = self.nonterminals_ordered.index(xijk) - # print("NEW NONTERMINAL INDDEX IS", new_nonterminal_index) - - if (how_far_into_the_RHS == (len(RHS_in_question) - 1)): - # CASE C - print(" "* depth +"CASE C STARTING") - retstring = self.Gzero(new_nonterminal_index, exact_length_total, depth+1) - print(" "* depth +"CASE C returning", retstring) - return retstring - - else: - # CASE D - - print(" "* depth +"CASE D STARTING") - - - splitting_possibilities = self.Fprim(nonterminal_index, chosen_production, how_far_into_the_RHS, exact_length_total) - print (" "* depth +"CASE D; SPLIT POSSIBILITIES ARE", splitting_possibilities) - - split_choice = 1+ self.normalized_choice(splitting_possibilities) - print (" "* depth +"CASE D; SPLITTING WITH LENGTH", split_choice) - - # hit the nonterminal X_ijk with Gzero - - nonterminal_generates = self.Gzero(new_nonterminal_index, split_choice, depth+1) - print (" "* depth +"CASE D NONTERMINAL @ XIJK generates", nonterminal_generates) - - # and then the remaining part of the rule - - rest_of_rule_generates = self.Gprim(nonterminal_index, chosen_production, how_far_into_the_RHS + 1, exact_length_total - split_choice, depth+1) - print ("CASE D REST OF RULE generates", rest_of_rule_generates) - - retstring = nonterminal_generates + rest_of_rule_generates - print (" "* depth +"CASE D RETURNING", retstring) - return retstring - - - - - - -z = CFGBoltzmann(rules, list_of_nonterminals, list_of_terminals) - -rulepack_cooked = z.preprocessor() - - -qq = z.Gzero_shimmed(nonterminals.EXPRESSION, 11) -print ("AND THE FINAL ANSWER IS","\n\n\n",qq, "\n\n\n") - -# Furthermore, we also note that the description of a context-free grammar is *itself* context-free -# so if we take the CFG-description grammar (BNF or something isomorphic to it), and use Boltzmann -# sampling on *it*, we will generate candidate grammars; which will be used to test the FPGA-based -# parser generator against a reference LR parser generator implementation. Naturally, this procedure -# may not produce LR-parseable grammars or even deterministic context-free grammars; but if the grammar -# is rejected by both the FPGA parser generator and reference parser generator; this is fine, we just -# move on to the next one. - - -# Once we randomly generate a CFG, we randomly generate strings that are valid strings, but also -# generate exemplars that are potentially *invalid* strings (both by mutating valid strings but -# also by generating random sequences ab initio from the terminal symbols). The potentially invalid -# strings are tested against the reference parser, those that are accidentally valid are discarded, -# and those that are invalid are used in the automatic equivalence testing. - - -# We fail our candidate parser generator if: -# -# 1. During the generation phase, it rejects a grammar the reference generator doesn't reject, or -# accepts a grammar the reference generator rejects. - -# 2. During the execution phase, it rejects a string the reference generated parser doesn't reject, -# or accepts a string the reference generated parser rejects. - -# 3. During the execution phase, it accepts a string that the reference generated parser accepts; but -# the parse tree it generates is not identical to the parse tree that the reference generated parser -# produced. - - -