diff --git a/CFGBoltzmann.py b/CFGBoltzmann.py new file mode 100644 index 0000000000000000000000000000000000000000..ed0d9953308971b84639b3f2207a87fef3c5100f --- /dev/null +++ b/CFGBoltzmann.py @@ -0,0 +1,503 @@ +import collections +import functools +import random +from enum import * + + +CFG = collections.namedtuple("CFG", "nonterminals terminals rules start") +# We define a context-free grammar as: +# 1. A list of nonterminal symbols +# 2. A list of terminal symbols +# 3. A list of production rules +# 4. A start symbol. + +# Each symbol is defined by a positive integer. + + +# For the ETF expression grammar: + +# Terminal symbols are: +# INTEGER, ADDOP, MULTOP, OPENPAREN, CLOSEPAREN + +# Nonterminal symbols are: +# EXPRESSION, TERM, FACTOR + +# The start symbol is EXPRESSION + +# The rules are: + +# EXPRESSION -> EXPRESSION ADDOP TERM +# EXPRESSION -> TERM +# TERM -> TERM MULTOP FACTOR +# TERM -> FACTOR +# FACTOR -> OPENPAREN EXPRESSION CLOSEPAREN +# FACTOR -> INTEGER + +#symbols = Enum("Symbols", "EXPRESSION TERM FACTOR INTEGER ADDOP MULTOP OPENPAREN CLOSEPAREN") + +nonterminals = Enum("Nonterminals", "EXPRESSION TERM FACTOR") + +terminals = Enum("Terminals", "INTEGER ADDOP MULTOP OPENPAREN CLOSEPAREN") + +list_of_nonterminals = [e for e in nonterminals] +list_of_terminals = [e for e in terminals] + +start = nonterminals.EXPRESSION + +# The rules are a list of tuples, each of which represents a rule, as follows: + +# (Nonterminal symbol, [ordered list of symbols (terminal or nonterminal) that are reduced to aforementioned nonterminal]) + +rules = [ + (nonterminals.EXPRESSION, [nonterminals.EXPRESSION, terminals.ADDOP, nonterminals.TERM ]), + (nonterminals.EXPRESSION, [nonterminals.TERM ]), + (nonterminals.TERM, [nonterminals.TERM, terminals.MULTOP, nonterminals.FACTOR ]), + (nonterminals.TERM, [nonterminals.FACTOR ]), + (nonterminals.FACTOR, [terminals.OPENPAREN, nonterminals.EXPRESSION, terminals.CLOSEPAREN]), + (nonterminals.FACTOR, [terminals.INTEGER ]) + ] + + + +# Once we have a CFG's definition, we can use Boltzmann sampling in order to generate random strings +# from that CFG. + +# We use the "sandwich" technique in https://notebook.drmaciver.com/posts/2020-07-11-10:49.html +# to estimate the Boltzmann distribution generating function. However, to enumerate the number of +# words in the grammar with length n, we use the technique described in: +# +# "Generating Strings at Random from a Context Free Grammar" by Bruce McKenzie, 1997 + +# There is the more effective method in "Automating Grammar Comparison", where the authors +# devise a slightly more advanced scheme that uses the Cantor pairing function to +# encode the parse tree of each generated string in a unitary index. I do not think I need +# to resort to such a technique just yet. + +# The notation is therefore as follows from that paper: + +# The paper uses the notation ||X||_n to mean "the number of strings of length n generated by +# the symbol X", but since this is text and not LaTeX, we use the notation "||X||==n" to mean +# the same. + +# Nonterminals are indexed by i, i = [1, number of nonterminals] +# Each nonterminal may have multiple ways of generating it. For each nonterminal i, +# there may be multiple rules that create it. For each i, there is an index j of these +# multiple ways. + +# Production rules are thus indexed by: +# (i = [1, number of nonterminals], j = [1, number of ways to generate nonterminal i]). + +# Each production rule is represented by α_ij = X_ij1, X_ij2, ..., X_ijT, with T=T_ij +# representing the number of symbols in the reduce rule (i, j) + +# We have two functions, and we use memoization in order to evaluate them without +# pathological cost. + + +# The first function, f(i, N), which we will call Fzero, represents the number +# of strings with length exactly N that can be generated by the expansion of +# nonterminal number i. It is a list of integers, with one element for each +# production rule that generates nonterminal i + +# f(i, N) = [||α_ij||==N for j = [1, number of production rules for nonterminal i]] + + +# The second function, f'(i, j, k, N), which we will call Fprim, represents the +# number of strings with length exactly N that can be generated by the +# last k symbols of the specific production rule (i, j). + +# We define it as follows, which leads to a simple "peeling off" way of evaluating it +# by induction. It is a list, with one element for each way to "split" the N characters +# between the first symbol (the k'th symbol in the entire rule) in the relevant subset +# of the rule and the k+1'th symbol to the last symbol of the rule: + +# f'(i, j, k, N) = [ (||X_ijk|| == L) * (||X_ij(k+1) X_ij(k+2)...X_ijT|| == N-L) for L = [1, N-T_ij+k]] + +# The * is an ordinary multiplication, not a Kleene star. We note that this looks like +# a convolution. + +# The "ways to split the length N" is N-T_ij+k because the *entire production rule* has +# T_ij symbols, the smaller subset from k+1 to T_ij has T_ij-k symbols. Therefore, +# this smaller subset will *always* generate a list of terminals with length T_ij-k or +# greater (because we require that there be no epsilon-productions). + + +# To evaluate Fzero and Fprim, we define them in terms of each other, analyze cases, and +# use memoization: + +# Fzero(i, N) = [sum(Fprim(i, j, 1, N)) for all possible j (j = [1, number of rules for nonterminal i])] +# we index arrays starting at zero. like everyone else, so the 1 is gonna be a zero in the code. + +# There are two types of special cases for Fprim(i, j, k, N), and they can both occur at +# the same time . We use these special cases to evaluate Fprim efficiently: + +# Special case 1: k is T_ij, and thus x_ijk is the last symbol in the production rule. +# Special case 2: The symbol x_ijk is a terminal symbol. + +# The exhaustive case analysis is as follows. + +# /----------------------------------------------------------\ +# |X_ijk is a terminal symbol | X_ijk is a nonterminal symbol| +# |---------------------------|------------------------------| +# X_ijk is the last | Case A | Case C | +# symbol in production | | Reduces to analysis for the | +# rule (i,j) | Extremely easy base case | production rule for the | +# | | nonterminal @ X_ijk | +#------------------------|---------------------------|------------------------------| +# X_ijk is not the last | Case B | Case D | +# symbol in the | A terminal always has len | The difficult case. We must | +# production rule (i,j) | 1 and no possibilities so | evaluate the full convolution| +# | this reduces to | for both X_ijk and remaining | +# | Fprim(i, j, k+1, N-1) | symbols in the subset of rule| +# \----------------------------------------------------------/ + +# We also note that for the degenerate case N=0, we always can and must return the empty array [] + +# In pseudocode (taken from the paper, with the correction that on top of page 5 +# the "if N==0 then return [1]" seems like it should be "if N==1 then return [1]"): + +# Fprim(i, j, k, N): +# if (N==0) return [] +# if X_ijk is a terminal: +# if k==T_ij: +# # CASE A +# if N==1 return [1], else return [0] +# if k != T_ij: +# # CASE B +# return [sum(Fprim(i, j, k+1, N-1))] +# else if X_ijk is a nonterminal: +# if k=T_ij: +# # CASE C +# return [sum(Fzero(index of nonterminal at index k, N))] +# if k != T_ij +# # CASE D +# return [sum(Fzero(index of nonterminal at index k, L)) * +# sum(Fprim(i, j, k+1, N-L)) for L = [1, N - T_ij + k]] + + +# First, we preprocess the grammar by grouping all the rules for each nonterminal together. + + +class CFGBoltzmann: + def __init__(self, rules, nonterminals, terminals): + self.unitary_rules = rules + self.nonterminals = nonterminals + self.terminals = terminals + + def preprocessor(self): + unitary_rules = self.unitary_rules + factorized_rulepack = {} + + for rule in unitary_rules: + nonterminal = rule[0] + rhs = rule[1] + if (nonterminal not in factorized_rulepack): + factorized_rulepack[nonterminal] = [rhs] + else: + factorized_rulepack[nonterminal].append(rhs) + + + rulepack_list = [] + list_of_terminals_in_sync_with_rulepack = [] + for x in factorized_rulepack: + rulepack_list.append((x, factorized_rulepack[x])) + list_of_terminals_in_sync_with_rulepack.append(x) + self.nonterminals_ordered = list_of_terminals_in_sync_with_rulepack + # print("ORDERED nonterminal list is", self.nonterminals_ordered) + + self.processed_rulepack = rulepack_list + + # We recall that: + + # Fzero(i, N) = [sum(Fprim(i, j, 1, N)) for all possible j (j = [1, number of rules for nonterminal i])] + + @functools.lru_cache(maxsize=8192) + def Fzero(self, nonterminal_index, exact_length_total): + # First find the nonterminal in the rulepack + + possible_RHSes = self.processed_rulepack[nonterminal_index][1] + # print(possible_RHSes) + assert(possible_RHSes != []) + + retlist = [] + + for rhs_index in range(len(possible_RHSes)): +# print("IN Fzero, trying rhs_index", rhs_index) + retlist.append(sum(self.Fprim(nonterminal_index, rhs_index, 0, exact_length_total))) # we index arrays starting at zero. like everyone else. +# print ("Fzero returns", retlist) + return retlist + + + + # Fprim is where the complicated case analysis lives. + + @functools.lru_cache(maxsize=8192) + def Fprim(self, nonterminal_index, which_RHS, how_far_into_the_RHS, exact_length_total): + #print("arguments are", nonterminal_index, which_RHS, how_far_into_the_RHS, exact_length_total) + + # first, handle the ultimate degenerate case: + if (exact_length_total == 0): + #print("LENGTH ZERO RETURNING []") + return [] + + # The case analysis hinges on what is at X_ijk. + + RHS_in_question = self.processed_rulepack[nonterminal_index][1][which_RHS] + #print("Our RHS is", RHS_in_question) + #print("WE ARE PROCESSING index", how_far_into_the_RHS, "and our remaining lengthis", exact_length_total) + + xijk = RHS_in_question[how_far_into_the_RHS] + + if (xijk in self.terminals): + # print("xijk", xijk, "is a terminal, hokay") + # print("lenghts of RHS In question is", len(RHS_in_question)) + + # are we at the end of the rule + if (how_far_into_the_RHS == (len(RHS_in_question) - 1)): + # CASE A + if (exact_length_total == 1): +# print("CASE A LEN 1") +# print ("Fprim returns SPECIAL CASE", [1]) + return [1] + else: + # print("CASE A LEN NON-ONE") + # print ("Fprim returns BAD SPECIAL CASE", [0]) + return [0] + else: + # CASE B + # print("CASE B") + reduct = self.Fprim(nonterminal_index, which_RHS, how_far_into_the_RHS + 1, exact_length_total - 1) + retlist = [sum(reduct)] + # print ("Fprim returns", retlist) + + return retlist + + + else: + assert (xijk in self.nonterminals) + # print("xijk", xijk, "is a NONterminal!") + # print("lenghts of RHS In question is", len(RHS_in_question)) + #print("how far?", how_far_into_the_RHS) + ## we now calculate the index of nonterminal at index K, we need it for both cases + new_nonterminal_index = self.nonterminals_ordered.index(xijk) + # print("NEW NONTERMINAL INDDEX IS", new_nonterminal_index) + + if (how_far_into_the_RHS == (len(RHS_in_question) - 1)): + # CASE C + # print("CASE C") + reduct = self.Fzero(new_nonterminal_index, exact_length_total) + retlist = [sum(reduct)] + # print ("Fprim returns", retlist) + return retlist + + else: + # CASE D + # Here we must do the full convolution to account for every + # possible way that the total length of the desired string can + # be split between the nonterminal at X_ijk and the remaining + # segment of the rule (from k+1 to the end of the rule) + + # Here, L will be the number of characters produced by X_ijk + # L will therefore go from 1 (we disallow epsilon-productions so + # X_ijk must produce at least one character) (INCLUSIVE) to (INCLUSIVE): + # Total Characters Requested - (minimum number of characters generated by k+1 to end of rule) + # which will be: + # Total Characters Requested - (T_ij -(k+1)+1) = TCR - (T_ij-k) + # But T_ij is the INDEX of the last element of the rule, aka len(rule)-1 + # so the expression is + # Total Characters Requested - (len(rule)-k-1) + # exact_length_total - len(rule) + how_far_into_the_RHS + 1 + + # print("CASE D") + retlist = [] + #print(exact_length_total - len(RHS_in_question) + how_far_into_the_RHS + 2) + for l in range(1, exact_length_total - len(RHS_in_question) + how_far_into_the_RHS + 2): + # print("L IS", l) + nonterminal_possibilities = sum(self.Fzero(new_nonterminal_index, l)) + remainder_possibilities = sum(self.Fprim(nonterminal_index, which_RHS, how_far_into_the_RHS + 1, exact_length_total - l)) + retlist.append(nonterminal_possibilities * remainder_possibilities) + # print ("Fprim returns", retlist) + return retlist + + + # Now we do generation of the strings. + + # This returns an *index* (not the item at the index) from a list, weighted + # by the integer at each element. + + def normalized_choice(self, inlist): + #total_weight = sum(inlist) + #weights = [x / total_weight for x in inlist] + choice = random.choices(range(len(inlist)), weights=inlist) + return choice[0] + + + # Similar to Fzero and Fprim, we have Gzero and Gprim. + + # the first function, Gzero(nonterminal index, requested length) serves only to + # select (weighted appropriately) which production rule for a nonterminal will + # be used. It then calls out to GPrim, which actually generates the string. + + def Gzero_shimmed(self, nonterminal, requested_length): + nonterminal_index = self.nonterminals_ordered.index(nonterminal) + return self.Gzero(nonterminal_index, requested_length, 0) + def Gzero(self, nonterminal_index, requested_length, depth): + print(" "* depth +"ENTERING Gzero") + possibilities = self.Fzero(nonterminal_index, requested_length) + chosen_production = self.normalized_choice(possibilities) + generated_string = self.Gprim(nonterminal_index, chosen_production, 0, requested_length, depth) + + return generated_string + + # Like Fprim, Gprim takes a nonterminal index, a production index for the nonterminal, + # and an index into the production rule (and of course, a requested length). + # This lets us walk through the production rule symbol by symbol. + + # As in Fprim, there is a case analysis based on if the index represents the last + # symbol in the rule, and if the index points to a terminal or to a nonterminal. + + # There are two types of special cases for Fprim(i, j, k, N), and they can both occur at + # the same time . We use these special cases to evaluate Fprim efficiently: + + # Special case 1: k is T_ij, and thus x_ijk is the last symbol in the production rule. + # Special case 2: The symbol x_ijk is a terminal symbol. + + # The exhaustive case analysis is as follows. + + # /----------------------------------------------------------\ + # |X_ijk is a terminal symbol | X_ijk is a nonterminal symbol| + # |---------------------------|------------------------------| + # X_ijk is the last | Case A | Case C | + # symbol in production | | Reduces to Gzero for the | + # rule (i,j) | The easiest base case. | production rule for the | + # | | nonterminal @ X_ijk | + #------------------------|---------------------------|------------------------------| + # X_ijk is not the last | Case B | Case D | + # symbol in the | A terminal always has len | Like the convolution, but use| + # production rule (i,j) | 1 and no possibilities so | Fprim and weighted-choice on | + # | this reduces to | [X_ijk, last symbol in the | + # | Gprim(i, j, k+1, N-1) | production rule] to find out | + # | | how many symbols we're having| + # | | X_ijk generate. Then we use | + # | | Gzero on X_ijk and Gprim on | + # | | (k+1) to end of rule | + # \----------------------------------------------------------/ + + + def Gprim(self, nonterminal_index, chosen_production, how_far_into_the_RHS, exact_length_total, depth): + print(" "* depth + "GPRIM arguments are", nonterminal_index, chosen_production, how_far_into_the_RHS, exact_length_total) + + # first, handle the ultimate degenerate case: + assert(exact_length_total != 0) + + # The case analysis hinges on what is at X_ijk. + + RHS_in_question = self.processed_rulepack[nonterminal_index][1][chosen_production] + print(" "* depth +"Our RHS is", RHS_in_question) + print(" "* depth +"WE ARE PROCESSING index", how_far_into_the_RHS, "and our remaining lengthis", exact_length_total) + + xijk = RHS_in_question[how_far_into_the_RHS] + + if (xijk in self.terminals): + # print("xijk", xijk, "is a terminal, hokay") + # print("lenghts of RHS In question is", len(RHS_in_question)) + + # are we at the end of the rule + if (how_far_into_the_RHS == (len(RHS_in_question) - 1)): + # CASE A + print(" "* depth +"GPRIM CASE A RETURNING", [xijk]) + return [xijk] + else: + # CASE B + print(" "* depth +"GPRIM CASE B pointing @", [xijk]) + reduct = self.Gprim(nonterminal_index, chosen_production, how_far_into_the_RHS + 1, exact_length_total - 1, depth+1) + retstring = [xijk] + reduct + print(" "* depth +"GPRIM CASE B RETURNING", retstring) + return retstring + + + else: + assert (xijk in self.nonterminals) + # print("xijk", xijk, "is a NONterminal!") + # print("lenghts of RHS In question is", len(RHS_in_question)) + #print("how far?", how_far_into_the_RHS) + ## we now calculate the index of nonterminal at index K, we need it for both cases + new_nonterminal_index = self.nonterminals_ordered.index(xijk) + # print("NEW NONTERMINAL INDDEX IS", new_nonterminal_index) + + if (how_far_into_the_RHS == (len(RHS_in_question) - 1)): + # CASE C + print(" "* depth +"CASE C STARTING") + retstring = self.Gzero(new_nonterminal_index, exact_length_total, depth+1) + print(" "* depth +"CASE C returning", retstring) + return retstring + + else: + # CASE D + + print(" "* depth +"CASE D STARTING") + + + splitting_possibilities = self.Fprim(nonterminal_index, chosen_production, how_far_into_the_RHS, exact_length_total) + print (" "* depth +"CASE D; SPLIT POSSIBILITIES ARE", splitting_possibilities) + + split_choice = 1+ self.normalized_choice(splitting_possibilities) + print (" "* depth +"CASE D; SPLITTING WITH LENGTH", split_choice) + + # hit the nonterminal X_ijk with Gzero + + nonterminal_generates = self.Gzero(new_nonterminal_index, split_choice, depth+1) + print (" "* depth +"CASE D NONTERMINAL @ XIJK generates", nonterminal_generates) + + # and then the remaining part of the rule + + rest_of_rule_generates = self.Gprim(nonterminal_index, chosen_production, how_far_into_the_RHS + 1, exact_length_total - split_choice, depth+1) + print ("CASE D REST OF RULE generates", rest_of_rule_generates) + + retstring = nonterminal_generates + rest_of_rule_generates + print (" "* depth +"CASE D RETURNING", retstring) + return retstring + + + + + + +z = CFGBoltzmann(rules, list_of_nonterminals, list_of_terminals) + +rulepack_cooked = z.preprocessor() + + +qq = z.Gzero_shimmed(nonterminals.EXPRESSION, 11) +print ("AND THE FINAL ANSWER IS","\n\n\n",qq, "\n\n\n") + +# Furthermore, we also note that the description of a context-free grammar is *itself* context-free +# so if we take the CFG-description grammar (BNF or something isomorphic to it), and use Boltzmann +# sampling on *it*, we will generate candidate grammars; which will be used to test the FPGA-based +# parser generator against a reference LR parser generator implementation. Naturally, this procedure +# may not produce LR-parseable grammars or even deterministic context-free grammars; but if the grammar +# is rejected by both the FPGA parser generator and reference parser generator; this is fine, we just +# move on to the next one. + + +# Once we randomly generate a CFG, we randomly generate strings that are valid strings, but also +# generate exemplars that are potentially *invalid* strings (both by mutating valid strings but +# also by generating random sequences ab initio from the terminal symbols). The potentially invalid +# strings are tested against the reference parser, those that are accidentally valid are discarded, +# and those that are invalid are used in the automatic equivalence testing. + + +# We fail our candidate parser generator if: +# +# 1. During the generation phase, it rejects a grammar the reference generator doesn't reject, or +# accepts a grammar the reference generator rejects. + +# 2. During the execution phase, it rejects a string the reference generated parser doesn't reject, +# or accepts a string the reference generated parser rejects. + +# 3. During the execution phase, it accepts a string that the reference generated parser accepts; but +# the parse tree it generates is not identical to the parse tree that the reference generated parser +# produced. + + +