From 93d4538aaa6f9bfd6d8706008314f62d6250dbb6 Mon Sep 17 00:00:00 2001 From: pompolic <pompolic@special-circumstanc.es> Date: Wed, 28 Sep 2022 15:05:42 +0200 Subject: [PATCH] Add large dict for tokens and their boundaries --- gdb-port/hammer-breakpoints.py | 4 ++- gdb-port/top-level-parse.py | 60 ++++++++-------------------------- 2 files changed, 16 insertions(+), 48 deletions(-) diff --git a/gdb-port/hammer-breakpoints.py b/gdb-port/hammer-breakpoints.py index 17eff1f..7b3353a 100644 --- a/gdb-port/hammer-breakpoints.py +++ b/gdb-port/hammer-breakpoints.py @@ -36,6 +36,8 @@ class HDoParseBreakpoint(gdb.Breakpoint): if val.name == 'arena': arena = int(val.value(frame)) parse_state = block['state'].value(frame) + token_start = index + top_level_parse.enter_h_do_parse(parse_state, None, parser) input_chunk = input_ptr + index @@ -113,7 +115,7 @@ class HDoParseRetBreakpoint(gdb.Breakpoint): print(top_level_parse.peek_parserstack().p_stack) #TODO:sometimes the hammer-parser-backtrace gets messed up indices = top_level_parse.get_top_stack_indices() token_map = ast_manager.token_map(indices) - top_level_parse.print_input_map(token_map) + #top_level_parse.print_input_map(token_map) print(top_level_parse.peek_parserstack().p_stack) # Do not stop at this breakpoint, but stop at the next HDoParseBreakpoint gdb.set_convenience_variable("hammer_step_counter", 1) diff --git a/gdb-port/top-level-parse.py b/gdb-port/top-level-parse.py index 010aab5..8ea0332 100644 --- a/gdb-port/top-level-parse.py +++ b/gdb-port/top-level-parse.py @@ -5,6 +5,8 @@ class TopLevelParse: self.input_stream_indices = IndexTree() # TODO: turn this into a tree self.input_index_tree_path = [0] self.current_parse_depth = 0 # Used by the AST handling code. + self.starting_input_index = [] + self.input_token_map = {} # Technically, 1 virtual parse function + 1 h_do_parse + 1 perform_lowlevel_parse (if packrat) # TODO: len(self.parser_stacks[-1]) and self.current_parse_depth serve the same purpose. avoid multiple sources of truth # TODO: stack of trees to push/pop when switching parser stacks @@ -67,7 +69,7 @@ class TopLevelParse: #final_token_bounds = (token_bounds[0], int(parse_state['input_stream']['index'])) #self.input_stream_indices[len(parser_stack.p_stack)] = final_token_bounds end_index = int(parse_state['input_stream']['index']) - self.finish_token(end_index) + self.finish_token(end_index, ret_val) else: # If ret_val is NULL, the parse has failed. We can be sure the (start, end) tuple won't be needed here. self.discard_pending_tokens() @@ -142,48 +144,6 @@ class TopLevelParse: # self.current_token_end = int(parse_state['input_stream']['index']) #TODO: maybe just have ParseVirtualRetBreakpoint extract the index and pass it into the function # token_bounds = (self.current_token_start, self.current_token_end) - # When h_do_parse is called, we take note of where state->input_stream->index is, and use it as the beginning of the token bounds - # Each entry in input_stream_indices is a list. It can have multiple elements when the parser before this one in the call graph is a sequence - # In that case, each parser it combines will be called in sequence, ending up at the same depth in the call graph. - # Unlike the parser stack, returning from h_do_parse doesn't correspond to popping the stack. Instead, if a parser rejects its input, all the token bounds above it (if the call stack grows upwards) are discarded. - - # Examples of how input_stream_indices changes during parsing: - # - # [(100, 250), (250, 300)] <--- current depth - # [(100, None)] <--- sequence containing the tokens above it - # - # After begin_new_token(): - # [(100, 250), (250, 300), (300, None)] - # [(100, None)] - # - # Once h_do_parse returns successfully (finish_token() is called): - # [(100, 250), (250, 300), (300, 450)] - # [(100, None)] - # - # Once the h_do_parse with the sequence itself returns successfully (finish_token() is called again): - # [(100, 250), (250, 300), (300, 450)] - # [(100, 450)] - - # In case a parse fails: - # [(100, 250), (250, 300), (300, None)] <--- the parser corresponding to the rightmost token here rejects - # [(100, None)] <--- h_sequence, every parser it combines must accept - # [(100, None)] <--- the combinator that contains the h_sequence() - # - # First, the incomplete token is discarded: - # [(100, 250), (250, 300)] - # [(100, None)] - # [(100, None)] - # - # After h_do_parse returns with NULL, the stack is popped: - # [(100, None)] <--- now this is the topmost item in input_stream_indices - # [(100, None)] - # - # Since h_sequence rejects too: - # [(100, None)] - # - # state->input_stream->index is restored, and h_do_parse() returns: - # [(100, 100)] - # TODO: difference between zero-length token and HParsedToken without AST def begin_new_token(self, index): token_bounds = (index, None) print("DEBUG: begin_new_token()") @@ -238,6 +198,12 @@ class TopLevelParse: print("len(self.input_index_tree_path):", len(self.input_index_tree_path), "self.current_parse_depth:", self.current_parse_depth) raise RuntimeError("Unexpected node path length") + diff = (self.current_parse_depth+1) - len(self.starting_input_index) + newindices = [None] * diff + self.starting_input_index.extend(newindices) + self.starting_input_index[self.current_parse_depth] = index + print(self.starting_input_index) + print("DEBUG: self.current_parse_depth:", self.current_parse_depth) print("DEBUG: path after begin_new_token():", self.input_index_tree_path) # TODO: rename input_index_tree_path to next_node_path print("DEBUG: tokens after begin_new_token():", self.input_stream_indices) @@ -248,7 +214,7 @@ class TopLevelParse: # If a parser successfully returns a token, we take the list of token bounds at the current depth, check that the bounds corresponding to # the most recent parse result aren't filled in yet, and make note of state->input_stream.index - def finish_token(self, index): + def finish_token(self, index, token_addr): print("DEBUG: finish_token()") print("DEBUG: current parse depth:", self.current_parse_depth) print("DEBUG: finished tokens:", self.input_stream_indices) @@ -264,6 +230,7 @@ class TopLevelParse: node.finish_token(index) #self.input_index_tree_path[self.current_parse_depth] += 1 del self.input_index_tree_path[self.current_parse_depth+1:] + self.input_token_map[int(token_addr)] = {'start': self.starting_input_index[self.current_parse_depth], 'end': index} print("DEBUG: current parse depth:", self.current_parse_depth) print("DEBUG: finished tokens:", self.input_stream_indices) print("DEBUG: path:", self.input_index_tree_path) @@ -286,8 +253,6 @@ class TopLevelParse: if node.indices[1] is not None: raise RuntimeError("DEBUG: failed parse but token still has bounds:", node.indices[1]) node.delete_all_children() - print("DEBUG: parent1: ", self.input_index_tree_path) - print("DEBUG: parent2: ", self.input_index_tree_path[0:self.current_parse_depth]) parent = self.input_stream_indices.select_by_path(self.input_index_tree_path[0:self.current_parse_depth]) if type(parent) is IndexTreeNode: parent.delete_last_child() @@ -297,6 +262,7 @@ class TopLevelParse: self.input_index_tree_path[self.current_parse_depth] -= 1 else: del self.input_index_tree_path[self.current_parse_depth:] + self.starting_input_index[self.current_parse_depth] = None def peek_parserstack(self): try: @@ -424,7 +390,7 @@ class TopLevelParse: def print_input_map(self, token): print("input map would be printed here") - print(token) + print(self.input_token_map) # TODO: better name pending def get_current_stack_depth(self): -- GitLab