From 93d4538aaa6f9bfd6d8706008314f62d6250dbb6 Mon Sep 17 00:00:00 2001
From: pompolic <pompolic@special-circumstanc.es>
Date: Wed, 28 Sep 2022 15:05:42 +0200
Subject: [PATCH] Add large dict for tokens and their boundaries

---
 gdb-port/hammer-breakpoints.py |  4 ++-
 gdb-port/top-level-parse.py    | 60 ++++++++--------------------------
 2 files changed, 16 insertions(+), 48 deletions(-)

diff --git a/gdb-port/hammer-breakpoints.py b/gdb-port/hammer-breakpoints.py
index 17eff1f..7b3353a 100644
--- a/gdb-port/hammer-breakpoints.py
+++ b/gdb-port/hammer-breakpoints.py
@@ -36,6 +36,8 @@ class HDoParseBreakpoint(gdb.Breakpoint):
 			if val.name == 'arena':
 				arena = int(val.value(frame))
 		parse_state = block['state'].value(frame)
+		token_start = index
+
 		top_level_parse.enter_h_do_parse(parse_state, None, parser)
 
 		input_chunk = input_ptr + index
@@ -113,7 +115,7 @@ class HDoParseRetBreakpoint(gdb.Breakpoint):
 			print(top_level_parse.peek_parserstack().p_stack) #TODO:sometimes the hammer-parser-backtrace gets messed up
 			indices = top_level_parse.get_top_stack_indices()
 			token_map = ast_manager.token_map(indices)
-			top_level_parse.print_input_map(token_map)
+			#top_level_parse.print_input_map(token_map)
 			print(top_level_parse.peek_parserstack().p_stack)
 			# Do not stop at this breakpoint, but stop at the next HDoParseBreakpoint
 			gdb.set_convenience_variable("hammer_step_counter", 1)
diff --git a/gdb-port/top-level-parse.py b/gdb-port/top-level-parse.py
index 010aab5..8ea0332 100644
--- a/gdb-port/top-level-parse.py
+++ b/gdb-port/top-level-parse.py
@@ -5,6 +5,8 @@ class TopLevelParse:
 		self.input_stream_indices = IndexTree() # TODO: turn this into a tree
 		self.input_index_tree_path = [0]
 		self.current_parse_depth = 0 # Used by the AST handling code.
+		self.starting_input_index = []
+		self.input_token_map = {}
 		# Technically, 1 virtual parse function + 1 h_do_parse + 1 perform_lowlevel_parse (if packrat)
 		# TODO: len(self.parser_stacks[-1]) and self.current_parse_depth serve the same purpose. avoid multiple sources of truth
 		# TODO: stack of trees to push/pop when switching parser stacks
@@ -67,7 +69,7 @@ class TopLevelParse:
 			#final_token_bounds = (token_bounds[0], int(parse_state['input_stream']['index']))
 			#self.input_stream_indices[len(parser_stack.p_stack)] = final_token_bounds
 			end_index = int(parse_state['input_stream']['index'])
-			self.finish_token(end_index)
+			self.finish_token(end_index, ret_val)
 		else:
 			# If ret_val is NULL, the parse has failed. We can be sure the (start, end) tuple won't be needed here.
 			self.discard_pending_tokens()
@@ -142,48 +144,6 @@ class TopLevelParse:
 	#	self.current_token_end = int(parse_state['input_stream']['index']) #TODO: maybe just have ParseVirtualRetBreakpoint extract the index and pass it into the function
 	#	token_bounds = (self.current_token_start, self.current_token_end)
 
-	# When h_do_parse is called, we take note of where state->input_stream->index is, and use it as the beginning of the token bounds
-	# Each entry in input_stream_indices is a list. It can have multiple elements when the parser before this one in the call graph is a sequence
-	# In that case, each parser it combines will be called in sequence, ending up at the same depth in the call graph.
-	# Unlike the parser stack, returning from h_do_parse doesn't correspond to popping the stack. Instead, if a parser rejects its input, all the token bounds above it (if the call stack grows upwards) are discarded.
-
-	# Examples of how input_stream_indices changes during parsing:
-	#
-	# [(100, 250), (250, 300)] <--- current depth
-	# [(100, None)] <--- sequence containing the tokens above it
-	#
-	# After begin_new_token():
-	# [(100, 250), (250, 300), (300, None)]
-	# [(100, None)]
-	#
-	# Once h_do_parse returns successfully (finish_token() is called):
-	# [(100, 250), (250, 300), (300, 450)]
-	# [(100, None)]
-	#
-	# Once the h_do_parse with the sequence itself returns successfully (finish_token() is called again):
-	# [(100, 250), (250, 300), (300, 450)]
-	# [(100, 450)]
-
-	# In case a parse fails:
-	# [(100, 250), (250, 300), (300, None)] <--- the parser corresponding to the rightmost token here rejects
-	# [(100, None)] <--- h_sequence, every parser it combines must accept
-	# [(100, None)] <--- the combinator that contains the h_sequence()
-	#
-	# First, the incomplete token is discarded:
-	# [(100, 250), (250, 300)]
-	# [(100, None)]
-	# [(100, None)]
-	#
-	# After h_do_parse returns with NULL, the stack is popped:
-	# [(100, None)] <--- now this is the topmost item in input_stream_indices
-	# [(100, None)]
-	#
-	# Since h_sequence rejects too:
-	# [(100, None)]
-	#
-	# state->input_stream->index is restored, and h_do_parse() returns:
-	# [(100, 100)]
-	# TODO: difference between zero-length token and HParsedToken without AST
 	def begin_new_token(self, index):
 		token_bounds = (index, None)
 		print("DEBUG: begin_new_token()")
@@ -238,6 +198,12 @@ class TopLevelParse:
 			print("len(self.input_index_tree_path):", len(self.input_index_tree_path), "self.current_parse_depth:", self.current_parse_depth)
 			raise RuntimeError("Unexpected node path length")
 
+		diff = (self.current_parse_depth+1) - len(self.starting_input_index)
+		newindices = [None] * diff
+		self.starting_input_index.extend(newindices)
+		self.starting_input_index[self.current_parse_depth] = index
+		print(self.starting_input_index)
+
 		print("DEBUG: self.current_parse_depth:", self.current_parse_depth)
 		print("DEBUG: path after begin_new_token():", self.input_index_tree_path) # TODO: rename input_index_tree_path to next_node_path
 		print("DEBUG: tokens after begin_new_token():", self.input_stream_indices)
@@ -248,7 +214,7 @@ class TopLevelParse:
 
 	# If a parser successfully returns a token, we take the list of token bounds at the current depth, check that the bounds corresponding to
 	# the most recent parse result aren't filled in yet, and make note of state->input_stream.index
-	def finish_token(self, index):
+	def finish_token(self, index, token_addr):
 		print("DEBUG: finish_token()")
 		print("DEBUG: current parse depth:", self.current_parse_depth)
 		print("DEBUG: finished tokens:", self.input_stream_indices)
@@ -264,6 +230,7 @@ class TopLevelParse:
 		node.finish_token(index)
 		#self.input_index_tree_path[self.current_parse_depth] += 1
 		del self.input_index_tree_path[self.current_parse_depth+1:]
+		self.input_token_map[int(token_addr)] = {'start': self.starting_input_index[self.current_parse_depth], 'end': index}
 		print("DEBUG: current parse depth:", self.current_parse_depth)
 		print("DEBUG: finished tokens:", self.input_stream_indices)
 		print("DEBUG: path:", self.input_index_tree_path)
@@ -286,8 +253,6 @@ class TopLevelParse:
 		if node.indices[1] is not None:
 			raise RuntimeError("DEBUG: failed parse but token still has bounds:", node.indices[1])
 		node.delete_all_children()
-		print("DEBUG: parent1: ", self.input_index_tree_path)
-		print("DEBUG: parent2: ", self.input_index_tree_path[0:self.current_parse_depth])
 		parent = self.input_stream_indices.select_by_path(self.input_index_tree_path[0:self.current_parse_depth])
 		if type(parent) is IndexTreeNode:
 			parent.delete_last_child()
@@ -297,6 +262,7 @@ class TopLevelParse:
 			self.input_index_tree_path[self.current_parse_depth] -= 1
 		else:
 			del self.input_index_tree_path[self.current_parse_depth:]
+		self.starting_input_index[self.current_parse_depth] = None
 
 	def peek_parserstack(self):
 		try:
@@ -424,7 +390,7 @@ class TopLevelParse:
 
 	def print_input_map(self, token):
 		print("input map would be printed here")
-		print(token)
+		print(self.input_token_map)
 
 	# TODO: better name pending
 	def get_current_stack_depth(self):
-- 
GitLab