Skip to content
Snippets Groups Projects
parser-name-instrumentation-gdb.py 22.20 KiB
# TODO: handlers for filters
# TODO: postordinate parser fails to get named
# TODO: step-to-parser command
# TODO: "current parser" on GUI is the one on top of the stack, while the argument of h_do_parse is not yet accounted for
# TODO: possible enhancement, caching the vtable type in Parser objects (allows searching by type)
# TODO: The parameter given to h_do_parse is not shown in the backtrace, which is confusing

# quick way to get locals from frame.block()
# {local.name : local for local in block}

class Parser:
	def __init__(self, name, address):
		self.name = name
		self.address = address
		self.bytes_used = {}
	
	def name_parser(self, name):
		self.name = name

	# TODO: remove
	def get_name_or_placeholder(self):
		if self.name is None:
			return "Wait for it... (if you're reading this, you found a bug)"
		else:
			return self.name

	def add_mem_use(self, state, size):
		if self.bytes_used.setdefault(state, None) is None:
			self.bytes_used[state] = size
		else:
			self.bytes_used[state] += size

	def get_mem_use(self, state=None):
		if state is None:
			return bytes_used
		else:
			return bytes_used.setdefault(state, 0)

	def __str__(self):
		return "(" + str(self.name) + ", " + hex(self.address) + ")"

	# Return the highest per-arena allocation count
	def get_arenamax(self):
		res = 0
		if self.bytes_used:
			res = max(self.bytes_used.values())
		return res

	def get_arenasum(self):
		res = 0
		if self.bytes_used:
			res = sum(self.bytes_used.values())
		return res

class ParserStack:
	def __init__(self, parse_state, arena):
		self.parse_state = parse_state
		self.arena = arena
		self.p_stack = []
		self.unclaimed_mem_use = 0

	def push(self, parser):
		self.p_stack.append(parser)

	def pop(self):
		return self.p_stack.pop()

	def peek(self):
		try:
			retval = self.p_stack[-1]
		except IndexError:
			retval = None
		return retval

	def set_state(self, state):
		self.parse_state = state
	# Shortcut for setting the name property of the parser on the top of stack
	# In terms of tracing, *most* calls to a parser look something like this with the packrat backend:
	# h_do_parse()
	#	parse_foo()
	#		perform_lowlevel_parse()
	
	# perform_lowlevel_parse() is called when the memo table at that position is not filled in yet.
	# it calls the corresponding parse_* virtual function via the vtable, but other than that does not have type information
	# it's probably possible to extract type information, by comparing vtable addresses, but that seems painful
	
	# parse_foo() is the parser's corresponding virtual function in the frontend, which does not have the equivalent of a "this" pointer
	
	# So what we do to keep track of parsers is incrementally filling in the details for both
	
	# h_do_parse() is the backend's "actually run the parser" function, but does not get called for some parsers
	# (apparently mostly it's for higher-order parsers)
	# also contains the decision logic about whether to call perform_lowlevel_parse()
	
	# possible scenarios:
	# h_do_parse()
	#	perform_lowlevel_parse()
	#		parse_foo()
	
	# h_do_parse()
	#	perform_lowlevel_parse()

	# h_do_parse()
	def name_top_parser(self, name):
		self.p_stack[-1].name_parser(name)

	def add_mem_use_each(self, size):
		for p in self.p_stack:
			p.bytes_used += size

	def add_mem_use_top(self, size):
		self.p_stack[-1].bytes_used += size
		
	def show_stack(self):
		print("stack would be printed here. Depth:", len(self.p_stack))
		#print([(p.get_name_or_placeholder(), hex(p.address)) for p in self.p_stack])

	def depth(self):
		return len(self.p_stack)

# Class that is responsible for bookkeeping throughout the entire parse
# NB, this is slightly different terminology than the hammer API implicitly uses:
# There, a parse is started by h_parse(), and it is associated with a parse state.
# This corresponds to the ParserStack above. TopLevelParse keeps track of all these.
# Subsequent h_do_parse()s with the same parser state are considered to belong to the same parse

# The TopLevelParse class is initialized in trace_begin(), and is used until the end of the trace
class TopLevelParse:
	def __init__(self):
		self.parser_stacks = []
		self.parser_objs = {}
		self.unclaimed_mem_use = 0
		# Holds 32 characters starting at state->input_stream[index], used by the GUI
		self.current_input_chunk = ''
		self.current_parser_env = ''
		self.vt_types = None
		self.parser_decombinator = None

	def init_parser(self):
		self.vt_types = VTTypes()
		self.parser_decombinator = ParserDecombinator(self.vt_types)

	# Called from h_packrat_parse()'s handler, where the parse state and arena get initialized
	def enter_h_packrat_parse(self, parser):
		# TODO: add a parser stack or something?
		parser_stack = ParserStack(None, None)
		self.parser_stacks.append(parser_stack)
		return 0

	# TODO: arena parameter is useless
	def enter_h_do_parse(self, parse_state, arena, parser):
		parser_stack = self.peek_parserstack()
		if parser_stack.parse_state is None and parser_stack.parse_state != parse_state:
			self.first_h_do_parse_after_packrat_parse(parse_state, arena)

	# Called from h_do_parse()'s handler, at which point we know the addresses of the state and arena
	def first_h_do_parse_after_packrat_parse(self, parse_state, arena):
		parser_stack = self.peek_parserstack()
		parser_stack.set_state(parse_state)

	# Popping the stack of stack of parsers
	def return_from_h_packrat_parse(self):
		old_stack = self.parser_stacks.pop()
		if old_stack.depth() > 0:
			print("Warning: parser stack not empty but parse is successful?")

	# Memoize the parser object for this particular address, then push it on the stack
	# Returns the parser object we just initalized (or the one already existing)
	#TODO: memoize_parser method
	def enter_perform_lowlevel_parse(self, parser_addr):
		try:
			parser_obj = self.parser_objs[parser_addr]
		except KeyError:
			# Create a parser object with no name and the address of the parser
			parser_obj = Parser(None, parser_addr)
			self.parser_objs[parser_addr] = parser_obj

		parser_stack = self.peek_parserstack()
		parser_stack.push(parser_obj)
		if self.parser_decombinator:
			p_env = self.parser_decombinator.decompose_parser(parser_obj, self)
			self.set_parser_env(type(p_env).__name__ + " - " + str(p_env)) # TODO: pass this as data structure to frontend
		return parser_obj

	def return_from_perform_lowlevel_parse(self):
		parser_stack = self.peek_parserstack()
		parser_obj = parser_stack.pop()
		# debug print here

	def enter_h_arena_malloc_raw(self, alloc_size):
		parser_obj = self.peek_parser()
		parser_stack = self.peek_parserstack()
		# This is probably the slowest part of the code, or maybe the overhead adds up over many calls to h_arena_malloc_raw()
		if parser_obj is not None:
			# Caveat: parser_stack is assumed not to be None if we could get a parser_obj
			parser_obj.add_mem_use(parser_stack.parse_state, alloc_size)
		elif parser_stack is not None:
			#print("Allocation of " + str(alloc_size) + " bytes without a parser on the stack. (Happens before first call perform_lowlevel_parse to or after return from that call)")
			parser_stack.unclaimed_mem_use += alloc_size
		else:
			#print("Allocation of " + str(alloc_size) + " bytes without a parser stack. (This happens before and after parse)")
			self.unclaimed_mem_use += alloc_size

	def parse_virtual(self, parser_name):
		parser_obj = self.peek_parser()
		if parser_obj.name is None:
			parser_obj.name_parser(parser_name)
		#else:
			#print("Warning: parser already named! This is a bug. old name: %s, new name: %s" % (parser_obj.name, parser_name))

	def peek_parserstack(self):
		try:
			retval = self.parser_stacks[-1]
		except IndexError:
			retval = None
		return retval

	def peek_parser(self):
		try: 
			retval = self.peek_parserstack().peek()
		except AttributeError:
			# print("Parser stack of stacks empty!")
			retval = None

		# retval will also be None when parser stack is empty (while parser stack of stacks isn't)
		return retval

	def parser_by_address(self, parser_addr):
		try:
			return self.parser_objs[int(parser_addr)]
		except KeyError:
			print("Parser with address " + str(hex(parser_addr)) + " not found!")

	def parsers_by_name(self, parser_name):
		results = [v for k,v in self.parser_objs.items() if v.name == parser_name]
		if len(results) > 0:
			return results
		else:
			return None

	def set_input_chunk(self, chunk):
		self.input_chunk = chunk

	def get_input_chunk(self):
		return self.input_chunk

	def set_parser_env(self, parser_env):
		self.current_parser_env = parser_env

	def get_parser_env(self):
		return self.current_parser_env

	def add_or_get_parser(self, parser_addr):
		try:
			parser_obj = self.parser_objs[int(parser_addr)]
		except KeyError:
			# Create a parser object with no name and the address of the parser
			parser_obj = Parser(None, int(parser_addr))
			self.parser_objs[int(parser_addr)] = parser_obj

		return parser_obj

	def get_parser_top_per_arena_mem(self):
		return sorted(self.parser_objs.values(), key=Parser.get_arenamax, reverse=True)[0]

	def get_parser_top_total_arena_mem(self):
		return sorted(self.parser_objs.values(), key=Parser.get_arenasum, reverse=True)[0]

	def get_avg_mem_use_per_arena(self):
		avg_mem_use = {}
		arena_counts = {}
		# Accumulate byte counts and counts of parsers using that arena
		for p in self.parser_objs.values():
			for arena,mem in p.bytes_used.items():
				if arena in arena_counts.keys():
					arena_counts[arena] += 1
				else:
					arena_counts[arena] = 1

				if arena in avg_mem_use.keys():
					avg_mem_use[arena] += mem
				else:
					avg_mem_use[arena] = mem

		averages = {arena: mem/arena_counts[arena] for arena,mem in avg_mem_use.items()}

		return averages

	# TODO: get_avg_mem_use_all_arenas, get_total_mem_use

top_level_parse = TopLevelParse()
# Approach 1: load the application, set breakpoints, execute stack commands on breakpoint hit, continue

class HPackratParseBreakpoint(gdb.Breakpoint):
	def stop(self):
		frame = gdb.selected_frame()
		block = frame.block()
		for val in block: # GDB, why did you take away [] for gdb.Block?
			if val.name == 'parser':
				parser = int(val.value(frame))
		top_level_parse.enter_h_packrat_parse(parser)
		return False
		
class HPackratParseRetBreakpoint(gdb.Breakpoint):
	def stop(self):
		top_level_parse.return_from_h_packrat_parse()
		return False

# TODO: frame.older() allows accessing the caller frame. decision logic about whether to call first_h_do_parse_after_packrat_parse()
# could be moved here
class HDoParseBreakpoint(gdb.Breakpoint):
	def stop(self):
		frame = gdb.selected_frame()
		block = frame.block()
		retval = False
		for val in block:
			if val.name == 'parser':
				parser = int(val.value(frame))
			if val.name == 'state':
				#TODO: rename these variables to make it clear they're pointers
				state = int(val.value(frame))
				state_obj = val.value(frame)
				index = val.value(frame).dereference()['input_stream']['index']
				input_ptr = val.value(frame).dereference()['input_stream']['input']
				# If you want to printf debug the parse state
				#print(val.value(frame).dereference())
			if val.name == 'arena':
				arena = int(val.value(frame))
		top_level_parse.enter_h_do_parse(state, None, parser)

		input_chunk = input_ptr + index
		#print(input_chunk.string('ascii','backslashreplace',10))
		top_level_parse.set_input_chunk(input_chunk.string('UTF-8','replace',32))


		# Check if we need to stop after a number of steps
		step_counter = gdb.convenience_variable("hammer_step_counter")
		if step_counter is not None and step_counter > 0:
			step_counter -= 1
			if step_counter == 0:
				gdb.set_convenience_variable("hammer_step_counter", None) # unset step counter
				retval = True
			else:
				gdb.set_convenience_variable("hammer_step_counter", step_counter)
		#else:
			# retval remains False

		# Check if we need to stop at a position
		stop_pos = gdb.convenience_variable("hammer_stop_pos")
		if stop_pos is not None and stop_pos <= index:
			retval = True
			gdb.set_convenience_variable("hammer_stop_pos", None)
			print("Requested stop position: " + str(stop_pos) + " Stopped at: " + str(index))
		#return False
		return retval

class PerformLowLevelParseBreakpoint(gdb.Breakpoint):
	def stop(self):
		frame = gdb.selected_frame()
		block = frame.block()
		#retval = False
		for val in block:
			if val.name == 'parser':
				parser = int(val.value(frame))
		top_level_parse.enter_perform_lowlevel_parse(parser)

		#return retval
		return False
		
class PerformLowLevelParseRetBreakpoint(gdb.Breakpoint):
	def stop(self):
		top_level_parse.return_from_perform_lowlevel_parse()
		return False

class ParserVirtualBreakpoint(gdb.Breakpoint):
	def stop(self):
		frame = gdb.selected_frame()
		block = frame.block()
		# function name is parse_*
		# we extract the second part
		# This is pretty much the same as naming it based on which parse_* function is called, so long as foo_vt contains parse_foo , and not e.g. parse_bar
		parser_type = frame.function().name.split("_")[1]
		parser_name = "(Unnamed " + parser_type + ")"
		top_level_parse.parse_virtual(parser_name)
		return False

class InitParserBreakpoint(gdb.Breakpoint):
	def stop(self):
		frame = gdb.selected_frame()
		block = frame.block()
		top_level_parse.init_parser()
		
		# This will also catch locals that aren't parsers, but it's not a problem in practice,
		# since h_parse() will never be called on them
		# If it becomes a problem after all, gdb.parse_and_eval() might be used to filter them out
		for p in block:
			top_level_parse.parser_objs[int(p.value(frame))] = Parser(p.name, int(p.value(frame)))

		return False

class HArenaMallocRawBreakpoint(gdb.Breakpoint):
	def stop(self):
		frame = gdb.selected_frame()
		block = frame.block()
		
		for val in block:
			if val.name == 'size':
				alloc_size = int(val.value(frame))

		top_level_parse.enter_h_arena_malloc_raw(alloc_size)

		return False

# GDB parameters
# TODO: hammer parameter prefix

class ExtendedParseStepInfo(gdb.Parameter):
	"""Controls whether to display parser stack and input preview on stepping the parse."""
	def __init__(self):
		super(ExtendedParseStepInfo, self).__init__("hammer-extended-parse-step-info", gdb.COMMAND_OBSCURE, gdb.PARAM_BOOLEAN)
		self.show_doc = "Show parser stack and input preview after hammer-parse-step:"
		#self.set_doc = "Show parser stack and input preview after hammer-parse-step:"
		self.value = True

ExtendedParseStepInfo()

# GDB commands

# TODO: GDB help strings
# TODO: factor commands out into their own file

class HammerParserBacktrace(gdb.Command):
	def __init__(self):
		super(HammerParserBacktrace, self).__init__ ("hammer-parser-backtrace", gdb.COMMAND_OBSCURE)

	def invoke(self, arg, from_tty):
		parserstack = top_level_parse.peek_parserstack().p_stack
		args = gdb.string_to_argv(arg)
		if len(args) < 1:
			maxsize = len(parserstack)
		else:
			try:
				maxsize = int(args[0])
				if maxsize < 1:
					raise ValueError
			except ValueError:
				maxsize = len(parserstacK)
				print("Argument must be a positive integer")

		depth = min(len(parserstack), maxsize)
		if depth > 0: # if stack not empty
			# unsure what the idiomatic python is for handling negative indices starting with -1,
			# but this addition is to avoid off-by-one errors
			index = -(depth+1)
			for p in parserstack[-1:index:-1]:
				print("[" + str(hex(p.address)) + "] " + p.name) # TODO: errors in perform_lowlevel_parse, if p.name is None
			if depth < len(parserstack):
				print("[...]")

HammerParserBacktrace()

class HammerParserMemUse(gdb.Command):
	def __init__(self):
		super(HammerParserMemUse, self).__init__("hammer-parser-mem-use", gdb.COMMAND_OBSCURE)

	def invoke(self, arg, from_tty):
		args = gdb.string_to_argv(arg)
		if len(args) < 1:
			print("Usage: hammer-parser-mem-use <address>")
			return

		parser_addr = args[0]
		try:
			parser_addr_int = int(parser_addr, 16)
			parser_obj = top_level_parse.parser_by_address(parser_addr_int)
			if parser_obj is not None:
				print(parser_obj.bytes_used)
		except ValueError:
			print("Address needs to be a hexadecimal number")

HammerParserMemUse()

class HammerParserMemUseName(gdb.Command):
	def __init__(self):
		super(HammerParserMemUseName, self).__init__("hammer-parser-mem-use-name", gdb.COMMAND_OBSCURE)

	def invoke(self, arg, from_tty):
		args = gdb.string_to_argv(arg)
		if len(args) < 1:
			print("Usage: hammer-parser-mem-use-name <name>")
			return

		parser_name = args[0]
		parser_objs = top_level_parse.parsers_by_name(parser_name)
		if parser_objs is not None:
			for p in parser_objs:
				print((p.name, hex(p.address), p.bytes_used))

HammerParserMemUseName()

class HammerParserTopSingleArenaMem(gdb.Command):
	def __init__(self):
		super(HammerParserTopSingleArenaMem, self).__init__("hammer-parser-top-single-arena-mem", gdb.COMMAND_OBSCURE)

	def invoke(self, arg, from_tty):
		args = gdb.string_to_argv(arg)

		p = top_level_parse.get_parser_top_per_arena_mem()
		print((p.name, hex(p.address), p.bytes_used))

HammerParserTopSingleArenaMem()

class HammerParserTopTotalArenaMem(gdb.Command):
	def __init__(self):
		super(HammerParserTopTotalArenaMem, self).__init__("hammer-parser-top-total-arena-mem", gdb.COMMAND_OBSCURE)

	def invoke(self, arg, from_tty):
		args = gdb.string_to_argv(arg)

		p = top_level_parse.get_parser_top_total_arena_mem()
		print((p.name, hex(p.address), p.bytes_used))
		total_mem_use = p.get_arenasum()
		print("Total: " + str(total_mem_use) + " bytes")

HammerParserTopTotalArenaMem()

# TODO: average memory use, per arena and total

class HammerParserPreviewInput(gdb.Command):
	def __init__(self):
		super(HammerParserPreviewInput, self).__init__("hammer-parser-preview-input", gdb.COMMAND_OBSCURE)

	def invoke(self, arg, from_tty):
		args = gdb.string_to_argv(arg)

		print(top_level_parse.input_chunk)

HammerParserPreviewInput()

class HammerParserAverageMem(gdb.Command):
	def __init__(self):
		super(HammerParserAverageMem, self).__init__("hammer-parser-average-mem", gdb.COMMAND_OBSCURE)

	def invoke(self, arg, from_tty):
		args = gdb.string_to_argv(arg)

		mem = top_level_parse.get_avg_mem_use_per_arena()
		print("Bytes used on average in each arena:")
		print(mem)

HammerParserAverageMem()

# Call when execution stopped at breakpoint in main
def locate_perform_lowlevel_parse_retq():
	arch = gdb.selected_frame().architecture()
	p_l_p_sym = gdb.lookup_symbol("perform_lowlevel_parse")[0]
	p_l_p_address = int(p_l_p_sym.value().address)
	# The choice of disassembling only 400 instructions from the start is arbitrary. (This function is 310 bytes long on this particular machine.) There is probably a way to find out where a function ends.
	instructions = arch.disassemble(p_l_p_address, p_l_p_address+400)
	results = [ ins["addr"] for ins in instructions if ins["asm"].startswith("ret") ]
	return results[0]

def locate_h_packrat_parse_retq():
	arch = gdb.selected_frame().architecture()
	h_p_p_sym = gdb.lookup_symbol("h_packrat_parse")[0]
	h_p_p_address = int(h_p_p_sym.value().address)
	# Same as with perform_lowlevel_parse, +450 is arbitrary
	instructions = arch.disassemble(h_p_p_address, h_p_p_address+450)
	results = [ ins["addr"] for ins in instructions if ins["asm"].startswith("ret") ]
	return results[0]

def locate_init_parser_retq():
	arch = gdb.selected_frame().architecture()
	i_p_sym = gdb.lookup_symbol("init_parser")[0]
	i_p_address = int(i_p_sym.value().address)
	# Same as with perform_lowlevel_parse, +16000 is arbitrary
	instructions = arch.disassemble(i_p_address, i_p_address+16000)
	results = [ ins["addr"] for ins in instructions if ins["asm"].startswith("ret") ]
	return results[0]
#TODO: regex match retq, ret, etc

# Break on main so that libhammer.so gets to load
main = gdb.Breakpoint("main")




#init_parser = InitParserBreakpoint("pdf.c:1223")
h_do_parse = HDoParseBreakpoint("h_do_parse")
h_packrat_parse = HPackratParseBreakpoint("h_packrat_parse")
perform_lowlevel_parse = PerformLowLevelParseBreakpoint("perform_lowlevel_parse")
h_arena_malloc_raw = HArenaMallocRawBreakpoint("h_arena_malloc_raw")
# todo: investigate GDB frame filters for rendering backtraces

parse_action = ParserVirtualBreakpoint("parse_action")
parse_choice = ParserVirtualBreakpoint("parse_choice")
parse_sequence = ParserVirtualBreakpoint("parse_sequence")
parse_difference = ParserVirtualBreakpoint("parse_difference")
parse_many = ParserVirtualBreakpoint("parse_many")
parse_and = ParserVirtualBreakpoint("parse_and")
parse_attr_bool = ParserVirtualBreakpoint("parse_attr_bool")
parse_bind = ParserVirtualBreakpoint("parse_bind")
parse_bits = ParserVirtualBreakpoint("parse_bits")
parse_butnot = ParserVirtualBreakpoint("parse_butnot")
parse_charset = ParserVirtualBreakpoint("parse_charset")
parse_ch = ParserVirtualBreakpoint("parse_ch")
parse_end = ParserVirtualBreakpoint("parse_end")
parse_endianness = ParserVirtualBreakpoint("parse_endianness")
parse_epsilon = ParserVirtualBreakpoint("parse_epsilon")
parse_ignore = ParserVirtualBreakpoint("parse_ignore")
parse_ignoreseq = ParserVirtualBreakpoint("parse_ignoreseq")
parse_indirect = ParserVirtualBreakpoint("parse_indirect")
parse_int_range = ParserVirtualBreakpoint("parse_int_range")
parse_not = ParserVirtualBreakpoint("parse_not")
parse_nothing = ParserVirtualBreakpoint("parse_nothing")
parse_optional = ParserVirtualBreakpoint("parse_optional")
parse_permutation = ParserVirtualBreakpoint("parse_permutation")
parse_skip = ParserVirtualBreakpoint("parse_skip")
parse_seek = ParserVirtualBreakpoint("parse_seek")
parse_tell = ParserVirtualBreakpoint("parse_tell")
parse_token = ParserVirtualBreakpoint("parse_token")
parse_unimplemented = ParserVirtualBreakpoint("parse_unimplemented")
parse_put = ParserVirtualBreakpoint("parse_put")
parse_get = ParserVirtualBreakpoint("parse_get")
parse_whitespace = ParserVirtualBreakpoint("parse_whitespace")
parse_xor = ParserVirtualBreakpoint("parse_xor")

# Commandline:
# $ gdb -ex "source /path/to/parser-name-instrumentation-gdb.py" --args /path/to/pdf /path/to/input.pdf

# run until main
gdb.execute("run")

plp_retq = locate_perform_lowlevel_parse_retq()
perform_lowlevel_parse_ret = PerformLowLevelParseRetBreakpoint("*" + hex(plp_retq))
hpp_retq = locate_h_packrat_parse_retq()
h_packrat_parse_ret = HPackratParseRetBreakpoint("*" + hex(hpp_retq))
i_p_retq = locate_init_parser_retq()
init_parser = InitParserBreakpoint("*" + hex(i_p_retq))

# Run until stop position, if set. Finish parsing otherwise
gdb.execute("continue")

print([(p.name, hex(p.address), p.bytes_used) for p in top_level_parse.parser_objs.values()])

# Approach 2: capture process trace with gdb, load the trace, execute stack commands on breakpoint hit, etc