-
pompolic authored
parser-name-instrumentation-gdb.py 22.20 KiB
# TODO: handlers for filters
# TODO: postordinate parser fails to get named
# TODO: step-to-parser command
# TODO: "current parser" on GUI is the one on top of the stack, while the argument of h_do_parse is not yet accounted for
# TODO: possible enhancement, caching the vtable type in Parser objects (allows searching by type)
# TODO: The parameter given to h_do_parse is not shown in the backtrace, which is confusing
# quick way to get locals from frame.block()
# {local.name : local for local in block}
class Parser:
def __init__(self, name, address):
self.name = name
self.address = address
self.bytes_used = {}
def name_parser(self, name):
self.name = name
# TODO: remove
def get_name_or_placeholder(self):
if self.name is None:
return "Wait for it... (if you're reading this, you found a bug)"
else:
return self.name
def add_mem_use(self, state, size):
if self.bytes_used.setdefault(state, None) is None:
self.bytes_used[state] = size
else:
self.bytes_used[state] += size
def get_mem_use(self, state=None):
if state is None:
return bytes_used
else:
return bytes_used.setdefault(state, 0)
def __str__(self):
return "(" + str(self.name) + ", " + hex(self.address) + ")"
# Return the highest per-arena allocation count
def get_arenamax(self):
res = 0
if self.bytes_used:
res = max(self.bytes_used.values())
return res
def get_arenasum(self):
res = 0
if self.bytes_used:
res = sum(self.bytes_used.values())
return res
class ParserStack:
def __init__(self, parse_state, arena):
self.parse_state = parse_state
self.arena = arena
self.p_stack = []
self.unclaimed_mem_use = 0
def push(self, parser):
self.p_stack.append(parser)
def pop(self):
return self.p_stack.pop()
def peek(self):
try:
retval = self.p_stack[-1]
except IndexError:
retval = None
return retval
def set_state(self, state):
self.parse_state = state
# Shortcut for setting the name property of the parser on the top of stack
# In terms of tracing, *most* calls to a parser look something like this with the packrat backend:
# h_do_parse()
# parse_foo()
# perform_lowlevel_parse()
# perform_lowlevel_parse() is called when the memo table at that position is not filled in yet.
# it calls the corresponding parse_* virtual function via the vtable, but other than that does not have type information
# it's probably possible to extract type information, by comparing vtable addresses, but that seems painful
# parse_foo() is the parser's corresponding virtual function in the frontend, which does not have the equivalent of a "this" pointer
# So what we do to keep track of parsers is incrementally filling in the details for both
# h_do_parse() is the backend's "actually run the parser" function, but does not get called for some parsers
# (apparently mostly it's for higher-order parsers)
# also contains the decision logic about whether to call perform_lowlevel_parse()
# possible scenarios:
# h_do_parse()
# perform_lowlevel_parse()
# parse_foo()
# h_do_parse()
# perform_lowlevel_parse()
# h_do_parse()
def name_top_parser(self, name):
self.p_stack[-1].name_parser(name)
def add_mem_use_each(self, size):
for p in self.p_stack:
p.bytes_used += size
def add_mem_use_top(self, size):
self.p_stack[-1].bytes_used += size
def show_stack(self):
print("stack would be printed here. Depth:", len(self.p_stack))
#print([(p.get_name_or_placeholder(), hex(p.address)) for p in self.p_stack])
def depth(self):
return len(self.p_stack)
# Class that is responsible for bookkeeping throughout the entire parse
# NB, this is slightly different terminology than the hammer API implicitly uses:
# There, a parse is started by h_parse(), and it is associated with a parse state.
# This corresponds to the ParserStack above. TopLevelParse keeps track of all these.
# Subsequent h_do_parse()s with the same parser state are considered to belong to the same parse
# The TopLevelParse class is initialized in trace_begin(), and is used until the end of the trace
class TopLevelParse:
def __init__(self):
self.parser_stacks = []
self.parser_objs = {}
self.unclaimed_mem_use = 0
# Holds 32 characters starting at state->input_stream[index], used by the GUI
self.current_input_chunk = ''
self.current_parser_env = ''
self.vt_types = None
self.parser_decombinator = None
def init_parser(self):
self.vt_types = VTTypes()
self.parser_decombinator = ParserDecombinator(self.vt_types)
# Called from h_packrat_parse()'s handler, where the parse state and arena get initialized
def enter_h_packrat_parse(self, parser):
# TODO: add a parser stack or something?
parser_stack = ParserStack(None, None)
self.parser_stacks.append(parser_stack)
return 0
# TODO: arena parameter is useless
def enter_h_do_parse(self, parse_state, arena, parser):
parser_stack = self.peek_parserstack()
if parser_stack.parse_state is None and parser_stack.parse_state != parse_state:
self.first_h_do_parse_after_packrat_parse(parse_state, arena)
# Called from h_do_parse()'s handler, at which point we know the addresses of the state and arena
def first_h_do_parse_after_packrat_parse(self, parse_state, arena):
parser_stack = self.peek_parserstack()
parser_stack.set_state(parse_state)
# Popping the stack of stack of parsers
def return_from_h_packrat_parse(self):
old_stack = self.parser_stacks.pop()
if old_stack.depth() > 0:
print("Warning: parser stack not empty but parse is successful?")
# Memoize the parser object for this particular address, then push it on the stack
# Returns the parser object we just initalized (or the one already existing)
#TODO: memoize_parser method
def enter_perform_lowlevel_parse(self, parser_addr):
try:
parser_obj = self.parser_objs[parser_addr]
except KeyError:
# Create a parser object with no name and the address of the parser
parser_obj = Parser(None, parser_addr)
self.parser_objs[parser_addr] = parser_obj
parser_stack = self.peek_parserstack()
parser_stack.push(parser_obj)
if self.parser_decombinator:
p_env = self.parser_decombinator.decompose_parser(parser_obj, self)
self.set_parser_env(type(p_env).__name__ + " - " + str(p_env)) # TODO: pass this as data structure to frontend
return parser_obj
def return_from_perform_lowlevel_parse(self):
parser_stack = self.peek_parserstack()
parser_obj = parser_stack.pop()
# debug print here
def enter_h_arena_malloc_raw(self, alloc_size):
parser_obj = self.peek_parser()
parser_stack = self.peek_parserstack()
# This is probably the slowest part of the code, or maybe the overhead adds up over many calls to h_arena_malloc_raw()
if parser_obj is not None:
# Caveat: parser_stack is assumed not to be None if we could get a parser_obj
parser_obj.add_mem_use(parser_stack.parse_state, alloc_size)
elif parser_stack is not None:
#print("Allocation of " + str(alloc_size) + " bytes without a parser on the stack. (Happens before first call perform_lowlevel_parse to or after return from that call)")
parser_stack.unclaimed_mem_use += alloc_size
else:
#print("Allocation of " + str(alloc_size) + " bytes without a parser stack. (This happens before and after parse)")
self.unclaimed_mem_use += alloc_size
def parse_virtual(self, parser_name):
parser_obj = self.peek_parser()
if parser_obj.name is None:
parser_obj.name_parser(parser_name)
#else:
#print("Warning: parser already named! This is a bug. old name: %s, new name: %s" % (parser_obj.name, parser_name))
def peek_parserstack(self):
try:
retval = self.parser_stacks[-1]
except IndexError:
retval = None
return retval
def peek_parser(self):
try:
retval = self.peek_parserstack().peek()
except AttributeError:
# print("Parser stack of stacks empty!")
retval = None
# retval will also be None when parser stack is empty (while parser stack of stacks isn't)
return retval
def parser_by_address(self, parser_addr):
try:
return self.parser_objs[int(parser_addr)]
except KeyError:
print("Parser with address " + str(hex(parser_addr)) + " not found!")
def parsers_by_name(self, parser_name):
results = [v for k,v in self.parser_objs.items() if v.name == parser_name]
if len(results) > 0:
return results
else:
return None
def set_input_chunk(self, chunk):
self.input_chunk = chunk
def get_input_chunk(self):
return self.input_chunk
def set_parser_env(self, parser_env):
self.current_parser_env = parser_env
def get_parser_env(self):
return self.current_parser_env
def add_or_get_parser(self, parser_addr):
try:
parser_obj = self.parser_objs[int(parser_addr)]
except KeyError:
# Create a parser object with no name and the address of the parser
parser_obj = Parser(None, int(parser_addr))
self.parser_objs[int(parser_addr)] = parser_obj
return parser_obj
def get_parser_top_per_arena_mem(self):
return sorted(self.parser_objs.values(), key=Parser.get_arenamax, reverse=True)[0]
def get_parser_top_total_arena_mem(self):
return sorted(self.parser_objs.values(), key=Parser.get_arenasum, reverse=True)[0]
def get_avg_mem_use_per_arena(self):
avg_mem_use = {}
arena_counts = {}
# Accumulate byte counts and counts of parsers using that arena
for p in self.parser_objs.values():
for arena,mem in p.bytes_used.items():
if arena in arena_counts.keys():
arena_counts[arena] += 1
else:
arena_counts[arena] = 1
if arena in avg_mem_use.keys():
avg_mem_use[arena] += mem
else:
avg_mem_use[arena] = mem
averages = {arena: mem/arena_counts[arena] for arena,mem in avg_mem_use.items()}
return averages
# TODO: get_avg_mem_use_all_arenas, get_total_mem_use
top_level_parse = TopLevelParse()
# Approach 1: load the application, set breakpoints, execute stack commands on breakpoint hit, continue
class HPackratParseBreakpoint(gdb.Breakpoint):
def stop(self):
frame = gdb.selected_frame()
block = frame.block()
for val in block: # GDB, why did you take away [] for gdb.Block?
if val.name == 'parser':
parser = int(val.value(frame))
top_level_parse.enter_h_packrat_parse(parser)
return False
class HPackratParseRetBreakpoint(gdb.Breakpoint):
def stop(self):
top_level_parse.return_from_h_packrat_parse()
return False
# TODO: frame.older() allows accessing the caller frame. decision logic about whether to call first_h_do_parse_after_packrat_parse()
# could be moved here
class HDoParseBreakpoint(gdb.Breakpoint):
def stop(self):
frame = gdb.selected_frame()
block = frame.block()
retval = False
for val in block:
if val.name == 'parser':
parser = int(val.value(frame))
if val.name == 'state':
#TODO: rename these variables to make it clear they're pointers
state = int(val.value(frame))
state_obj = val.value(frame)
index = val.value(frame).dereference()['input_stream']['index']
input_ptr = val.value(frame).dereference()['input_stream']['input']
# If you want to printf debug the parse state
#print(val.value(frame).dereference())
if val.name == 'arena':
arena = int(val.value(frame))
top_level_parse.enter_h_do_parse(state, None, parser)
input_chunk = input_ptr + index
#print(input_chunk.string('ascii','backslashreplace',10))
top_level_parse.set_input_chunk(input_chunk.string('UTF-8','replace',32))
# Check if we need to stop after a number of steps
step_counter = gdb.convenience_variable("hammer_step_counter")
if step_counter is not None and step_counter > 0:
step_counter -= 1
if step_counter == 0:
gdb.set_convenience_variable("hammer_step_counter", None) # unset step counter
retval = True
else:
gdb.set_convenience_variable("hammer_step_counter", step_counter)
#else:
# retval remains False
# Check if we need to stop at a position
stop_pos = gdb.convenience_variable("hammer_stop_pos")
if stop_pos is not None and stop_pos <= index:
retval = True
gdb.set_convenience_variable("hammer_stop_pos", None)
print("Requested stop position: " + str(stop_pos) + " Stopped at: " + str(index))
#return False
return retval
class PerformLowLevelParseBreakpoint(gdb.Breakpoint):
def stop(self):
frame = gdb.selected_frame()
block = frame.block()
#retval = False
for val in block:
if val.name == 'parser':
parser = int(val.value(frame))
top_level_parse.enter_perform_lowlevel_parse(parser)
#return retval
return False
class PerformLowLevelParseRetBreakpoint(gdb.Breakpoint):
def stop(self):
top_level_parse.return_from_perform_lowlevel_parse()
return False
class ParserVirtualBreakpoint(gdb.Breakpoint):
def stop(self):
frame = gdb.selected_frame()
block = frame.block()
# function name is parse_*
# we extract the second part
# This is pretty much the same as naming it based on which parse_* function is called, so long as foo_vt contains parse_foo , and not e.g. parse_bar
parser_type = frame.function().name.split("_")[1]
parser_name = "(Unnamed " + parser_type + ")"
top_level_parse.parse_virtual(parser_name)
return False
class InitParserBreakpoint(gdb.Breakpoint):
def stop(self):
frame = gdb.selected_frame()
block = frame.block()
top_level_parse.init_parser()
# This will also catch locals that aren't parsers, but it's not a problem in practice,
# since h_parse() will never be called on them
# If it becomes a problem after all, gdb.parse_and_eval() might be used to filter them out
for p in block:
top_level_parse.parser_objs[int(p.value(frame))] = Parser(p.name, int(p.value(frame)))
return False
class HArenaMallocRawBreakpoint(gdb.Breakpoint):
def stop(self):
frame = gdb.selected_frame()
block = frame.block()
for val in block:
if val.name == 'size':
alloc_size = int(val.value(frame))
top_level_parse.enter_h_arena_malloc_raw(alloc_size)
return False
# GDB parameters
# TODO: hammer parameter prefix
class ExtendedParseStepInfo(gdb.Parameter):
"""Controls whether to display parser stack and input preview on stepping the parse."""
def __init__(self):
super(ExtendedParseStepInfo, self).__init__("hammer-extended-parse-step-info", gdb.COMMAND_OBSCURE, gdb.PARAM_BOOLEAN)
self.show_doc = "Show parser stack and input preview after hammer-parse-step:"
#self.set_doc = "Show parser stack and input preview after hammer-parse-step:"
self.value = True
ExtendedParseStepInfo()
# GDB commands
# TODO: GDB help strings
# TODO: factor commands out into their own file
class HammerParserBacktrace(gdb.Command):
def __init__(self):
super(HammerParserBacktrace, self).__init__ ("hammer-parser-backtrace", gdb.COMMAND_OBSCURE)
def invoke(self, arg, from_tty):
parserstack = top_level_parse.peek_parserstack().p_stack
args = gdb.string_to_argv(arg)
if len(args) < 1:
maxsize = len(parserstack)
else:
try:
maxsize = int(args[0])
if maxsize < 1:
raise ValueError
except ValueError:
maxsize = len(parserstacK)
print("Argument must be a positive integer")
depth = min(len(parserstack), maxsize)
if depth > 0: # if stack not empty
# unsure what the idiomatic python is for handling negative indices starting with -1,
# but this addition is to avoid off-by-one errors
index = -(depth+1)
for p in parserstack[-1:index:-1]:
print("[" + str(hex(p.address)) + "] " + p.name) # TODO: errors in perform_lowlevel_parse, if p.name is None
if depth < len(parserstack):
print("[...]")
HammerParserBacktrace()
class HammerParserMemUse(gdb.Command):
def __init__(self):
super(HammerParserMemUse, self).__init__("hammer-parser-mem-use", gdb.COMMAND_OBSCURE)
def invoke(self, arg, from_tty):
args = gdb.string_to_argv(arg)
if len(args) < 1:
print("Usage: hammer-parser-mem-use <address>")
return
parser_addr = args[0]
try:
parser_addr_int = int(parser_addr, 16)
parser_obj = top_level_parse.parser_by_address(parser_addr_int)
if parser_obj is not None:
print(parser_obj.bytes_used)
except ValueError:
print("Address needs to be a hexadecimal number")
HammerParserMemUse()
class HammerParserMemUseName(gdb.Command):
def __init__(self):
super(HammerParserMemUseName, self).__init__("hammer-parser-mem-use-name", gdb.COMMAND_OBSCURE)
def invoke(self, arg, from_tty):
args = gdb.string_to_argv(arg)
if len(args) < 1:
print("Usage: hammer-parser-mem-use-name <name>")
return
parser_name = args[0]
parser_objs = top_level_parse.parsers_by_name(parser_name)
if parser_objs is not None:
for p in parser_objs:
print((p.name, hex(p.address), p.bytes_used))
HammerParserMemUseName()
class HammerParserTopSingleArenaMem(gdb.Command):
def __init__(self):
super(HammerParserTopSingleArenaMem, self).__init__("hammer-parser-top-single-arena-mem", gdb.COMMAND_OBSCURE)
def invoke(self, arg, from_tty):
args = gdb.string_to_argv(arg)
p = top_level_parse.get_parser_top_per_arena_mem()
print((p.name, hex(p.address), p.bytes_used))
HammerParserTopSingleArenaMem()
class HammerParserTopTotalArenaMem(gdb.Command):
def __init__(self):
super(HammerParserTopTotalArenaMem, self).__init__("hammer-parser-top-total-arena-mem", gdb.COMMAND_OBSCURE)
def invoke(self, arg, from_tty):
args = gdb.string_to_argv(arg)
p = top_level_parse.get_parser_top_total_arena_mem()
print((p.name, hex(p.address), p.bytes_used))
total_mem_use = p.get_arenasum()
print("Total: " + str(total_mem_use) + " bytes")
HammerParserTopTotalArenaMem()
# TODO: average memory use, per arena and total
class HammerParserPreviewInput(gdb.Command):
def __init__(self):
super(HammerParserPreviewInput, self).__init__("hammer-parser-preview-input", gdb.COMMAND_OBSCURE)
def invoke(self, arg, from_tty):
args = gdb.string_to_argv(arg)
print(top_level_parse.input_chunk)
HammerParserPreviewInput()
class HammerParserAverageMem(gdb.Command):
def __init__(self):
super(HammerParserAverageMem, self).__init__("hammer-parser-average-mem", gdb.COMMAND_OBSCURE)
def invoke(self, arg, from_tty):
args = gdb.string_to_argv(arg)
mem = top_level_parse.get_avg_mem_use_per_arena()
print("Bytes used on average in each arena:")
print(mem)
HammerParserAverageMem()
# Call when execution stopped at breakpoint in main
def locate_perform_lowlevel_parse_retq():
arch = gdb.selected_frame().architecture()
p_l_p_sym = gdb.lookup_symbol("perform_lowlevel_parse")[0]
p_l_p_address = int(p_l_p_sym.value().address)
# The choice of disassembling only 400 instructions from the start is arbitrary. (This function is 310 bytes long on this particular machine.) There is probably a way to find out where a function ends.
instructions = arch.disassemble(p_l_p_address, p_l_p_address+400)
results = [ ins["addr"] for ins in instructions if ins["asm"].startswith("ret") ]
return results[0]
def locate_h_packrat_parse_retq():
arch = gdb.selected_frame().architecture()
h_p_p_sym = gdb.lookup_symbol("h_packrat_parse")[0]
h_p_p_address = int(h_p_p_sym.value().address)
# Same as with perform_lowlevel_parse, +450 is arbitrary
instructions = arch.disassemble(h_p_p_address, h_p_p_address+450)
results = [ ins["addr"] for ins in instructions if ins["asm"].startswith("ret") ]
return results[0]
def locate_init_parser_retq():
arch = gdb.selected_frame().architecture()
i_p_sym = gdb.lookup_symbol("init_parser")[0]
i_p_address = int(i_p_sym.value().address)
# Same as with perform_lowlevel_parse, +16000 is arbitrary
instructions = arch.disassemble(i_p_address, i_p_address+16000)
results = [ ins["addr"] for ins in instructions if ins["asm"].startswith("ret") ]
return results[0]
#TODO: regex match retq, ret, etc
# Break on main so that libhammer.so gets to load
main = gdb.Breakpoint("main")
#init_parser = InitParserBreakpoint("pdf.c:1223")
h_do_parse = HDoParseBreakpoint("h_do_parse")
h_packrat_parse = HPackratParseBreakpoint("h_packrat_parse")
perform_lowlevel_parse = PerformLowLevelParseBreakpoint("perform_lowlevel_parse")
h_arena_malloc_raw = HArenaMallocRawBreakpoint("h_arena_malloc_raw")
# todo: investigate GDB frame filters for rendering backtraces
parse_action = ParserVirtualBreakpoint("parse_action")
parse_choice = ParserVirtualBreakpoint("parse_choice")
parse_sequence = ParserVirtualBreakpoint("parse_sequence")
parse_difference = ParserVirtualBreakpoint("parse_difference")
parse_many = ParserVirtualBreakpoint("parse_many")
parse_and = ParserVirtualBreakpoint("parse_and")
parse_attr_bool = ParserVirtualBreakpoint("parse_attr_bool")
parse_bind = ParserVirtualBreakpoint("parse_bind")
parse_bits = ParserVirtualBreakpoint("parse_bits")
parse_butnot = ParserVirtualBreakpoint("parse_butnot")
parse_charset = ParserVirtualBreakpoint("parse_charset")
parse_ch = ParserVirtualBreakpoint("parse_ch")
parse_end = ParserVirtualBreakpoint("parse_end")
parse_endianness = ParserVirtualBreakpoint("parse_endianness")
parse_epsilon = ParserVirtualBreakpoint("parse_epsilon")
parse_ignore = ParserVirtualBreakpoint("parse_ignore")
parse_ignoreseq = ParserVirtualBreakpoint("parse_ignoreseq")
parse_indirect = ParserVirtualBreakpoint("parse_indirect")
parse_int_range = ParserVirtualBreakpoint("parse_int_range")
parse_not = ParserVirtualBreakpoint("parse_not")
parse_nothing = ParserVirtualBreakpoint("parse_nothing")
parse_optional = ParserVirtualBreakpoint("parse_optional")
parse_permutation = ParserVirtualBreakpoint("parse_permutation")
parse_skip = ParserVirtualBreakpoint("parse_skip")
parse_seek = ParserVirtualBreakpoint("parse_seek")
parse_tell = ParserVirtualBreakpoint("parse_tell")
parse_token = ParserVirtualBreakpoint("parse_token")
parse_unimplemented = ParserVirtualBreakpoint("parse_unimplemented")
parse_put = ParserVirtualBreakpoint("parse_put")
parse_get = ParserVirtualBreakpoint("parse_get")
parse_whitespace = ParserVirtualBreakpoint("parse_whitespace")
parse_xor = ParserVirtualBreakpoint("parse_xor")
# Commandline:
# $ gdb -ex "source /path/to/parser-name-instrumentation-gdb.py" --args /path/to/pdf /path/to/input.pdf
# run until main
gdb.execute("run")
plp_retq = locate_perform_lowlevel_parse_retq()
perform_lowlevel_parse_ret = PerformLowLevelParseRetBreakpoint("*" + hex(plp_retq))
hpp_retq = locate_h_packrat_parse_retq()
h_packrat_parse_ret = HPackratParseRetBreakpoint("*" + hex(hpp_retq))
i_p_retq = locate_init_parser_retq()
init_parser = InitParserBreakpoint("*" + hex(i_p_retq))
# Run until stop position, if set. Finish parsing otherwise
gdb.execute("continue")
print([(p.name, hex(p.address), p.bytes_used) for p in top_level_parse.parser_objs.values()])
# Approach 2: capture process trace with gdb, load the trace, execute stack commands on breakpoint hit, etc