From 6d81c61b0a5623ad405226025a943812afd830d9 Mon Sep 17 00:00:00 2001 From: pompolic <pompolic@special-circumstanc.es> Date: Thu, 27 May 2021 21:09:19 +0200 Subject: [PATCH] GDB port added --- gdb-port/parser-name-instrumentation-gdb.py | 338 ++++++++++++++++++++ 1 file changed, 338 insertions(+) create mode 100644 gdb-port/parser-name-instrumentation-gdb.py diff --git a/gdb-port/parser-name-instrumentation-gdb.py b/gdb-port/parser-name-instrumentation-gdb.py new file mode 100644 index 0000000..381839d --- /dev/null +++ b/gdb-port/parser-name-instrumentation-gdb.py @@ -0,0 +1,338 @@ +# TODO: handlers for filters +# TODO: postordinate parser fails to get named + +class Parser: + #TODO: remove + _parser_names = {} + + def __init__(self, name, address): + self.name = name + self.address = address + self.bytes_used = {} + + def name_parser(self, name): + #if self.address not in Parser._parser_names: + # Parser._parser_names[self.address] = name + self.name = name + + # TODO: remove + def get_name_or_placeholder(self): + if self.name is None: + return "Wait for it... (if you're reading this, you found a bug)" + else: + return self.name + + def add_mem_use(self, state, size): + if self.bytes_used.setdefault(state, None) is None: + self.bytes_used[state] = size + else: + self.bytes_used[state] += size + + def get_mem_use(self, state=None): + if state is None: + return bytes_used + else: + return bytes_used.setdefault(state, 0) + +class ParserStack: + def __init__(self, parse_state, arena): + self.parse_state = parse_state + self.arena = arena + self.p_stack = [] + self.unclaimed_mem_use = 0 + + def push(self, parser): + self.p_stack.append(parser) + + def pop(self): + return self.p_stack.pop() + + def peek(self): + try: + retval = self.p_stack[-1] + except IndexError: + retval = None + return retval + + def set_state(self, state): + self.parse_state = state + # Shortcut for setting the name property of the parser on the top of stack + # In terms of tracing, *most* calls to a parser look something like this with the packrat backend: + # h_do_parse() + # parse_foo() + # perform_lowlevel_parse() + + # perform_lowlevel_parse() is called when the memo table at that position is not filled in yet. + # it calls the corresponding parse_* virtual function via the vtable, but other than that does not have type information + # it's probably possible to extract type information, by comparing vtable addresses, but that seems painful + + # parse_foo() is the parser's corresponding virtual function in the frontend, which does not have the equivalent of a "this" pointer + + # So what we do to keep track of parsers is incrementally filling in the details for both + + # h_do_parse() is the backend's "actually run the parser" function, but does not get called for some parsers + # (apparently mostly it's for higher-order parsers) + # also contains the decision logic about whether to call perform_lowlevel_parse() + + # possible scenarios: + # h_do_parse() + # perform_lowlevel_parse() + # parse_foo() + + # h_do_parse() + # perform_lowlevel_parse() + + # h_do_parse() + def name_top_parser(self, name): + self.p_stack[-1].name_parser(name) + + def add_mem_use_each(self, size): + for p in self.p_stack: + p.bytes_used += size + + def add_mem_use_top(self, size): + self.p_stack[-1].bytes_used += size + + def show_stack(self): + print("stack would be printed here. Depth:", len(self.p_stack)) + #print([(p.get_name_or_placeholder(), hex(p.address)) for p in self.p_stack]) + + def depth(self): + return len(self.p_stack) + +# Class that is responsible for bookkeeping throughout the entire parse +# NB, this is slightly different terminology than the hammer API implicitly uses: +# There, a parse is started by h_parse(), and it is associated with a parse state. +# This corresponds to the ParserStack above. TopLevelParse keeps track of all these. +# Subsequent h_do_parse()s with the same parser state are considered to belong to the same parse + +# The TopLevelParse class is initialized in trace_begin(), and is used until the end of the trace +class TopLevelParse: + def __init__(self): + self.parser_stacks = [] + self.parser_objs = {} + self.unclaimed_mem_use = 0 + + # Called from h_packrat_parse()'s handler, where the parse state and arena get initialized + def enter_h_packrat_parse(self, parser): + # TODO: add a parser stack or something? + parser_stack = ParserStack(None, None) + self.parser_stacks.append(parser_stack) + return 0 + + # TODO: arena parameter is useless + def enter_h_do_parse(self, parse_state, arena, parser): + parser_stack = self.peek_parserstack() + if parser_stack.parse_state is None and parser_stack.parse_state != parse_state: + self.first_h_do_parse_after_packrat_parse(parse_state, arena) + + # Called from h_do_parse()'s handler, at which point we know the addresses of the state and arena + def first_h_do_parse_after_packrat_parse(self, parse_state, arena): + parser_stack = self.peek_parserstack() + parser_stack.set_state(parse_state) + + # Popping the stack of stack of parsers + def return_from_h_packrat_parse(self): + old_stack = self.parser_stacks.pop() + if old_stack.depth() > 0: + print("Warning: parser stack not empty but parse is successful?") + + # Memoize the parser object for this particular address, then push it on the stack + # Returns the parser object we just initalized (or the one already existing) + def enter_perform_lowlevel_parse(self, parser_addr): + try: + parser_obj = self.parser_objs[parser_addr] + except KeyError: + # Create a parser object with no name and the address of the parser + parser_obj = Parser(None, parser_addr) + self.parser_objs[parser_addr] = parser_obj + + parser_stack = self.peek_parserstack() + parser_stack.push(parser_obj) + return parser_obj + + def return_from_perform_lowlevel_parse(self): + parser_stack = self.peek_parserstack() + parser_obj = parser_stack.pop() + # debug print here + + def enter_h_arena_malloc_raw(self, alloc_size): + parser_obj = self.peek_parser() + parser_stack = self.peek_parserstack() + # This is probably the slowest part of the code, or maybe the overhead adds up over many calls to h_arena_malloc_raw() + if parser_obj is not None: + # Caveat: parser_stack is assumed not to be None if we could get a parser_obj + parser_obj.add_mem_use(parser_stack.parse_state, alloc_size) + elif parser_stack is not None: + print("Allocation of " + str(alloc_size) + " bytes without a parser on the stack. (Happens before first call perform_lowlevel_parse to or after return from that call)") + parser_stack.unclaimed_mem_use += alloc_size + else: + print("Allocation of " + str(alloc_size) + " bytes without a parser stack. (This happens before and after parse)") + self.unclaimed_mem_use += alloc_size + + def parse_virtual(self, parser_name): + parser_obj = self.peek_parser() + if parser_obj.name is None: + parser_obj.name_parser(parser_name) + #else: + #print("Warning: parser already named! This is a bug. old name: %s, new name: %s" % (parser_obj.name, parser_name)) + + def peek_parserstack(self): + try: + retval = self.parser_stacks[-1] + except IndexError: + retval = None + return retval + + def peek_parser(self): + try: + retval = self.peek_parserstack().peek() + except AttributeError: + # print("Parser stack of stacks empty!") + retval = None + + # retval will also be None when parser stack is empty (while parser stack of stacks isn't) + return retval + + +top_level_parse = TopLevelParse() +# Approach 1: load the application, set breakpoints, execute stack commands on breakpoint hit, continue + +class HPackratParseBreakpoint(gdb.Breakpoint): + def stop(self): + frame = gdb.selected_frame() + block = frame.block() + for val in block: # GDB, why did you take away [] for gdb.Block? + if val.name == 'parser': + parser = int(val.value(frame)) + top_level_parse.enter_h_packrat_parse(parser) + return False + +class HPackratParseRetBreakpoint(gdb.Breakpoint): + def stop(self): + top_level_parse.return_from_h_packrat_parse() + return False + +# TODO: frame.older() allows accessing the caller frame. decision logic about whether to call first_h_do_parse_after_packrat_parse() +# could be moved here +class HDoParseBreakpoint(gdb.Breakpoint): + def stop(self): + frame = gdb.selected_frame() + block = frame.block() + for val in block: + if val.name == 'parser': + parser = int(val.value(frame)) + if val.name == 'state': + state = int(val.value(frame)) + if val.name == 'arena': + arena = int(val.value(frame)) + top_level_parse.enter_h_do_parse(state, None, parser) + return False + +class PerformLowLevelParseBreakpoint(gdb.Breakpoint): + def stop(self): + frame = gdb.selected_frame() + block = frame.block() + for val in block: + if val.name == 'parser': + parser = int(val.value(frame)) + top_level_parse.enter_perform_lowlevel_parse(parser) + return False + +class PerformLowLevelParseRetBreakpoint(gdb.Breakpoint): + def stop(self): + top_level_parse.return_from_perform_lowlevel_parse() + return False + +class ParserVirtualBreakpoint(gdb.Breakpoint): + def stop(self): + frame = gdb.selected_frame() + block = frame.block() + # function name is parse_* + # we extract the second part + parser_type = frame.function().name.split("_")[1] + parser_name = "(Unnamed " + parser_type + ")" + top_level_parse.parse_virtual(parser_name) + return False + +class InitParserBreakpoint(gdb.Breakpoint): + def stop(self): + frame = gdb.selected_frame() + block = frame.block() + + # This will also catch locals that aren't parsers, but it's not a problem in practice, + # since h_parse() will never be called on them + for p in block: + top_level_parse.parser_objs[int(p.value(frame))] = Parser(p.name, int(p.value(frame))) + + return False + +class HArenaMallocRawBreakpoint(gdb.Breakpoint): + def stop(self): + frame = gdb.selected_frame() + block = frame.block() + + for val in block: + if val.name == 'size': + alloc_size = int(val.value(frame)) + + top_level_parse.enter_h_arena_malloc_raw(alloc_size) + + return False + +# Break on main so that libhammer.so gets to load +#main = gdb.Breakpoint("main") + + + + +init_parser = InitParserBreakpoint("pdf.c:1223") +h_do_parse = HDoParseBreakpoint("h_do_parse") +h_packrat_parse = HPackratParseBreakpoint("h_packrat_parse") +perform_lowlevel_parse = PerformLowLevelParseBreakpoint("perform_lowlevel_parse") +h_arena_malloc_raw = HArenaMallocRawBreakpoint("h_arena_malloc_raw") +# todo: investigate GDB frame filters for rendering backtraces + +parse_action = ParserVirtualBreakpoint("parse_action") +parse_choice = ParserVirtualBreakpoint("parse_choice") +parse_sequence = ParserVirtualBreakpoint("parse_sequence") +parse_difference = ParserVirtualBreakpoint("parse_difference") +parse_many = ParserVirtualBreakpoint("parse_many") +parse_and = ParserVirtualBreakpoint("parse_and") +parse_attr_bool = ParserVirtualBreakpoint("parse_attr_bool") +parse_bind = ParserVirtualBreakpoint("parse_bind") +parse_bits = ParserVirtualBreakpoint("parse_bits") +parse_butnot = ParserVirtualBreakpoint("parse_butnot") +parse_charset = ParserVirtualBreakpoint("parse_charset") +parse_ch = ParserVirtualBreakpoint("parse_ch") +parse_end = ParserVirtualBreakpoint("parse_end") +parse_endianness = ParserVirtualBreakpoint("parse_endianness") +parse_epsilon = ParserVirtualBreakpoint("parse_epsilon") +parse_ignore = ParserVirtualBreakpoint("parse_ignore") +parse_ignoreseq = ParserVirtualBreakpoint("parse_ignoreseq") +parse_indirect = ParserVirtualBreakpoint("parse_indirect") +parse_int_range = ParserVirtualBreakpoint("parse_int_range") +parse_not = ParserVirtualBreakpoint("parse_not") +parse_nothing = ParserVirtualBreakpoint("parse_nothing") +parse_optional = ParserVirtualBreakpoint("parse_optional") +parse_permutation = ParserVirtualBreakpoint("parse_permutation") +parse_skip = ParserVirtualBreakpoint("parse_skip") +parse_seek = ParserVirtualBreakpoint("parse_seek") +parse_tell = ParserVirtualBreakpoint("parse_tell") +parse_token = ParserVirtualBreakpoint("parse_token") +parse_unimplemented = ParserVirtualBreakpoint("parse_unimplemented") +parse_put = ParserVirtualBreakpoint("parse_put") +parse_get = ParserVirtualBreakpoint("parse_get") +parse_whitespace = ParserVirtualBreakpoint("parse_whitespace") +parse_xor = ParserVirtualBreakpoint("parse_xor") + +perform_lowlevel_parse_ret = PerformLowLevelParseRetBreakpoint("perform_lowlevel_parse+310") # address of the RETQ instruction in the function, adjust for your particular hammer build +h_packrat_parse_ret = HPackratParseRetBreakpoint("h_packrat_parse+420") + +# Commandline: +# $ gdb -ex "source /path/to/parser-name-instrumentation-gdb.py" --args /path/to/pdf /path/to/input.pdf +gdb.execute("run") + +print([(p.name, hex(p.address), p.bytes_used) for p in top_level_parse.parser_objs.values()]) + +# Approach 2: capture process trace with gdb, load the trace, execute stack commands on breakpoint hit, etc -- GitLab