# TODO: handlers for filters # TODO: postordinate parser fails to get named # TODO: step-to-parser command # TODO: "current parser" on GUI is the one on top of the stack, while the argument of h_do_parse is not yet accounted for # TODO: possible enhancement, caching the vtable type in Parser objects (allows searching by type) # TODO: The parameter given to h_do_parse is not shown in the backtrace, which is confusing # List of names of functions with H_RULEs declared, used by BreakpointManager H_RULE_FUNCTIONS = [ "init_runlengthdecode_parser", "init_LZW_parser" ] class Parser: def __init__(self, name, address): self.name = name self.address = address self.bytes_used = {} def name_parser(self, name): self.name = name # TODO: remove def get_name_or_placeholder(self): if self.name is None: return "Wait for it... (if you're reading this, you found a bug)" else: return self.name def add_mem_use(self, state, size): if self.bytes_used.setdefault(state, None) is None: self.bytes_used[state] = size else: self.bytes_used[state] += size def get_mem_use(self, state=None): if state is None: return self.bytes_used else: return self.bytes_used.setdefault(state, 0) def __str__(self): return "(" + str(self.name) + ", " + hex(self.address) + ")" # Return the highest per-arena allocation count # TODO: disambiguate "parse state" and "arena", possibly tracking both def get_arenamax(self): res = 0 if self.bytes_used: res = max(self.bytes_used.values()) return res def get_arenasum(self): res = 0 if self.bytes_used: res = sum(self.bytes_used.values()) return res class ParserStack: def __init__(self, parse_state, arena): self.parse_state = parse_state self.arena = arena self.p_stack = [] self.unclaimed_mem_use = 0 def push(self, parser): self.p_stack.append(parser) def pop(self): return self.p_stack.pop() def peek(self): try: retval = self.p_stack[-1] except IndexError: retval = None return retval def set_state(self, state): self.parse_state = state # Shortcut for setting the name property of the parser on the top of stack # In terms of tracing, *most* calls to a parser look something like this with the packrat backend: # h_do_parse() # parse_foo() # perform_lowlevel_parse() # perform_lowlevel_parse() is called when the memo table at that position is not filled in yet. # it calls the corresponding parse_* virtual function via the vtable, but other than that does not have type information # it's probably possible to extract type information, by comparing vtable addresses, but that seems painful # parse_foo() is the parser's corresponding virtual function in the frontend, which does not have the equivalent of a "this" pointer # So what we do to keep track of parsers is incrementally filling in the details for both # h_do_parse() is the backend's "actually run the parser" function, but does not get called for some parsers # (apparently mostly it's for higher-order parsers) # also contains the decision logic about whether to call perform_lowlevel_parse() # possible scenarios: # h_do_parse() # perform_lowlevel_parse() # parse_foo() # h_do_parse() # perform_lowlevel_parse() # h_do_parse() def name_top_parser(self, name): self.p_stack[-1].name_parser(name) def add_mem_use_each(self, size): for p in self.p_stack: p.bytes_used += size def add_mem_use_top(self, size): self.p_stack[-1].bytes_used += size def show_stack(self): print("stack would be printed here. Depth:", len(self.p_stack)) #print([(p.get_name_or_placeholder(), hex(p.address)) for p in self.p_stack]) def depth(self): return len(self.p_stack) # Class that is responsible for bookkeeping throughout the entire parse # NB, this is slightly different terminology than the hammer API implicitly uses: # There, a parse is started by h_parse(), and it is associated with a parse state. # This corresponds to the ParserStack above. TopLevelParse keeps track of all these. # Subsequent h_do_parse()s with the same parser state are considered to belong to the same parse class TopLevelParse: def __init__(self): self.parser_stacks = [] self.parser_objs = {} self.unclaimed_mem_use = 0 # Holds 32 characters starting at state->input_stream[index], used by the GUI self.current_input_chunk = '' self.current_parser_env = '' # We save/push in perform_lowlevel_parse, but this is used to display them ahead of time self.h_do_parse_parser = None self.vt_types = None self.parser_decombinator = None def init_parser(self): self.vt_types = VTTypes() self.parser_decombinator = ParserDecombinator(self.vt_types) # Called from h_packrat_parse()'s handler, where the parse state and arena get initialized def enter_h_packrat_parse(self, parser): # TODO: add a parser stack or something? parser_stack = ParserStack(None, None) self.parser_stacks.append(parser_stack) return 0 # TODO: arena parameter is useless def enter_h_do_parse(self, parse_state, arena, parser): parser_stack = self.peek_parserstack() try: parser_obj = self.parser_objs[parser] except KeyError: parser_obj = Parser(None, parser) self.parser_objs[parser] = parser_obj self.h_do_parse_parser = parser_obj # TODO: current_parser_env should be set here instead too if parser_stack.parse_state is None and parser_stack.parse_state != parse_state: self.first_h_do_parse_after_packrat_parse(parse_state, arena) # Called from h_do_parse()'s handler, at which point we know the addresses of the state and arena def first_h_do_parse_after_packrat_parse(self, parse_state, arena): parser_stack = self.peek_parserstack() parser_stack.set_state(parse_state) # Popping the stack of stack of parsers def return_from_h_packrat_parse(self): old_stack = self.parser_stacks.pop() if old_stack.depth() > 0: print("Warning: parser stack not empty but parse is successful?") # Memoize the parser object for this particular address, then push it on the stack # Returns the parser object we just initalized (or the one already existing) #TODO: memoize_parser method def enter_perform_lowlevel_parse(self, parser_addr): try: parser_obj = self.parser_objs[parser_addr] except KeyError: # Create a parser object with no name and the address of the parser parser_obj = Parser(None, parser_addr) self.parser_objs[parser_addr] = parser_obj parser_stack = self.peek_parserstack() parser_stack.push(parser_obj) if self.parser_decombinator: p_env = self.parser_decombinator.decompose_parser(parser_obj, self) self.set_parser_env(type(p_env).__name__ + " - " + str(p_env)) # TODO: pass this as data structure to frontend return parser_obj def return_from_perform_lowlevel_parse(self): parser_stack = self.peek_parserstack() parser_obj = parser_stack.pop() # debug print here def enter_h_arena_malloc_raw(self, alloc_size): parser_obj = self.peek_parser() parser_stack = self.peek_parserstack() # This is probably the slowest part of the code, or maybe the overhead adds up over many calls to h_arena_malloc_raw() if parser_obj is not None: # Caveat: parser_stack is assumed not to be None if we could get a parser_obj parser_obj.add_mem_use(parser_stack.parse_state, alloc_size) elif parser_stack is not None: #print("Allocation of " + str(alloc_size) + " bytes without a parser on the stack. (Happens before first call perform_lowlevel_parse to or after return from that call)") parser_stack.unclaimed_mem_use += alloc_size else: #print("Allocation of " + str(alloc_size) + " bytes without a parser stack. (This happens before and after parse)") self.unclaimed_mem_use += alloc_size def parse_virtual(self, parser_name): parser_obj = self.peek_parser() if parser_obj.name is None: parser_obj.name_parser(parser_name) #else: #print("Warning: parser already named! This is a bug. old name: %s, new name: %s" % (parser_obj.name, parser_name)) def peek_parserstack(self): try: retval = self.parser_stacks[-1] except IndexError: retval = None return retval def peek_parser(self): try: retval = self.peek_parserstack().peek() except AttributeError: # print("Parser stack of stacks empty!") retval = None # retval will also be None when parser stack is empty (while parser stack of stacks isn't) return retval def parser_by_address(self, parser_addr): try: return self.parser_objs[int(parser_addr)] except KeyError: print("Parser with address " + str(hex(parser_addr)) + " not found!") def parsers_by_name(self, parser_name): results = [v for k,v in self.parser_objs.items() if v.name == parser_name] if len(results) > 0: return results else: return None def set_input_chunk(self, chunk): self.input_chunk = chunk def get_input_chunk(self): return self.input_chunk def set_parser_env(self, parser_env): self.current_parser_env = parser_env def get_parser_env(self): return self.current_parser_env def add_or_get_parser(self, parser_addr): try: parser_obj = self.parser_objs[int(parser_addr)] except KeyError: # Create a parser object with no name and the address of the parser parser_obj = Parser(None, int(parser_addr)) self.parser_objs[int(parser_addr)] = parser_obj return parser_obj def get_parser_top_per_arena_mem(self): return sorted(self.parser_objs.values(), key=Parser.get_arenamax, reverse=True)[0] def get_parser_top_total_arena_mem(self): return sorted(self.parser_objs.values(), key=Parser.get_arenasum, reverse=True)[0] def get_avg_mem_use_per_arena(self): avg_mem_use = {} arena_counts = {} # Accumulate byte counts and counts of parsers using that arena for p in self.parser_objs.values(): for arena,mem in p.bytes_used.items(): if arena in arena_counts.keys(): arena_counts[arena] += 1 else: arena_counts[arena] = 1 if arena in avg_mem_use.keys(): avg_mem_use[arena] += mem else: avg_mem_use[arena] = mem averages = {arena: mem/arena_counts[arena] for arena,mem in avg_mem_use.items()} return averages # TODO: get_avg_mem_use_all_arenas, get_total_mem_use top_level_parse = TopLevelParse() # Approach 1: load the application, set breakpoints, execute stack commands on breakpoint hit, continue class InitParserBreakpoint(gdb.Breakpoint): def stop(self): frame = gdb.selected_frame() block = frame.block() top_level_parse.init_parser() # This will also catch locals that aren't parsers, but it's not a problem in practice, # since h_parse() will never be called on them # If it becomes a problem after all, gdb.parse_and_eval() might be used to filter them out for p in block: top_level_parse.parser_objs[int(p.value(frame))] = Parser(p.name, int(p.value(frame))) return False # TODO: refactored to breakpoint-manager.py , remove #class HRuleBreakpoint(gdb.Breakpoint): # def stop(self): # frame = gdb.selected_frame() # block = frame.block() # # for p in block: # top_level_parse.parser_objs[int(p.value(frame))] = Parser(p.name, int(p.value(frame))) class HArenaMallocRawBreakpoint(gdb.Breakpoint): def stop(self): frame = gdb.selected_frame() block = frame.block() for val in block: if val.name == 'size': alloc_size = int(val.value(frame)) top_level_parse.enter_h_arena_malloc_raw(alloc_size) return False hammer_retq_breakpoints = [] #class BreakpointManager(): # def __init__(self, h_rule_functions): # self.hammer_retq_breakpoints = [] # self.h_rule_breakpoints = [] # # self.h_do_parse = None # self.h_packrat_parse = None # self.perform_lowlevel_parse = None # self.h_arena_malloc_raw = None # # self.parse_action = None # self.parse_choice = None # self.parse_sequence = None # self.parse_difference = None # self.parse_many = None # self.parse_and = None # self.parse_attr_bool = None # self.parse_bind = None # self.parse_bits = None # self.parse_butnot = None # self.parse_charset = None # self.parse_ch = None # self.parse_end = None # self.parse_endianness = None # self.parse_epsilon = None # self.parse_ignore = None # self.parse_ignoreseq = None # self.parse_indirect = None # self.parse_int_range = None # self.parse_not = None # self.parse_nothing = None # self.parse_optional = None # self.parse_permutation = None # self.parse_skip = None # self.parse_seek = None # self.parse_tell = None # self.parse_token = None # self.parse_unimplemented = None # self.parse_put = None # self.parse_get = None # self.parse_whitespace = None # self.parse_xor = None # # def set_h_rule_breakpoints(self): # for func in H_RULE_FUNCTIONS: # func_retq = locate_retq(func[0], func[1]) # self.h_rule_breakpoints[func] = HRuleBreakpoint("*" + hex(func_retq)) breakpoint_manager = BreakpointManager(H_RULE_FUNCTIONS) class PDFMainBreakpoint(gdb.Breakpoint): def stop(self): breakpoints = breakpoint_manager.set_hammer_retq_breakpoints() #breakpoint_manager.hammer_retq_breakpoints = breakpoints return True def set_hammer_retq_breakpoints(self): breakpoints = [] plp_retq = locate_perform_lowlevel_parse_retq() perform_lowlevel_parse_ret = PerformLowLevelParseRetBreakpoint("*" + hex(plp_retq)) breakpoints.append(perform_lowlevel_parse_ret) hpp_retq = locate_h_packrat_parse_retq() h_packrat_parse_ret = HPackratParseRetBreakpoint("*" + hex(hpp_retq)) breakpoints.append(h_packrat_parse_ret) return breakpoints # GDB parameters # TODO: hammer parameter prefix class ExtendedParseStepInfo(gdb.Parameter): """Controls whether to display parser stack and input preview on stepping the parse.""" def __init__(self): super(ExtendedParseStepInfo, self).__init__("hammer-extended-parse-step-info", gdb.COMMAND_OBSCURE, gdb.PARAM_BOOLEAN) self.show_doc = "Show parser stack and input preview after hammer-parse-step:" #self.set_doc = "Show parser stack and input preview after hammer-parse-step:" self.value = True ExtendedParseStepInfo() # GDB commands # TODO: GDB help strings # TODO: factor commands out into their own file class HammerParserBacktrace(gdb.Command): def __init__(self): super(HammerParserBacktrace, self).__init__ ("hammer-parser-backtrace", gdb.COMMAND_OBSCURE) def invoke(self, arg, from_tty): parserstack = top_level_parse.peek_parserstack().p_stack args = gdb.string_to_argv(arg) if len(args) < 1: maxsize = len(parserstack) else: try: maxsize = int(args[0]) if maxsize < 1: raise ValueError except ValueError: maxsize = len(parserstack) print("Argument must be a positive integer") print("[" + str(hex(top_level_parse.h_do_parse_parser.address)) + "] " + top_level_parse.h_do_parse_parser.name + " [current]") #TODO: GUI widget should reflect this print(" ") depth = min(len(parserstack), maxsize) if depth > 0: # if stack not empty # unsure what the idiomatic python is for handling negative indices starting with -1, # but this addition is to avoid off-by-one errors index = -(depth+1) for p in parserstack[-1:index:-1]: print("[" + str(hex(p.address)) + "] " + p.name) # TODO: errors in perform_lowlevel_parse, if p.name is None if depth < len(parserstack): print("[...]") HammerParserBacktrace() class HammerParserMemUse(gdb.Command): def __init__(self): super(HammerParserMemUse, self).__init__("hammer-parser-mem-use", gdb.COMMAND_OBSCURE) def invoke(self, arg, from_tty): args = gdb.string_to_argv(arg) if len(args) < 1: print("Usage: hammer-parser-mem-use <address>") return parser_addr = args[0] try: parser_addr_int = int(parser_addr, 16) parser_obj = top_level_parse.parser_by_address(parser_addr_int) if parser_obj is not None: print(parser_obj.bytes_used) except ValueError: print("Address needs to be a hexadecimal number") HammerParserMemUse() class HammerParserMemUseName(gdb.Command): def __init__(self): super(HammerParserMemUseName, self).__init__("hammer-parser-mem-use-name", gdb.COMMAND_OBSCURE) def invoke(self, arg, from_tty): args = gdb.string_to_argv(arg) if len(args) < 1: print("Usage: hammer-parser-mem-use-name <name>") return parser_name = args[0] parser_objs = top_level_parse.parsers_by_name(parser_name) if parser_objs is not None: for p in parser_objs: print((p.name, hex(p.address), p.bytes_used)) HammerParserMemUseName() class HammerParserTopSingleArenaMem(gdb.Command): def __init__(self): super(HammerParserTopSingleArenaMem, self).__init__("hammer-parser-top-single-arena-mem", gdb.COMMAND_OBSCURE) def invoke(self, arg, from_tty): args = gdb.string_to_argv(arg) p = top_level_parse.get_parser_top_per_arena_mem() print((p.name, hex(p.address), p.bytes_used)) HammerParserTopSingleArenaMem() class HammerParserTopTotalArenaMem(gdb.Command): def __init__(self): super(HammerParserTopTotalArenaMem, self).__init__("hammer-parser-top-total-arena-mem", gdb.COMMAND_OBSCURE) def invoke(self, arg, from_tty): args = gdb.string_to_argv(arg) p = top_level_parse.get_parser_top_total_arena_mem() print((p.name, hex(p.address), p.bytes_used)) total_mem_use = p.get_arenasum() print("Total: " + str(total_mem_use) + " bytes") HammerParserTopTotalArenaMem() # TODO: average memory use, per arena and total class HammerParserPreviewInput(gdb.Command): def __init__(self): super(HammerParserPreviewInput, self).__init__("hammer-parser-preview-input", gdb.COMMAND_OBSCURE) def invoke(self, arg, from_tty): args = gdb.string_to_argv(arg) print(top_level_parse.input_chunk) HammerParserPreviewInput() class HammerParserAverageMem(gdb.Command): def __init__(self): super(HammerParserAverageMem, self).__init__("hammer-parser-average-mem", gdb.COMMAND_OBSCURE) def invoke(self, arg, from_tty): args = gdb.string_to_argv(arg) mem = top_level_parse.get_avg_mem_use_per_arena() print("Bytes used on average in each arena:") print(mem) HammerParserAverageMem() class HammerParserCurrentEnv(gdb.Command): def __init__(self): super(HammerParserCurrentEnv, self).__init__("hammer-parser-current-env", gdb.COMMAND_OBSCURE) def invoke(self, arg, from_tty): p = top_level_parse.h_do_parse_parser p_env = top_level_parse.parser_decombinator.decompose_parser(p, top_level_parse) #TODO: parser -> env mapping function in top_level_parse print(type(p_env).__name__ + " - " + str(p_env)) # TODO: consistency with GUI HammerParserCurrentEnv() #TODO: move all this to BreakpointManager? # Call when execution stopped at breakpoint in main def locate_perform_lowlevel_parse_retq(): arch = gdb.selected_frame().architecture() p_l_p_sym = gdb.lookup_symbol("perform_lowlevel_parse")[0] p_l_p_address = int(p_l_p_sym.value().address) # The choice of disassembling only 400 instructions from the start is arbitrary. (This function is 310 bytes long on this particular machine.) There is probably a way to find out where a function ends. instructions = arch.disassemble(p_l_p_address, p_l_p_address+400) results = [ ins["addr"] for ins in instructions if ins["asm"].startswith("ret") ] return results[0] def locate_h_packrat_parse_retq(): arch = gdb.selected_frame().architecture() h_p_p_sym = gdb.lookup_symbol("h_packrat_parse")[0] h_p_p_address = int(h_p_p_sym.value().address) # Same as with perform_lowlevel_parse, +450 is arbitrary instructions = arch.disassemble(h_p_p_address, h_p_p_address+450) results = [ ins["addr"] for ins in instructions if ins["asm"].startswith("ret") ] return results[0] def locate_init_parser_retq(): arch = gdb.selected_frame().architecture() i_p_sym = gdb.lookup_symbol("init_parser")[0] i_p_address = int(i_p_sym.value().address) # Same as with perform_lowlevel_parse, +16000 is arbitrary instructions = arch.disassemble(i_p_address, i_p_address+16000) results = [ ins["addr"] for ins in instructions if ins["asm"].startswith("ret") ] return results[0] def locate_retqs(symbol, length): arch = gdb.selected_frame().architecture() sym = gdb.lookup_symbol(symbol)[0] sym_address = int(sym.value().address) instructions = arch.disassemble(sym_address, sym_address+length) results = [ ins["addr"] for ins in instructions if ins["asm"].startswith("ret") ] return results def locate_retq(symbol, length): results = locate_retqs(symbol, length) return results[0] def del_hammer_retq_breakpoints(breakpoints): for bp in breakpoints: bp.delete() # Clean up by-address breakpoints in hammer when inferior exits. # Caveat: Assumes there's a single inferior, the debugged parser, so no checking is done # TODO: where to store breakpoints? TopLevelParse? A BreakpointManager class?i def exit_handler(event): #breakpoints = [ perform_lowlevel_parse_ret, h_packrat_parse_ret ] #del_hammer_retq_breakpoints(breakpoints) del_hammer_retq_breakpoints(breakpoint_manager.hammer_retq_breakpoints) gdb.events.exited.connect(exit_handler) #TODO: regex match retq, ret, etc # Break on main so that libhammer.so gets to load main = PDFMainBreakpoint("main") #init_parser = InitParserBreakpoint("pdf.c:1223") h_do_parse = HDoParseBreakpoint("h_do_parse") h_packrat_parse = HPackratParseBreakpoint("h_packrat_parse") perform_lowlevel_parse = PerformLowLevelParseBreakpoint("perform_lowlevel_parse") h_arena_malloc_raw = HArenaMallocRawBreakpoint("h_arena_malloc_raw") # todo: investigate GDB frame filters for rendering backtraces parse_action = ParserVirtualBreakpoint("parse_action") parse_choice = ParserVirtualBreakpoint("parse_choice") parse_sequence = ParserVirtualBreakpoint("parse_sequence") parse_difference = ParserVirtualBreakpoint("parse_difference") parse_many = ParserVirtualBreakpoint("parse_many") parse_and = ParserVirtualBreakpoint("parse_and") parse_attr_bool = ParserVirtualBreakpoint("parse_attr_bool") parse_bind = ParserVirtualBreakpoint("parse_bind") parse_bits = ParserVirtualBreakpoint("parse_bits") parse_butnot = ParserVirtualBreakpoint("parse_butnot") parse_charset = ParserVirtualBreakpoint("parse_charset") parse_ch = ParserVirtualBreakpoint("parse_ch") parse_end = ParserVirtualBreakpoint("parse_end") parse_endianness = ParserVirtualBreakpoint("parse_endianness") parse_epsilon = ParserVirtualBreakpoint("parse_epsilon") parse_ignore = ParserVirtualBreakpoint("parse_ignore") parse_ignoreseq = ParserVirtualBreakpoint("parse_ignoreseq") parse_indirect = ParserVirtualBreakpoint("parse_indirect") parse_int_range = ParserVirtualBreakpoint("parse_int_range") parse_not = ParserVirtualBreakpoint("parse_not") parse_nothing = ParserVirtualBreakpoint("parse_nothing") parse_optional = ParserVirtualBreakpoint("parse_optional") parse_permutation = ParserVirtualBreakpoint("parse_permutation") parse_skip = ParserVirtualBreakpoint("parse_skip") parse_seek = ParserVirtualBreakpoint("parse_seek") parse_tell = ParserVirtualBreakpoint("parse_tell") parse_token = ParserVirtualBreakpoint("parse_token") parse_unimplemented = ParserVirtualBreakpoint("parse_unimplemented") parse_put = ParserVirtualBreakpoint("parse_put") parse_get = ParserVirtualBreakpoint("parse_get") parse_whitespace = ParserVirtualBreakpoint("parse_whitespace") parse_xor = ParserVirtualBreakpoint("parse_xor") # Commandline: # $ gdb -ex "source /path/to/parser-name-instrumentation-gdb.py" --args /path/to/pdf /path/to/input.pdf # run until main gdb.execute("run") #plp_retq = locate_perform_lowlevel_parse_retq() #perform_lowlevel_parse_ret = PerformLowLevelParseRetBreakpoint("*" + hex(plp_retq)) #hpp_retq = locate_h_packrat_parse_retq() #h_packrat_parse_ret = HPackratParseRetBreakpoint("*" + hex(hpp_retq)) i_p_retq = locate_init_parser_retq() init_parser = InitParserBreakpoint("*" + hex(i_p_retq)) #hammer_retq_breakpoints = [perform_lowlevel_parse_ret, h_packrat_parse_ret] #h_rule_breakpoints = {} #for func in H_RULE_FUNCTIONS: # func_retq = locate_retq(func[0], func[1]) # h_rule_breakpoints[func] = HRuleBreakpoint("*" + hex(func_retq)) breakpoint_manager.set_h_rule_breakpoints() # TODO: the RET breakpoints in hammer break when "run" is executed again. figure out a way to automatically replace these # Run until stop position, if set. Finish parsing otherwise gdb.execute("continue") print([(p.name, hex(p.address), p.bytes_used) for p in top_level_parse.parser_objs.values()]) # Approach 2: capture process trace with gdb, load the trace, execute stack commands on breakpoint hit, etc