diff --git a/gdb-port/breakpoint-manager.py b/gdb-port/breakpoint-manager.py new file mode 100644 index 0000000000000000000000000000000000000000..bc4314f164f7431880615ddc0afc59799857e7d8 --- /dev/null +++ b/gdb-port/breakpoint-manager.py @@ -0,0 +1,165 @@ +class BreakpointManager: + def __init__(self, h_rule_functions): + self.hammer_retq_breakpoints = [] + self.h_rule_breakpoints = [] + self.H_RULE_FUNCTIONS = h_rule_functions + + self.h_do_parse = None + self.h_packrat_parse = None + self.perform_lowlevel_parse = None + self.h_arena_malloc_raw = None + + self.init_parser_retq = None + + self.parse_action = None + self.parse_choice = None + self.parse_sequence = None + self.parse_difference = None + self.parse_many = None + self.parse_and = None + self.parse_attr_bool = None + self.parse_bind = None + self.parse_bits = None + self.parse_butnot = None + self.parse_charset = None + self.parse_ch = None + self.parse_end = None + self.parse_endianness = None + self.parse_epsilon = None + self.parse_ignore = None + self.parse_ignoreseq = None + self.parse_indirect = None + self.parse_int_range = None + self.parse_not = None + self.parse_nothing = None + self.parse_optional = None + self.parse_permutation = None + self.parse_skip = None + self.parse_seek = None + self.parse_tell = None + self.parse_token = None + self.parse_unimplemented = None + self.parse_put = None + self.parse_get = None + self.parse_whitespace = None + self.parse_xor = None + + def set_h_rule_breakpoints(self): + for func in self.H_RULE_FUNCTIONS: + func_retq = self.locate_retq(func) + self.h_rule_breakpoints.append(HRuleBreakpoint("*" + hex(func_retq))) + + def set_hammer_retq_breakpoints(self): + plp_retq = self.locate_retq("perform_lowlevel_parse") + perform_lowlevel_parse_ret = PerformLowLevelParseRetBreakpoint("*" + hex(plp_retq)) + self.hammer_retq_breakpoints.append(perform_lowlevel_parse_ret) + hpp_retq = self.locate_retq("h_packrat_parse") + h_packrat_parse_ret = HPackratParseRetBreakpoint("*" + hex(hpp_retq)) + self.hammer_retq_breakpoints.append(h_packrat_parse_ret) + return self.hammer_retq_breakpoints + + def del_hammer_retq_breakpoints(self): + for bp in self.hammer_retq_breakpoints: + bp.delete() + + # Helper functions for finding the return instructions of particular functions of interest + #TODO: rename "symbol" to "fn_name" + def locate_retqs(self, symbol): + arch = gdb.selected_frame().architecture() + sym = gdb.lookup_symbol(symbol)[0] + sym_address = int(sym.value().address) + sym_block = sym.symtab.objfile.progspace.block_for_pc(sym_address) + instructions = arch.disassemble(sym_address, sym_block.end) + results = [ ins["addr"] for ins in instructions if ins["asm"].startswith("ret") ] + return results + + def locate_retq(self, symbol): + results = self.locate_retqs(symbol) + return results[0] + + def set_parser_virtual_breakpoints(self): + self.parse_action = ParserVirtualBreakpoint("parse_action") + self.parse_choice = ParserVirtualBreakpoint("parse_choice") + self.parse_sequence = ParserVirtualBreakpoint("parse_sequence") + self.parse_difference = ParserVirtualBreakpoint("parse_difference") + self.parse_many = ParserVirtualBreakpoint("parse_many") + self.parse_and = ParserVirtualBreakpoint("parse_and") + self.parse_attr_bool = ParserVirtualBreakpoint("parse_attr_bool") + self.parse_bind = ParserVirtualBreakpoint("parse_bind") + self.parse_bits = ParserVirtualBreakpoint("parse_bits") + self.parse_butnot = ParserVirtualBreakpoint("parse_butnot") + self.parse_charset = ParserVirtualBreakpoint("parse_charset") + self.parse_ch = ParserVirtualBreakpoint("parse_ch") + self.parse_end = ParserVirtualBreakpoint("parse_end") + self.parse_endianness = ParserVirtualBreakpoint("parse_endianness") + self.parse_epsilon = ParserVirtualBreakpoint("parse_epsilon") + self.parse_ignore = ParserVirtualBreakpoint("parse_ignore") + self.parse_ignoreseq = ParserVirtualBreakpoint("parse_ignoreseq") + self.parse_indirect = ParserVirtualBreakpoint("parse_indirect") + self.parse_int_range = ParserVirtualBreakpoint("parse_int_range") + self.parse_not = ParserVirtualBreakpoint("parse_not") + self.parse_nothing = ParserVirtualBreakpoint("parse_nothing") + self.parse_optional = ParserVirtualBreakpoint("parse_optional") + self.parse_permutation = ParserVirtualBreakpoint("parse_permutation") + self.parse_skip = ParserVirtualBreakpoint("parse_skip") + self.parse_seek = ParserVirtualBreakpoint("parse_seek") + self.parse_tell = ParserVirtualBreakpoint("parse_tell") + self.parse_token = ParserVirtualBreakpoint("parse_token") + self.parse_unimplemented = ParserVirtualBreakpoint("parse_unimplemented") + self.parse_put = ParserVirtualBreakpoint("parse_put") + self.parse_get = ParserVirtualBreakpoint("parse_get") + self.parse_whitespace = ParserVirtualBreakpoint("parse_whitespace") + self.parse_xor = ParserVirtualBreakpoint("parse_xor") + + def del_parser_virtual_breakpoints(self): + self.parse_action.delete() + self.parse_choice.delete() + self.parse_sequence.delete() + self.parse_difference.delete() + self.parse_many.delete() + self.parse_and.delete() + self.parse_attr_bool.delete() + self.parse_bind.delete() + self.parse_bits.delete() + self.parse_butnot.delete() + self.parse_charset.delete() + self.parse_ch.delete() + self.parse_end.delete() + self.parse_endianness.delete() + self.parse_epsilon.delete() + self.parse_ignore.delete() + self.parse_ignoreseq.delete() + self.parse_indirect.delete() + self.parse_int_range.delete() + self.parse_not.delete() + self.parse_nothing.delete() + self.parse_optional.delete() + self.parse_permutation.delete() + self.parse_skip.delete() + self.parse_seek.delete() + self.parse_tell.delete() + self.parse_token.delete() + self.parse_unimplemented.delete() + self.parse_put.delete() + self.parse_get.delete() + self.parse_whitespace.delete() + self.parse_xor.delete() + + def set_init_parser_breakpoint(self): + i_p_retq = self.locate_retq("init_parser") + self.init_parser_retq = InitParserBreakpoint("*"+hex(i_p_retq)) + + def del_init_parser_breakpoint(self): + self.init_parser_retq.delete() + + def set_hammer_breakpoints(self): + self.h_do_parse = HDoParseBreakpoint("h_do_parse") + self.h_packrat_parse = HPackratParseBreakpoint("h_packrat_parse") + self.perform_lowlevel_parse = PerformLowLevelParseBreakpoint("perform_lowlevel_parse") + self.h_arena_malloc_raw = HArenaMallocRawBreakpoint("h_arena_malloc_raw") + + def del_hammer_breakpoints(self): + self.h_do_parse.delete() + self.h_packrat_parse.delete() + self.perform_lowlevel_parse.delete() + self.h_arena_malloc_raw.delete() diff --git a/gdb-port/commands.py b/gdb-port/commands.py new file mode 100644 index 0000000000000000000000000000000000000000..c78fd57be880fd58532a8d58f7ef31f877c761a1 --- /dev/null +++ b/gdb-port/commands.py @@ -0,0 +1,133 @@ +class HammerParserBacktrace(gdb.Command): + def __init__(self): + super(HammerParserBacktrace, self).__init__ ("hammer-parser-backtrace", gdb.COMMAND_OBSCURE) + + def invoke(self, arg, from_tty): + parserstack = top_level_parse.peek_parserstack().p_stack + args = gdb.string_to_argv(arg) + if len(args) < 1: + maxsize = len(parserstack) + else: + try: + maxsize = int(args[0]) + if maxsize < 1: + raise ValueError + except ValueError: + maxsize = len(parserstack) + print("Argument must be a positive integer") + + print("[" + str(hex(top_level_parse.h_do_parse_parser.address)) + "] " + top_level_parse.h_do_parse_parser.name + " [current]") #TODO: GUI widget should reflect this + print(" ") + depth = min(len(parserstack), maxsize) + if depth > 0: # if stack not empty + # unsure what the idiomatic python is for handling negative indices starting with -1, + # but this addition is to avoid off-by-one errors + index = -(depth+1) + for p in parserstack[-1:index:-1]: + print("[" + str(hex(p.address)) + "] " + p.name) # TODO: errors in perform_lowlevel_parse, if p.name is None + if depth < len(parserstack): + print("[...]") + +HammerParserBacktrace() + +class HammerParserMemUse(gdb.Command): + def __init__(self): + super(HammerParserMemUse, self).__init__("hammer-parser-mem-use", gdb.COMMAND_OBSCURE) + + def invoke(self, arg, from_tty): + args = gdb.string_to_argv(arg) + if len(args) < 1: + print("Usage: hammer-parser-mem-use <address>") + return + + parser_addr = args[0] + try: + parser_addr_int = int(parser_addr, 16) + parser_obj = top_level_parse.parser_by_address(parser_addr_int) + if parser_obj is not None: + print(parser_obj.bytes_used) + except ValueError: + print("Address needs to be a hexadecimal number") + +HammerParserMemUse() + +class HammerParserMemUseName(gdb.Command): + def __init__(self): + super(HammerParserMemUseName, self).__init__("hammer-parser-mem-use-name", gdb.COMMAND_OBSCURE) + + def invoke(self, arg, from_tty): + args = gdb.string_to_argv(arg) + if len(args) < 1: + print("Usage: hammer-parser-mem-use-name <name>") + return + + parser_name = args[0] + parser_objs = top_level_parse.parsers_by_name(parser_name) + if parser_objs is not None: + for p in parser_objs: + print((p.name, hex(p.address), p.bytes_used)) + +HammerParserMemUseName() + +class HammerParserTopSingleArenaMem(gdb.Command): + def __init__(self): + super(HammerParserTopSingleArenaMem, self).__init__("hammer-parser-top-single-arena-mem", gdb.COMMAND_OBSCURE) + + def invoke(self, arg, from_tty): + args = gdb.string_to_argv(arg) + + p = top_level_parse.get_parser_top_per_arena_mem() + print((p.name, hex(p.address), p.bytes_used)) + +HammerParserTopSingleArenaMem() + +class HammerParserTopTotalArenaMem(gdb.Command): + def __init__(self): + super(HammerParserTopTotalArenaMem, self).__init__("hammer-parser-top-total-arena-mem", gdb.COMMAND_OBSCURE) + + def invoke(self, arg, from_tty): + args = gdb.string_to_argv(arg) + + p = top_level_parse.get_parser_top_total_arena_mem() + print((p.name, hex(p.address), p.bytes_used)) + total_mem_use = p.get_arenasum() + print("Total: " + str(total_mem_use) + " bytes") + +HammerParserTopTotalArenaMem() + +# TODO: average memory use, per arena and total + +class HammerParserPreviewInput(gdb.Command): + def __init__(self): + super(HammerParserPreviewInput, self).__init__("hammer-parser-preview-input", gdb.COMMAND_OBSCURE) + + def invoke(self, arg, from_tty): + args = gdb.string_to_argv(arg) + + print(top_level_parse.input_chunk) + +HammerParserPreviewInput() + +class HammerParserAverageMem(gdb.Command): + def __init__(self): + super(HammerParserAverageMem, self).__init__("hammer-parser-average-mem", gdb.COMMAND_OBSCURE) + + def invoke(self, arg, from_tty): + args = gdb.string_to_argv(arg) + + mem = top_level_parse.get_avg_mem_use_per_arena() + print("Bytes used on average in each arena:") + print(mem) + +HammerParserAverageMem() + +class HammerParserCurrentEnv(gdb.Command): + def __init__(self): + super(HammerParserCurrentEnv, self).__init__("hammer-parser-current-env", gdb.COMMAND_OBSCURE) + + def invoke(self, arg, from_tty): + p = top_level_parse.h_do_parse_parser + p_env = top_level_parse.parser_decombinator.decompose_parser(p, top_level_parse) #TODO: parser -> env mapping function in top_level_parse + print(type(p_env).__name__ + " - " + str(p_env)) # TODO: consistency with GUI + +HammerParserCurrentEnv() diff --git a/gdb-port/hammer-breakpoints.py b/gdb-port/hammer-breakpoints.py new file mode 100644 index 0000000000000000000000000000000000000000..e2990bb4da70a2d7d95841a9e76a077a332b4eaf --- /dev/null +++ b/gdb-port/hammer-breakpoints.py @@ -0,0 +1,117 @@ +class HPackratParseBreakpoint(gdb.Breakpoint): + def stop(self): + frame = gdb.selected_frame() + block = frame.block() + for val in block: # GDB, why did you take away [] for gdb.Block? + if val.name == 'parser': + parser = int(val.value(frame)) + top_level_parse.enter_h_packrat_parse(parser) + return False + +class HPackratParseRetBreakpoint(gdb.Breakpoint): + def stop(self): + top_level_parse.return_from_h_packrat_parse() + return False + +# TODO: frame.older() allows accessing the caller frame. decision logic about whether to call first_h_do_parse_after_packrat_parse() +# could be moved here +class HDoParseBreakpoint(gdb.Breakpoint): + def stop(self): + frame = gdb.selected_frame() + block = frame.block() + retval = False + for val in block: + if val.name == 'parser': + parser = int(val.value(frame)) + if val.name == 'state': + #TODO: rename these variables to make it clear they're pointers + state = int(val.value(frame)) + state_obj = val.value(frame) + index = val.value(frame).dereference()['input_stream']['index'] + input_ptr = val.value(frame).dereference()['input_stream']['input'] + # If you want to printf debug the parse state + #print(val.value(frame).dereference()) + if val.name == 'arena': + arena = int(val.value(frame)) + top_level_parse.enter_h_do_parse(state, None, parser) + + input_chunk = input_ptr + index + #print(input_chunk.string('ascii','backslashreplace',10)) + if int(input_ptr) != 0: # XXX: checking gdb values for 0 + top_level_parse.set_input_chunk(input_chunk.string('UTF-8','replace',32)) # XXX: breaks after a filter fails to execute + + # Check if we need to stop after a number of steps + step_counter = gdb.convenience_variable("hammer_step_counter") + if step_counter is not None and step_counter > 0: + step_counter -= 1 + if step_counter == 0: + gdb.set_convenience_variable("hammer_step_counter", None) # unset step counter + retval = True + else: + gdb.set_convenience_variable("hammer_step_counter", step_counter) + #else: + # retval remains False + + # Check if we need to stop at a position + stop_pos = gdb.convenience_variable("hammer_stop_pos") + if stop_pos is not None and stop_pos <= index: + retval = True + gdb.set_convenience_variable("hammer_stop_pos", None) + print("Requested stop position: " + str(stop_pos) + " Stopped at: " + str(index)) + #return False + return retval + +class PerformLowLevelParseBreakpoint(gdb.Breakpoint): + def stop(self): + frame = gdb.selected_frame() + block = frame.block() + #retval = False + for val in block: + if val.name == 'parser': + parser = int(val.value(frame)) + top_level_parse.enter_perform_lowlevel_parse(parser) + + #return retval + return False + +class PerformLowLevelParseRetBreakpoint(gdb.Breakpoint): + def stop(self): + top_level_parse.return_from_perform_lowlevel_parse() + return False + +class ParserVirtualBreakpoint(gdb.Breakpoint): + def stop(self): + frame = gdb.selected_frame() + block = frame.block() + # function name is parse_* + # we extract the second part + # This is pretty much the same as naming it based on which parse_* function is called, so long as foo_vt contains parse_foo , and not e.g. parse_bar + parser_type = frame.function().name.split("_")[1] + parser_name = "(Unnamed " + parser_type + ")" + top_level_parse.parse_virtual(parser_name) + return False + +class HArenaMallocRawBreakpoint(gdb.Breakpoint): + def stop(self): + frame = gdb.selected_frame() + block = frame.block() + + for val in block: + if val.name == 'size': + alloc_size = int(val.value(frame)) + + top_level_parse.enter_h_arena_malloc_raw(alloc_size) + + return False + +class HRuleBreakpoint(gdb.Breakpoint): + def stop(self): + frame = gdb.selected_frame() + block = frame.block() + parser_type = gdb.lookup_type("HParser").pointer() + + for p in block: + # GDB (Debian 10.1-2) with python 3.9 crashes when trying to compare these two, but doing this would filter out locals that aren't parsers + #if p.type == parser_type: + top_level_parse.parser_objs[int(p.value(frame))] = Parser(p.name, int(p.value(frame))) + diff --git a/gdb-port/parser-name-instrumentation-gdb.py b/gdb-port/parser-name-instrumentation-gdb.py index fcfe6902db13b9447d88012d32cdbe35ef1f38fa..0eba78aefa289febd8babd3cf0a3f82dd83394ad 100644 --- a/gdb-port/parser-name-instrumentation-gdb.py +++ b/gdb-port/parser-name-instrumentation-gdb.py @@ -6,8 +6,8 @@ # TODO: The parameter given to h_do_parse is not shown in the backtrace, which is confusing -# Tuples of symbol name, length in bytes -H_RULE_FUNCTIONS = [ ("init_runlengthdecode_parser", 314), ("init_LZW_parser", 1221) ] +# List of names of functions with H_RULEs declared, used by BreakpointManager +H_RULE_FUNCTIONS = [ "init_runlengthdecode_parser", "init_LZW_parser" ] class Parser: def __init__(self, name, address): @@ -33,14 +33,15 @@ class Parser: def get_mem_use(self, state=None): if state is None: - return bytes_used + return self.bytes_used else: - return bytes_used.setdefault(state, 0) + return self.bytes_used.setdefault(state, 0) def __str__(self): return "(" + str(self.name) + ", " + hex(self.address) + ")" # Return the highest per-arena allocation count + # TODO: disambiguate "parse state" and "arena", possibly tracking both def get_arenamax(self): res = 0 if self.bytes_used: @@ -299,99 +300,6 @@ class TopLevelParse: top_level_parse = TopLevelParse() # Approach 1: load the application, set breakpoints, execute stack commands on breakpoint hit, continue -class HPackratParseBreakpoint(gdb.Breakpoint): - def stop(self): - frame = gdb.selected_frame() - block = frame.block() - for val in block: # GDB, why did you take away [] for gdb.Block? - if val.name == 'parser': - parser = int(val.value(frame)) - top_level_parse.enter_h_packrat_parse(parser) - return False - -class HPackratParseRetBreakpoint(gdb.Breakpoint): - def stop(self): - top_level_parse.return_from_h_packrat_parse() - return False - -# TODO: frame.older() allows accessing the caller frame. decision logic about whether to call first_h_do_parse_after_packrat_parse() -# could be moved here -class HDoParseBreakpoint(gdb.Breakpoint): - def stop(self): - frame = gdb.selected_frame() - block = frame.block() - retval = False - for val in block: - if val.name == 'parser': - parser = int(val.value(frame)) - if val.name == 'state': - #TODO: rename these variables to make it clear they're pointers - state = int(val.value(frame)) - state_obj = val.value(frame) - index = val.value(frame).dereference()['input_stream']['index'] - input_ptr = val.value(frame).dereference()['input_stream']['input'] - # If you want to printf debug the parse state - #print(val.value(frame).dereference()) - if val.name == 'arena': - arena = int(val.value(frame)) - top_level_parse.enter_h_do_parse(state, None, parser) - - input_chunk = input_ptr + index - #print(input_chunk.string('ascii','backslashreplace',10)) - if int(input_ptr) != 0: # XXX: checking gdb values for 0 - top_level_parse.set_input_chunk(input_chunk.string('UTF-8','replace',32)) # XXX: breaks after a filter fails to execute - - # Check if we need to stop after a number of steps - step_counter = gdb.convenience_variable("hammer_step_counter") - if step_counter is not None and step_counter > 0: - step_counter -= 1 - if step_counter == 0: - gdb.set_convenience_variable("hammer_step_counter", None) # unset step counter - retval = True - else: - gdb.set_convenience_variable("hammer_step_counter", step_counter) - #else: - # retval remains False - - # Check if we need to stop at a position - stop_pos = gdb.convenience_variable("hammer_stop_pos") - if stop_pos is not None and stop_pos <= index: - retval = True - gdb.set_convenience_variable("hammer_stop_pos", None) - print("Requested stop position: " + str(stop_pos) + " Stopped at: " + str(index)) - #return False - return retval - -class PerformLowLevelParseBreakpoint(gdb.Breakpoint): - def stop(self): - frame = gdb.selected_frame() - block = frame.block() - #retval = False - for val in block: - if val.name == 'parser': - parser = int(val.value(frame)) - top_level_parse.enter_perform_lowlevel_parse(parser) - - #return retval - return False - -class PerformLowLevelParseRetBreakpoint(gdb.Breakpoint): - def stop(self): - top_level_parse.return_from_perform_lowlevel_parse() - return False - -class ParserVirtualBreakpoint(gdb.Breakpoint): - def stop(self): - frame = gdb.selected_frame() - block = frame.block() - # function name is parse_* - # we extract the second part - # This is pretty much the same as naming it based on which parse_* function is called, so long as foo_vt contains parse_foo , and not e.g. parse_bar - parser_type = frame.function().name.split("_")[1] - parser_name = "(Unnamed " + parser_type + ")" - top_level_parse.parse_virtual(parser_name) - return False - class InitParserBreakpoint(gdb.Breakpoint): def stop(self): frame = gdb.selected_frame() @@ -406,13 +314,14 @@ class InitParserBreakpoint(gdb.Breakpoint): return False -class HRuleBreakpoint(gdb.Breakpoint): - def stop(self): - frame = gdb.selected_frame() - block = frame.block() - - for p in block: - top_level_parse.parser_objs[int(p.value(frame))] = Parser(p.name, int(p.value(frame))) +# TODO: refactored to breakpoint-manager.py , remove +#class HRuleBreakpoint(gdb.Breakpoint): +# def stop(self): +# frame = gdb.selected_frame() +# block = frame.block() +# +# for p in block: +# top_level_parse.parser_objs[int(p.value(frame))] = Parser(p.name, int(p.value(frame))) class HArenaMallocRawBreakpoint(gdb.Breakpoint): def stop(self): @@ -429,72 +338,63 @@ class HArenaMallocRawBreakpoint(gdb.Breakpoint): hammer_retq_breakpoints = [] -class BreakpointManager(): - def __init__(self): - self.hammer_retq_breakpoints = [] - self.h_rule_breakpoints = [] - - self.h_do_parse = None - self.h_packrat_parse = None - self.perform_lowlevel_parse = None - self.h_arena_malloc_raw = None - - self.parse_action = None - self.parse_choice = None - self.parse_sequence = None - self.parse_difference = None - self.parse_many = None - self.parse_and = None - self.parse_attr_bool = None - self.parse_bind = None - self.parse_bits = None - self.parse_butnot = None - self.parse_charset = None - self.parse_ch = None - self.parse_end = None - self.parse_endianness = None - self.parse_epsilon = None - self.parse_ignore = None - self.parse_ignoreseq = None - self.parse_indirect = None - self.parse_int_range = None - self.parse_not = None - self.parse_nothing = None - self.parse_optional = None - self.parse_permutation = None - self.parse_skip = None - self.parse_seek = None - self.parse_tell = None - self.parse_token = None - self.parse_unimplemented = None - self.parse_put = None - self.parse_get = None - self.parse_whitespace = None - self.parse_xor = None - - def set_h_rule_breakpoints(self): - for func in H_RULE_FUNCTIONS: - func_retq = locate_retq(func[0], func[1]) - self.h_rule_breakpoints[func] = HRuleBreakpoint("*" + hex(func_retq)) - -breakpoint_manager = BreakpointManager() +#class BreakpointManager(): +# def __init__(self, h_rule_functions): +# self.hammer_retq_breakpoints = [] +# self.h_rule_breakpoints = [] +# +# self.h_do_parse = None +# self.h_packrat_parse = None +# self.perform_lowlevel_parse = None +# self.h_arena_malloc_raw = None +# +# self.parse_action = None +# self.parse_choice = None +# self.parse_sequence = None +# self.parse_difference = None +# self.parse_many = None +# self.parse_and = None +# self.parse_attr_bool = None +# self.parse_bind = None +# self.parse_bits = None +# self.parse_butnot = None +# self.parse_charset = None +# self.parse_ch = None +# self.parse_end = None +# self.parse_endianness = None +# self.parse_epsilon = None +# self.parse_ignore = None +# self.parse_ignoreseq = None +# self.parse_indirect = None +# self.parse_int_range = None +# self.parse_not = None +# self.parse_nothing = None +# self.parse_optional = None +# self.parse_permutation = None +# self.parse_skip = None +# self.parse_seek = None +# self.parse_tell = None +# self.parse_token = None +# self.parse_unimplemented = None +# self.parse_put = None +# self.parse_get = None +# self.parse_whitespace = None +# self.parse_xor = None +# +# def set_h_rule_breakpoints(self): +# for func in H_RULE_FUNCTIONS: +# func_retq = locate_retq(func[0], func[1]) +# self.h_rule_breakpoints[func] = HRuleBreakpoint("*" + hex(func_retq)) + +breakpoint_manager = BreakpointManager(H_RULE_FUNCTIONS) class PDFMainBreakpoint(gdb.Breakpoint): def stop(self): - breakpoints = self.set_hammer_retq_breakpoints() - breakpoint_manager.hammer_retq_breakpoints = breakpoints + breakpoints = breakpoint_manager.set_hammer_retq_breakpoints() + #breakpoint_manager.hammer_retq_breakpoints = breakpoints return True - def set_hammer_retq_breakpoints(self): - breakpoints = [] - plp_retq = locate_perform_lowlevel_parse_retq() - perform_lowlevel_parse_ret = PerformLowLevelParseRetBreakpoint("*" + hex(plp_retq)) - breakpoints.append(perform_lowlevel_parse_ret) - hpp_retq = locate_h_packrat_parse_retq() - h_packrat_parse_ret = HPackratParseRetBreakpoint("*" + hex(hpp_retq)) - breakpoints.append(h_packrat_parse_ret) - return breakpoints # GDB parameters # TODO: hammer parameter prefix @@ -647,63 +547,16 @@ class HammerParserCurrentEnv(gdb.Command): HammerParserCurrentEnv() -#TODO: move all this to BreakpointManager? -# Call when execution stopped at breakpoint in main -def locate_perform_lowlevel_parse_retq(): - arch = gdb.selected_frame().architecture() - p_l_p_sym = gdb.lookup_symbol("perform_lowlevel_parse")[0] - p_l_p_address = int(p_l_p_sym.value().address) - # The choice of disassembling only 800 instructions from the start is arbitrary. (This function is 310 bytes long on this particular machine.) There is probably a way to find out where a function ends. - instructions = arch.disassemble(p_l_p_address, p_l_p_address+800) - results = [ ins["addr"] for ins in instructions if ins["asm"].startswith("ret") ] - return results[0] - -def locate_h_packrat_parse_retq(): - arch = gdb.selected_frame().architecture() - h_p_p_sym = gdb.lookup_symbol("h_packrat_parse")[0] - h_p_p_address = int(h_p_p_sym.value().address) - # Same as with perform_lowlevel_parse, +900 is arbitrary - instructions = arch.disassemble(h_p_p_address, h_p_p_address+900) - results = [ ins["addr"] for ins in instructions if ins["asm"].startswith("ret") ] - return results[0] - -def locate_init_parser_retq(): - arch = gdb.selected_frame().architecture() - i_p_sym = gdb.lookup_symbol("init_parser")[0] - i_p_address = int(i_p_sym.value().address) - # Same as with perform_lowlevel_parse, +16000 is arbitrary - instructions = arch.disassemble(i_p_address, i_p_address+16000) - results = [ ins["addr"] for ins in instructions if ins["asm"].startswith("ret") ] - return results[0] - -def locate_retqs(symbol, length): - arch = gdb.selected_frame().architecture() - sym = gdb.lookup_symbol(symbol)[0] - sym_address = int(sym.value().address) - instructions = arch.disassemble(sym_address, sym_address+length) - results = [ ins["addr"] for ins in instructions if ins["asm"].startswith("ret") ] - return results - -def locate_retq(symbol, length): - results = locate_retqs(symbol, length) - return results[0] - -def del_hammer_retq_breakpoints(breakpoints): - for bp in breakpoints: - bp.delete() - # Clean up by-address breakpoints in hammer when inferior exits. # Caveat: Assumes there's a single inferior, the debugged parser, so no checking is done # TODO: where to store breakpoints? TopLevelParse? A BreakpointManager class?i def exit_handler(event): #breakpoints = [ perform_lowlevel_parse_ret, h_packrat_parse_ret ] #del_hammer_retq_breakpoints(breakpoints) - del_hammer_retq_breakpoints(breakpoint_manager.hammer_retq_breakpoints) + breakpoint_manager.del_hammer_retq_breakpoints() gdb.events.exited.connect(exit_handler) -#TODO: regex match retq, ret, etc - # Break on main so that libhammer.so gets to load main = PDFMainBreakpoint("main") @@ -711,44 +564,47 @@ main = PDFMainBreakpoint("main") #init_parser = InitParserBreakpoint("pdf.c:1223") -h_do_parse = HDoParseBreakpoint("h_do_parse") -h_packrat_parse = HPackratParseBreakpoint("h_packrat_parse") -perform_lowlevel_parse = PerformLowLevelParseBreakpoint("perform_lowlevel_parse") -h_arena_malloc_raw = HArenaMallocRawBreakpoint("h_arena_malloc_raw") -# todo: investigate GDB frame filters for rendering backtraces - -parse_action = ParserVirtualBreakpoint("parse_action") -parse_choice = ParserVirtualBreakpoint("parse_choice") -parse_sequence = ParserVirtualBreakpoint("parse_sequence") -parse_difference = ParserVirtualBreakpoint("parse_difference") -parse_many = ParserVirtualBreakpoint("parse_many") -parse_and = ParserVirtualBreakpoint("parse_and") -parse_attr_bool = ParserVirtualBreakpoint("parse_attr_bool") -parse_bind = ParserVirtualBreakpoint("parse_bind") -parse_bits = ParserVirtualBreakpoint("parse_bits") -parse_butnot = ParserVirtualBreakpoint("parse_butnot") -parse_charset = ParserVirtualBreakpoint("parse_charset") -parse_ch = ParserVirtualBreakpoint("parse_ch") -parse_end = ParserVirtualBreakpoint("parse_end") -parse_endianness = ParserVirtualBreakpoint("parse_endianness") -parse_epsilon = ParserVirtualBreakpoint("parse_epsilon") -parse_ignore = ParserVirtualBreakpoint("parse_ignore") -parse_ignoreseq = ParserVirtualBreakpoint("parse_ignoreseq") -parse_indirect = ParserVirtualBreakpoint("parse_indirect") -parse_int_range = ParserVirtualBreakpoint("parse_int_range") -parse_not = ParserVirtualBreakpoint("parse_not") -parse_nothing = ParserVirtualBreakpoint("parse_nothing") -parse_optional = ParserVirtualBreakpoint("parse_optional") -parse_permutation = ParserVirtualBreakpoint("parse_permutation") -parse_skip = ParserVirtualBreakpoint("parse_skip") -parse_seek = ParserVirtualBreakpoint("parse_seek") -parse_tell = ParserVirtualBreakpoint("parse_tell") -parse_token = ParserVirtualBreakpoint("parse_token") -parse_unimplemented = ParserVirtualBreakpoint("parse_unimplemented") -parse_put = ParserVirtualBreakpoint("parse_put") -parse_get = ParserVirtualBreakpoint("parse_get") -parse_whitespace = ParserVirtualBreakpoint("parse_whitespace") -parse_xor = ParserVirtualBreakpoint("parse_xor") +#h_do_parse = HDoParseBreakpoint("h_do_parse") +#h_packrat_parse = HPackratParseBreakpoint("h_packrat_parse") +#perform_lowlevel_parse = PerformLowLevelParseBreakpoint("perform_lowlevel_parse") +#h_arena_malloc_raw = HArenaMallocRawBreakpoint("h_arena_malloc_raw") +breakpoint_manager.set_hammer_breakpoints() + +# TODO: investigate GDB frame filters for rendering backtraces + +#parse_action = ParserVirtualBreakpoint("parse_action") +#parse_choice = ParserVirtualBreakpoint("parse_choice") +#parse_sequence = ParserVirtualBreakpoint("parse_sequence") +#parse_difference = ParserVirtualBreakpoint("parse_difference") +#parse_many = ParserVirtualBreakpoint("parse_many") +#parse_and = ParserVirtualBreakpoint("parse_and") +#parse_attr_bool = ParserVirtualBreakpoint("parse_attr_bool") +#parse_bind = ParserVirtualBreakpoint("parse_bind") +#parse_bits = ParserVirtualBreakpoint("parse_bits") +#parse_butnot = ParserVirtualBreakpoint("parse_butnot") +#parse_charset = ParserVirtualBreakpoint("parse_charset") +#parse_ch = ParserVirtualBreakpoint("parse_ch") +#parse_end = ParserVirtualBreakpoint("parse_end") +#parse_endianness = ParserVirtualBreakpoint("parse_endianness") +#parse_epsilon = ParserVirtualBreakpoint("parse_epsilon") +#parse_ignore = ParserVirtualBreakpoint("parse_ignore") +#parse_ignoreseq = ParserVirtualBreakpoint("parse_ignoreseq") +#parse_indirect = ParserVirtualBreakpoint("parse_indirect") +#parse_int_range = ParserVirtualBreakpoint("parse_int_range") +#parse_not = ParserVirtualBreakpoint("parse_not") +#parse_nothing = ParserVirtualBreakpoint("parse_nothing") +#parse_optional = ParserVirtualBreakpoint("parse_optional") +#parse_permutation = ParserVirtualBreakpoint("parse_permutation") +#parse_skip = ParserVirtualBreakpoint("parse_skip") +#parse_seek = ParserVirtualBreakpoint("parse_seek") +#parse_tell = ParserVirtualBreakpoint("parse_tell") +#parse_token = ParserVirtualBreakpoint("parse_token") +#parse_unimplemented = ParserVirtualBreakpoint("parse_unimplemented") +#parse_put = ParserVirtualBreakpoint("parse_put") +#parse_get = ParserVirtualBreakpoint("parse_get") +#parse_whitespace = ParserVirtualBreakpoint("parse_whitespace") +#parse_xor = ParserVirtualBreakpoint("parse_xor") +breakpoint_manager.set_parser_virtual_breakpoints() # Commandline: # $ gdb -ex "source /path/to/parser-name-instrumentation-gdb.py" --args /path/to/pdf /path/to/input.pdf @@ -760,15 +616,20 @@ gdb.execute("run") #perform_lowlevel_parse_ret = PerformLowLevelParseRetBreakpoint("*" + hex(plp_retq)) #hpp_retq = locate_h_packrat_parse_retq() #h_packrat_parse_ret = HPackratParseRetBreakpoint("*" + hex(hpp_retq)) -i_p_retq = locate_init_parser_retq() -init_parser = InitParserBreakpoint("*" + hex(i_p_retq)) + +#i_p_retq = breakpoint_manager.locate_retq("init_parser") +#init_parser = InitParserBreakpoint("*" + hex(i_p_retq)) +breakpoint_manager.set_init_parser_breakpoint() + #hammer_retq_breakpoints = [perform_lowlevel_parse_ret, h_packrat_parse_ret] -h_rule_breakpoints = {} +#h_rule_breakpoints = {} + +#for func in H_RULE_FUNCTIONS: +# func_retq = locate_retq(func[0], func[1]) +# h_rule_breakpoints[func] = HRuleBreakpoint("*" + hex(func_retq)) +breakpoint_manager.set_h_rule_breakpoints() -for func in H_RULE_FUNCTIONS: - func_retq = locate_retq(func[0], func[1]) - h_rule_breakpoints[func] = HRuleBreakpoint("*" + hex(func_retq)) # TODO: the RET breakpoints in hammer break when "run" is executed again. figure out a way to automatically replace these diff --git a/gdb-port/parser.py b/gdb-port/parser.py new file mode 100644 index 0000000000000000000000000000000000000000..eb07193f9cdff510b5d4738b9bc784db26184255 --- /dev/null +++ b/gdb-port/parser.py @@ -0,0 +1,110 @@ +class Parser: + def __init__(self, name, address): + self.name = name + self.address = address + self.bytes_used = {} + + def name_parser(self, name): + self.name = name + + # TODO: remove + def get_name_or_placeholder(self): + if self.name is None: + return "Wait for it... (if you're reading this, you found a bug)" + else: + return self.name + + def add_mem_use(self, state, size): + if self.bytes_used.setdefault(state, None) is None: + self.bytes_used[state] = size + else: + self.bytes_used[state] += size + + def get_mem_use(self, state=None): + if state is None: + return self.bytes_used + else: + return self.bytes_used.setdefault(state, 0) + + def __str__(self): + return "(" + str(self.name) + ", " + hex(self.address) + ")" + + # Return the highest per-arena allocation count + # TODO: disambiguate "parse state" and "arena", possibly tracking both + def get_arenamax(self): + res = 0 + if self.bytes_used: + res = max(self.bytes_used.values()) + return res + + def get_arenasum(self): + res = 0 + if self.bytes_used: + res = sum(self.bytes_used.values()) + return res + +class ParserStack: + def __init__(self, parse_state, arena): + self.parse_state = parse_state + self.arena = arena + self.p_stack = [] + self.unclaimed_mem_use = 0 + + def push(self, parser): + self.p_stack.append(parser) + + def pop(self): + return self.p_stack.pop() + + def peek(self): + try: + retval = self.p_stack[-1] + except IndexError: + retval = None + return retval + + def set_state(self, state): + self.parse_state = state + # Shortcut for setting the name property of the parser on the top of stack + # In terms of tracing, *most* calls to a parser look something like this with the packrat backend: + # h_do_parse() + # parse_foo() + # perform_lowlevel_parse() + + # perform_lowlevel_parse() is called when the memo table at that position is not filled in yet. + # it calls the corresponding parse_* virtual function via the vtable, but other than that does not have type information + # it's probably possible to extract type information, by comparing vtable addresses, but that seems painful + + # parse_foo() is the parser's corresponding virtual function in the frontend, which does not have the equivalent of a "this" pointer + + # So what we do to keep track of parsers is incrementally filling in the details for both + + # h_do_parse() is the backend's "actually run the parser" function, but does not get called for some parsers + # (apparently mostly it's for higher-order parsers) + # also contains the decision logic about whether to call perform_lowlevel_parse() + + # possible scenarios: + # h_do_parse() + # perform_lowlevel_parse() + # parse_foo() + + # h_do_parse() + # perform_lowlevel_parse() + + # h_do_parse() + def name_top_parser(self, name): + self.p_stack[-1].name_parser(name) + + def add_mem_use_each(self, size): + for p in self.p_stack: + p.bytes_used += size + + def add_mem_use_top(self, size): + self.p_stack[-1].bytes_used += size + + def show_stack(self): + print("stack would be printed here. Depth:", len(self.p_stack)) + #print([(p.get_name_or_placeholder(), hex(p.address)) for p in self.p_stack]) + + def depth(self): + return len(self.p_stack) diff --git a/gdb-port/pdf-breakpoints.py b/gdb-port/pdf-breakpoints.py new file mode 100644 index 0000000000000000000000000000000000000000..b20a4307f5b905c6575356bd0f0b72b625fbe940 --- /dev/null +++ b/gdb-port/pdf-breakpoints.py @@ -0,0 +1,30 @@ +class InitParserBreakpoint(gdb.Breakpoint): + def stop(self): + frame = gdb.selected_frame() + block = frame.block() + top_level_parse.init_parser() + + # This will also catch locals that aren't parsers, but it's not a problem in practice, + # since h_parse() will never be called on them + # If it becomes a problem after all, gdb.parse_and_eval() might be used to filter them out + for p in block: + top_level_parse.parser_objs[int(p.value(frame))] = Parser(p.name, int(p.value(frame))) + + return False + +class PDFMainBreakpoint(gdb.Breakpoint): + def stop(self): + breakpoints = breakpoint_manager.set_hammer_retq_breakpoints() + #breakpoint_manager.hammer_retq_breakpoints = breakpoints + + return True + + def set_hammer_retq_breakpoints(self): + breakpoints = [] + plp_retq = locate_perform_lowlevel_parse_retq() + perform_lowlevel_parse_ret = PerformLowLevelParseRetBreakpoint("*" + hex(plp_retq)) + breakpoints.append(perform_lowlevel_parse_ret) + hpp_retq = locate_h_packrat_parse_retq() + h_packrat_parse_ret = HPackratParseRetBreakpoint("*" + hex(hpp_retq)) + breakpoints.append(h_packrat_parse_ret) + return breakpoints diff --git a/gdb-port/tests/run_unittest_main_noexit.py b/gdb-port/tests/run_unittest_main_noexit.py new file mode 100644 index 0000000000000000000000000000000000000000..decab90a023e4f8078aa262cc743f66ad39382f0 --- /dev/null +++ b/gdb-port/tests/run_unittest_main_noexit.py @@ -0,0 +1,3 @@ +import unittest + +unittest.main(exit=False) diff --git a/gdb-port/tests/unit/breakpoint-manager.py b/gdb-port/tests/unit/breakpoint-manager.py new file mode 100644 index 0000000000000000000000000000000000000000..446de0bf29aaafaf67972e16b28c132b03e2d56e --- /dev/null +++ b/gdb-port/tests/unit/breakpoint-manager.py @@ -0,0 +1,80 @@ +import unittest +import unittest.mock + +class BreakpointManagerCreated(unittest.TestCase): + def setUp(self): + test_breakpoints = [ "init_runlengthdecode_parser", "init_LZW_parser" ] + self.bpm = BreakpointManager(test_breakpoints) + self.arch = gdb.selected_frame().architecture() + + def test_locate_retqs(self): + retqs = self.bpm.locate_retqs("init_runlengthdecode_parser") + addr = retqs[0] + instr = self.arch.disassemble(addr, addr+8, 1)[0] + self.assertTrue(instr['asm'].startswith("ret")) + + def test_locate_retq(self): + retq = self.bpm.locate_retq("init_runlengthdecode_parser") + instr = self.arch.disassemble(retq, retq+8, 1)[0] + self.assertTrue(instr['asm'].startswith("ret")) + + +class BreakpointManagerSettingBreakpoints(unittest.TestCase): + def setUp(self): + #TODO: make these tests independent of the pdf parser + test_breakpoints = [ "init_runlengthdecode_parser", "init_LZW_parser" ] + # "constants" like rld_retq below could be moved to setUpClass (possibly better performance) + self.bpm = BreakpointManager(test_breakpoints) + self.arch = gdb.selected_frame().architecture() + self.rld_retq = self.bpm.locate_retq(test_breakpoints[0]) + self.lzw_retq = self.bpm.locate_retq(test_breakpoints[1]) + + self.hrbp_patcher = unittest.mock.patch('__main__.HRuleBreakpoint') + self.hrbp_mock_object = self.hrbp_patcher.start() + self.plprbp_patcher = unittest.mock.patch('__main__.PerformLowLevelParseRetBreakpoint') + self.plprbp_mock_object = self.plprbp_patcher.start() + self.hpprbp_patcher = unittest.mock.patch('__main__.HPackratParseRetBreakpoint') + self.hpprbp_mock_object = self.hpprbp_patcher.start() + + def tearDown(self): + self.hpprbp_patcher.stop() + self.plprbp_patcher.stop() + self.hrbp_patcher.stop() + +# TODO: mock breakpoints, assert on arguments to constructor + def test_set_h_rule_breakpoints(self): + self.bpm.set_h_rule_breakpoints() + self.assertTrue(self.hrbp_mock_object.call_count == 2) + #bps_valid = [ bp.is_valid() for bp in self.bpm.h_rule_breakpoints] + # Assert that we have as many breakpoints as h_rule_function and is_valid() returns true for all of them + #self.assertEqual(bps_valid, len(self.bpm.H_RULE_FUNCTIONS) * [True]) + # Cleanup + #for bp in self.bpm.h_rule_breakpoints: + # bp.delete() + + def test_set_hammer_retq_breakpoints(self): + self.bpm.set_hammer_retq_breakpoints() + self.assertTrue(self.plprbp_mock_object.called) + self.assertTrue(self.hpprbp_mock_object.called) + #bps_valid = [ bp.is_valid() for bp in self.bpm.hammer_retq_breakpoints] + #self.assertEqual(bps_valid, 2 * [True]) + #Cleanup + #for bp in self.bpm.hammer_retq_breakpoints: + # bp.delete() + + def test_del_hammer_retq_breakpoints(self): + self.bpm.set_hammer_retq_breakpoints() + bps = self.bpm.hammer_retq_breakpoints + self.bpm.del_hammer_retq_breakpoints() + for bp in bps: + self.assertTrue(bp.delete.called) + #self.assertTrue(self.plprbp_mock_object.delete.called) + #self.assertTrue(self.hpprbp_mock_object.delete.called) + #bps_valid = [ bp.is_valid() for bp in bps] + #self.assertEqual(bps_valid, 2 * [False]) + + # TODO + #def tearDown(self): + #self.bpm.delete + +#TODO: tests for PDFMainBreakpoint diff --git a/gdb-port/tests/unit/breakpoints_hammer.py b/gdb-port/tests/unit/breakpoints_hammer.py index 6d51f9d869f8d1239d88f9f4acc5996be4bf1a1e..08d20df143cc9d83bf9901a6c2ee5fe29c946f03 100644 --- a/gdb-port/tests/unit/breakpoints_hammer.py +++ b/gdb-port/tests/unit/breakpoints_hammer.py @@ -1,26 +1,39 @@ import unittest +import unittest.mock +#class PerformLowlevelParseRetBreakpointCreated(unittest.TestCase): +# def test_breakpoint_is_at_ret(self): +# breakpoint_loc = int(perform_lowlevel_parse_ret.location.lstrip("*"), 16) +# arch = gdb.selected_frame().architecture() +# disassembled_ins = arch.disassemble(breakpoint_loc) +# ins = disassembled_ins[0]['asm'].split(" ")[0] -class PerformLowlevelParseRetBreakpointCreated(unittest.TestCase): - def test_breakpoint_is_at_ret(self): - breakpoint_loc = int(perform_lowlevel_parse_ret.location.lstrip("*"), 16) - arch = gdb.selected_frame().architecture() - disassembled_ins = arch.disassemble(breakpoint_loc) - ins = disassembled_ins[0]['asm'].split(" ")[0] +# self.assertIn(ins, ["ret", "retq"]) - self.assertIn(ins, ["ret", "retq"]) +# def test_not_pending(self): +# self.assertFalse(perform_lowlevel_parse_ret.pending) - def test_not_pending(self): - self.assertFalse(perform_lowlevel_parse_ret.pending) +class PerformLowLevelParseRetBreakpointStop(unittest.TestCase): + def setUp(self): + self.patcher = unittest.mock.patch('__main__.top_level_parse') + self.mock_class = self.patcher.start() + self.plbp = PerformLowLevelParseRetBreakpoint("main") -class HPackratParseRetBreakpointCreated(unittest.TestCase): - def test_breakpoint_is_at_ret(self): - breakpoint_loc = int(h_packrat_parse_ret.location.lstrip("*"), 16) - arch = gdb.selected_frame().architecture() - disassembled_ins = arch.disassemble(breakpoint_loc) - ins = disassembled_ins[0]['asm'].split(" ")[0] + def tearDown(self): + self.patcher.stop() #TODO: make sure we unpatch even if exceptions - self.assertIn(ins, ["ret", "retq"]) + def test_stop(self): + self.plbp.stop() + self.assertTrue(self.mock_class.return_from_perform_lowlevel_parse.called) - def test_not_pending(self): - self.assertFalse(h_packrat_parse_ret.pending) +#class HPackratParseRetBreakpointCreated(unittest.TestCase): +# def test_breakpoint_is_at_ret(self): +# breakpoint_loc = int(h_packrat_parse_ret.location.lstrip("*"), 16) +# arch = gdb.selected_frame().architecture() +# disassembled_ins = arch.disassemble(breakpoint_loc) +# ins = disassembled_ins[0]['asm'].split(" ")[0] + +# self.assertIn(ins, ["ret", "retq"]) + +# def test_not_pending(self): +# self.assertFalse(h_packrat_parse_ret.pending) diff --git a/gdb-port/tests/unit/hammer-breakpoints.py b/gdb-port/tests/unit/hammer-breakpoints.py new file mode 100644 index 0000000000000000000000000000000000000000..4c9604945935d35efd46f7c123a2601ee354b547 --- /dev/null +++ b/gdb-port/tests/unit/hammer-breakpoints.py @@ -0,0 +1,75 @@ +import unittest +import unittest.mock + +class HPackratParseBreakpointStop(unittest.TestCase): + def setUp(self): + self.tlp_patcher = unittest.mock.patch('__main__.top_level_parse') + self.tlp_mock_class = self.tlp_patcher.start() + self.bp_patcher = unittest.mock.patch.object(HPackratParseBreakpoint, '__init__', return_value=None ) + self.bp_mock_class = self.bp_patcher.start() + self.hppbp = HPackratParseBreakpoint('main') + + def tearDown(self): + self.bp_patcher.stop() + self.tlp_patcher.stop() + + def test_stop(self): + self.hppbp.stop() + self.assertTrue(self.tlp_mock_class.enter_h_packrat_parse.called) + +#TODO: move to BreakpointManager, mock out breakpoint +#class PerformLowlevelParseRetBreakpointCreated(unittest.TestCase): +# def test_breakpoint_is_at_ret(self): +# breakpoint_loc = int(perform_lowlevel_parse_ret.location.lstrip("*"), 16) +# arch = gdb.selected_frame().architecture() +# disassembled_ins = arch.disassemble(breakpoint_loc) +# ins = disassembled_ins[0]['asm'].split(" ")[0] + +# self.assertIn(ins, ["ret", "retq"]) + +# def test_not_pending(self): +# self.assertFalse(perform_lowlevel_parse_ret.pending) + +class PerformLowLevelParseRetBreakpointStop(unittest.TestCase): + def setUp(self): + self.tlp_patcher = unittest.mock.patch('__main__.top_level_parse') + self.tlp_mock_class = self.tlp_patcher.start() + self.bp_patcher = unittest.mock.patch.object(PerformLowLevelParseRetBreakpoint, '__init__', return_value=None ) + self.bp_mock_class = self.bp_patcher.start() + self.plrbp = PerformLowLevelParseRetBreakpoint("main") + + def tearDown(self): + self.bp_patcher.stop() + self.tlp_patcher.stop() #TODO: make sure we unpatch even if exceptions + + def test_stop(self): + self.plrbp.stop() + self.assertTrue(self.tlp_mock_class.return_from_perform_lowlevel_parse.called) + +#class HPackratParseRetBreakpointCreated(unittest.TestCase): +# def test_breakpoint_is_at_ret(self): +# breakpoint_loc = int(h_packrat_parse_ret.location.lstrip("*"), 16) +# arch = gdb.selected_frame().architecture() +# disassembled_ins = arch.disassemble(breakpoint_loc) +# ins = disassembled_ins[0]['asm'].split(" ")[0] + +# self.assertIn(ins, ["ret", "retq"]) + +# def test_not_pending(self): +# self.assertFalse(h_packrat_parse_ret.pending) + +class HPackratParseRetBreakpointStop(unittest.TestCase): + def setUp(self): + self.tlp_patcher = unittest.mock.patch('__main__.top_level_parse') + self.tlp_mock_class = self.tlp_patcher.start() + self.bp_patcher = unittest.mock.patch.object(HPackratParseRetBreakpoint, '__init__', return_value=None ) + self.bp_mock_class = self.bp_patcher.start() + self.hpprbp = HPackratParseRetBreakpoint("main") + + def tearDown(self): + self.bp_patcher.stop() + self.tlp_patcher.stop() + + def test_stop(self): + self.hpprbp.stop() + self.assertTrue(self.tlp_mock_class.return_from_h_packrat_parse.called) diff --git a/gdb-port/tests/unit/parser-stack.py b/gdb-port/tests/unit/parser-stack.py new file mode 100644 index 0000000000000000000000000000000000000000..e734bdd70e85210e2dade60b36321bb36798160d --- /dev/null +++ b/gdb-port/tests/unit/parser-stack.py @@ -0,0 +1,40 @@ +import unittest + +class ParserStackPushPop(unittest.TestCase): + def setUp(self): + self.parser1 = Parser("foo", 1234) + self.parser2 = Parser("bar", 1242) + self.parser_stack = ParserStack(16, 32) # Made-up addresses + + def test_push(self): + self.parser_stack.push(self.parser1) + self.assertEqual(self.parser_stack.p_stack, [self.parser1]) + + def test_pop(self): + self.parser_stack.push(self.parser1) + self.parser_stack.push(self.parser2) + self.parser_stack.pop() + self.assertEqual(self.parser_stack.p_stack, [self.parser1]) + +class ParserStackNaming(unittest.TestCase): + def setUp(self): + self.parser1 = Parser("foo", 1234) + self.parser2 = Parser("bar", 1242) + self.parser_stack = ParserStack(16, 32) + self.parser_stack.push(self.parser1) + self.parser_stack.push(self.parser2) + + def test_name_top_parser(self): + parser3 = Parser(None, 1250) + self.parser_stack.push(parser3) + self.parser_stack.name_top_parser("foo2") + parser3 = self.parser_stack.peek() + self.assertEqual(parser3.name, "foo2") + + # TODO: should parsers be allowed to name only once? + # TODO: parser unit test + @unittest.expectedFailure + def test_name_top_parser_twice(self): + self.parser_stack.name_top_parser("something else") + self.assertEqual(self.parser2.name, "something else") + diff --git a/gdb-port/tests/unit/parser.py b/gdb-port/tests/unit/parser.py new file mode 100644 index 0000000000000000000000000000000000000000..3b525c675efb0049a1c091fd395a98efb82cd8a3 --- /dev/null +++ b/gdb-port/tests/unit/parser.py @@ -0,0 +1,80 @@ +import unittest + +class ParserCreation(unittest.TestCase): + def setUp(self): + self.parser_address = 12345 + self.parser_name = "foo" + + def test_parser_creation_with_address(self): + test_parser = Parser(None, self.parser_address) + self.assertIsInstance(test_parser.address, int) + self.assertEqual(test_parser.address, 12345) + self.assertIsNone(test_parser.name) + + def test_parser_creation_with_name(self): + test_parser = Parser(self.parser_name, None) + self.assertIsInstance(test_parser.name, str) + self.assertEqual(test_parser.name, "foo") + self.assertIsNone(test_parser.address) + + def test_parser_creation_with_name_and_address(self): + test_parser = Parser(self.parser_name, self.parser_address) + self.assertIsInstance(test_parser.name, str) + self.assertEqual(test_parser.name, "foo") + self.assertIsInstance(test_parser.address, int) + self.assertEqual(test_parser.address, 12345) + +class ParserMemUse(unittest.TestCase): + def setUp(self): + self.parser = Parser("foo", 12345) + + def test_add_mem_use(self): + self.parser.add_mem_use(1, 50) + self.parser.add_mem_use(1, 60) + self.assertEqual(self.parser.get_mem_use(1), 110) + + def test_get_mem_use(self): + self.parser.add_mem_use(1, 50) + self.parser.add_mem_use(2, 60) + self.assertEqual(self.parser.get_mem_use(1), 50) + self.assertEqual(self.parser.get_mem_use(2), 60) + self.assertEqual(self.parser.get_mem_use(), {1: 50, 2: 60}) + + def test_arenamax(self): + self.parser.add_mem_use(1, 50) + self.parser.add_mem_use(2, 60) + self.assertEqual(self.parser.get_arenamax(), 60) + + def test_arenasum(self): + self.parser.add_mem_use(1, 50) + self.parser.add_mem_use(2, 60) + self.assertEqual(self.parser.get_arenasum(), 110) + +class ParserNaming(unittest.TestCase): + def setUp(self): + self.parser = Parser(None, 12345) + + def test_name_parser_method(self): + self.parser.name_parser("foo") + self.assertEqual(self.parser.name, "foo") + + # This is enforced by TopLevelParse instead + #@unittest.expectedFailure + #def test_parser_cannot_be_named_twice(self): + # self.parser.name_parser("foo") + # self.parser.name_parser("bar") + # self.assertEqual(self.parser.name, "bar") + + #def test_first_parser_name_stays(self): + # self.parser.name_parser("foo") + # self.parser.name_parser("bar") + # self.assertEqual(self.parser.name, "foo") + +class ParserStringRepresentation(unittest.TestCase): + def setUp(self): + self.parser = Parser("foo", 12345) + + def test_parser_string_rep(self): + self.assertEqual(str(self.parser), "(foo, 0x3039)") + +#unittest.main(exit=False) diff --git a/gdb-port/tests/unit/top-level-parse.py b/gdb-port/tests/unit/top-level-parse.py new file mode 100644 index 0000000000000000000000000000000000000000..0872e51319b25a19e3fcced41973aa02db4e6773 --- /dev/null +++ b/gdb-port/tests/unit/top-level-parse.py @@ -0,0 +1,165 @@ +import unittest + +class TopLevelParseStateManagement(unittest.TestCase): + def setUp(self): + self.top_level_parse = TopLevelParse() + self.ps1 = ParserStack(16, 40) + self.parser1 = Parser("foo", 32) + + def test_enter_h_packrat_parse(self): + self.top_level_parse.enter_h_packrat_parse(32) + self.assertIsInstance(self.top_level_parse.parser_stacks[0], ParserStack) + + def test_enter_h_do_parse_new_parser(self): + ps1 = ParserStack(16, 40) + self.top_level_parse.enter_h_packrat_parse(32) + #simulating effects of enter_h_packrat_parse + self.top_level_parse.enter_h_do_parse(16, 40, 32) + parser1 = self.top_level_parse.parser_objs[32] + self.assertIs(self.top_level_parse.h_do_parse_parser, self.top_level_parse.parser_objs[32]) + self.assertIsNone(parser1.name) + self.assertEqual(parser1.address, 32) + + def test_enter_h_do_parse_known_parser(self): + self.top_level_parse.enter_h_packrat_parse(32) + parser1 = Parser("foo", 32) + self.top_level_parse.parser_objs[32] = parser1 # Add the parser into the db by hand + self.top_level_parse.enter_h_do_parse(16, 40, 32) + self.assertIs(self.top_level_parse.h_do_parse_parser, parser1) + self.assertIs(self.top_level_parse.h_do_parse_parser, self.top_level_parse.parser_objs[32]) + self.assertEqual(self.top_level_parse.h_do_parse_parser.name, "foo") + self.assertEqual(self.top_level_parse.h_do_parse_parser.address, 32) + + def test_ehd_calls_first_h_do_parse_after_packrat_parse(self): + parser1 = Parser("foo", 32) + self.top_level_parse.enter_h_packrat_parse(parser1.address) + self.top_level_parse.enter_h_do_parse(16, 40, parser1.address) + ps1 = self.top_level_parse.peek_parserstack() + self.assertEqual(ps1.parse_state, 16) + + def test_first_h_do_parse_after_packrat_parse(self): + ps1 = ParserStack(None, 40) + self.top_level_parse.parser_stacks.append(ps1) + self.top_level_parse.first_h_do_parse_after_packrat_parse(16, 40) + self.assertEqual(ps1.parse_state, 16) + + def test_return_from_h_packrat_parse(self): + self.top_level_parse.parser_stacks.append(self.ps1) + self.top_level_parse.return_from_h_packrat_parse() + self.assertEqual(self.top_level_parse.parser_stacks, []) + + def test_enter_perform_lowlevel_parse_known_parser(self): + self.top_level_parse.parser_stacks.append(self.ps1) + self.top_level_parse.parser_objs[32] = self.parser1 + self.top_level_parse.enter_perform_lowlevel_parse(32) + self.assertIs(self.top_level_parse.parser_stacks[-1].p_stack[-1], self.parser1) + def test_enter_perform_lowlevel_parse_new_parser(self): + self.top_level_parse.parser_stacks.append(self.ps1) + self.top_level_parse.enter_perform_lowlevel_parse(32) + parser1 = self.top_level_parse.parser_stacks[-1].p_stack[-1] + self.assertIsNone(parser1.name) + self.assertEqual(parser1.address, 32) + + def test_return_from_perform_lowlevel_parse(self): + self.top_level_parse.parser_stacks.append(self.ps1) + self.top_level_parse.parser_objs[32] = self.parser1 + self.top_level_parse.parser_stacks[-1].p_stack.append(self.parser1) + self.top_level_parse.return_from_perform_lowlevel_parse() + self.assertEqual(self.top_level_parse.parser_stacks[-1].p_stack, []) + + def test_enter_h_arena_malloc_raw(self): + self.top_level_parse.parser_stacks.append(self.ps1) + self.top_level_parse.parser_objs[32] = self.parser1 + self.ps1.push(self.parser1) + self.top_level_parse.enter_h_arena_malloc_raw(256) + self.assertEqual(self.top_level_parse.parser_stacks[-1].p_stack[-1].bytes_used[16], 256) + + def test_enter_h_arena_malloc_raw_no_parser(self): + self.top_level_parse.parser_stacks.append(self.ps1) + self.top_level_parse.enter_h_arena_malloc_raw(256) + self.assertEqual(self.top_level_parse.parser_stacks[-1].unclaimed_mem_use, 256) + + def test_enter_h_arena_malloc_raw_no_stack(self): + self.top_level_parse.enter_h_arena_malloc_raw(256) + self.assertEqual(self.top_level_parse.unclaimed_mem_use, 256) + + def test_parse_virtual(self): + self.top_level_parse.parser_stacks.append(self.ps1) + parser2 = Parser(None, 32) + self.top_level_parse.parser_stacks[-1].push(parser2) + self.top_level_parse.parse_virtual("foo") + self.assertEqual(parser2.name, "foo") + + def test_peek_parserstack(self): + self.top_level_parse.parser_stacks.append(self.ps1) + ps1 = self.top_level_parse.peek_parserstack() + self.assertIs(ps1, self.ps1) + + def test_peek_parser(self): + self.top_level_parse.parser_stacks.append(self.ps1) + self.top_level_parse.parser_objs[32] = self.parser1 + self.ps1.push(self.parser1) + parser1 = self.top_level_parse.peek_parser() + self.assertIs(parser1, self.parser1) + + def test_parser_by_address(self): + self.top_level_parse.parser_stacks.append(self.ps1) + self.top_level_parse.parser_objs[32] = self.parser1 + self.ps1.push(self.parser1) + parser1 = self.top_level_parse.parser_by_address(32) + + def test_parsers_by_name(self): + self.top_level_parse.parser_stacks.append(self.ps1) + self.top_level_parse.parser_objs[32] = self.parser1 + self.ps1.push(self.parser1) + parser2 = Parser("foo", 48) + self.top_level_parse.parser_objs[48] = parser2 + self.ps1.push(parser2) + parserlist = self.top_level_parse.parsers_by_name("foo") + self.assertEqual(parserlist, [self.parser1, parser2]) + + def test_add_or_get_parser_add(self): + parser2 = self.top_level_parse.add_or_get_parser(32) + self.assertIsNone(parser2.name) + self.assertEqual(parser2.address, 32) + self.assertIs(self.top_level_parse.parser_objs[32], parser2) + + def test_add_or_get_parser_get(self): + self.top_level_parse.parser_objs[32] = self.parser1 + parser1 = self.top_level_parse.add_or_get_parser(32) + self.assertIs(parser1, self.parser1) + + def test_get_parser_top_per_arena_mem(self): + self.top_level_parse.parser_stacks.append(self.ps1) + self.top_level_parse.parser_objs[32] = self.parser1 + self.ps1.push(self.parser1) + parser2 = Parser("bar", 48) + self.top_level_parse.parser_objs[48] = parser2 + self.ps1.push(parser2) + self.parser1.add_mem_use(16, 50) + parser2.add_mem_use(16, 60) + self.assertIs(self.top_level_parse.get_parser_top_per_arena_mem(), parser2) + + def test_get_parser_top_total_arena_mem(self): + self.top_level_parse.parser_stacks.append(self.ps1) + self.top_level_parse.parser_objs[32] = self.parser1 + self.ps1.push(self.parser1) + parser2 = Parser("bar", 48) + self.top_level_parse.parser_objs[48] = parser2 + self.ps1.push(parser2) + self.parser1.add_mem_use(16, 50) + self.parser1.add_mem_use(128, 50) + parser2.add_mem_use(16, 60) + self.assertIs(self.top_level_parse.get_parser_top_total_arena_mem(), self.parser1) + + def test_get_avg_mem_use_per_arena(self): + self.top_level_parse.parser_stacks.append(self.ps1) + self.top_level_parse.parser_objs[32] = self.parser1 + self.ps1.push(self.parser1) + parser2 = Parser("bar", 48) + self.top_level_parse.parser_objs[48] = parser2 + self.ps1.push(parser2) + self.parser1.add_mem_use(16, 50) + self.parser1.add_mem_use(128, 50) + parser2.add_mem_use(16, 60) + self.assertEqual(self.top_level_parse.get_avg_mem_use_per_arena(), {16: 55, 128: 50}) diff --git a/gdb-port/top-level-parse.py b/gdb-port/top-level-parse.py new file mode 100644 index 0000000000000000000000000000000000000000..9f7b2c13d18232910163720a0d36258b72356099 --- /dev/null +++ b/gdb-port/top-level-parse.py @@ -0,0 +1,172 @@ +class TopLevelParse: + def __init__(self): + self.parser_stacks = [] + self.parser_objs = {} + self.unclaimed_mem_use = 0 + # Holds 32 characters starting at state->input_stream[index], used by the GUI + self.current_input_chunk = '' + self.current_parser_env = '' + # We save/push in perform_lowlevel_parse, but this is used to display them ahead of time + self.h_do_parse_parser = None + self.vt_types = None + self.parser_decombinator = None + + def init_parser(self): + self.vt_types = VTTypes() + self.parser_decombinator = ParserDecombinator(self.vt_types) + + # Called from h_packrat_parse()'s handler, where the parse state and arena get initialized + def enter_h_packrat_parse(self, parser): + # TODO: add a parser stack or something? + parser_stack = ParserStack(None, None) + self.parser_stacks.append(parser_stack) + return 0 + + # TODO: arena parameter is useless + def enter_h_do_parse(self, parse_state, arena, parser): + parser_stack = self.peek_parserstack() + try: + parser_obj = self.parser_objs[parser] + except KeyError: + parser_obj = Parser(None, parser) + self.parser_objs[parser] = parser_obj + self.h_do_parse_parser = parser_obj # TODO: current_parser_env should be set here instead too + if parser_stack.parse_state is None and parser_stack.parse_state != parse_state: + self.first_h_do_parse_after_packrat_parse(parse_state, arena) + + # Called from h_do_parse()'s handler, at which point we know the addresses of the state and arena + def first_h_do_parse_after_packrat_parse(self, parse_state, arena): + parser_stack = self.peek_parserstack() + parser_stack.set_state(parse_state) + + # Popping the stack of stack of parsers + def return_from_h_packrat_parse(self): + old_stack = self.parser_stacks.pop() + if old_stack.depth() > 0: + print("Warning: parser stack not empty but parse is successful?") + + # Memoize the parser object for this particular address, then push it on the stack + # Returns the parser object we just initalized (or the one already existing) + #TODO: memoize_parser method + def enter_perform_lowlevel_parse(self, parser_addr): + try: + parser_obj = self.parser_objs[parser_addr] + except KeyError: + # Create a parser object with no name and the address of the parser + parser_obj = Parser(None, parser_addr) + self.parser_objs[parser_addr] = parser_obj + + parser_stack = self.peek_parserstack() + parser_stack.push(parser_obj) + if self.parser_decombinator: + p_env = self.parser_decombinator.decompose_parser(parser_obj, self) + self.set_parser_env(type(p_env).__name__ + " - " + str(p_env)) # TODO: pass this as data structure to frontend + return parser_obj + + def return_from_perform_lowlevel_parse(self): + parser_stack = self.peek_parserstack() + parser_obj = parser_stack.pop() + # debug print here + + def enter_h_arena_malloc_raw(self, alloc_size): + parser_obj = self.peek_parser() + parser_stack = self.peek_parserstack() + # This is probably the slowest part of the code, or maybe the overhead adds up over many calls to h_arena_malloc_raw() + if parser_obj is not None: + # Caveat: parser_stack is assumed not to be None if we could get a parser_obj + parser_obj.add_mem_use(parser_stack.parse_state, alloc_size) + elif parser_stack is not None: + #print("Allocation of " + str(alloc_size) + " bytes without a parser on the stack. (Happens before first call perform_lowlevel_parse to or after return from that call)") + parser_stack.unclaimed_mem_use += alloc_size + else: + #print("Allocation of " + str(alloc_size) + " bytes without a parser stack. (This happens before and after parse)") + self.unclaimed_mem_use += alloc_size + + def parse_virtual(self, parser_name): + parser_obj = self.peek_parser() + if parser_obj.name is None: + parser_obj.name_parser(parser_name) + #else: + #print("Warning: parser already named! This is a bug. old name: %s, new name: %s" % (parser_obj.name, parser_name)) + + def peek_parserstack(self): + try: + retval = self.parser_stacks[-1] + except IndexError: + retval = None + return retval + + def peek_parser(self): + try: + retval = self.peek_parserstack().peek() + except AttributeError: + # print("Parser stack of stacks empty!") + retval = None + + # retval will also be None when parser stack is empty (while parser stack of stacks isn't) + return retval + + def parser_by_address(self, parser_addr): + try: + return self.parser_objs[int(parser_addr)] + except KeyError: + print("Parser with address " + str(hex(parser_addr)) + " not found!") + + def parsers_by_name(self, parser_name): + results = [v for k,v in self.parser_objs.items() if v.name == parser_name] + if len(results) > 0: + return results + else: + return None + + def set_input_chunk(self, chunk): + self.input_chunk = chunk + + def get_input_chunk(self): + return self.input_chunk + + def set_parser_env(self, parser_env): + self.current_parser_env = parser_env + + def get_parser_env(self): + return self.current_parser_env + + def add_or_get_parser(self, parser_addr): + try: + parser_obj = self.parser_objs[int(parser_addr)] + except KeyError: + # Create a parser object with no name and the address of the parser + parser_obj = Parser(None, int(parser_addr)) + self.parser_objs[int(parser_addr)] = parser_obj + + return parser_obj + + def get_parser_top_per_arena_mem(self): + return sorted(self.parser_objs.values(), key=Parser.get_arenamax, reverse=True)[0] + + def get_parser_top_total_arena_mem(self): + return sorted(self.parser_objs.values(), key=Parser.get_arenasum, reverse=True)[0] + + def get_avg_mem_use_per_arena(self): + avg_mem_use = {} + arena_counts = {} + # Accumulate byte counts and counts of parsers using that arena + for p in self.parser_objs.values(): + for arena,mem in p.bytes_used.items(): + if arena in arena_counts.keys(): + arena_counts[arena] += 1 + else: + arena_counts[arena] = 1 + + if arena in avg_mem_use.keys(): + avg_mem_use[arena] += mem + else: + avg_mem_use[arena] = mem + + averages = {arena: mem/arena_counts[arena] for arena,mem in avg_mem_use.items()} + + return averages + + # TODO: get_avg_mem_use_all_arenas, get_total_mem_use + +top_level_parse = TopLevelParse()