From 0600440b7c67d7c8b21f1efbf3b872fb73b612f6 Mon Sep 17 00:00:00 2001 From: Dan Hirsch <thequux@upstandinghackers.com> Date: Thu, 23 May 2013 23:26:22 +0200 Subject: [PATCH] Got a lot of regex test cases working --- Makefile | 5 ++- common.mk | 2 +- src/Makefile | 2 + src/backends/regex.c | 19 ++++++--- src/backends/regex_debug.c | 83 ++++++++++++++++++++++++++++++++++++++ src/parsers/bits.c | 4 +- src/parsers/ch.c | 18 ++++++++- src/parsers/charset.c | 22 +++++++++- src/parsers/token.c | 2 +- src/parsers/whitespace.c | 2 +- src/system_allocator.c | 3 +- 11 files changed, 148 insertions(+), 14 deletions(-) create mode 100644 src/backends/regex_debug.c diff --git a/Makefile b/Makefile index ef5be76a..dfca1177 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,10 @@ CONFIG_VARS= INCLUDE_TESTS .DEFAULT_GOAL := all -%: +nojni: all +nojni: SUBDIRS:=$(filter-out jni,$(SUBDIRS)) + +all clean: +for dir in $(SUBDIRS); do $(MAKE) -C $${dir} $@; done test: src/test_suite diff --git a/common.mk b/common.mk index 66e48f05..26734952 100644 --- a/common.mk +++ b/common.mk @@ -6,7 +6,7 @@ endif include $(TOPLEVEL)/config.mk TEST_CFLAGS = $(shell pkg-config --cflags glib-2.0) -DINCLUDE_TESTS -TEST_LDFLAGS = $(shell pkg-config --libs glib-2.0) -lrt +TEST_LDFLAGS = $(shell pkg-config --libs glib-2.0) -lrt -ldl CFLAGS := -std=gnu99 -Wall -Wextra -Werror -Wno-unused-parameter -Wno-attributes -g LDFLAGS := diff --git a/src/Makefile b/src/Makefile index cafafa85..7fac881a 100644 --- a/src/Makefile +++ b/src/Makefile @@ -83,3 +83,5 @@ test: test_suite test_suite: $(TESTS) libhammer.a $(call hush, "Linking $@") $(CC) -o $@ $^ $(LDFLAGS) $(TEST_LDFLAGS) + +backends/regex.o: backends/regex_debug.c diff --git a/src/backends/regex.c b/src/backends/regex.c index 3cbbb2d4..e0f3f7ee 100644 --- a/src/backends/regex.c +++ b/src/backends/regex.c @@ -1,3 +1,4 @@ +#define _GNU_SOURCE #include <string.h> #include <assert.h> #include "../internal.h" @@ -13,6 +14,7 @@ typedef enum HSVMOp_ { SVM_ACTION, // Same meaning as RVM_ACTION SVM_CAPTURE, // Same meaning as RVM_CAPTURE SVM_ACCEPT, + SVM_OPCOUNT } HSVMOp; typedef struct HRVMTrace_ { @@ -42,8 +44,8 @@ HRVMTrace *invert_trace(HRVMTrace *trace) { trace->next = last; last = trace; trace = next; - } while (trace->next); - return trace; + } while (trace); + return last; } void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const uint8_t* input, size_t len) { @@ -151,7 +153,7 @@ void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const uint8_t* input, size_ case RVM_STEP: // save thread live_threads++; - heads_n[THREAD.ip++] = THREAD.trace; + heads_n[++THREAD.ip] = THREAD.trace; ipq_top--; goto next_insn; } @@ -221,15 +223,15 @@ HParseResult *run_trace(HAllocator *mm__, HRVMProg *orig_prog, HRVMTrace *trace, case SVM_CAPTURE: // Top of stack must be a mark // This replaces said mark in-place with a TT_BYTES. - assert(ctx.stack[ctx.stack_count]->token_type == TT_MARK); + assert(ctx.stack[ctx.stack_count-1]->token_type == TT_MARK); - tmp_res = ctx.stack[ctx.stack_count]; + tmp_res = ctx.stack[ctx.stack_count-1]; tmp_res->token_type = TT_BYTES; // TODO: Will need to copy if bit_offset is nonzero assert(tmp_res->bit_offset == 0); tmp_res->bytes.token = input + tmp_res->index; - tmp_res->bytes.len = cur->input_pos - tmp_res->index + 1; // inclusive + tmp_res->bytes.len = cur->input_pos - tmp_res->index; break; case SVM_ACCEPT: assert(ctx.stack_count == 1); @@ -351,6 +353,7 @@ static int h_regex_compile(HAllocator *mm__, HParser* parser, const void* params h_free(prog); return 2; } + h_rvm_insert_insn(prog, RVM_ACCEPT, 0); parser->backend_data = prog; return 0; } @@ -364,3 +367,7 @@ HParserBackendVTable h__regex_backend_vtable = { .parse = h_regex_parse, .free = h_regex_free }; + +#ifndef NDEBUG +#include "regex_debug.c" +#endif diff --git a/src/backends/regex_debug.c b/src/backends/regex_debug.c new file mode 100644 index 00000000..520c5a9b --- /dev/null +++ b/src/backends/regex_debug.c @@ -0,0 +1,83 @@ +// Intended to be included from regex_debug.c +#define _GNU_SOURCE +#include <stdio.h> +#include <malloc.h> + + + +// This is some spectacularly non-portable code... but whee! +#include <dlfcn.h> +char* getsym(void* addr) { + Dl_info dli; + char* retstr; + if (dladdr(addr, &dli) != 0 && dli.dli_sname != NULL) { + if (dli.dli_saddr == addr) + return strdup(dli.dli_sname); + else + asprintf(&retstr, "%s+0x%lx", dli.dli_sname, addr - dli.dli_saddr); + } else + asprintf(&retstr, "%p", addr); + + return retstr; +} + +const char* rvm_op_names[RVM_OPCOUNT] = { + "ACCEPT", + "GOTO", + "FORK", + "PUSH", + "ACTION", + "CAPTURE", + "EOF", + "MATCH", + "STEP" +}; + +const char* svm_op_names[SVM_OPCOUNT] = { + "PUSH", + "NOP", + "ACTION", + "CAPTURE", + "ACCEPT" +}; + +void dump_rvm_prog(HRVMProg *prog) { + char* symref; + for (unsigned int i = 0; i < prog->length; i++) { + HRVMInsn *insn = &prog->insns[i]; + printf("%4d %-10s", i, rvm_op_names[insn->op]); + switch (insn->op) { + case RVM_GOTO: + case RVM_FORK: + printf("%hd\n", insn->arg); + break; + case RVM_ACTION: + symref = getsym(prog->actions[insn->arg].action); + // TODO: somehow format the argument to action + printf("%s\n", symref); + free(symref); + break; + case RVM_MATCH: { + uint8_t low, high; + low = insn->arg & 0xff; + high = (insn->arg >> 8) & 0xff; + if (high > low) + printf("NONE\n"); + else { + if (low >= 0x32 && low <= 0x7e) + printf("%02hhx ('%c')", low, low); + else + printf("%02hhx", low); + + if (high >= 0x32 && high <= 0x7e) + printf(" - %02hhx ('%c')\n", high, high); + else + printf(" - %02hhx\n", high); + } + break; + } + default: + printf("\n"); + } + } +} diff --git a/src/parsers/bits.c b/src/parsers/bits.c index 408ea29b..0da8bc1e 100644 --- a/src/parsers/bits.c +++ b/src/parsers/bits.c @@ -97,6 +97,8 @@ static bool h_svm_action_bits(HArena *arena, HSVMContext *ctx, void* env) { uint64_t res = 0; for (size_t i = 0; i < top->bytes.len; i++) res = (res << 8) | top->bytes.token[i]; // TODO: Handle other endiannesses. + uint64_t msb = (env_->signedp ? 1LL:0) << (top->bytes.len * 8 - 1); + res = (res ^ msb) - msb; top->uint = res; // possibly cast to signed through union top->token_type = (env_->signedp ? TT_SINT : TT_UINT); return true; @@ -105,7 +107,7 @@ static bool h_svm_action_bits(HArena *arena, HSVMContext *ctx, void* env) { static bool bits_ctrvm(HRVMProg *prog, void* env) { struct bits_env *env_ = (struct bits_env*)env; h_rvm_insert_insn(prog, RVM_PUSH, 0); - for (size_t i=0; (i < env_->length)/8; ++i) { // FUTURE: when we can handle non-byte-aligned, the env_->length/8 part will be different + for (size_t i=0; i < (env_->length/8); ++i) { // FUTURE: when we can handle non-byte-aligned, the env_->length/8 part will be different h_rvm_insert_insn(prog, RVM_MATCH, 0xFF00); h_rvm_insert_insn(prog, RVM_STEP, 0); } diff --git a/src/parsers/ch.c b/src/parsers/ch.c index 9621869c..0de61e49 100644 --- a/src/parsers/ch.c +++ b/src/parsers/ch.c @@ -1,3 +1,4 @@ +#include <assert.h> #include "parser_internal.h" static HParseResult* parse_ch(void* env, HParseState *state) { @@ -20,11 +21,26 @@ static HCFChoice* desugar_ch(HAllocator *mm__, void *env) { return ret; } +static bool h_svm_action_ch(HArena *arena, HSVMContext *ctx, void* env) { + // BUG: relies un undefined behaviour: int64_t is a signed uint64_t; not necessarily true on 32-bit + HParsedToken *top = ctx->stack[ctx->stack_count-1]; + assert(top->token_type == TT_BYTES); + uint64_t res = 0; + for (size_t i = 0; i < top->bytes.len; i++) + res = (res << 8) | top->bytes.token[i]; // TODO: Handle other endiannesses. + top->uint = res; // possibly cast to signed through union + top->token_type = TT_UINT; + return true; +} + static bool ch_ctrvm(HRVMProg *prog, void* env) { uint8_t c = (uint8_t)(unsigned long)(env); // TODO: Does this capture anything? - h_rvm_insert_insn(prog, RVM_MATCH, c & c << 8); + h_rvm_insert_insn(prog, RVM_PUSH, 0); + h_rvm_insert_insn(prog, RVM_MATCH, c | c << 8); h_rvm_insert_insn(prog, RVM_STEP, 0); + h_rvm_insert_insn(prog, RVM_CAPTURE, 0); + h_rvm_insert_insn(prog, RVM_ACTION, h_rvm_create_action(prog, h_svm_action_ch, env)); return true; } diff --git a/src/parsers/charset.c b/src/parsers/charset.c index 1b06cce6..8984b576 100644 --- a/src/parsers/charset.c +++ b/src/parsers/charset.c @@ -1,3 +1,4 @@ +#include <assert.h> #include <string.h> #include "../internal.h" #include "parser_internal.h" @@ -22,23 +23,42 @@ static HCFChoice* desugar_charset(HAllocator *mm__, void *env) { return ret; } +static bool h_svm_action_ch(HArena *arena, HSVMContext *ctx, void* env) { + // BUG: relies un undefined behaviour: int64_t is a signed uint64_t; not necessarily true on 32-bit + HParsedToken *top = ctx->stack[ctx->stack_count-1]; + assert(top->token_type == TT_BYTES); + uint64_t res = 0; + for (size_t i = 0; i < top->bytes.len; i++) + res = (res << 8) | top->bytes.token[i]; // TODO: Handle other endiannesses. + top->uint = res; // possibly cast to signed through union + top->token_type = TT_UINT; + return true; +} + // FUTURE: this is horribly inefficient static bool cs_ctrvm(HRVMProg *prog, void *env) { HCharset cs = (HCharset)env; + h_rvm_insert_insn(prog, RVM_PUSH, 0); + uint16_t start = h_rvm_get_ip(prog); for (size_t i=0; i<256; ++i) { + // TODO: merge ranges. if (charset_isset(cs, i)) { uint16_t insn = h_rvm_insert_insn(prog, RVM_FORK, 0); - h_rvm_insert_insn(prog, RVM_MATCH, i & i << 8); + h_rvm_insert_insn(prog, RVM_MATCH, i | i << 8); h_rvm_insert_insn(prog, RVM_GOTO, 0); h_rvm_patch_arg(prog, insn, h_rvm_get_ip(prog)); } } + h_rvm_insert_insn(prog, RVM_MATCH, 0x00FF); uint16_t jump = h_rvm_insert_insn(prog, RVM_STEP, 0); for (size_t i=start; i<jump; ++i) { if (RVM_GOTO == prog->insns[i].op) h_rvm_patch_arg(prog, i, jump); } + + h_rvm_insert_insn(prog, RVM_CAPTURE, 0); + h_rvm_insert_insn(prog, RVM_ACTION, h_rvm_create_action(prog, h_svm_action_ch, env)); return true; } diff --git a/src/parsers/token.c b/src/parsers/token.c index defcdf75..9b190426 100644 --- a/src/parsers/token.c +++ b/src/parsers/token.c @@ -69,7 +69,7 @@ static bool token_ctrvm(HRVMProg *prog, void *env) { HToken *t = (HToken*)env; h_rvm_insert_insn(prog, RVM_PUSH, 0); for (int i=0; i<t->len; ++i) { - h_rvm_insert_insn(prog, RVM_MATCH, t->str[i] & t->str[i] << 8); + h_rvm_insert_insn(prog, RVM_MATCH, t->str[i] | t->str[i] << 8); h_rvm_insert_insn(prog, RVM_STEP, 0); } h_rvm_insert_insn(prog, RVM_CAPTURE, 0); diff --git a/src/parsers/whitespace.c b/src/parsers/whitespace.c index 89ce23b0..eb89446f 100644 --- a/src/parsers/whitespace.c +++ b/src/parsers/whitespace.c @@ -73,7 +73,7 @@ static bool ws_ctrvm(HRVMProg *prog, void *env) { h_rvm_insert_insn(prog, RVM_GOTO, start); h_rvm_patch_arg(prog, next, h_rvm_get_ip(prog)); } - return h_compile_regex(prog, p->env); + return h_compile_regex(prog, p); } static const HParserVtable whitespace_vt = { diff --git a/src/system_allocator.c b/src/system_allocator.c index 80d7acf2..5f3e4844 100644 --- a/src/system_allocator.c +++ b/src/system_allocator.c @@ -21,7 +21,8 @@ static void* system_realloc(HAllocator *allocator, void* ptr, size_t size) { } static void system_free(HAllocator *allocator, void* ptr) { - free(ptr - sizeof(size_t)); + if (ptr != NULL) + free(ptr - sizeof(size_t)); } HAllocator system_allocator = { -- GitLab