diff --git a/Makefile b/Makefile index ef5be76a59520fdc44348e426bca421708a02262..dfca1177fa95ac6896934ed6a83dd84ca4176063 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,10 @@ CONFIG_VARS= INCLUDE_TESTS .DEFAULT_GOAL := all -%: +nojni: all +nojni: SUBDIRS:=$(filter-out jni,$(SUBDIRS)) + +all clean: +for dir in $(SUBDIRS); do $(MAKE) -C $${dir} $@; done test: src/test_suite diff --git a/common.mk b/common.mk index 66e48f05a1a5e3068962715aef0d1b5099e81f47..26734952f3cde74e9b9b42c9c1330653ad4a3ef1 100644 --- a/common.mk +++ b/common.mk @@ -6,7 +6,7 @@ endif include $(TOPLEVEL)/config.mk TEST_CFLAGS = $(shell pkg-config --cflags glib-2.0) -DINCLUDE_TESTS -TEST_LDFLAGS = $(shell pkg-config --libs glib-2.0) -lrt +TEST_LDFLAGS = $(shell pkg-config --libs glib-2.0) -lrt -ldl CFLAGS := -std=gnu99 -Wall -Wextra -Werror -Wno-unused-parameter -Wno-attributes -g LDFLAGS := diff --git a/examples/base64.c b/examples/base64.c index ee142e3cbfee5fa7bf2032d4f07882581916d4f7..a02297397a4942d1e2e1e003233465aa1e19a6c8 100644 --- a/examples/base64.c +++ b/examples/base64.c @@ -15,21 +15,21 @@ const HParser* document = NULL; void init_parser(void) { // CORE - const HParser *digit = h_ch_range(0x30, 0x39); - const HParser *alpha = h_choice(h_ch_range(0x41, 0x5a), h_ch_range(0x61, 0x7a), NULL); + HParser *digit = h_ch_range(0x30, 0x39); + HParser *alpha = h_choice(h_ch_range(0x41, 0x5a), h_ch_range(0x61, 0x7a), NULL); // AUX. - const HParser *plus = h_ch('+'); - const HParser *slash = h_ch('/'); - const HParser *equals = h_ch('='); - - const HParser *bsfdig = h_choice(alpha, digit, plus, slash, NULL); - const HParser *bsfdig_4bit = h_in((uint8_t *)"AEIMQUYcgkosw048", 16); - const HParser *bsfdig_2bit = h_in((uint8_t *)"AQgw", 4); - const HParser *base64_3 = h_repeat_n(bsfdig, 4); - const HParser *base64_2 = h_sequence(bsfdig, bsfdig, bsfdig_4bit, equals, NULL); - const HParser *base64_1 = h_sequence(bsfdig, bsfdig_2bit, equals, equals, NULL); - const HParser *base64 = h_sequence(h_many(base64_3), + HParser *plus = h_ch('+'); + HParser *slash = h_ch('/'); + HParser *equals = h_ch('='); + + HParser *bsfdig = h_choice(alpha, digit, plus, slash, NULL); + HParser *bsfdig_4bit = h_in((uint8_t *)"AEIMQUYcgkosw048", 16); + HParser *bsfdig_2bit = h_in((uint8_t *)"AQgw", 4); + HParser *base64_3 = h_repeat_n(bsfdig, 4); + HParser *base64_2 = h_sequence(bsfdig, bsfdig, bsfdig_4bit, equals, NULL); + HParser *base64_1 = h_sequence(bsfdig, bsfdig_2bit, equals, equals, NULL); + HParser *base64 = h_sequence(h_many(base64_3), h_optional(h_choice(base64_2, base64_1, NULL)), NULL); diff --git a/examples/base64_sem1.c b/examples/base64_sem1.c index f2a3e82b3ef2ce60befd3ccc8d5570937ecd7166..0a08e50082fd637b9c81bc751bb4a6620f21949b 100644 --- a/examples/base64_sem1.c +++ b/examples/base64_sem1.c @@ -22,7 +22,7 @@ // They must be named act_<rulename>. /// -const HParsedToken *act_bsfdig(const HParseResult *p) +HParsedToken *act_bsfdig(const HParseResult *p) { HParsedToken *res = H_MAKE_UINT(0); @@ -53,7 +53,7 @@ H_ACT_APPLY(act_index0, h_act_index, 0); #define act_document act_index0 // General-form action to turn a block of base64 digits into bytes. -const HParsedToken *act_base64_n(int n, const HParseResult *p) +HParsedToken *act_base64_n(int n, const HParseResult *p) { HParsedToken *res = H_MAKE_SEQN(n); @@ -82,7 +82,7 @@ H_ACT_APPLY(act_base64_3, act_base64_n, 3); H_ACT_APPLY(act_base64_2, act_base64_n, 2); H_ACT_APPLY(act_base64_1, act_base64_n, 1); -const HParsedToken *act_base64(const HParseResult *p) +HParsedToken *act_base64(const HParseResult *p) { assert(p->ast->token_type == TT_SEQUENCE); assert(p->ast->seq->used == 2); @@ -96,7 +96,7 @@ const HParsedToken *act_base64(const HParseResult *p) h_seq_append(res, seq->elements[i]); // append one trailing base64_2 or _1 block - const HParsedToken *tok = h_seq_index(p->ast, 1); + HParsedToken *tok = h_seq_index(p->ast, 1); if(tok->token_type == TT_SEQUENCE) h_seq_append(res, tok); @@ -108,7 +108,7 @@ const HParsedToken *act_base64(const HParseResult *p) // Set up the parser with the grammar to be recognized. /// -const HParser *init_parser(void) +HParser *init_parser(void) { // CORE H_RULE (digit, h_ch_range(0x30, 0x39)); diff --git a/examples/base64_sem2.c b/examples/base64_sem2.c index 32afe5bbc1ab74077f08311c1c9d47405060e3f7..c1549cf5371affd95aaabec667cc7d5018e91f1e 100644 --- a/examples/base64_sem2.c +++ b/examples/base64_sem2.c @@ -48,7 +48,7 @@ uint8_t bsfdig_value(const HParsedToken *p) // helper: append a byte value to a sequence #define seq_append_byte(res, b) h_seq_snoc(res, H_MAKE_UINT(b)) -const HParsedToken *act_base64(const HParseResult *p) +HParsedToken *act_base64(const HParseResult *p) { assert(p->ast->token_type == TT_SEQUENCE); assert(p->ast->seq->used == 2); diff --git a/examples/dns.c b/examples/dns.c index 7887ba6a7881d41c21836effa5f3ebaffc986d56..3f730b970cd6d59677b7d8a38dcec8ea7b10ef48 100644 --- a/examples/dns.c +++ b/examples/dns.c @@ -86,7 +86,7 @@ void set_rdata(struct dns_rr *rr, HCountedArray *rdata) { } } -const HParsedToken* act_header(const HParseResult *p) { +HParsedToken* act_header(const HParseResult *p) { HParsedToken **fields = h_seq_elements(p->ast); dns_header_t header_ = { .id = H_CAST_UINT(fields[0]), @@ -109,7 +109,7 @@ const HParsedToken* act_header(const HParseResult *p) { return H_MAKE(dns_header_t, header); } -const HParsedToken* act_label(const HParseResult *p) { +HParsedToken* act_label(const HParseResult *p) { dns_label_t *r = H_ALLOC(dns_label_t); r->len = h_seq_len(p->ast); @@ -121,7 +121,7 @@ const HParsedToken* act_label(const HParseResult *p) { return H_MAKE(dns_label_t, r); } -const HParsedToken* act_rr(const HParseResult *p) { +HParsedToken* act_rr(const HParseResult *p) { dns_rr_t *rr = H_ALLOC(dns_rr_t); rr->name = *H_FIELD(dns_domain_t, 0); @@ -136,7 +136,7 @@ const HParsedToken* act_rr(const HParseResult *p) { return H_MAKE(dns_rr_t, rr); } -const HParsedToken* act_question(const HParseResult *p) { +HParsedToken* act_question(const HParseResult *p) { dns_question_t *q = H_ALLOC(dns_question_t); HParsedToken **fields = h_seq_elements(p->ast); @@ -153,7 +153,7 @@ const HParsedToken* act_question(const HParseResult *p) { return H_MAKE(dns_question_t, q); } -const HParsedToken* act_message(const HParseResult *p) { +HParsedToken* act_message(const HParseResult *p) { h_pprint(stdout, p->ast, 0, 2); dns_message_t *msg = H_ALLOC(dns_message_t); diff --git a/examples/dns_common.c b/examples/dns_common.c index 76915b66e8030b26cbf23462ccaad2d944949d05..01dd8f0fb4630680174f8fff657c70f845775624 100644 --- a/examples/dns_common.c +++ b/examples/dns_common.c @@ -18,8 +18,8 @@ bool validate_label(HParseResult *p) { #define act_label h_act_flatten -const HParsedToken* act_domain(const HParseResult *p) { - const HParsedToken *ret = NULL; +HParsedToken* act_domain(const HParseResult *p) { + HParsedToken *ret = NULL; char *arr = NULL; switch(p->ast->token_type) { @@ -56,8 +56,8 @@ const HParsedToken* act_domain(const HParseResult *p) { return ret; } -const HParser* init_domain() { - static const HParser *ret = NULL; +HParser* init_domain() { + static HParser *ret = NULL; if (ret) return ret; @@ -76,8 +76,8 @@ const HParser* init_domain() { return ret; } -const HParser* init_character_string() { - static const HParser *cstr = NULL; +HParser* init_character_string() { + static HParser *cstr = NULL; if (cstr) return cstr; diff --git a/examples/dns_common.h b/examples/dns_common.h index c1d8d7e9d66f98c666f08c95ff5d34fa93f874c3..8af014b2f22c28da36e3312b56e355e1d5500e73 100644 --- a/examples/dns_common.h +++ b/examples/dns_common.h @@ -4,9 +4,9 @@ #include "../src/hammer.h" #include "../src/glue.h" -const HParser* init_domain(); -const HParser* init_character_string(); +HParser* init_domain(); +HParser* init_character_string(); -const HParsedToken* act_index0(const HParseResult *p); +HParsedToken* act_index0(const HParseResult *p); #endif diff --git a/examples/rr.c b/examples/rr.c index 8c14e0aec8e678f86dfdbc54dc0499dd3a828d8e..2ba85341d0f444924f9801656eeb8fa94728ac3e 100644 --- a/examples/rr.c +++ b/examples/rr.c @@ -17,7 +17,7 @@ bool validate_null(HParseResult *p) { return (65536 > p->ast->seq->used); } -const HParsedToken *act_null(const HParseResult *p) { +HParsedToken *act_null(const HParseResult *p) { dns_rr_null_t *null = H_ALLOC(dns_rr_null_t); size_t len = h_seq_len(p->ast); @@ -28,7 +28,7 @@ const HParsedToken *act_null(const HParseResult *p) { return H_MAKE(dns_rr_null_t, null); } -const HParsedToken *act_txt(const HParseResult *p) { +HParsedToken *act_txt(const HParseResult *p) { dns_rr_txt_t *txt = H_ALLOC(dns_rr_txt_t); const HCountedArray *arr = H_CAST_SEQ(p->ast); @@ -47,7 +47,7 @@ const HParsedToken *act_txt(const HParseResult *p) { return H_MAKE(dns_rr_txt_t, txt); } -const HParsedToken* act_cstr(const HParseResult *p) { +HParsedToken* act_cstr(const HParseResult *p) { dns_cstr_t *cs = H_ALLOC(dns_cstr_t); const HCountedArray *arr = H_CAST_SEQ(p->ast); @@ -60,7 +60,7 @@ const HParsedToken* act_cstr(const HParseResult *p) { return H_MAKE(dns_cstr_t, cs); } -const HParsedToken* act_soa(const HParseResult *p) { +HParsedToken* act_soa(const HParseResult *p) { dns_rr_soa_t *soa = H_ALLOC(dns_rr_soa_t); soa->mname = *H_FIELD(dns_domain_t, 0); @@ -74,7 +74,7 @@ const HParsedToken* act_soa(const HParseResult *p) { return H_MAKE(dns_rr_soa_t, soa); } -const HParsedToken* act_wks(const HParseResult *p) { +HParsedToken* act_wks(const HParseResult *p) { dns_rr_wks_t *wks = H_ALLOC(dns_rr_wks_t); wks->address = H_FIELD_UINT(0); @@ -87,7 +87,7 @@ const HParsedToken* act_wks(const HParseResult *p) { return H_MAKE(dns_rr_wks_t, wks); } -const HParsedToken* act_hinfo(const HParseResult *p) { +HParsedToken* act_hinfo(const HParseResult *p) { dns_rr_hinfo_t *hinfo = H_ALLOC(dns_rr_hinfo_t); hinfo->cpu = *H_FIELD(dns_cstr_t, 0); @@ -96,7 +96,7 @@ const HParsedToken* act_hinfo(const HParseResult *p) { return H_MAKE(dns_rr_hinfo_t, hinfo); } -const HParsedToken* act_minfo(const HParseResult *p) { +HParsedToken* act_minfo(const HParseResult *p) { dns_rr_minfo_t *minfo = H_ALLOC(dns_rr_minfo_t); minfo->rmailbx = *H_FIELD(dns_domain_t, 0); @@ -105,7 +105,7 @@ const HParsedToken* act_minfo(const HParseResult *p) { return H_MAKE(dns_rr_minfo_t, minfo); } -const HParsedToken* act_mx(const HParseResult *p) { +HParsedToken* act_mx(const HParseResult *p) { dns_rr_mx_t *mx = H_ALLOC(dns_rr_mx_t); mx->preference = H_FIELD_UINT(0); @@ -120,8 +120,8 @@ const HParsedToken* act_mx(const HParseResult *p) { /// #define RDATA_TYPE_MAX 16 -const HParser* init_rdata(uint16_t type) { - static const HParser *parsers[RDATA_TYPE_MAX+1]; +HParser* init_rdata(uint16_t type) { + static HParser *parsers[RDATA_TYPE_MAX+1]; static int inited = 0; if (type >= sizeof(parsers)) diff --git a/examples/rr.h b/examples/rr.h index fce457817c7802fbd0cb77b688c99f9244bda86a..bbc1d0331fdcf6f9bcba3d5f534daca9e22ab5e0 100644 --- a/examples/rr.h +++ b/examples/rr.h @@ -3,6 +3,6 @@ #include "../src/hammer.h" -const HParser* init_rdata(uint16_t type); +HParser* init_rdata(uint16_t type); #endif diff --git a/src/Makefile b/src/Makefile index cafafa85043956606d4a1083caa161aaeccb7d11..7fac881ae0882fb0d436005d65dd63125914af4c 100644 --- a/src/Makefile +++ b/src/Makefile @@ -83,3 +83,5 @@ test: test_suite test_suite: $(TESTS) libhammer.a $(call hush, "Linking $@") $(CC) -o $@ $^ $(LDFLAGS) $(TEST_LDFLAGS) + +backends/regex.o: backends/regex_debug.c diff --git a/src/backends/regex.c b/src/backends/regex.c index 3cbbb2d4721afafca68ba56aa2bb98bcebbffc9f..4389bc9172d99a175621fc6488326d3dfc38d07e 100644 --- a/src/backends/regex.c +++ b/src/backends/regex.c @@ -1,3 +1,4 @@ +#define _GNU_SOURCE #include <string.h> #include <assert.h> #include "../internal.h" @@ -13,6 +14,7 @@ typedef enum HSVMOp_ { SVM_ACTION, // Same meaning as RVM_ACTION SVM_CAPTURE, // Same meaning as RVM_CAPTURE SVM_ACCEPT, + SVM_OPCOUNT } HSVMOp; typedef struct HRVMTrace_ { @@ -42,8 +44,8 @@ HRVMTrace *invert_trace(HRVMTrace *trace) { trace->next = last; last = trace; trace = next; - } while (trace->next); - return trace; + } while (trace); + return last; } void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const uint8_t* input, size_t len) { @@ -51,7 +53,7 @@ void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const uint8_t* input, size_ HRVMTrace **heads_p = a_new(HRVMTrace*, prog->length), **heads_n = a_new(HRVMTrace*, prog->length); - HRVMTrace *ret_trace; + HRVMTrace *ret_trace = NULL; uint8_t *insn_seen = a_new(uint8_t, prog->length); // 0 -> not seen, 1->processed, 2->queued HRVMThread *ip_queue = a_new(HRVMThread, prog->length); @@ -59,6 +61,7 @@ void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const uint8_t* input, size_ + #define THREAD ip_queue[ipq_top-1] #define PUSH_SVM(op_, arg_) do { \ @@ -96,22 +99,23 @@ void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const uint8_t* input, size_ if (!heads_p[ip_s]) continue; THREAD.ip = ip_s; - + THREAD.trace = heads_p[ip_s]; uint8_t hi, lo; uint16_t arg; while(ipq_top > 0) { - if (insn_seen[THREAD.ip] == 1) + if (insn_seen[THREAD.ip] == 1) { + ipq_top--; // Kill thread. continue; + } insn_seen[THREAD.ip] = 1; arg = prog->insns[THREAD.ip].arg; switch(prog->insns[THREAD.ip].op) { case RVM_ACCEPT: PUSH_SVM(SVM_ACCEPT, 0); ret_trace = THREAD.trace; - goto run_trace; + ipq_top--; + goto next_insn; case RVM_MATCH: - // Doesn't actually validate the "must be followed by MATCH - // or STEP. It should. Preproc perhaps? hi = (arg >> 8) & 0xff; lo = arg & 0xff; THREAD.ip++; @@ -151,7 +155,7 @@ void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const uint8_t* input, size_ case RVM_STEP: // save thread live_threads++; - heads_n[THREAD.ip++] = THREAD.trace; + heads_n[++THREAD.ip] = THREAD.trace; ipq_top--; goto next_insn; } @@ -163,13 +167,14 @@ void* h_rvm_run__m(HAllocator *mm__, HRVMProg *prog, const uint8_t* input, size_ } // No accept was reached. match_fail: - h_delete_arena(arena); - return NULL; + if (ret_trace == NULL) { + // No match found; definite failure. + h_delete_arena(arena); + return NULL; + } - run_trace: // Invert the direction of the trace linked list. - ret_trace = invert_trace(ret_trace); HParseResult *ret = run_trace(mm__, prog, ret_trace, input, len); // ret is in its own arena @@ -214,33 +219,38 @@ HParseResult *run_trace(HAllocator *mm__, HRVMProg *orig_prog, HRVMTrace *trace, case SVM_ACTION: // Action should modify stack appropriately if (!orig_prog->actions[cur->arg].action(arena, &ctx, orig_prog->actions[cur->arg].env)) { + // action failed... abort somehow - // TODO: Actually abort + goto fail; } break; case SVM_CAPTURE: // Top of stack must be a mark // This replaces said mark in-place with a TT_BYTES. - assert(ctx.stack[ctx.stack_count]->token_type == TT_MARK); + assert(ctx.stack[ctx.stack_count-1]->token_type == TT_MARK); - tmp_res = ctx.stack[ctx.stack_count]; + tmp_res = ctx.stack[ctx.stack_count-1]; tmp_res->token_type = TT_BYTES; // TODO: Will need to copy if bit_offset is nonzero assert(tmp_res->bit_offset == 0); tmp_res->bytes.token = input + tmp_res->index; - tmp_res->bytes.len = cur->input_pos - tmp_res->index + 1; // inclusive + tmp_res->bytes.len = cur->input_pos - tmp_res->index; break; case SVM_ACCEPT: - assert(ctx.stack_count == 1); - HParseResult *res = a_new(HParseResult, 1); - res->ast = ctx.stack[0]; + assert(ctx.stack_count <= 1); + HParseResult *res = a_new(HParseResult, 1); + if (ctx.stack_count == 1) { + res->ast = ctx.stack[0]; + } else { + res->ast = NULL; + } res->bit_length = cur->input_pos * 8; res->arena = arena; return res; } } - + fail: h_delete_arena(arena); return NULL; } @@ -291,7 +301,7 @@ void h_rvm_patch_arg(HRVMProg *prog, uint16_t ip, uint16_t new_val) { size_t h_svm_count_to_mark(HSVMContext *ctx) { size_t ctm; - for (ctm = 0; ctm < ctx->stack_count-1; ctm++) { + for (ctm = 0; ctm < ctx->stack_count; ctm++) { if (ctx->stack[ctx->stack_count - 1 - ctm]->token_type == TT_MARK) return ctm; } @@ -305,20 +315,20 @@ bool h_svm_action_make_sequence(HArena *arena, HSVMContext *ctx, void* env) { HParsedToken *res = ctx->stack[ctx->stack_count - 1 - n_items]; assert (res->token_type == TT_MARK); res->token_type = TT_SEQUENCE; - + HCountedArray *ret_carray = h_carray_new_sized(arena, n_items); res->seq = ret_carray; // res index and bit offset are the same as the mark. for (size_t i = 0; i < n_items; i++) { ret_carray->elements[i] = ctx->stack[ctx->stack_count - n_items + i]; } + ret_carray->used = n_items; ctx->stack_count -= n_items; return true; } bool h_svm_action_clear_to_mark(HArena *arena, HSVMContext *ctx, void* env) { - while (ctx->stack_count > 0) { - if (ctx->stack[--ctx->stack_count]->token_type == TT_MARK) + while (ctx->stack_count > 0) { if (ctx->stack[--ctx->stack_count]->token_type == TT_MARK) return true; } return false; // no mark found. @@ -351,6 +361,7 @@ static int h_regex_compile(HAllocator *mm__, HParser* parser, const void* params h_free(prog); return 2; } + h_rvm_insert_insn(prog, RVM_ACCEPT, 0); parser->backend_data = prog; return 0; } @@ -364,3 +375,7 @@ HParserBackendVTable h__regex_backend_vtable = { .parse = h_regex_parse, .free = h_regex_free }; + +#ifndef NDEBUG +#include "regex_debug.c" +#endif diff --git a/src/backends/regex_debug.c b/src/backends/regex_debug.c new file mode 100644 index 0000000000000000000000000000000000000000..8b2ea31163504e59b89c365a26420c1ff48e4a7a --- /dev/null +++ b/src/backends/regex_debug.c @@ -0,0 +1,103 @@ +// Intended to be included from regex_debug.c +#define _GNU_SOURCE +#include <stdio.h> +#include <malloc.h> + + + +// This is some spectacularly non-portable code... but whee! +#include <dlfcn.h> +char* getsym(void* addr) { + char* retstr; +#if 0 + // This will be fixed later. + Dl_info dli; + if (dladdr(addr, &dli) != 0 && dli.dli_sname != NULL) { + if (dli.dli_saddr == addr) + return strdup(dli.dli_sname); + else + asprintf(&retstr, "%s+0x%lx", dli.dli_sname, addr - dli.dli_saddr); + } else +#endif + asprintf(&retstr, "%p", addr); + + return retstr; +} + +const char* rvm_op_names[RVM_OPCOUNT] = { + "ACCEPT", + "GOTO", + "FORK", + "PUSH", + "ACTION", + "CAPTURE", + "EOF", + "MATCH", + "STEP" +}; + +const char* svm_op_names[SVM_OPCOUNT] = { + "PUSH", + "NOP", + "ACTION", + "CAPTURE", + "ACCEPT" +}; + +void dump_rvm_prog(HRVMProg *prog) { + char* symref; + for (unsigned int i = 0; i < prog->length; i++) { + HRVMInsn *insn = &prog->insns[i]; + printf("%4d %-10s", i, rvm_op_names[insn->op]); + switch (insn->op) { + case RVM_GOTO: + case RVM_FORK: + printf("%hd\n", insn->arg); + break; + case RVM_ACTION: + symref = getsym(prog->actions[insn->arg].action); + // TODO: somehow format the argument to action + printf("%s\n", symref); + free(symref); + break; + case RVM_MATCH: { + uint8_t low, high; + low = insn->arg & 0xff; + high = (insn->arg >> 8) & 0xff; + if (high < low) + printf("NONE\n"); + else { + if (low >= 0x20 && low <= 0x7e) + printf("%02hhx ('%c')", low, low); + else + printf("%02hhx", low); + + if (high >= 0x20 && high <= 0x7e) + printf(" - %02hhx ('%c')\n", high, high); + else + printf(" - %02hhx\n", high); + } + break; + } + default: + printf("\n"); + } + } +} + +void dump_svm_prog(HRVMProg *prog, HRVMTrace *trace) { + char* symref; + for (; trace != NULL; trace = trace->next) { + printf("@%04zd %-10s", trace->input_pos, svm_op_names[trace->opcode]); + switch (trace->opcode) { + case SVM_ACTION: + symref = getsym(prog->actions[trace->arg].action); + // TODO: somehow format the argument to action + printf("%s\n", symref); + free(symref); + break; + default: + printf("\n"); + } + } +} diff --git a/src/datastructures.c b/src/datastructures.c index 0581591f10d47a8dea16557706652f064f3f9b4d..a12707ef9758db93ad79dc052533b40cdd4edcbb 100644 --- a/src/datastructures.c +++ b/src/datastructures.c @@ -9,7 +9,8 @@ HCountedArray *h_carray_new_sized(HArena * arena, size_t size) { HCountedArray *ret = h_arena_malloc(arena, sizeof(HCountedArray)); - assert(size > 0); + if (size == 0) + size = 1; ret->used = 0; ret->capacity = size; ret->arena = arena; diff --git a/src/glue.c b/src/glue.c index 2cbfde6c2cf5b435c4f6b6a7f3639141b349e988..48bd222e59aa51dd19174b7153627e4d4ccc31f1 100644 --- a/src/glue.c +++ b/src/glue.c @@ -5,7 +5,7 @@ #include "parsers/parser_internal.h" // Helper to build HAction's that pick one index out of a sequence. -const HParsedToken *h_act_index(int i, const HParseResult *p) +HParsedToken *h_act_index(int i, const HParseResult *p) { if(!p) return NULL; @@ -23,7 +23,7 @@ const HParsedToken *h_act_index(int i, const HParseResult *p) return tok->seq->elements[i]; } -const HParsedToken *h_act_first(const HParseResult *p) { +HParsedToken *h_act_first(const HParseResult *p) { assert(p->ast); assert(p->ast->token_type == TT_SEQUENCE); assert(p->ast->seq->used > 0); @@ -31,7 +31,7 @@ const HParsedToken *h_act_first(const HParseResult *p) { return p->ast->seq->elements[0]; } -const HParsedToken *h_act_second(const HParseResult *p) { +HParsedToken *h_act_second(const HParseResult *p) { assert(p->ast); assert(p->ast->token_type == TT_SEQUENCE); assert(p->ast->seq->used > 0); @@ -39,7 +39,7 @@ const HParsedToken *h_act_second(const HParseResult *p) { return p->ast->seq->elements[1]; } -const HParsedToken *h_act_last(const HParseResult *p) { +HParsedToken *h_act_last(const HParseResult *p) { assert(p->ast); assert(p->ast->token_type == TT_SEQUENCE); assert(p->ast->seq->used > 0); @@ -59,7 +59,7 @@ static void act_flatten_(HCountedArray *seq, const HParsedToken *tok) { } } -const HParsedToken *h_act_flatten(const HParseResult *p) { +HParsedToken *h_act_flatten(const HParseResult *p) { HCountedArray *seq = h_carray_new(p->arena); act_flatten_(seq, p->ast); @@ -72,7 +72,7 @@ const HParsedToken *h_act_flatten(const HParseResult *p) { return res; } -const HParsedToken *h_act_ignore(const HParseResult *p) { +HParsedToken *h_act_ignore(const HParseResult *p) { return NULL; } diff --git a/src/glue.h b/src/glue.h index ece7e9ea053f95a2d5255e966f97ac60fdf8c2ba..1880988910e926c1c216dcd24cbb99022ddf7866 100644 --- a/src/glue.h +++ b/src/glue.h @@ -55,13 +55,13 @@ // -#define H_RULE(rule, def) const HParser *rule = def -#define H_ARULE(rule, def) const HParser *rule = h_action(def, act_ ## rule) -#define H_VRULE(rule, def) const HParser *rule = \ +#define H_RULE(rule, def) HParser *rule = def +#define H_ARULE(rule, def) HParser *rule = h_action(def, act_ ## rule) +#define H_VRULE(rule, def) HParser *rule = \ h_attr_bool(def, validate_ ## rule) -#define H_VARULE(rule, def) const HParser *rule = \ +#define H_VARULE(rule, def) HParser *rule = \ h_attr_bool(h_action(def, act_ ## rule), validate_ ## rule) -#define H_AVRULE(rule, def) const HParser *rule = \ +#define H_AVRULE(rule, def) HParser *rule = \ h_action(h_attr_bool(def, validate_ ## rule), act_ ## rule) @@ -88,17 +88,17 @@ // action such as h_act_index. // -const HParsedToken *h_act_index(int i, const HParseResult *p); -const HParsedToken *h_act_first(const HParseResult *p); -const HParsedToken *h_act_second(const HParseResult *p); -const HParsedToken *h_act_last(const HParseResult *p); -const HParsedToken *h_act_flatten(const HParseResult *p); -const HParsedToken *h_act_ignore(const HParseResult *p); +HParsedToken *h_act_index(int i, const HParseResult *p); +HParsedToken *h_act_first(const HParseResult *p); +HParsedToken *h_act_second(const HParseResult *p); +HParsedToken *h_act_last(const HParseResult *p); +HParsedToken *h_act_flatten(const HParseResult *p); +HParsedToken *h_act_ignore(const HParseResult *p); // Define 'myaction' as a specialization of 'paction' by supplying the leading // parameters. #define H_ACT_APPLY(myaction, paction, ...) \ - const HParsedToken *myaction(const HParseResult *p) { \ + HParsedToken *myaction(const HParseResult *p) { \ return paction(__VA_ARGS__, p); \ } diff --git a/src/hammer.h b/src/hammer.h index ebbec52a8fcc4b439d435fb7b79599ccddfc0025..455684cc92edbfbf9b9352625e373ca408f61261 100644 --- a/src/hammer.h +++ b/src/hammer.h @@ -111,7 +111,7 @@ typedef struct HBitWriter_ HBitWriter; * say, structs) and stuff values for them into the void* in the * tagged union in HParsedToken. */ -typedef const HParsedToken* (*HAction)(const HParseResult *p); +typedef HParsedToken* (*HAction)(const HParseResult *p); /** * Type of a boolean attribute-checking function, used in the @@ -370,7 +370,7 @@ HAMMER_FN_DECL_NOARG(HParser*, h_nothing_p); * * Result token type: TT_SEQUENCE */ -HAMMER_FN_DECL_VARARGS_ATTR(__attribute__((sentinel)), HParser*, h_sequence, const HParser* p); +HAMMER_FN_DECL_VARARGS_ATTR(__attribute__((sentinel)), HParser*, h_sequence, HParser* p); /** * Given an array of parsers, p_array, apply each parser in order. The @@ -379,7 +379,7 @@ HAMMER_FN_DECL_VARARGS_ATTR(__attribute__((sentinel)), HParser*, h_sequence, con * * Result token type: The type of the first successful parser's result. */ -HAMMER_FN_DECL_VARARGS_ATTR(__attribute__((sentinel)), HParser*, h_choice, const HParser* p); +HAMMER_FN_DECL_VARARGS_ATTR(__attribute__((sentinel)), HParser*, h_choice, HParser* p); /** * Given two parsers, p1 and p2, this parser succeeds in the following @@ -605,11 +605,11 @@ void h_bit_writer_free(HBitWriter* w); // General-purpose actions for use with h_action // XXX to be consolidated with glue.h when merged upstream -const HParsedToken *h_act_first(const HParseResult *p); -const HParsedToken *h_act_second(const HParseResult *p); -const HParsedToken *h_act_last(const HParseResult *p); -const HParsedToken *h_act_flatten(const HParseResult *p); -const HParsedToken *h_act_ignore(const HParseResult *p); +HParsedToken *h_act_first(const HParseResult *p); +HParsedToken *h_act_second(const HParseResult *p); +HParsedToken *h_act_last(const HParseResult *p); +HParsedToken *h_act_flatten(const HParseResult *p); +HParsedToken *h_act_ignore(const HParseResult *p); // {{{ Benchmark functions HAMMER_FN_DECL(HBenchmarkResults *, h_benchmark, HParser* parser, HParserTestcase* testcases); diff --git a/src/parsers/action.c b/src/parsers/action.c index 12ec036f144874ea944e647444a976e76a81b764..b00426a73646bf9ecb637777c62adafd49e231de 100644 --- a/src/parsers/action.c +++ b/src/parsers/action.c @@ -1,3 +1,4 @@ +#include <assert.h> #include "parser_internal.h" typedef struct { @@ -45,9 +46,35 @@ static bool action_isValidCF(void *env) { return a->p->vtable->isValidCF(a->p->env); } +static bool h_svm_action_action(HArena *arena, HSVMContext *ctx, void* arg) { + HParseResult res; + HAction action = arg; + assert(ctx->stack_count >= 1); + if (ctx->stack[ctx->stack_count-1]->token_type != TT_MARK) { + assert(ctx->stack_count >= 2 && ctx->stack[ctx->stack_count-2]->token_type == TT_MARK); + res.ast = ctx->stack[ctx->stack_count-2] = ctx->stack[ctx->stack_count-1]; + ctx->stack_count--; + // mark replaced. + } else { + res.ast = NULL; + } + res.arena = arena; + + HParsedToken *tok = action(&res); + if (tok != NULL) + ctx->stack[ctx->stack_count-1] = tok; + else + ctx->stack_count--; + return true; // action can't fail +} + static bool action_ctrvm(HRVMProg *prog, void* env) { HParseAction *a = (HParseAction*)env; - return a->p->vtable->compile_to_rvm(prog, a->p->env); + h_rvm_insert_insn(prog, RVM_PUSH, 0); + if (!h_compile_regex(prog, a->p)) + return false; + h_rvm_insert_insn(prog, RVM_ACTION, h_rvm_create_action(prog, h_svm_action_action, a->action)); + return true; } static const HParserVtable action_vt = { diff --git a/src/parsers/attr_bool.c b/src/parsers/attr_bool.c index 635806624c25fcff68bf2f827c5908060f7a3a1e..a05caa1f2471aab259e895df961c8402e78b4066 100644 --- a/src/parsers/attr_bool.c +++ b/src/parsers/attr_bool.c @@ -1,3 +1,4 @@ +#include <assert.h> #include "parser_internal.h" typedef struct { @@ -48,9 +49,30 @@ static HCFChoice* desugar_ab(HAllocator *mm__, void *env) { return ret; } +static bool h_svm_action_attr_bool(HArena *arena, HSVMContext *ctx, void* arg) { + HParseResult res; + HPredicate pred = arg; + assert(ctx->stack_count >= 1); + if (ctx->stack[ctx->stack_count-1]->token_type != TT_MARK) { + assert(ctx->stack_count >= 2 && ctx->stack[ctx->stack_count-2]->token_type == TT_MARK); + ctx->stack_count--; + res.ast = ctx->stack[ctx->stack_count-1] = ctx->stack[ctx->stack_count]; + // mark replaced. + } else { + ctx->stack_count--; + res.ast = NULL; + } + res.arena = arena; + return pred(&res); +} + static bool ab_ctrvm(HRVMProg *prog, void *env) { HAttrBool *ab = (HAttrBool*)env; - return h_compile_regex(prog, ab->p); + h_rvm_insert_insn(prog, RVM_PUSH, 0); + if (!h_compile_regex(prog, ab->p)) + return false; + h_rvm_insert_insn(prog, RVM_ACTION, h_rvm_create_action(prog, h_svm_action_attr_bool, ab->pred)); + return true; } static const HParserVtable attr_bool_vt = { diff --git a/src/parsers/bits.c b/src/parsers/bits.c index 408ea29b7eecf38bc75ff5d041d4725b6dc8c1c4..e153e3a86f2aa3716583fe37bf5cf44124ec5d9e 100644 --- a/src/parsers/bits.c +++ b/src/parsers/bits.c @@ -45,10 +45,10 @@ static HParsedToken *reshape_bits(const HParseResult *p, bool signedp) { return ret; } -static const HParsedToken *reshape_bits_unsigned(const HParseResult *p) { +static HParsedToken *reshape_bits_unsigned(const HParseResult *p) { return reshape_bits(p, false); } -static const HParsedToken *reshape_bits_signed(const HParseResult *p) { +static HParsedToken *reshape_bits_signed(const HParseResult *p) { return reshape_bits(p, true); } @@ -97,6 +97,8 @@ static bool h_svm_action_bits(HArena *arena, HSVMContext *ctx, void* env) { uint64_t res = 0; for (size_t i = 0; i < top->bytes.len; i++) res = (res << 8) | top->bytes.token[i]; // TODO: Handle other endiannesses. + uint64_t msb = (env_->signedp ? 1LL:0) << (top->bytes.len * 8 - 1); + res = (res ^ msb) - msb; top->uint = res; // possibly cast to signed through union top->token_type = (env_->signedp ? TT_SINT : TT_UINT); return true; @@ -105,7 +107,7 @@ static bool h_svm_action_bits(HArena *arena, HSVMContext *ctx, void* env) { static bool bits_ctrvm(HRVMProg *prog, void* env) { struct bits_env *env_ = (struct bits_env*)env; h_rvm_insert_insn(prog, RVM_PUSH, 0); - for (size_t i=0; (i < env_->length)/8; ++i) { // FUTURE: when we can handle non-byte-aligned, the env_->length/8 part will be different + for (size_t i=0; i < (env_->length/8); ++i) { // FUTURE: when we can handle non-byte-aligned, the env_->length/8 part will be different h_rvm_insert_insn(prog, RVM_MATCH, 0xFF00); h_rvm_insert_insn(prog, RVM_STEP, 0); } diff --git a/src/parsers/ch.c b/src/parsers/ch.c index 9621869c01286ea7e7664e3d1f6549e94d0e1937..0de61e49cfa158328196217b745524b06b880228 100644 --- a/src/parsers/ch.c +++ b/src/parsers/ch.c @@ -1,3 +1,4 @@ +#include <assert.h> #include "parser_internal.h" static HParseResult* parse_ch(void* env, HParseState *state) { @@ -20,11 +21,26 @@ static HCFChoice* desugar_ch(HAllocator *mm__, void *env) { return ret; } +static bool h_svm_action_ch(HArena *arena, HSVMContext *ctx, void* env) { + // BUG: relies un undefined behaviour: int64_t is a signed uint64_t; not necessarily true on 32-bit + HParsedToken *top = ctx->stack[ctx->stack_count-1]; + assert(top->token_type == TT_BYTES); + uint64_t res = 0; + for (size_t i = 0; i < top->bytes.len; i++) + res = (res << 8) | top->bytes.token[i]; // TODO: Handle other endiannesses. + top->uint = res; // possibly cast to signed through union + top->token_type = TT_UINT; + return true; +} + static bool ch_ctrvm(HRVMProg *prog, void* env) { uint8_t c = (uint8_t)(unsigned long)(env); // TODO: Does this capture anything? - h_rvm_insert_insn(prog, RVM_MATCH, c & c << 8); + h_rvm_insert_insn(prog, RVM_PUSH, 0); + h_rvm_insert_insn(prog, RVM_MATCH, c | c << 8); h_rvm_insert_insn(prog, RVM_STEP, 0); + h_rvm_insert_insn(prog, RVM_CAPTURE, 0); + h_rvm_insert_insn(prog, RVM_ACTION, h_rvm_create_action(prog, h_svm_action_ch, env)); return true; } diff --git a/src/parsers/charset.c b/src/parsers/charset.c index 1b06cce6f044f6045c5c8920f14ddf6039bc0674..db4c2e777216ce798179bd930d28ea4ad4685cd1 100644 --- a/src/parsers/charset.c +++ b/src/parsers/charset.c @@ -1,3 +1,4 @@ +#include <assert.h> #include <string.h> #include "../internal.h" #include "parser_internal.h" @@ -22,23 +23,54 @@ static HCFChoice* desugar_charset(HAllocator *mm__, void *env) { return ret; } +static bool h_svm_action_ch(HArena *arena, HSVMContext *ctx, void* env) { + // BUG: relies un undefined behaviour: int64_t is a signed uint64_t; not necessarily true on 32-bit + HParsedToken *top = ctx->stack[ctx->stack_count-1]; + assert(top->token_type == TT_BYTES); + uint64_t res = 0; + for (size_t i = 0; i < top->bytes.len; i++) + res = (res << 8) | top->bytes.token[i]; // TODO: Handle other endiannesses. + top->uint = res; // possibly cast to signed through union + top->token_type = TT_UINT; + return true; +} + // FUTURE: this is horribly inefficient static bool cs_ctrvm(HRVMProg *prog, void *env) { HCharset cs = (HCharset)env; + h_rvm_insert_insn(prog, RVM_PUSH, 0); + uint16_t start = h_rvm_get_ip(prog); - for (size_t i=0; i<256; ++i) { - if (charset_isset(cs, i)) { - uint16_t insn = h_rvm_insert_insn(prog, RVM_FORK, 0); - h_rvm_insert_insn(prog, RVM_MATCH, i & i << 8); - h_rvm_insert_insn(prog, RVM_GOTO, 0); - h_rvm_patch_arg(prog, insn, h_rvm_get_ip(prog)); + + uint8_t range_start = 0; + bool collecting = false; + for (size_t i=0; i<257; ++i) { + // Position 256 is only there so that every included character has + // a non-included character after it. + if (i < 256 && charset_isset(cs, i)) { + if (!collecting) { + collecting = true; + range_start = i; + } + } else { + if (collecting) { + collecting = false; + uint16_t insn = h_rvm_insert_insn(prog, RVM_FORK, 0); + h_rvm_insert_insn(prog, RVM_MATCH, range_start | (i-1) << 8); + h_rvm_insert_insn(prog, RVM_GOTO, 0); + h_rvm_patch_arg(prog, insn, h_rvm_get_ip(prog)); + } } } + h_rvm_insert_insn(prog, RVM_MATCH, 0x00FF); uint16_t jump = h_rvm_insert_insn(prog, RVM_STEP, 0); for (size_t i=start; i<jump; ++i) { if (RVM_GOTO == prog->insns[i].op) h_rvm_patch_arg(prog, i, jump); } + + h_rvm_insert_insn(prog, RVM_CAPTURE, 0); + h_rvm_insert_insn(prog, RVM_ACTION, h_rvm_create_action(prog, h_svm_action_ch, env)); return true; } diff --git a/src/parsers/choice.c b/src/parsers/choice.c index 5485f2f35cdbee4a0513dab2200bf306255d6550..67b3742059e869c357d473575968e9ec610f931e 100644 --- a/src/parsers/choice.c +++ b/src/parsers/choice.c @@ -3,7 +3,7 @@ typedef struct { size_t len; - const HParser **p_array; + HParser **p_array; } HSequence; @@ -58,16 +58,16 @@ static HCFChoice* desugar_choice(HAllocator *mm__, void *env) { static bool choice_ctrvm(HRVMProg *prog, void* env) { HSequence *s = (HSequence*)env; uint16_t gotos[s->len]; - uint16_t start = h_rvm_get_ip(prog); for (size_t i=0; i<s->len; ++i) { uint16_t insn = h_rvm_insert_insn(prog, RVM_FORK, 0); - if (!h_compile_regex(prog, s->p_array[i]->env)) + if (!h_compile_regex(prog, s->p_array[i])) return false; - gotos[i] = h_rvm_insert_insn(prog, RVM_GOTO, 0); + gotos[i] = h_rvm_insert_insn(prog, RVM_GOTO, 65535); h_rvm_patch_arg(prog, insn, h_rvm_get_ip(prog)); } - uint16_t jump = h_rvm_insert_insn(prog, RVM_STEP, 0); - for (size_t i=start; i<s->len; ++i) { + h_rvm_insert_insn(prog, RVM_MATCH, 0x00FF); // fail. + uint16_t jump = h_rvm_get_ip(prog); + for (size_t i=0; i<s->len; ++i) { h_rvm_patch_arg(prog, gotos[i], jump); } return true; @@ -81,7 +81,7 @@ static const HParserVtable choice_vt = { .compile_to_rvm = choice_ctrvm, }; -HParser* h_choice(const HParser* p, ...) { +HParser* h_choice(HParser* p, ...) { va_list ap; va_start(ap, p); HParser* ret = h_choice__mv(&system_allocator, p, ap); @@ -89,7 +89,7 @@ HParser* h_choice(const HParser* p, ...) { return ret; } -HParser* h_choice__m(HAllocator* mm__, const HParser* p, ...) { +HParser* h_choice__m(HAllocator* mm__, HParser* p, ...) { va_list ap; va_start(ap, p); HParser* ret = h_choice__mv(mm__, p, ap); @@ -97,28 +97,28 @@ HParser* h_choice__m(HAllocator* mm__, const HParser* p, ...) { return ret; } -HParser* h_choice__v(const HParser* p, va_list ap) { +HParser* h_choice__v(HParser* p, va_list ap) { return h_choice__mv(&system_allocator, p, ap); } -HParser* h_choice__mv(HAllocator* mm__, const HParser* p, va_list ap_) { +HParser* h_choice__mv(HAllocator* mm__, HParser* p, va_list ap_) { va_list ap; size_t len = 0; HSequence *s = h_new(HSequence, 1); - const HParser *arg; + HParser *arg; va_copy(ap, ap_); do { len++; - arg = va_arg(ap, const HParser *); + arg = va_arg(ap, HParser *); } while (arg); va_end(ap); - s->p_array = h_new(const HParser *, len); + s->p_array = h_new(HParser *, len); va_copy(ap, ap_); s->p_array[0] = p; for (size_t i = 1; i < len; i++) { - s->p_array[i] = va_arg(ap, const HParser *); + s->p_array[i] = va_arg(ap, HParser *); } while (arg); va_end(ap); @@ -139,7 +139,7 @@ HParser* h_choice__ma(HAllocator* mm__, void *args[]) { } while(arg); HSequence *s = h_new(HSequence, 1); - s->p_array = h_new(const HParser *, len); + s->p_array = h_new(HParser *, len); for (size_t i = 0; i < len; i++) { s->p_array[i] = ((HParser **)args)[i]; diff --git a/src/parsers/ignore.c b/src/parsers/ignore.c index 62f45edf4ee7e703d54136ed293a494cf5b7930a..178d97076cf2c677e88c49e62d6dc0456e94f2ae 100644 --- a/src/parsers/ignore.c +++ b/src/parsers/ignore.c @@ -47,7 +47,7 @@ static bool h_svm_action_pop(HArena *arena, HSVMContext *ctx, void* arg) { static bool ignore_ctrvm(HRVMProg *prog, void *env) { HParser *p = (HParser*)env; - h_compile_regex(prog, p->env); + h_compile_regex(prog, p); h_rvm_insert_insn(prog, RVM_ACTION, h_rvm_create_action(prog, h_svm_action_pop, NULL)); return true; } diff --git a/src/parsers/ignoreseq.c b/src/parsers/ignoreseq.c index 6a61f3ead5436fb675412df5f459c2e1406f351f..8fcc143b8a4b9a6b2248d9a4de5596560593d58d 100644 --- a/src/parsers/ignoreseq.c +++ b/src/parsers/ignoreseq.c @@ -83,7 +83,7 @@ static bool h_svm_action_ignoreseq(HArena *arena, HSVMContext *ctx, void* env) { // stack. assert(seq->len >= 1); for (int i = seq->len - 1; i>=0; i--) { - if (i == (int)seq->which && ctx->stack[ctx->stack_count]->token_type != TT_MARK) + if (i == (int)seq->which && ctx->stack[ctx->stack_count-1]->token_type != TT_MARK) save = ctx->stack[ctx->stack_count-1]; // skip over everything up to and including the mark. while (ctx->stack[--ctx->stack_count]->token_type != TT_MARK) @@ -97,7 +97,7 @@ static bool is_ctrvm(HRVMProg *prog, void* env) { HIgnoreSeq *seq = (HIgnoreSeq*)env; for (size_t i=0; i<seq->len; ++i) { h_rvm_insert_insn(prog, RVM_PUSH, 0); - if (!h_compile_regex(prog, seq->parsers[i]->env)) + if (!h_compile_regex(prog, seq->parsers[i])) return false; } h_rvm_insert_insn(prog, RVM_ACTION, h_rvm_create_action(prog, h_svm_action_ignoreseq, env)); diff --git a/src/parsers/many.c b/src/parsers/many.c index 6f6e8591a2903abdb1a6c2d59783add9aa0823a1..8185203c907b9350e2dc577db515373ab9603e65 100644 --- a/src/parsers/many.c +++ b/src/parsers/many.c @@ -1,3 +1,4 @@ +#include <assert.h> #include "parser_internal.h" // TODO: split this up. @@ -14,7 +15,7 @@ static HParseResult *parse_many(void* env, HParseState *state) { HInputStream bak; while (env_->min_p || env_->count > count) { bak = state->input_stream; - if (count > 0) { + if (count > 0 && env_->sep != NULL) { HParseResult *sep = h_do_parse(env_->sep, state); if (!sep) goto err0; @@ -47,13 +48,15 @@ static HParseResult *parse_many(void* env, HParseState *state) { static bool many_isValidRegular(void *env) { HRepeat *repeat = (HRepeat*)env; return (repeat->p->vtable->isValidRegular(repeat->p->env) && - repeat->sep->vtable->isValidRegular(repeat->sep->env)); + (repeat->sep == NULL || + repeat->sep->vtable->isValidRegular(repeat->sep->env))); } static bool many_isValidCF(void *env) { HRepeat *repeat = (HRepeat*)env; return (repeat->p->vtable->isValidCF(repeat->p->env) && - repeat->sep->vtable->isValidCF(repeat->sep->env)); + (repeat->sep == NULL || + repeat->sep->vtable->isValidCF(repeat->sep->env))); } static HCFChoice* desugar_many(HAllocator *mm__, void *env) { @@ -70,7 +73,9 @@ static HCFChoice* desugar_many(HAllocator *mm__, void *env) { -> \epsilon */ - HCFChoice *sep = h_desugar(mm__, repeat->sep); + HParser *epsilon = h_epsilon_p__m(mm__); + + HCFChoice *sep = h_desugar(mm__, (repeat->sep != NULL) ? repeat->sep : epsilon); HCFChoice *a = h_desugar(mm__, repeat->p); HCFChoice *ma = h_new(HCFChoice, 1); HCFChoice *mar = h_new(HCFChoice, 1); @@ -119,24 +124,56 @@ static HCFChoice* desugar_many(HAllocator *mm__, void *env) { static bool many_ctrvm(HRVMProg *prog, void *env) { HRepeat *repeat = (HRepeat*)env; - // FIXME: Implement clear_to_mark uint16_t clear_to_mark = h_rvm_create_action(prog, h_svm_action_clear_to_mark, NULL); + // TODO: implement min & max properly. Right now, it's always + // max==inf, min={0,1} + + // Structure: + // Min == 0: + // FORK end // if Min == 0 + // GOTO mid + // nxt: <SEP> + // mid: <ELEM> + // FORK nxt + // end: + + if (repeat->min_p) { h_rvm_insert_insn(prog, RVM_PUSH, 0); - // TODO: implement min and max properly. Right now, it's always min==0, max==inf - uint16_t insn = h_rvm_insert_insn(prog, RVM_FORK, 0); - if (!h_compile_regex(prog, repeat->p)) - return false; - if (repeat->sep != NULL) { - h_rvm_insert_insn(prog, RVM_PUSH, 0); - if (!h_compile_regex(prog, repeat->sep)) + assert(repeat->count < 2); // TODO: The other cases should be supported later. + uint16_t end_fork; + if (repeat->count == 0) + end_fork = h_rvm_insert_insn(prog, RVM_FORK, 0xFFFF); + uint16_t goto_mid = h_rvm_insert_insn(prog, RVM_GOTO, 0xFFFF); + uint16_t nxt = h_rvm_get_ip(prog); + if (repeat->sep != NULL) { + h_rvm_insert_insn(prog, RVM_PUSH, 0); + if (!h_compile_regex(prog, repeat->sep)) + return false; + h_rvm_insert_insn(prog, RVM_ACTION, clear_to_mark); + } + h_rvm_patch_arg(prog, goto_mid, h_rvm_get_ip(prog)); + if (!h_compile_regex(prog, repeat->p)) return false; - h_rvm_insert_insn(prog, RVM_ACTION, clear_to_mark); + h_rvm_insert_insn(prog, RVM_FORK, nxt); + h_rvm_patch_arg(prog, end_fork, h_rvm_get_ip(prog)); + + h_rvm_insert_insn(prog, RVM_ACTION, h_rvm_create_action(prog, h_svm_action_make_sequence, NULL)); + return true; + } else { + h_rvm_insert_insn(prog, RVM_PUSH, 0); + for (size_t i = 0; i < repeat->count; i++) { + if (repeat->sep != NULL && i != 0) { + h_rvm_insert_insn(prog, RVM_PUSH, 0); + if (!h_compile_regex(prog, repeat->sep)) + return false; + h_rvm_insert_insn(prog, RVM_ACTION, clear_to_mark); + } + if (!h_compile_regex(prog, repeat->p)) + return false; + } + h_rvm_insert_insn(prog, RVM_ACTION, h_rvm_create_action(prog, h_svm_action_make_sequence, NULL)); + return true; } - h_rvm_insert_insn(prog, RVM_GOTO, insn); - h_rvm_patch_arg(prog, insn, h_rvm_get_ip(prog)); - - h_rvm_insert_insn(prog, RVM_ACTION, h_rvm_create_action(prog, h_svm_action_make_sequence, NULL)); - return true; } static const HParserVtable many_vt = { @@ -153,7 +190,7 @@ HParser* h_many(const HParser* p) { HParser* h_many__m(HAllocator* mm__, const HParser* p) { HRepeat *env = h_new(HRepeat, 1); env->p = p; - env->sep = h_epsilon_p__m(mm__); + env->sep = NULL; env->count = 0; env->min_p = true; return h_new_parser(mm__, &many_vt, env); @@ -165,7 +202,7 @@ HParser* h_many1(const HParser* p) { HParser* h_many1__m(HAllocator* mm__, const HParser* p) { HRepeat *env = h_new(HRepeat, 1); env->p = p; - env->sep = h_epsilon_p__m(mm__); + env->sep = NULL; env->count = 1; env->min_p = true; return h_new_parser(mm__, &many_vt, env); @@ -177,7 +214,7 @@ HParser* h_repeat_n(const HParser* p, const size_t n) { HParser* h_repeat_n__m(HAllocator* mm__, const HParser* p, const size_t n) { HRepeat *env = h_new(HRepeat, 1); env->p = p; - env->sep = h_epsilon_p__m(mm__); + env->sep = NULL; env->count = n; env->min_p = false; return h_new_parser(mm__, &many_vt, env); @@ -222,7 +259,7 @@ static HParseResult* parse_length_value(void *env, HParseState *state) { // TODO: allocate this using public functions HRepeat repeat = { .p = lv->value, - .sep = h_epsilon_p(), + .sep = NULL, .count = len->ast->uint, .min_p = false }; diff --git a/src/parsers/optional.c b/src/parsers/optional.c index c60600d37af5f5e660092b0dea3185f816668d27..87ba541b91310bae7bd20b30fe2b9387501c0045 100644 --- a/src/parsers/optional.c +++ b/src/parsers/optional.c @@ -22,7 +22,7 @@ static bool opt_isValidCF(void *env) { return p->vtable->isValidCF(p->env); } -static const HParsedToken* reshape_optional(const HParseResult *p) { +static HParsedToken* reshape_optional(const HParseResult *p) { assert(p->ast); assert(p->ast->token_type == TT_SEQUENCE); @@ -82,7 +82,7 @@ static bool opt_ctrvm(HRVMProg *prog, void* env) { h_rvm_insert_insn(prog, RVM_PUSH, 0); uint16_t insn = h_rvm_insert_insn(prog, RVM_FORK, 0); HParser *p = (HParser*) env; - if (!h_compile_regex(prog, p->env)) + if (!h_compile_regex(prog, p)) return false; h_rvm_patch_arg(prog, insn, h_rvm_get_ip(prog)); h_rvm_insert_insn(prog, RVM_ACTION, h_rvm_create_action(prog, h_svm_action_optional, NULL)); diff --git a/src/parsers/sequence.c b/src/parsers/sequence.c index 4de218339bde91d72918bc1d7b1dedfee681197c..aa600231d06ccbf461a855c51658909f0c66e866 100644 --- a/src/parsers/sequence.c +++ b/src/parsers/sequence.c @@ -4,7 +4,7 @@ typedef struct { size_t len; - const HParser **p_array; + HParser **p_array; } HSequence; static HParseResult* parse_sequence(void *env, HParseState *state) { @@ -43,7 +43,7 @@ static bool sequence_isValidCF(void *env) { return true; } -static const HParsedToken *reshape_sequence(const HParseResult *p) { +static HParsedToken *reshape_sequence(const HParseResult *p) { assert(p->ast); assert(p->ast->token_type == TT_SEQUENCE); @@ -101,7 +101,7 @@ static const HParserVtable sequence_vt = { .compile_to_rvm = sequence_ctrvm, }; -HParser* h_sequence(const HParser* p, ...) { +HParser* h_sequence(HParser* p, ...) { va_list ap; va_start(ap, p); HParser* ret = h_sequence__mv(&system_allocator, p, ap); @@ -109,7 +109,7 @@ HParser* h_sequence(const HParser* p, ...) { return ret; } -HParser* h_sequence__m(HAllocator* mm__, const HParser* p, ...) { +HParser* h_sequence__m(HAllocator* mm__, HParser* p, ...) { va_list ap; va_start(ap, p); HParser* ret = h_sequence__mv(mm__, p, ap); @@ -117,27 +117,27 @@ HParser* h_sequence__m(HAllocator* mm__, const HParser* p, ...) { return ret; } -HParser* h_sequence__v(const HParser* p, va_list ap) { +HParser* h_sequence__v(HParser* p, va_list ap) { return h_sequence__mv(&system_allocator, p, ap); } -HParser* h_sequence__mv(HAllocator* mm__, const HParser *p, va_list ap_) { +HParser* h_sequence__mv(HAllocator* mm__, HParser *p, va_list ap_) { va_list ap; size_t len = 0; const HParser *arg; va_copy(ap, ap_); do { len++; - arg = va_arg(ap, const HParser *); + arg = va_arg(ap, HParser *); } while (arg); va_end(ap); HSequence *s = h_new(HSequence, 1); - s->p_array = h_new(const HParser *, len); + s->p_array = h_new(HParser *, len); va_copy(ap, ap_); s->p_array[0] = p; for (size_t i = 1; i < len; i++) { - s->p_array[i] = va_arg(ap, const HParser *); + s->p_array[i] = va_arg(ap, HParser *); } while (arg); va_end(ap); @@ -158,7 +158,7 @@ HParser* h_sequence__ma(HAllocator* mm__, void *args[]) { } while(arg); HSequence *s = h_new(HSequence, 1); - s->p_array = h_new(const HParser *, len); + s->p_array = h_new(HParser *, len); for (size_t i = 0; i < len; i++) { s->p_array[i] = ((HParser **)args)[i]; diff --git a/src/parsers/token.c b/src/parsers/token.c index defcdf75eebbe8161e05c0a6fb3c70a5a7c95be7..2346a45e0f36d64ccf6a87df07c9ea89f067989d 100644 --- a/src/parsers/token.c +++ b/src/parsers/token.c @@ -20,7 +20,7 @@ static HParseResult* parse_token(void *env, HParseState *state) { } -static const HParsedToken *reshape_token(const HParseResult *p) { +static HParsedToken *reshape_token(const HParseResult *p) { // fetch sequence of uints from p assert(p->ast); assert(p->ast->token_type == TT_SEQUENCE); @@ -69,7 +69,7 @@ static bool token_ctrvm(HRVMProg *prog, void *env) { HToken *t = (HToken*)env; h_rvm_insert_insn(prog, RVM_PUSH, 0); for (int i=0; i<t->len; ++i) { - h_rvm_insert_insn(prog, RVM_MATCH, t->str[i] & t->str[i] << 8); + h_rvm_insert_insn(prog, RVM_MATCH, t->str[i] | t->str[i] << 8); h_rvm_insert_insn(prog, RVM_STEP, 0); } h_rvm_insert_insn(prog, RVM_CAPTURE, 0); diff --git a/src/parsers/whitespace.c b/src/parsers/whitespace.c index 89ce23b0d992692fccec210484198d276935f7cd..454e04ed3633d79e9f58bcd64b911a3e8ab26bdb 100644 --- a/src/parsers/whitespace.c +++ b/src/parsers/whitespace.c @@ -67,13 +67,19 @@ static bool ws_ctrvm(HRVMProg *prog, void *env) { uint16_t start = h_rvm_get_ip(prog); uint16_t next; - for (int i = 0; i < 6; i++) { + uint16_t ranges[2] = { + 0x0d09, + 0x2020, + }; + + for (int i = 0; i < 2; i++) { next = h_rvm_insert_insn(prog, RVM_FORK, 0); - h_rvm_insert_insn(prog, RVM_MATCH, (SPACE_CHRS[i] << 8) | (SPACE_CHRS[i])); + h_rvm_insert_insn(prog, RVM_MATCH, ranges[i]); + h_rvm_insert_insn(prog, RVM_STEP, 0); h_rvm_insert_insn(prog, RVM_GOTO, start); h_rvm_patch_arg(prog, next, h_rvm_get_ip(prog)); } - return h_compile_regex(prog, p->env); + return h_compile_regex(prog, p); } static const HParserVtable whitespace_vt = { diff --git a/src/system_allocator.c b/src/system_allocator.c index 80d7acf2be1d7dacd250d60fbd3a4338ed99c1af..5f3e48440996892d1335fbc50f6dc282099e39c4 100644 --- a/src/system_allocator.c +++ b/src/system_allocator.c @@ -21,7 +21,8 @@ static void* system_realloc(HAllocator *allocator, void* ptr, size_t size) { } static void system_free(HAllocator *allocator, void* ptr) { - free(ptr - sizeof(size_t)); + if (ptr != NULL) + free(ptr - sizeof(size_t)); } HAllocator system_allocator = { diff --git a/src/t_grammar.c b/src/t_grammar.c index 8003bcf64656d20a06106a2635fa47538b6db41e..0287b2fe6eda00a1d6575e619161d18ca9f20639 100644 --- a/src/t_grammar.c +++ b/src/t_grammar.c @@ -15,9 +15,9 @@ static void test_end(void) { } static void test_example_1(void) { - const HParser *c = h_many(h_ch('x')); - const HParser *q = h_sequence(c, h_ch('y'), NULL); - const HParser *p = h_choice(q, h_end_p(), NULL); + HParser *c = h_many(h_ch('x')); + HParser *q = h_sequence(c, h_ch('y'), NULL); + HParser *p = h_choice(q, h_end_p(), NULL); HCFGrammar *g = h_cfgrammar(&system_allocator, p); g_check_nonterminal(g, c); diff --git a/src/t_parser.c b/src/t_parser.c index 961aa4efa8999b6147eb401362ac6201d44860bf..8aab7bb38e4b950e60da93e1c362b4a09ef0bbb0 100644 --- a/src/t_parser.c +++ b/src/t_parser.c @@ -162,7 +162,7 @@ static void test_middle(gconstpointer backend) { #include <ctype.h> -const HParsedToken* upcase(const HParseResult *p) { +HParsedToken* upcase(const HParseResult *p) { switch(p->ast->token_type) { case TT_SEQUENCE: { @@ -180,17 +180,17 @@ const HParsedToken* upcase(const HParseResult *p) { } } ret->seq = seq; - return (const HParsedToken*)ret; + return ret; } case TT_UINT: { HParsedToken *ret = a_new_(p->arena, HParsedToken, 1); ret->token_type = TT_UINT; ret->uint = toupper(p->ast->uint); - return (const HParsedToken*)ret; + return ret; } default: - return p->ast; + return (HParsedToken*)p->ast; } } @@ -526,6 +526,7 @@ void register_parser_tests(void) { g_test_add_data_func("/core/parser/regex/choice", GINT_TO_POINTER(PB_REGULAR), test_choice); g_test_add_data_func("/core/parser/regex/many", GINT_TO_POINTER(PB_REGULAR), test_many); g_test_add_data_func("/core/parser/regex/many1", GINT_TO_POINTER(PB_REGULAR), test_many1); + g_test_add_data_func("/core/parser/regex/repeat_n", GINT_TO_POINTER(PB_REGULAR), test_repeat_n); g_test_add_data_func("/core/parser/regex/optional", GINT_TO_POINTER(PB_REGULAR), test_optional); g_test_add_data_func("/core/parser/regex/sepBy", GINT_TO_POINTER(PB_REGULAR), test_sepBy); g_test_add_data_func("/core/parser/regex/sepBy1", GINT_TO_POINTER(PB_REGULAR), test_sepBy1);