From 2af69dd8f95689f81e5fc10780b54461288130eb Mon Sep 17 00:00:00 2001 From: Dan Hirsch <thequux@thequux.com> Date: Fri, 4 May 2012 21:23:56 +0100 Subject: [PATCH] Sped up charset parsing; fixed choice operator --- Makefile | 5 +++- NOTES | 16 ++++++++++- src/bitreader.c | 4 +++ src/hammer.c | 76 +++++++++++++++++++++++++------------------------ src/internal.h | 19 +++++++++++++ 5 files changed, 81 insertions(+), 39 deletions(-) diff --git a/Makefile b/Makefile index d6205afb..fbb2b075 100644 --- a/Makefile +++ b/Makefile @@ -11,6 +11,9 @@ SUBDIRS = src \ %: +for dir in $(SUBDIRS); do $(MAKE) -C $${dir} $@; done +test: src/test_suite + $< + define SUBDIR_TEMPLATE $(1)/%: $$(MAKE) -C $(1) $$* @@ -19,4 +22,4 @@ endef $(foreach dir,$(SUBDIRS),$(eval $(call SUBDIR_TEMPLATE,$(dir)))) #.DEFAULT: -# $(if $(findstring ./,$(dir $@)),$(error No rule to make target `$@'),$(MAKE) -C $(dir $@) $(notdir $@)) \ No newline at end of file +# $(if $(findstring ./,$(dir $@)),$(error No rule to make target `$@'),$(MAKE) -C $(dir $@) $(notdir $@)) diff --git a/NOTES b/NOTES index 4d89c709..edee9d05 100644 --- a/NOTES +++ b/NOTES @@ -3,4 +3,18 @@ NOTES Regarding parse_result_t: If a parse fails, the parse_result_t will be NULL. -If a parse is successful but there's nothing there (i.e., if end_p succeeds), then there's a parse_result_t but its ast is NULL. \ No newline at end of file +If a parse is successful but there's nothing there (i.e., if end_p succeeds), then there's a parse_result_t but its ast is NULL. + +Regarding input location: +If parse is successful, input is left at beginning of next thing to be read. +If parse fails, location is UNPREDICTABLE. + + +If CONSISTENCY_CHECK is defined, enable a bunch of additional internal +consistency checks. + +TODO: Add consistency check to the bitreader + +We should support the use of parse-table-based parse methods; add a +parse_compile method that must be called before the newly-created +parser is used. diff --git a/src/bitreader.c b/src/bitreader.c index 6d5f784d..0b406e01 100644 --- a/src/bitreader.c +++ b/src/bitreader.c @@ -8,10 +8,14 @@ #define MSB(range) (1:range) #define LDB(range,i) (((i)>>LSB(range))&((1<<(MSB(range)-LSB(range)+1))-1)) + long long read_bits(input_stream_t* state, int count, char signed_p) { + // BUG: Does not long long out = 0; int offset = 0; long long msb = (!!signed_p) << (count - 1); // 0 if unsigned, else 1 << (nbits - 1) + // BUG: does not stop early in case of + if ((state->bit_offset & 0x7) == 0 && (count & 0x7) == 0) { // fast path if (state->endianness & BYTE_BIG_ENDIAN) { diff --git a/src/hammer.c b/src/hammer.c index 867e36b3..95d8a426 100644 --- a/src/hammer.c +++ b/src/hammer.c @@ -19,14 +19,14 @@ #include "internal.h" #include <assert.h> #include <string.h> -/* TODO(thequux): rewrite to follow new parse_state_t layout + parse_state_t* from(parse_state_t *ps, const size_t index) { - parse_state_t p = { ps->input, ps->index + index, ps->length - index, ps->cache }; parse_state_t *ret = g_new(parse_state_t, 1); - *ret = p; + *ret = *ps; + ret->input_stream.index += index; return ret; } -*/ + const uint8_t* substring(const parse_state_t *ps, const size_t start, const size_t end) { if (end > start && (ps->input_stream.index + end) < ps->input_stream.length) { gpointer ret = g_malloc(end - start); @@ -48,8 +48,7 @@ const gchar* to_string(parse_state_t *ps) { return g_strescape((const gchar*)(ps->input_stream.input), NULL); } -guint djbhash(const -uint8_t *buf, size_t len) { +guint djbhash(const uint8_t *buf, size_t len) { guint hash = 5381; while (len--) { hash = hash * 33 + *buf++; @@ -75,6 +74,12 @@ parse_result_t* do_parse(const parser_t* parser, parse_state_t *state) { res = parser->fn(parser->env, state); // update the cache g_hash_table_replace(state->cache, &key, res); +#ifdef CONSISTENCY_CHECK + if (!res) { + state->input_stream = INVALID; + state->input_stream.input = key.input_pos.input; + } +#endif return res; } } @@ -135,47 +140,41 @@ typedef struct { uint8_t upper; } range_t; -static parse_result_t* parse_range(void* env, parse_state_t *state) { - range_t *range = (range_t*)env; - uint8_t r = (uint8_t)read_bits(&state->input_stream, 8, false); - if (range->lower <= r && range->upper >= r) { - parsed_token_t *tok = g_new(parsed_token_t, 1); - tok->token_type = TT_UINT; tok->uint = r; - return make_result(tok); - } else { - return NULL; - } -} - -const parser_t* range(const uint8_t lower, const uint8_t upper) { - range_t *r = g_new(range_t, 1); - r->lower = lower; r->upper = upper; - parser_t *ret = g_new(parser_t, 1); - ret->fn = parse_range; ret->env = (void*)r; - return (const parser_t*)ret; -} const parser_t* whitespace(const parser_t* p) { return NULL; } //const parser_t* action(const parser_t* p, /* fptr to action on AST */) { return NULL; } const parser_t* left_factor_action(const parser_t* p) { return NULL; } -static parse_result_t* parse_negate(void *env, parse_state_t *state) { - parser_t *p = (parser_t*)env; - parse_result_t *result = do_parse(p, state); - if (NULL == result) { - uint8_t r = (uint8_t)read_bits(&state->input_stream, 8, false); - parsed_token_t *tok = g_new(parsed_token_t, 1); - tok->token_type = TT_UINT; tok->uint = r; +static parse_result_t* parse_charset(void *env, parse_state_t *state) { + uint8_t in = read_bits(&state->input_stream, 8, false); + charset cs = (charset)env; + + if (charset_isset(cs, in)) { + parsed_token_t *tok = g_new(parsed_token_t, 1); + tok->token_type = TT_UINT; tok->uint = in; return make_result(tok); - } else { + } else return NULL; - } } -const parser_t* negate(const parser_t* p) { - assert(parse_ch == p->fn || parse_range == p->fn); +const parser_t* range(const uint8_t lower, const uint8_t upper) { + parser_t *ret = g_new(parser_t, 1); + charset cs = new_charset(); + for (int i = 0; i < 256; i++) + charset_set(cs, i, (lower <= i) && (i <= upper)); + ret->fn = parse_charset; ret->env = (void*)cs; + return (const parser_t*)ret; +} + +const parser_t* notin(const uint8_t *options, int count) { parser_t *ret = g_new(parser_t, 1); - ret->fn = parse_negate; ret->env = (void*)p; + charset cs = new_charset(); + for (int i = 0; i < 256; i++) + charset_set(cs, i, 1); + for (int i = 0; i < count; i++) + charset_set(cs, i, 0); + + ret->fn = parse_charset; ret->env = (void*)cs; return (const parser_t*)ret; } @@ -232,7 +231,10 @@ const parser_t* sequence(const parser_t* p_array[]) { static parse_result_t* parse_choice(void *env, parse_state_t *state) { sequence_t *s = (sequence_t*)env; + input_stream_t backup = state->input_stream; for (size_t i=0; i<s->len; ++i) { + if (i != 0) + state->input_stream = backup; parse_result_t *tmp = do_parse(s->p_array[i], state); if (NULL != tmp) return tmp; diff --git a/src/internal.h b/src/internal.h index 29eaeeb6..aa2d4a13 100644 --- a/src/internal.h +++ b/src/internal.h @@ -1,5 +1,6 @@ #ifndef HAMMER_INTERNAL__H #define HAMMER_INTERNAL__H +#include <glib.h> #include "hammer.h" #define false 0 @@ -10,6 +11,24 @@ typedef struct parser_cache_key { const parser_t *parser; } parser_cache_key_t; +typedef unsigned int *charset; + +static inline charset new_charset() { + charset cs = g_new0(unsigned int, 256 / sizeof(unsigned int)); + return cs; +} + +static inline int charset_isset(charset cs, uint8_t pos) { + return !!(cs[pos / sizeof(*cs)] & (1 << (pos % sizeof(*cs)))); +} + +static inline void charset_set(charset cs, uint8_t pos, int val) { + cs[pos / sizeof(*cs)] = + val + ? cs[pos / sizeof(*cs)] | (1 << (pos % sizeof(*cs))) + : cs[pos / sizeof(*cs)] & ~(1 << (pos % sizeof(*cs))); +} + // TODO(thequux): Set symbol visibility for these functions so that they aren't exported. long long read_bits(input_stream_t* state, int count, char signed_p); -- GitLab