From 2af69dd8f95689f81e5fc10780b54461288130eb Mon Sep 17 00:00:00 2001
From: Dan Hirsch <thequux@thequux.com>
Date: Fri, 4 May 2012 21:23:56 +0100
Subject: [PATCH] Sped up charset parsing; fixed choice operator

---
 Makefile        |  5 +++-
 NOTES           | 16 ++++++++++-
 src/bitreader.c |  4 +++
 src/hammer.c    | 76 +++++++++++++++++++++++++------------------------
 src/internal.h  | 19 +++++++++++++
 5 files changed, 81 insertions(+), 39 deletions(-)

diff --git a/Makefile b/Makefile
index d6205afb..fbb2b075 100644
--- a/Makefile
+++ b/Makefile
@@ -11,6 +11,9 @@ SUBDIRS = src \
 %:
 	+for dir in $(SUBDIRS); do $(MAKE) -C $${dir} $@; done
 
+test: src/test_suite
+	$<
+
 define SUBDIR_TEMPLATE
 $(1)/%:
 	$$(MAKE) -C $(1) $$*
@@ -19,4 +22,4 @@ endef
 $(foreach dir,$(SUBDIRS),$(eval $(call SUBDIR_TEMPLATE,$(dir))))
 
 #.DEFAULT:
-#	$(if $(findstring ./,$(dir $@)),$(error No rule to make target `$@'),$(MAKE) -C $(dir $@) $(notdir $@))
\ No newline at end of file
+#	$(if $(findstring ./,$(dir $@)),$(error No rule to make target `$@'),$(MAKE) -C $(dir $@) $(notdir $@))
diff --git a/NOTES b/NOTES
index 4d89c709..edee9d05 100644
--- a/NOTES
+++ b/NOTES
@@ -3,4 +3,18 @@ NOTES
 
 Regarding parse_result_t:
 If a parse fails, the parse_result_t will be NULL.
-If a parse is successful but there's nothing there (i.e., if end_p succeeds), then there's a parse_result_t but its ast is NULL.
\ No newline at end of file
+If a parse is successful but there's nothing there (i.e., if end_p succeeds), then there's a parse_result_t but its ast is NULL.
+
+Regarding input location:
+If parse is successful, input is left at beginning of next thing to be read.
+If parse fails, location is UNPREDICTABLE.
+
+
+If CONSISTENCY_CHECK is defined, enable a bunch of additional internal
+consistency checks.
+
+TODO: Add consistency check to the bitreader
+
+We should support the use of parse-table-based parse methods; add a
+parse_compile method that must be called before the newly-created
+parser is used.
diff --git a/src/bitreader.c b/src/bitreader.c
index 6d5f784d..0b406e01 100644
--- a/src/bitreader.c
+++ b/src/bitreader.c
@@ -8,10 +8,14 @@
 #define MSB(range) (1:range)
 #define LDB(range,i) (((i)>>LSB(range))&((1<<(MSB(range)-LSB(range)+1))-1))
 
+
 long long read_bits(input_stream_t* state, int count, char signed_p) {
+  // BUG: Does not 
   long long out = 0;
   int offset = 0;
   long long msb = (!!signed_p) << (count - 1); // 0 if unsigned, else 1 << (nbits - 1)
+  // BUG: does not stop early in case of
+  
   if ((state->bit_offset & 0x7) == 0 && (count & 0x7) == 0) {
     // fast path
     if (state->endianness & BYTE_BIG_ENDIAN) {
diff --git a/src/hammer.c b/src/hammer.c
index 867e36b3..95d8a426 100644
--- a/src/hammer.c
+++ b/src/hammer.c
@@ -19,14 +19,14 @@
 #include "internal.h"
 #include <assert.h>
 #include <string.h>
-/* TODO(thequux): rewrite to follow new parse_state_t layout
+
 parse_state_t* from(parse_state_t *ps, const size_t index) {
-  parse_state_t p = { ps->input, ps->index + index, ps->length - index, ps->cache };
   parse_state_t *ret = g_new(parse_state_t, 1);
-  *ret = p;
+  *ret = *ps;
+  ret->input_stream.index += index;
   return ret;
 }
-*/
+
 const uint8_t* substring(const parse_state_t *ps, const size_t start, const size_t end) {
   if (end > start && (ps->input_stream.index + end) < ps->input_stream.length) {
     gpointer ret = g_malloc(end - start);
@@ -48,8 +48,7 @@ const gchar* to_string(parse_state_t *ps) {
   return g_strescape((const gchar*)(ps->input_stream.input), NULL);
 }
 
-guint djbhash(const 
-uint8_t *buf, size_t len) {
+guint djbhash(const uint8_t *buf, size_t len) {
   guint hash = 5381;
   while (len--) {
     hash = hash * 33 + *buf++;
@@ -75,6 +74,12 @@ parse_result_t* do_parse(const parser_t* parser, parse_state_t *state) {
     res = parser->fn(parser->env, state);
     // update the cache
     g_hash_table_replace(state->cache, &key, res);
+#ifdef CONSISTENCY_CHECK
+    if (!res) {
+      state->input_stream = INVALID;
+      state->input_stream.input = key.input_pos.input;
+    }
+#endif
     return res;
   }
 }
@@ -135,47 +140,41 @@ typedef struct {
   uint8_t upper;
 } range_t;
 
-static parse_result_t* parse_range(void* env, parse_state_t *state) {
-  range_t *range = (range_t*)env;
-  uint8_t r = (uint8_t)read_bits(&state->input_stream, 8, false);
-  if (range->lower <= r && range->upper >= r) {
-    parsed_token_t *tok = g_new(parsed_token_t, 1);
-    tok->token_type = TT_UINT; tok->uint = r;
-    return make_result(tok);
-  } else {
-    return NULL;
-  }
-}
-
-const parser_t* range(const uint8_t lower, const uint8_t upper) { 
-  range_t *r = g_new(range_t, 1);
-  r->lower = lower; r->upper = upper;
-  parser_t *ret = g_new(parser_t, 1);
-  ret->fn = parse_range; ret->env = (void*)r;
-  return (const parser_t*)ret;
-}
 const parser_t* whitespace(const parser_t* p) { return NULL; }
 //const parser_t* action(const parser_t* p, /* fptr to action on AST */) { return NULL; }
 
 const parser_t* left_factor_action(const parser_t* p) { return NULL; }
 
-static parse_result_t* parse_negate(void *env, parse_state_t *state) {
-  parser_t *p = (parser_t*)env;
-  parse_result_t *result = do_parse(p, state);
-  if (NULL == result) {
-    uint8_t r = (uint8_t)read_bits(&state->input_stream, 8, false);
-    parsed_token_t *tok = g_new(parsed_token_t, 1);    
-    tok->token_type = TT_UINT; tok->uint = r;
+static parse_result_t* parse_charset(void *env, parse_state_t *state) {
+  uint8_t in = read_bits(&state->input_stream, 8, false);
+  charset cs = (charset)env;
+
+  if (charset_isset(cs, in)) {
+    parsed_token_t *tok = g_new(parsed_token_t, 1);
+    tok->token_type = TT_UINT; tok->uint = in;
     return make_result(tok);    
-  } else {
+  } else
     return NULL;
-  }
 }
 
-const parser_t* negate(const parser_t* p) { 
-  assert(parse_ch == p->fn || parse_range == p->fn);
+const parser_t* range(const uint8_t lower, const uint8_t upper) {
+  parser_t *ret = g_new(parser_t, 1);
+  charset cs = new_charset();
+  for (int i = 0; i < 256; i++)
+    charset_set(cs, i, (lower <= i) && (i <= upper));
+  ret->fn = parse_charset; ret->env = (void*)cs;
+  return (const parser_t*)ret;
+}
+
+const parser_t* notin(const uint8_t *options, int count) {
   parser_t *ret = g_new(parser_t, 1);
-  ret->fn = parse_negate; ret->env = (void*)p;
+  charset cs = new_charset();
+  for (int i = 0; i < 256; i++)
+    charset_set(cs, i, 1);
+  for (int i = 0; i < count; i++)
+    charset_set(cs, i, 0);
+
+  ret->fn = parse_charset; ret->env = (void*)cs;
   return (const parser_t*)ret;
 }
 
@@ -232,7 +231,10 @@ const parser_t* sequence(const parser_t* p_array[]) {
 
 static parse_result_t* parse_choice(void *env, parse_state_t *state) {
   sequence_t *s = (sequence_t*)env;
+  input_stream_t backup = state->input_stream;
   for (size_t i=0; i<s->len; ++i) {
+    if (i != 0)
+      state->input_stream = backup;
     parse_result_t *tmp = do_parse(s->p_array[i], state);
     if (NULL != tmp)
       return tmp;
diff --git a/src/internal.h b/src/internal.h
index 29eaeeb6..aa2d4a13 100644
--- a/src/internal.h
+++ b/src/internal.h
@@ -1,5 +1,6 @@
 #ifndef HAMMER_INTERNAL__H
 #define HAMMER_INTERNAL__H
+#include <glib.h>
 #include "hammer.h"
 
 #define false 0
@@ -10,6 +11,24 @@ typedef struct parser_cache_key {
   const parser_t *parser;
 } parser_cache_key_t;
 
+typedef unsigned int *charset;
+
+static inline charset new_charset() {
+  charset cs = g_new0(unsigned int, 256 / sizeof(unsigned int));
+  return cs;
+}
+
+static inline int charset_isset(charset cs, uint8_t pos) {
+  return !!(cs[pos / sizeof(*cs)] & (1 << (pos % sizeof(*cs))));
+}
+
+static inline void charset_set(charset cs, uint8_t pos, int val) {
+  cs[pos / sizeof(*cs)] =
+    val
+    ? cs[pos / sizeof(*cs)] |  (1 << (pos % sizeof(*cs)))
+    : cs[pos / sizeof(*cs)] & ~(1 << (pos % sizeof(*cs)));
+}
+
 // TODO(thequux): Set symbol visibility for these functions so that they aren't exported.
 
 long long read_bits(input_stream_t* state, int count, char signed_p);
-- 
GitLab