diff --git a/SConstruct b/SConstruct
index 4cf48a3a5eeaf91ae6ded46762f5cf7b8e48d595..a8f7ce8b9d39964458dea9fd1ee1fbe3d0a4b474 100644
--- a/SConstruct
+++ b/SConstruct
@@ -14,7 +14,11 @@ tools = ['default', 'scanreplace']
 if 'dotnet' in ARGUMENTS.get('bindings', []):
 	tools.append('csharp/mono')
 
-env = Environment(ENV = {'PATH' : os.environ['PATH'], 'PKG_CONFIG_PATH' : os.environ['PKG_CONFIG_PATH']},
+envvars = {'PATH' : os.environ['PATH']}
+if 'PKG_CONFIG_PATH' in os.environ:
+    envvars['PKG_CONFIG_PATH'] = os.environ['PKG_CONFIG_PATH']
+
+env = Environment(ENV = envvars,
                   variables = vars,
                   tools=tools,
                   toolpath=['tools'])
diff --git a/src/SConscript b/src/SConscript
index 38ace12a179f34bf540f7c9bf2322ce449583772..386a9a25e2b2dfb50616c595deb7cf7edf6594cc 100644
--- a/src/SConscript
+++ b/src/SConscript
@@ -69,7 +69,8 @@ ctests = ['t_benchmark.c',
           't_bitwriter.c',
           't_parser.c',
           't_grammar.c',
-          't_misc.c']
+          't_misc.c',
+	  't_regression.c']
 
 libhammer_shared = env.SharedLibrary('hammer', parsers + backends + misc_hammer_parts)
 libhammer_static = env.StaticLibrary('hammer', parsers + backends + misc_hammer_parts)
diff --git a/src/allocator.h b/src/allocator.h
index 803d89fe9bdbfd861a2ba86b5f216d5442e328fe..4a486936a058c0a619a83e7afdf0c5dfffc50d48 100644
--- a/src/allocator.h
+++ b/src/allocator.h
@@ -33,11 +33,22 @@ typedef struct HAllocator_ {
 typedef struct HArena_ HArena ; // hidden implementation
 
 HArena *h_new_arena(HAllocator* allocator, size_t block_size); // pass 0 for default...
-#ifndef SWIG
-void* h_arena_malloc(HArena *arena, size_t count) __attribute__(( malloc, alloc_size(2) ));
+
+#if defined __llvm__
+# if __has_attribute(malloc)
+#   define ATTR_MALLOC(n) __attribute__((malloc))
+# else
+#   define ATTR_MALLOC(n)
+# endif
+#elif defined SWIG
+# define ATTR_MALLOC(n)
+#elif defined __GNUC__
+# define ATTR_MALLOC(n) __attribute__((malloc, alloc_size(2)))
 #else
-void* h_arena_malloc(HArena *arena, size_t count);
+# define ATTR_MALLOC(n)
 #endif
+
+void* h_arena_malloc(HArena *arena, size_t count) ATTR_MALLOC(2);
 void h_arena_free(HArena *arena, void* ptr); // For future expansion, with alternate memory managers.
 void h_delete_arena(HArena *arena);
 
diff --git a/src/backends/packrat.c b/src/backends/packrat.c
index c1e422ed6e9fa42fe4130c11ad8a1f7e7c22c2a2..33082c6c278beb09b2abf767e5314d18ab471db4 100644
--- a/src/backends/packrat.c
+++ b/src/backends/packrat.c
@@ -33,11 +33,13 @@ static inline HParseResult* perform_lowlevel_parse(HParseState *state, const HPa
     if (tmp_res) {
       tmp_res->arena = state->arena;
       if (!state->input_stream.overrun) {
-	tmp_res->bit_length = ((state->input_stream.index - bak.index) << 3);
-	if (state->input_stream.endianness & BIT_BIG_ENDIAN)
-	  tmp_res->bit_length += state->input_stream.bit_offset - bak.bit_offset;
-	else
-	  tmp_res->bit_length += bak.bit_offset - state->input_stream.bit_offset;
+	size_t bit_length = h_input_stream_pos(&state->input_stream) - h_input_stream_pos(&bak);
+	if (tmp_res->bit_length == 0) { // Don't modify if forwarding.
+	  tmp_res->bit_length = bit_length;
+	}
+	if (tmp_res->ast && tmp_res->ast->bit_length != 0) {
+	  ((HParsedToken*)(tmp_res->ast))->bit_length = bit_length;
+	}
       } else
 	tmp_res->bit_length = 0;
     }
diff --git a/src/bitreader.c b/src/bitreader.c
index df8c4c3615fe9b36f02621945006adcbc981e60b..3627df5d6f9f228c8c9fe7b6e1b0c1c30b7e7de1 100644
--- a/src/bitreader.c
+++ b/src/bitreader.c
@@ -39,10 +39,7 @@ int64_t h_read_bits(HInputStream* state, int count, char signed_p) {
   if (bits_left <= 64) { // Large enough to handle any valid count, but small enough that overflow isn't a problem.
     // not in danger of overflowing, so add in bits
     // add in number of bits...
-    if (state->endianness & BIT_BIG_ENDIAN)
-      bits_left = (bits_left << 3) - 8 + state->bit_offset;
-    else
-      bits_left = (bits_left << 3) - state->bit_offset;
+    bits_left = (bits_left << 3) - state->bit_offset - state->margin;
     if (bits_left < count) {
       if (state->endianness & BYTE_BIG_ENDIAN)
 	final_shift = count - bits_left;
@@ -54,7 +51,7 @@ int64_t h_read_bits(HInputStream* state, int count, char signed_p) {
       final_shift = 0;
   }
   
-  if ((state->bit_offset & 0x7) == 0 && (count & 0x7) == 0) {
+  if ((state->bit_offset & 0x7) == 0 && (count & 0x7) == 0 && (state->margin == 0)) {
     // fast path
     if (state->endianness & BYTE_BIG_ENDIAN) {
       while (count > 0) {
@@ -73,22 +70,24 @@ int64_t h_read_bits(HInputStream* state, int count, char signed_p) {
       int segment, segment_len;
       // Read a segment...
       if (state->endianness & BIT_BIG_ENDIAN) {
-	if (count >= state->bit_offset) {
-	  segment_len = state->bit_offset;
-	  state->bit_offset = 8;
-	  segment = state->input[state->index] & ((1 << segment_len) - 1);
+	if (count + state->bit_offset + state->margin >= 8) {
+	  segment_len = 8 - state->bit_offset - state->margin;
+	  segment = (state->input[state->index] >> state->margin) & ((1 << segment_len) - 1);
 	  state->index++;
+	  state->bit_offset = 0;
+	  state->margin = 0;
 	} else {
 	  segment_len = count;
-	  state->bit_offset -= count;
-	  segment = (state->input[state->index] >> state->bit_offset) & ((1 << segment_len) - 1);
+	  state->bit_offset += count;
+	  segment = (state->input[state->index] >> (8 - state->bit_offset)) & ((1 << segment_len) - 1);
 	}
       } else { // BIT_LITTLE_ENDIAN
-	if (count + state->bit_offset >= 8) {
-	  segment_len = 8 - state->bit_offset;
-	  segment = (state->input[state->index] >> state->bit_offset);
+	if (count + state->bit_offset + state->margin >= 8) {
+	  segment_len = 8 - state->bit_offset - state->margin;
+	  segment = (state->input[state->index] >> state->bit_offset) & ((1 << segment_len) - 1);
 	  state->index++;
 	  state->bit_offset = 0;
+	  state->margin = 0;
 	} else {
 	  segment_len = count;
 	  segment = (state->input[state->index] >> state->bit_offset) & ((1 << segment_len) - 1);
diff --git a/src/glue.h b/src/glue.h
index 1fe6ce46f453e911339e5ea3090e2436283f106a..6c1c56ca0e368bc407d846f342dd52ba934c9dda 100644
--- a/src/glue.h
+++ b/src/glue.h
@@ -11,7 +11,8 @@
 //
 // A few standard semantic actions are defined below. The H_ACT_APPLY macro
 // allows semantic actions to be defined by "partial application" of
-// a generic action to fixed paramters.
+// a generic action to fixed paramters. H_VALIDATE_APPLY is similar for
+// h_atter_bool.
 //
 // The definition of more complex semantic actions will usually consist of
 // extracting data from the given parse tree and constructing a token of custom
@@ -66,13 +67,13 @@
     h_attr_bool(h_action(def, act_ ## rule, NULL), validate_ ## rule, NULL)
 #define H_AVRULE(rule, def) HParser *rule = \
     h_action(h_attr_bool(def, validate_ ## rule, NULL), act_ ## rule, NULL)
-#define H_ADRULE(rule, def, data) HParser *rule =	\
+#define H_ADRULE(rule, def, data) HParser *rule =       \
     h_action(def, act_ ## rule, data)
-#define H_VDRULE(rule, def, data) HParser *rule =	\
+#define H_VDRULE(rule, def, data) HParser *rule =       \
     h_attr_bool(def, validate_ ## rule, data)
-#define H_VADRULE(rule, def, data) HParser *rule =		\
+#define H_VADRULE(rule, def, data) HParser *rule =              \
     h_attr_bool(h_action(def, act_ ## rule, data), validate_ ## rule, data)
-#define H_AVDRULE(rule, def, data) HParser *rule =		\
+#define H_AVDRULE(rule, def, data) HParser *rule =              \
     h_action(h_attr_bool(def, validate_ ## rule, data), act_ ## rule, data)
 
 
@@ -109,8 +110,14 @@ HParsedToken *h_act_ignore(const HParseResult *p, void* user_data);
 // Define 'myaction' as a specialization of 'paction' by supplying the leading
 // parameters.
 #define H_ACT_APPLY(myaction, paction, ...) \
-  HParsedToken *myaction(const HParseResult *p, void* user_data) {	\
-    return paction(__VA_ARGS__, p, user_data);				\
+  HParsedToken *myaction(const HParseResult *p, void* user_data) {      \
+    return paction(__VA_ARGS__, p, user_data);                          \
+  }
+
+// Similar, but for validations.
+#define H_VALIDATE_APPLY(myvalidation, pvalidation, ...)  \
+  bool myvalidation(HParseResult* p, void* user_data) {   \
+    return pvalidation(__VA_ARGS__, p, user_data);        \
   }
 
 
diff --git a/src/hammer.c b/src/hammer.c
index 2456bdcedb7c9c7a0b4e374e8b8146bf19603179..6bb9ebb4febe53668a91ae9617ba05f2c158023d 100644
--- a/src/hammer.c
+++ b/src/hammer.c
@@ -52,7 +52,7 @@ HParseResult* h_parse__m(HAllocator* mm__, const HParser* parser, const uint8_t*
   // Set up a parse state...
   HInputStream input_stream = {
     .index = 0,
-    .bit_offset = 8,
+    .bit_offset = 0,
     .overrun = 0,
     .endianness = BIT_BIG_ENDIAN | BYTE_BIG_ENDIAN,
     .length = length,
diff --git a/src/hammer.h b/src/hammer.h
index b0ce75d20d74d65b8a64e5a25694ef8696acf4ad..1c02b0548d0964afe47d984a767ac688c6caa7d0 100644
--- a/src/hammer.h
+++ b/src/hammer.h
@@ -99,6 +99,7 @@ typedef struct HParsedToken_ {
   HTokenData token_data;
 #endif
   size_t index;
+  size_t bit_length;
   char bit_offset;
 } HParsedToken;
 
diff --git a/src/internal.h b/src/internal.h
index 6c721eb03e3f790308b7539ea0abd3b9ae59f805..0c4d4dc2739953c3cfffa487ea3bd73993698ebd 100644
--- a/src/internal.h
+++ b/src/internal.h
@@ -70,6 +70,8 @@ typedef struct HInputStream_ {
   size_t index;
   size_t length;
   char bit_offset;
+  char margin; // The number of bits on the end that is being read
+	       // towards that should be ignored.
   char endianness;
   char overrun;
 } HInputStream;
@@ -295,6 +297,9 @@ extern HParserBackendVTable h__glr_backend_vtable;
 // TODO(thequux): Set symbol visibility for these functions so that they aren't exported.
 
 int64_t h_read_bits(HInputStream* state, int count, char signed_p);
+static inline size_t h_input_stream_pos(HInputStream* state) {
+  return state->index * 8 + state->bit_offset + state->margin;
+}
 // need to decide if we want to make this public. 
 HParseResult* h_do_parse(const HParser* parser, HParseState *state);
 void put_cached(HParseState *ps, const HParser *p, HParseResult *cached);
diff --git a/src/parsers/endianness.c b/src/parsers/endianness.c
index 091e4c0142da577c47992ba45084af1f7e447ae9..e3f53ab8225a75bde08ff7e3dd456822e1234b86 100644
--- a/src/parsers/endianness.c
+++ b/src/parsers/endianness.c
@@ -11,19 +11,9 @@ static void switch_bit_order(HInputStream *input)
 {
     assert(input->bit_offset <= 8);
 
-    if((input->bit_offset % 8) != 0) {
-        // switching bit order in the middle of a byte
-        // we leave bit_offset untouched. this means that something like
-        //     le(bits(5)),le(bits(3))
-        // is equivalent to
-        //     le(bits(5),bits(3)) .
-        // on the other hand,
-        //     le(bits(5)),be(bits(5))
-        // will read the same 5 bits twice and discard the top 3.
-    } else {
-        // flip offset (0 <-> 8)
-        input->bit_offset = 8 - input->bit_offset;
-    }
+    char tmp = input->bit_offset;
+    input->bit_offset = input->margin;
+    input->margin = tmp;
 }
 
 static HParseResult *parse_endianness(void *env, HParseState *state)
diff --git a/src/parsers/parser_internal.h b/src/parsers/parser_internal.h
index ec97dd1b0696fcb69f4a17bfc7d4078138f4d355..9a3b6de3898b42336a84bfe565448c27315e29bb 100644
--- a/src/parsers/parser_internal.h
+++ b/src/parsers/parser_internal.h
@@ -18,6 +18,7 @@ static inline HParseResult* make_result(HArena *arena, HParsedToken *tok) {
   HParseResult *ret = h_arena_malloc(arena, sizeof(HParseResult));
   ret->ast = tok;
   ret->arena = arena;
+  ret->bit_length = 0; // This way it gets overridden in h_do_parse
   return ret;
 }
 
diff --git a/src/t_bitreader.c b/src/t_bitreader.c
index 40a7bb98369dd32696cb536cbd08c16b1a10c2b4..65235c1d36e3ed4406acee6ec93a524efe94aef9 100644
--- a/src/t_bitreader.c
+++ b/src/t_bitreader.c
@@ -4,14 +4,14 @@
 #include "internal.h"
 #include "test_suite.h"
 
-#define MK_INPUT_STREAM(buf,len,endianness_)   \
+#define MK_INPUT_STREAM(buf,len,endianness_)  \
   {					      \
-    .input = (uint8_t*)buf,					\
-      .length = len,						\
-      .index = 0,						\
-      .bit_offset = (((endianness_) & BIT_BIG_ENDIAN) ? 8 : 0),	\
-      .endianness = endianness_					\
-      }
+      .input = (uint8_t*)buf,		      \
+      .length = len,			      \
+      .index = 0,			      \
+      .bit_offset = 0,			      \
+      .endianness = endianness_		      \
+  }
 
 
 static void test_bitreader_ints(void) {
@@ -56,7 +56,6 @@ static void test_offset_largebits_le(void) {
   g_check_cmp_int32(h_read_bits(&is, 11, false), ==, 0x2D3);
 }
 
-
 void register_bitreader_tests(void)  {
   g_test_add_func("/core/bitreader/be", test_bitreader_be);
   g_test_add_func("/core/bitreader/le", test_bitreader_le);
diff --git a/src/t_bitwriter.c b/src/t_bitwriter.c
index 747c86f2a328d41f1e25bad6fb4c90de3df814e6..6b9b7051fa480b47e9cf173e29d865bdbc4a8943 100644
--- a/src/t_bitwriter.c
+++ b/src/t_bitwriter.c
@@ -24,7 +24,7 @@ void run_bitwriter_test(bitwriter_test_elem data[], char flags) {
     .input = buf,
     .index = 0,
     .length = len,
-    .bit_offset = (flags & BIT_BIG_ENDIAN) ? 8 : 0,
+    .bit_offset = 0,
     .endianness = flags,
     .overrun = 0
   };
diff --git a/src/t_regression.c b/src/t_regression.c
new file mode 100644
index 0000000000000000000000000000000000000000..e74f16b98a7d037b19b6ece386721830720ab2c1
--- /dev/null
+++ b/src/t_regression.c
@@ -0,0 +1,38 @@
+#include <glib.h>
+#include <stdint.h>
+#include "glue.h"
+#include "hammer.h"
+#include "test_suite.h"
+
+static void test_bug118(void) {
+  // https://github.com/UpstandingHackers/hammer/issues/118
+  // Adapted from https://gist.github.com/mrdomino/c6bc91a7cb3b9817edb5
+
+  HParseResult* p;
+  const uint8_t *input = (uint8_t*)"\x69\x5A\x6A\x7A\x8A\x9A";
+ 
+#define MY_ENDIAN (BIT_BIG_ENDIAN | BYTE_LITTLE_ENDIAN)
+    H_RULE(nibble, h_with_endianness(MY_ENDIAN, h_bits(4, false)));
+    H_RULE(sample, h_with_endianness(MY_ENDIAN, h_bits(10, false)));
+#undef MY_ENDIAN
+ 
+    H_RULE(samples, h_sequence(h_repeat_n(sample, 3), h_ignore(h_bits(2, false)), NULL));
+ 
+    H_RULE(header_ok, h_sequence(nibble, nibble, NULL));
+    H_RULE(header_weird, h_sequence(nibble, nibble, nibble, NULL));
+ 
+    H_RULE(parser_ok, h_sequence(header_ok, samples, NULL));
+    H_RULE(parser_weird, h_sequence(header_weird, samples, NULL));
+ 
+ 
+    p = h_parse(parser_weird, input, 6);
+    g_check_cmp_int32(p->bit_length, ==, 44);
+    h_parse_result_free(p);
+    p = h_parse(parser_ok, input, 6);
+    g_check_cmp_int32(p->bit_length, ==, 40);
+    h_parse_result_free(p);
+}
+
+void register_regression_tests(void) {
+  g_test_add_func("/core/regression/bug118", test_bug118);
+}
diff --git a/src/test_suite.c b/src/test_suite.c
index 81f86b2c5007f11375995ad50751dfcb4618b7f5..cba18e8db9ad4b1187a028c2a2326ae6c1026633 100644
--- a/src/test_suite.c
+++ b/src/test_suite.c
@@ -25,6 +25,7 @@ extern void register_parser_tests();
 extern void register_grammar_tests();
 extern void register_misc_tests();
 extern void register_benchmark_tests();
+extern void register_regression_tests();
 
 int main(int argc, char** argv) {
   g_test_init(&argc, &argv, NULL);
@@ -35,6 +36,7 @@ int main(int argc, char** argv) {
   register_parser_tests();
   register_grammar_tests();
   register_misc_tests();
+  register_regression_tests();
   if (g_test_slow() || g_test_perf())
     register_benchmark_tests();