Compare revisions

5b4ea546 · 5b4ea546 · 5b4ea546 · 5b4ea546 · 5b4ea546 · 5b4ea546
--- a/docs/hammerman.1
+++ b/docs/hammerman.1
+.TH HAMMER 1 2014-04-23  Hammer
+.SH NAME
+Hammer \- a bit oriented parsing library
+.SH SYNOPSIS
+#include <hammer\/hammer.h>
+.SH DESCRIPTION
+.B Hammer
+is a parsing library. Like many modern parsing libraries, it provides a parser combinator interface for writing grammars as inline domain-specific languages, but Hammer also provides a variety of parsing backends. It's also bit-oriented rather than character-oriented, making it ideal for parsing binary data such as images, network packets, audio, and executables.
+
+Hammer is written in C, but will provide bindings for other languages. If you don't see a language you're interested in on the list, just ask.
+
+Hammer currently builds under Linux, OS X, and Windows.
+.SH NOTES
+Bit-oriented -- grammars can include single-bit flags or multi-bit constructs that span character boundaries, with no hassle
+
+Thread-safe, reentrant
+
+Benchmarking for parsing backends -- determine empirically which backend will be most time-efficient for your grammar
+
+    Parsing backends:
+        Packrat parsing
+        LL(k)
+        GLR
+        LALR
+        Regular expressions
+    Language bindings:
+        C++
+        Java (not currently building; give us a few days)
+        Python
+        Ruby
+        Perl
+        Go
+        PHP
+        .NET
+.SH EXAMPLE
+.nf
+ 1  #include <hammer/hammer.h>
+ 2  #include <stdio.h>
+ 3
+ 4  int main(int argc, char *argv[]) {
+ 5      uint8_t input[1024];
+ 6      size_t inputsize;
+ 7
+ 8      HParser *hello_parser = h_token("Hello World", 11);
+ 9
+10      inputsize = fread(input, 1, sizeof(input), stdin);
+11
+12      HParseResult *result = h_parse(hello_parser, input, inputsize);
+13      if(result) {
+14          printf("yay!\n");
+15      } else {
+16          printf("boo!\n");
+17      }
+18  }
+.fi
--- a/docs/hammerman.3
+++ b/docs/hammerman.3
+'\" t
+.\"     Title: hammer
+.\"    Author: [see the "AUTHOR" section]
+.\" Generator: DocBook XSL Stylesheets v1.76.1 <http://docbook.sf.net/>
+.\"      Date: 29 April 2014
+.\"    Manual: \ \&
+.\"    Source: \ \& 8.6.9
+.\"  Language: English
+.\"
+.TH "HAMMER" "3" "29 April 2014" "\ \& 8\&.6\&.9" "\ \&"
+.\" -----------------------------------------------------------------
+.\" * Define some portability stuff
+.\" -----------------------------------------------------------------
+.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.\" http://bugs.debian.org/507673
+.\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html
+.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.ie \n(.g .ds Aq \(aq
+.el       .ds Aq '
+.\" -----------------------------------------------------------------
+.\" * set default formatting
+.\" -----------------------------------------------------------------
+.\" disable hyphenation
+.nh
+.\" disable justification (adjust text to left margin only)
+.ad l
+.\" -----------------------------------------------------------------
+.\" * MAIN CONTENT STARTS HERE *
+.\" -----------------------------------------------------------------
+.SH "NAME"
+Hammer \- a bit oriented parsing library
+.SH "SYNOPSIS"
+.sp
+.B #include <hammer/hammer.h>
+.SH "DESCRIPTION"
+.sp
+.B Hammer(3)
+is a parsing library. Like many modern parsing libraries, it provides a parser combinator interface for  writing  grammars as  inline domain-specific languages, but Hammer also provides a variety of parsing backends. It's also bit-oriented rather  than character-oriented, making it ideal for parsing binary data such as images, network packets, audio, and executables.
+
+Hammer is written in C, but will provide bindings for other languages.  If you don't see a language you're interested in on the list, just ask.
+
+Hammer currently builds under Linux, OS X, and Windows.
+.SH "NOTES"
+Bit-oriented -- grammars can include single-bit flags or multi-bit constructs that span character boundaries, with no hassle
+
+Thread-safe, reentrant
+
+Benchmarking for parsing backends -- determine empirically which backend will be most time-efficient for your grammar
+
+    Parsing backends:
+        Packrat parsing
+        LL(k)
+        GLR
+        LALR
+        Regular expressions
+    Language bindings:
+        C++
+        Java (not currently building; give us a few days)
+        Python
+        Ruby
+        Perl
+        Go
+        PHP
+        .NET
+.SH "EXAMPLE"
+.nf
+ 1  #include <hammer/hammer.h>
+ 2  #include <stdio.h>
+ 3
+ 4  int main(int argc, char *argv[]) {
+ 5      uint8_t input[1024];
+ 6      size_t inputsize;
+ 7
+ 8      HParser *hello_parser = h_token("Hello World", 11);
+ 9
+10      inputsize = fread(input, 1, sizeof(input), stdin);
+11
+12      HParseResult *result = h_parse(hello_parser, input, inputsize);
+13      if(result) {
+14          printf("yay!\\n");
+15      } else {
+16          printf("boo!\\n");
+17      }
+18      h_parse_result_free(result);
+19      return 0 == result;
+20  }
+.fi
+.SH "AUTHOR"
+.sp
+Hammer was originally written by Meredith Patterson and TQ Hirsch\&. Many people have contributed to it\&.
+.SH "RESOURCES"
+.sp
+github: https://github\&.com/upstandinghackers/hammer/
+.SH "COPYING"
+.sp
+Free use of this software is granted under the terms of the GNU General Public License (GPL)\& v2.
--- a/examples/SConscript
+++ b/examples/SConscript
+from __future__ import absolute_import, division, print_function
+
 Import('env')

 example = env.Clone()
-example.Append(LIBS="hammer", LIBPATH="../src")
+
+if 'GPROF' in env and env['GPROF'] == 1:
+    hammer_lib_name="hammer_pg"
+else:
+    hammer_lib_name="hammer"
+
+example.Append(LIBS=hammer_lib_name, LIBPATH="../src")

 dns = example.Program('dns', ['dns.c', 'rr.c', 'dns_common.c'])
+ttuser = example.Program('ttuser', 'ttuser.c')
 base64 = example.Program('base64', 'base64.c')
 base64_sem1 = example.Program('base64_sem1', 'base64_sem1.c')
 base64_sem2 = example.Program('base64_sem2', 'base64_sem2.c')
-env.Alias("examples", [dns, base64, base64_sem1, base64_sem2])
\ No newline at end of file
+ties = example.Program('ties', ['ties.c', 'grammar.c'])
+env.Alias("examples", [dns, ttuser, base64, base64_sem1, base64_sem2, ties])
--- a/examples/base64.c
+++ b/examples/base64.c
@@ -45,7 +45,7 @@ int main(int argc, char **argv)
 {
    uint8_t input[102400];
    size_t inputsize;
-    const HParseResult *result;
+    HParseResult *result;

    init_parser();

@@ -57,6 +57,7 @@ int main(int argc, char **argv)
    if(result) {
        fprintf(stderr, "parsed=%" PRId64 " bytes\n", result->bit_length/8);
        h_pprint(stdout, result->ast, 0, 0);
+        h_parse_result_free(result);
        return 0;
    } else {
        return 1;

--- a/examples/base64.py
+++ b/examples/base64.py
+#!/usr/bin/env python2
+
+# Example parser: Base64, syntax only.
+#
+# Demonstrates how to construct a Hammer parser that recognizes valid Base64
+# sequences.
+#
+# Note that no semantic evaluation of the sequence is performed, i.e. the
+# byte sequence being represented is not returned, or determined. See
+# base64_sem1.py and base64_sem2.py for examples how to attach appropriate
+# semantic actions to the grammar.
+
+from __future__ import absolute_import, division, print_function
+
+import sys
+
+import hammer as h
+
+
+def init_parser():
+    # CORE
+    digit = h.ch_range(0x30, 0x39)
+    alpha = h.choice(h.ch_range(0x41, 0x5a), h.ch_range(0x61, 0x7a))
+
+    # AUX.
+    plus = h.ch(b'+')
+    slash = h.ch(b'/')
+    equals = h.ch(b'=')
+
+    bsfdig = h.choice(alpha, digit, plus, slash)
+    bsfdig_4bit = h.in_(b'AEIMQUYcgkosw048')
+    bsfdig_2bit = h.in_(b'AQgw')
+    base64_3 = h.repeat_n(bsfdig, 4)
+    base64_2 = h.sequence(bsfdig, bsfdig, bsfdig_4bit, equals)
+    base64_1 = h.sequence(bsfdig, bsfdig_2bit, equals, equals)
+    base64 = h.sequence(h.many(base64_3),
+                        h.optional(h.choice(base64_2, base64_1)))
+
+    return h.sequence(h.whitespace(base64), h.whitespace(h.end_p()))
+
+
+def main():
+    document = init_parser()
+
+    s = sys.stdin.read()
+    inputsize = len(s)
+    print('inputsize=%i' % inputsize, file=sys.stderr)
+    print('input=%s' % s, file=sys.stderr, end='')
+
+    result = document.parse(s)
+
+    if result:
+        #print('parsed=%i bytes', result.bit_length/8, file=sys.stderr)
+        print(result)
+
+
+if __name__ == '__main__':
+    import sys
+
+    main()
--- a/examples/base64_sem1.c
+++ b/examples/base64_sem1.c
@@ -29,9 +29,9 @@ HParsedToken *act_bsfdig(const HParseResult *p, void* user_data)

    uint8_t c = H_CAST_UINT(p->ast);

-    if(c >= 0x40 && c <= 0x5A) // A-Z
+    if(c >= 0x41 && c <= 0x5A) // A-Z
        res->uint = c - 0x41;
-    else if(c >= 0x60 && c <= 0x7A) // a-z
+    else if(c >= 0x61 && c <= 0x7A) // a-z
        res->uint = c - 0x61 + 26;
    else if(c >= 0x30 && c <= 0x39) // 0-9
        res->uint = c - 0x30 + 52;
@@ -149,12 +149,13 @@ HParser *init_parser(void)

 #include <stdio.h>

+const HParser *parser;  // Allocated statically to suppress leak warnings
+
 int main(int argc, char **argv)
 {
    uint8_t input[102400];
    size_t inputsize;
-    const HParser *parser;
-    const HParseResult *result;
+    HParseResult *result;

    parser = init_parser();

@@ -166,6 +167,7 @@ int main(int argc, char **argv)
    if(result) {
        fprintf(stderr, "parsed=%" PRId64 " bytes\n", result->bit_length/8);
        h_pprint(stdout, result->ast, 0, 0);
+        h_parse_result_free(result);
        return 0;
    } else {
        return 1;

--- a/examples/base64_sem1.py
+++ b/examples/base64_sem1.py
+#!/usr/bin/env python2
+
+# Example parser: Base64, with fine-grained semantic actions
+#
+# Demonstrates how to attach semantic actions to grammar rules and piece by
+# piece transform the parse tree into the desired semantic representation,
+# in this case a sequence of 8-bit values.
+#
+# Those rules using h.action get an attached action, which must be declared
+# (as a function).
+#
+# This variant of the example uses fine-grained semantic actions that
+# transform the parse tree in small steps in a bottom-up fashion. Compare
+# base64_sem2.py for an alternative approach using a single top-level action.
+
+from __future__ import absolute_import, division, print_function
+
+import functools
+import sys
+
+import hammer as h
+
+
+# Semantic actions for the grammar below, each corresponds to an "ARULE".
+# They must be named act_<rulename>.
+
+def act_bsfdig(p, user_data=None):
+    # FIXME See the note in init_parser()
+    c = p if isinstance(p, h.INTEGER_TYPES) else ord(p)
+
+    if 0x41 <= c <= 0x5A: # A-Z
+        return c - 0x41
+    elif 0x61 <= c <= 0x7A: # a-z
+        return c - 0x61 + 26
+    elif 0x30 <= c <= 0x39: # 0-9
+        return c - 0x30 + 52
+    elif c == b'+':
+        return 62
+    elif c == b'/':
+        return 63
+    else:
+        raise ValueError
+
+# Hammer's Python bindings don't currently expose h_act_index or hact_ignore
+
+def act_index0(p, user_data=None):
+    return p[0]
+
+def act_ignore(p, user_data=None):
+    return None
+
+act_bsfdig_4bit = act_bsfdig
+act_bsfdig_2bit = act_bsfdig
+
+act_equals      = act_ignore
+act_ws          = act_ignore
+
+act_document    = act_index0
+
+
+def act_base64_n(n, p, user_data=None):
+    """General-form action to turn a block of base64 digits into bytes.
+    """
+    res = [0]*n
+
+    x = 0
+    bits = 0
+    for i in range(0, n+1):
+        x <<= 6
+        x |= p[i] or 0
+        bits += 6
+
+    x >>= bits % 8 # align, i.e. cut off extra bits
+
+    for i in range(n):
+        item = x & 0xFF
+
+        res[n-1-i] = item   # output the last byte and
+        x >>= 8             # discard it
+
+    return tuple(res)
+
+
+act_base64_3 = functools.partial(act_base64_n, 3)
+act_base64_2 = functools.partial(act_base64_n, 2)
+act_base64_1 = functools.partial(act_base64_n, 1)
+
+
+def act_base64(p, user_data=None):
+    assert isinstance(p, tuple)
+    assert len(p) == 2
+    assert isinstance(p[0], tuple)
+
+    res = []
+    
+    # concatenate base64_3 blocks
+    for elem in p[0]:
+        res.extend(elem)
+
+    # append one trailing base64_2 or _1 block
+    tok = p[1]
+    if isinstance(tok, tuple):
+        res.extend(tok)
+
+    return tuple(res)
+
+
+def init_parser():
+    """Return a parser with the grammar to be recognized.
+    """
+    # CORE
+
+    # This is a direct translation of the  C example. In C the literal 0x30
+    # is interchangable with the char literal '0' (note the single quotes).
+    # This is not the case in Python.
+    
+    # TODO In the interests of being more Pythonic settle on either string
+    #      literals, or integers
+    digit   = h.ch_range(0x30, 0x39)
+    alpha   = h.choice(h.ch_range(0x41, 0x5a), h.ch_range(0x61, 0x7a))
+    space   = h.in_(b" \t\n\r\f\v")
+
+    # AUX.
+    plus    = h.ch(b'+')
+    slash   = h.ch(b'/')
+    equals  = h.action(h.ch(b'='), act_equals)
+
+    bsfdig      = h.action(h.choice(alpha, digit, plus, slash), act_bsfdig)
+    bsfdig_4bit = h.action(h.in_(b"AEIMQUYcgkosw048"), act_bsfdig_4bit)
+    bsfdig_2bit = h.action(h.in_(b"AQgw"), act_bsfdig_2bit)
+    base64_3    = h.action(h.repeat_n(bsfdig, 4), act_base64_3)
+    base64_2    = h.action(h.sequence(bsfdig, bsfdig, bsfdig_4bit, equals),
+                           act_base64_2)
+    base64_1    = h.action(h.sequence(bsfdig, bsfdig_2bit, equals, equals),
+                           act_base64_1)
+    base64      = h.action(h.sequence(h.many(base64_3),
+                                      h.optional(h.choice(base64_2,
+                                                          base64_1))),
+                           act_base64)
+
+    # TODO This is not quite the same as the C example, with uses act_ignore.
+    #      But I can't get hammer to filter any value returned by act_ignore.
+    ws          = h.ignore(h.many(space))
+    document    = h.action(h.sequence(ws, base64, ws, h.end_p()),
+                           act_document)
+
+    # BUG sometimes inputs that should just don't parse.
+    # It *seemed* to happen mostly with things like "bbbbaaaaBA==".
+    # Using less actions seemed to make it less likely.
+
+    return document
+
+def main():
+    parser = init_parser()
+
+    s = sys.stdin.read()
+    inputsize = len(s)
+    print('inputsize=%i' % inputsize, file=sys.stderr)
+    print('input=%s' % s, file=sys.stderr, end='')
+
+    result = parser.parse(s)
+
+    if result:
+        #print('parsed=%i bytes', result.bit_length/8, file=sys.stderr)
+        print(result)
+
+
+if __name__ == '__main__':
+    main()
--- a/examples/base64_sem2.c
+++ b/examples/base64_sem2.c
@@ -31,9 +31,9 @@ uint8_t bsfdig_value(const HParsedToken *p)

    if(p && p->token_type == TT_UINT) {
        uint8_t c = p->uint;
-        if(c >= 0x40 && c <= 0x5A) // A-Z
+        if(c >= 0x41 && c <= 0x5A) // A-Z
            value = c - 0x41;
-        else if(c >= 0x60 && c <= 0x7A) // a-z
+        else if(c >= 0x61 && c <= 0x7A) // a-z
            value = c - 0x61 + 26;
        else if(c >= 0x30 && c <= 0x39) // 0-9
            value = c - 0x30 + 52;
@@ -153,12 +153,13 @@ const HParser *init_parser(void)

 #include <stdio.h>

+const HParser *parser;  // Allocated statically to suppress leak warnings
+
 int main(int argc, char **argv)
 {
    uint8_t input[102400];
    size_t inputsize;
-    const HParser *parser;
-    const HParseResult *result;
+    HParseResult *result;

    parser = init_parser();

@@ -170,6 +171,7 @@ int main(int argc, char **argv)
    if(result) {
        fprintf(stderr, "parsed=%" PRId64 " bytes\n", result->bit_length/8);
        h_pprint(stdout, result->ast, 0, 0);
+        h_parse_result_free(result);
        return 0;
    } else {
        return 1;

--- a/examples/base64_sem2.py
+++ b/examples/base64_sem2.py
+#!/usr/bin/env python2
+
+# Example parser: Base64, with fine-grained semantic actions
+#
+# Demonstrates how to attach semantic actions to a grammar and transform the
+# parse tree into the desired semantic representation, in this case a sequence
+# of 8-bit values.
+#
+# Those rules using h.action get an attached action, which must be declared
+# (as a function).
+#
+# This variant of the example uses coarse-grained semantic actions,
+# transforming the entire parse tree in one big step. Compare base64_sem1.py
+# for an alternative approach using a fine-grained piece-by-piece
+# transformation.
+
+from __future__ import absolute_import, division, print_function
+
+import functools
+import sys
+
+import hammer as h
+
+
+# Semantic actions for the grammar below, each corresponds to an "ARULE".
+# They must be named act_<rulename>.
+
+def bsfdig_value(p):
+    """Return the numeric value of a parsed base64 digit.
+    """
+    c = p if isinstance(p, h.INTEGER_TYPES) else ord(p)
+    if c:
+        if 0x41 <= c <= 0x5A: # A-Z
+            return  c - 0x41
+        elif 0x61 <= c <= 0x7A: # a-z
+            return  c - 0x61 + 26
+        elif 0x30 <= c <= 0x39: # 0-9
+            return  c - 0x30 + 52
+        elif c == b'+':
+            return  62
+        elif c == b'/':
+            return  63
+    return 0
+
+def act_base64(p, user_data=None):
+    assert isinstance(p, tuple)
+    assert len(p) == 2
+    assert isinstance(p[0], tuple)
+
+    # grab b64_3 block sequence
+    # grab and analyze b64 end block (_2 or _1)
+    b64_3 = p[0]
+    b64_2 = p[1]
+    b64_1 = p[1]
+
+    if not isinstance(b64_2, tuple):
+        b64_1 = b64_2 = None
+    elif b64_2[2] == '=':
+        b64_2 = None
+    else:
+        b64_1 = None
+
+    # allocate result sequence
+    res = []
+
+    # concatenate base64_3 blocks
+    for digits in b64_3:
+        assert isinstance(digits, tuple)
+
+        x = bsfdig_value(digits[0])
+        x <<= 6; x |= bsfdig_value(digits[1])
+        x <<= 6; x |= bsfdig_value(digits[2])
+        x <<= 6; x |= bsfdig_value(digits[3])
+        res.append((x >> 16) & 0xFF)
+        res.append((x >> 8) & 0xFF)
+        res.append(x & 0xFF)
+
+    # append one trailing base64_2 or _1 block
+    if b64_2:
+        digits = b64_2
+        x = bsfdig_value(digits[0])
+        x <<= 6; x |= bsfdig_value(digits[1])
+        x <<= 6; x |= bsfdig_value(digits[2])
+        res.append((x >> 10) & 0xFF)
+        res.append((x >> 2) & 0xFF)
+    elif b64_1:
+        digits = b64_1
+        x = bsfdig_value(digits[0])
+        x <<= 6; x |= bsfdig_value(digits[1])
+        res.append((x >> 4) & 0xFF)
+
+    return tuple(res)
+
+# Hammer's Python bindings don't currently expose h_act_index or hact_ignore
+
+def act_index0(p, user_data=None):
+    return p[0]
+
+def act_ignore(p, user_data=None):
+    return None
+
+act_ws          = act_ignore
+act_document    = act_index0
+
+
+def init_parser():
+    """Set up the parser with the grammar to be recognized.
+    """
+    # CORE
+    digit   = h.ch_range(0x30, 0x39)
+    alpha   = h.choice(h.ch_range(0x41, 0x5a), h.ch_range(0x61, 0x7a))
+    space   = h.in_(b" \t\n\r\f\v")
+
+    # AUX.
+    plus    = h.ch(b'+')
+    slash   = h.ch(b'/')
+    equals  = h.ch(b'=')
+
+    bsfdig      = h.choice(alpha, digit, plus, slash)
+    bsfdig_4bit = h.in_(b"AEIMQUYcgkosw048")
+    bsfdig_2bit = h.in_(b"AQgw")
+    base64_3    = h.repeat_n(bsfdig, 4)
+    base64_2    = h.sequence(bsfdig, bsfdig, bsfdig_4bit, equals)
+    base64_1    = h.sequence(bsfdig, bsfdig_2bit, equals, equals)
+    base64      = h.action(h.sequence(h.many(base64_3),
+                                      h.optional(h.choice(base64_2,
+                                                          base64_1))),
+                           act_base64)
+
+    # TODO This is not quite the same as the C example, with uses act_ignore.
+    #      But I can't get hammer to filter any value returned by act_ignore.
+    ws          = h.ignore(h.many(space))
+    document    = h.action(h.sequence(ws, base64, ws, h.end_p()),
+                           act_document)
+
+    # BUG sometimes inputs that should just don't parse.
+    # It *seemed* to happen mostly with things like "bbbbaaaaBA==".
+    # Using less actions seemed to make it less likely.
+
+    return document
+
+
+def main():
+    parser = init_parser()
+
+    s = sys.stdin.read()
+    inputsize = len(s)
+    print('inputsize=%i' % inputsize, file=sys.stderr)
+    print('input=%s' % s, file=sys.stderr, end='')
+
+    result = parser.parse(s)
+
+    if result:
+        #print('parsed=%i bytes', result.bit_length/8, file=sys.stderr)
+        print(result)
+
+
+if __name__ == '__main__':
+    main()
--- a/examples/grammar.c
+++ b/examples/grammar.c
+// Generates a system of equations for generating functions from a grammar.
+//
+// (c) 2015 Mikael Vejdemo-Johansson <mikael@johanssons.org>
+//
+
+// If a desugared parser has user_data set, the generating function systems will try
+// to interpret it as a string:
+//
+// If this string for an h_ch starts with the character 0, then that character
+// will have weight 0 in the generating function.
+//
+// Use the remaining string to set the preferred name of that parser in the
+// generating function.
+//
+
+#include <inttypes.h>
+#include "../src/backends/contextfree.h"
+#include "../src/backends/lr.h"
+#include "grammar.h"
+#include <stdio.h>
+
+const char *nonterminal_name(const HCFGrammar *g, const HCFChoice *nt) {
+  // if user_data exists and is printable:
+  if(nt->user_data != NULL && *(char*)(nt->user_data) > ' ' && *(char*)(nt->user_data) < 127) {
+    char* user_str = (char*)(nt->user_data);
+    if(*user_str != '\0') {
+      // user_data is a non-empty string
+      return user_str;
+    } else {
+      return user_str+1;
+    }
+  }
+  
+  static char buf[16] = {0}; // 14 characters in base 26 are enough for 64 bits
+
+  // find nt's number in g
+  size_t n = (uintptr_t)h_hashtable_get(g->nts, nt);
+
+  // NB the start symbol (number 0) is always "A".
+  int i;
+  for(i=14; i>=0 && (n>0 || i==14); i--) {
+    buf[i] = 'A' + n%26;
+    n = n/26;   // shift one digit
+  }
+
+  return buf+i+1;
+}
+
+
+
+void readsequence(FILE *file, uint32_t *count, uint32_t *length,
+		  const HCFGrammar *g, const HCFSequence *seq) {
+  // tally up numbers of choices, and lengths of emitted strings.
+  // Immediately emit any nonterminals encountered.
+  HCFChoice** x = seq->items;
+  
+  fprintf(file, "1");
+  if (*x == NULL) {
+    // empty sequence
+    // GF is 1
+    return;
+  } else {
+    char has_user_data = (*x)->user_data != NULL && *(char*)(*x)->user_data != 0;
+    HCharset cs;
+    unsigned int i, cscount=0;
+    for(; *x; x++) {
+      switch((*x)->type) {
+      case HCF_CHAR:
+	if(!(has_user_data && *(char*)(*x)->user_data == '0')) {
+	  (*length)++;
+	}
+	break;
+      case HCF_END:
+	break;
+      case HCF_CHARSET:
+	cs = (*x)->charset;
+	for(i=0; i<256; i++) {
+	  if (charset_isset(cs, i)) {
+	    cscount++;
+	  }
+	}
+	*count *= cscount;
+	break;
+      default: // HCF_CHOICE, non-terminal symbol
+	fprintf(file, "*%s", nonterminal_name(g, *x));
+	break;
+      }
+    }
+  }
+}
+
+// For each nt in g->nts
+//     For each choice in nt->key->seq
+//          For all elements in sequence
+//              Accumulate counts 
+//              Accumulate string lengths
+//              Emit count*t^length
+void h_pprint_gfeqns(FILE *file, const HCFGrammar *g) {
+  if (g->nts->used < 1) {
+    return;
+  }
+
+  // determine maximum string length of symbol names
+  int len;
+  size_t s;
+  for(len=1, s=26; s < g->nts->used; len++, s*=26);
+
+  // emit the SageMath ring init string
+  // iterate over g->nts, output symbols
+  size_t i;
+  HHashTableEntry *hte;  
+  fprintf(file, "ring.<t");
+  for(i=0; i < g->nts->capacity; i++) {
+    for(hte = &g->nts->contents[i]; hte; hte = hte->next) {
+      if (hte->key == NULL) {
+        continue;
+      }
+      const HCFChoice *nt = hte->key;
+      fprintf(file, ",");
+      
+      fprintf(file, "%s", nonterminal_name(g, nt));
+    }
+  }
+  fprintf(file, "> = QQ[]\n");
+      
+  
+  // iterate over g->nts
+  // emit a Sage ideal definition
+  int j=0;
+  fprintf(file, "ID = ring.ideal(");
+  for(i=0; i < g->nts->capacity; i++) {
+    for(hte = &g->nts->contents[i]; hte; hte = hte->next) {
+      if (hte->key == NULL) {
+        continue;
+      }
+
+      if(j>0) {
+	fprintf(file, ",");
+      }
+      j++;
+      
+      const HCFChoice *nt = hte->key;
+      const char *ntn = nonterminal_name(g, nt);
+      if(*ntn == 0) {
+	continue;
+      }
+      fprintf(file, "%s - (", ntn);
+
+      
+      for(HCFSequence **seq = nt->seq; *seq; seq++) {
+	if (seq != nt->seq) {
+	  fprintf(file, " + ");
+	}
+	uint32_t count=1, length=0;
+	readsequence(file, &count, &length, g, *seq);
+	if(count == 1) {
+	  if(length == 1) {
+	    fprintf(file, "*t");
+	  }
+	  if(length > 1) {
+	    fprintf(file, "*t^%d", length);
+	  }
+	} else if(count > 1) {
+	  if(length == 0) {
+	    fprintf(file, "*%d", count);
+	  }
+	  if(length == 1) {
+	    fprintf(file, "*%d*t", count);
+	  }
+	  if (length > 1) {
+	    fprintf(file, "*%d*t^%d", count, length);
+	  } 
+	}
+      }
+
+      fprintf(file, ")");
+    }
+  }
+  fprintf(file, ")\n");
+}
--- a/examples/grammar.h
+++ b/examples/grammar.h
+// Generates a system of equations for generating functions from a grammar.
+//
+// (c) 2015 Mikael Vejdemo-Johansson <mikael@johanssons.org>
+//
+
+// Currently does absolutely no elegance, no caching of information, but rather
+// just prints the generating functions to a provided FILE*.
+//
+
+
+// If a desugared parser has user_data set, the generating function systems will try
+// to interpret it as a string:
+//
+// If this string for an h_ch starts with the character 0, then that character
+// will have weight 0 in the generating function.
+//
+// Use the remaining string to set the preferred name of that parser in the
+// generating function.
+//
+
+#ifndef HAMMER_GRAMMAR__H
+#define HAMMER_GRAMMAR__H
+
+#include "../src/backends/contextfree.h"
+#include "../src/backends/lr.h"
+
+
+// Filched from cfgrammar.c this function extracts the name from user_data if it
+// is set; otherwise assigns a name automatically from its position in some
+// ordering of non-terminals.
+const char *nonterminal_name(const HCFGrammar *g, const HCFChoice *nt);
+
+// This function prints out the monomial generated by a single HCFSequence
+// It returns the resulting exponent for t in length and the number of alternatives
+// accumulated in length. The monomial is (mostly) printed out to the provided FILE*,
+// the caller is responsible for adding a scalar and a power of t to the printout.
+void readsequence(FILE *file, uint32_t *count, uint32_t *length,
+		  const HCFGrammar *g, const HCFSequence *seq);
+
+// This function walks through a grammar and generates an equation for each
+// production rule. The results are printed out to the provided FILE*.
+void h_pprint_gfeqns(FILE *file, const HCFGrammar *g);
+
+
+
+#endif
--- a/examples/ties.c
+++ b/examples/ties.c
+// Intention: read in a parser, generate the system of equations for its
+// generating functions
+//
+
+#include <inttypes.h>
+#include "../src/backends/contextfree.h"
+#include "../src/backends/lr.h"
+#include "grammar.h"
+#include <stdio.h>
+
+
+HAllocator *mm__;
+
+HParser* cfExample() {
+  HParser *n = h_ch('n');
+  HParser *E = h_indirect();
+  HParser *T = h_choice(h_sequence(h_ch('('), E, h_ch(')'), NULL), n, NULL);
+  HParser *E_ = h_choice(h_sequence(E, h_ch('-'), T, NULL), T, NULL);
+  h_bind_indirect(E, E_);
+  return E;
+}
+
+// The tie knot parsers below would work better if we could patch the gen.function
+// code above to allow user specification of non-default byte string "lengths",
+// so that U symbols don't contribute with factors of t to the gen. function.
+//
+// Alternatively: use multivariate generating functions to spit out different
+// variables for different terminals. This gets really messy with bigger alphabets.
+
+HParser* finkmao() {
+  HParser *L = h_ch('L');
+  HParser *R = h_ch('R');
+  HParser *C = h_ch('C');
+  HParser *U = h_ch('U');
+  HParser *Lnext = h_indirect();
+  HParser *Rnext = h_indirect();
+  HParser *Cnext = h_indirect();
+  HParser *L_ = h_choice(h_sequence(R, Rnext, NULL),
+			 h_sequence(C, Cnext, NULL),
+			 h_sequence(R, C, U, NULL), NULL);
+  HParser *R_ = h_choice(h_sequence(L, Lnext, NULL),
+			 h_sequence(C, Cnext, NULL),
+			 h_sequence(L, C, U, NULL), NULL);
+  HParser *C_ = h_choice(h_sequence(R, Rnext, NULL),
+			 h_sequence(L, Lnext, NULL), NULL);
+  h_bind_indirect(Lnext, L_);
+  h_bind_indirect(Rnext, R_);
+  h_bind_indirect(Cnext, C_);
+  HParser *tie = h_sequence(L, Lnext, NULL);
+
+  h_desugar_augmented(mm__, tie);
+
+  L->desugared->user_data = "L";
+  R->desugared->user_data = "R";
+  C->desugared->user_data = "C";
+  Lnext->desugared->user_data = "Ln";
+  Rnext->desugared->user_data = "Rn";
+  Cnext->desugared->user_data = "Cn";
+  tie->desugared->user_data = "tie";
+  U->desugared->user_data = "0U";
+  
+  return tie;
+}
+
+HParser* finkmaoTW() {
+  HParser *T = h_ch('T');
+  HParser *W = h_ch('W');
+  HParser *U = h_ch('U');
+  HParser *prefix = h_choice(T, W, h_epsilon_p(),
+			     NULL);
+  HParser *pair = h_choice(h_sequence(T, T, NULL),
+			   h_sequence(W, T, NULL),
+			   h_sequence(T, W, NULL),
+			   h_sequence(W, W, NULL), NULL);
+  HParser *tuck = h_choice(h_sequence(T, T, U, NULL),
+			   h_sequence(W, W, U, NULL),
+			   NULL);
+  HParser *pairstar = h_indirect();
+  HParser *pstar_ = h_choice(h_sequence(pair, pairstar, NULL),
+			     h_epsilon_p(),
+			      NULL);
+  h_bind_indirect(pairstar, pstar_);
+
+  HParser* tie = h_sequence(prefix, pairstar, tuck, NULL);
+  h_desugar_augmented(mm__, tie);
+
+  
+  T->desugared->user_data = "T";
+  W->desugared->user_data = "W";
+  U->desugared->user_data = "0U";
+  prefix->desugared->user_data = "prefix";
+  pair->desugared->user_data = "pair";
+  tuck->desugared->user_data = "tuck";
+  pstar_->desugared->user_data = "pairstar";
+  tie->desugared->user_data = "tie";
+  
+  return tie;
+}
+
+HParser* depth1TW() {
+  HParser *T = h_ch('T');
+  HParser *W = h_ch('W');
+  HParser *U = h_ch('U');
+  HParser *prefix = h_choice(T, W, h_epsilon_p(), NULL);
+  HParser *pair = h_choice(h_sequence(T, T, NULL),
+			   h_sequence(W, T, NULL),
+			   h_sequence(T, W, NULL),
+			   h_sequence(W, W, NULL), NULL);
+  HParser *tuck = h_choice(h_sequence(T, T, U, NULL),
+			   h_sequence(W, W, U, NULL),
+			   NULL);
+  HParser *tuckpairstar = h_indirect();
+  HParser *tpstar_ = h_choice(h_sequence(pair, tuckpairstar, NULL),
+			      h_sequence(tuck, tuckpairstar, NULL),
+			      h_epsilon_p(),
+			      NULL);
+  h_bind_indirect(tuckpairstar, tpstar_);
+  HParser *tie = h_choice(h_sequence(prefix, tuckpairstar, tuck, NULL), NULL);
+
+  h_desugar_augmented(mm__, tie);
+  
+  T->desugared->user_data = "T";
+  W->desugared->user_data = "W";
+  U->desugared->user_data = "0U";
+  prefix->desugared->user_data = "prefix";
+  pair->desugared->user_data = "pair";
+  tuck->desugared->user_data = "tuck";
+  tpstar_->desugared->user_data = "tuckpairstar";
+  tie->desugared->user_data = "tie";
+
+  return tie;
+}
+
+HParser* depth1() {
+  HParser *L = h_ch('L');
+  HParser *R = h_ch('R');
+  HParser *C = h_ch('C');
+  HParser *U = h_ch('U');
+  HParser *lastR = h_indirect();
+  HParser *lastL = h_indirect();
+  HParser *lastC = h_indirect();
+  HParser *R_ = h_choice(h_sequence(L, R, lastR, NULL),
+			 h_sequence(C, R, lastR, NULL),
+			 h_sequence(L, C, lastC, NULL),
+			 h_sequence(L, C, U, lastC, NULL),
+			 h_sequence(L, C, U, NULL),
+			 h_sequence(C, L, lastL, NULL),
+			 h_sequence(C, L, U, lastL, NULL),
+			 h_sequence(C, L, U, NULL),
+			 NULL);
+  HParser *L_ = h_choice(h_sequence(R, L, lastR, NULL),
+			 h_sequence(C, L, lastR, NULL),
+			 h_sequence(R, C, lastC, NULL),
+			 h_sequence(R, C, U, lastC, NULL),
+			 h_sequence(R, C, U, NULL),
+			 h_sequence(C, R, lastR, NULL),
+			 h_sequence(C, R, U, lastR, NULL),
+			 h_sequence(C, R, U, NULL),
+			 NULL);
+  HParser *C_ = h_choice(h_sequence(L, C, lastR, NULL),
+			 h_sequence(R, C, lastR, NULL),
+			 h_sequence(L, R, lastR, NULL),
+			 h_sequence(L, R, U, lastR, NULL),
+			 h_sequence(L, R, U, NULL),
+			 h_sequence(R, L, lastL, NULL),
+			 h_sequence(R, L, U, lastL, NULL),
+			 h_sequence(R, L, U, NULL),
+			 NULL);
+  h_bind_indirect(lastR, R_);
+  h_bind_indirect(lastL, L_);
+  h_bind_indirect(lastC, C_);
+  HParser* tie = h_choice(h_sequence(L, lastL, NULL),
+		  h_sequence(R, lastR, NULL),
+		  h_sequence(C, lastC, NULL),
+		  NULL);
+
+  h_desugar_augmented(mm__, tie);
+
+  L->desugared->user_data = "L";
+  R->desugared->user_data = "R";
+  C->desugared->user_data = "C";
+  U->desugared->user_data = "0U";
+  lastL ->desugared->user_data = "Ln";
+  lastR->desugared->user_data = "Rn";
+  lastC->desugared->user_data = "Cn";
+  tie->desugared->user_data = "tie";
+
+  return tie;
+}
+
+HParser* depthNTW() {
+  HParser *T = h_ch('T');
+  HParser *W = h_ch('W');
+  HParser *U = h_ch('U');
+  HParser *prefix = h_choice(T, W, h_epsilon_p(), NULL);
+  HParser *pair = h_choice(h_sequence(T, T, NULL),
+			   h_sequence(W, T, NULL),
+			   h_sequence(T, W, NULL),
+			   h_sequence(W, W, NULL), NULL);
+  HParser *tstart = h_indirect();
+  HParser *tw0 = h_indirect();
+  HParser *tw1 = h_indirect();
+  HParser *tw2 = h_indirect();
+  HParser *wstart = h_indirect();
+  HParser *wt0 = h_indirect();
+  HParser *wt1 = h_indirect();
+  HParser *wt2 = h_indirect();
+  
+  HParser *T_ = h_choice(h_sequence(T, T, tw2, U, NULL),
+			 h_sequence(T, W, tw0, U, NULL),
+			 NULL);
+  HParser *tw0_ = h_choice(h_sequence(T, T, tw2, U, NULL),
+			   h_sequence(T, W, tw0, U, NULL),
+			   h_sequence(W, T, tw0, U, NULL),
+			   h_sequence(W, W, tw1, U, NULL),
+			   h_sequence(tstart, tw2, U, NULL),
+			   h_sequence(wstart, tw1, U, NULL),
+			   NULL);
+  HParser *tw1_ = h_choice(h_sequence(T, T, tw0, U, NULL),
+			   h_sequence(T, W, tw1, U, NULL),
+			   h_sequence(W, T, tw1, U, NULL),
+			   h_sequence(W, W, tw2, U, NULL),
+			   h_sequence(tstart, tw0, U, NULL),
+			   h_sequence(wstart, tw2, U, NULL),
+			   NULL);
+  HParser *tw2_ = h_choice(h_sequence(T, T, tw1, U, NULL),
+			   h_sequence(T, W, tw2, U, NULL),
+			   h_sequence(W, T, tw2, U, NULL),
+			   h_sequence(W, W, tw0, U, NULL),
+			   h_sequence(tstart, tw1, U, NULL),
+			   h_sequence(wstart, tw0, U, NULL),
+			   h_epsilon_p(),
+			   NULL);
+  
+  HParser *W_ = h_choice(h_sequence(W, W, wt2, U, NULL),
+			 h_sequence(W, T, wt0, U, NULL),
+			 NULL);
+  HParser *wt0_ = h_choice(h_sequence(W, W, wt2, U, NULL),
+			   h_sequence(W, T, wt0, U, NULL),
+			   h_sequence(T, W, wt0, U, NULL),
+			   h_sequence(T, T, wt1, U, NULL),
+			   h_sequence(wstart, wt2, U, NULL),
+			   h_sequence(tstart, wt1, U, NULL),
+			   NULL);
+  HParser *wt1_ = h_choice(h_sequence(W, W, wt0, U, NULL),
+			   h_sequence(W, T, wt1, U, NULL),
+			   h_sequence(T, W, wt1, U, NULL),
+			   h_sequence(T, T, wt2, U, NULL),
+			   h_sequence(wstart, wt0, U, NULL),
+			   h_sequence(tstart, wt2, U, NULL),
+			   NULL);
+  HParser *wt2_ = h_choice(h_sequence(W, W, wt1, U, NULL),
+			   h_sequence(W, T, wt2, U, NULL),
+			   h_sequence(T, W, wt2, U, NULL),
+			   h_sequence(T, T, wt0, U, NULL),
+			   h_sequence(wstart, wt1, U, NULL),
+			   h_sequence(tstart, wt0, U, NULL),
+			   h_epsilon_p(),
+			   NULL);
+
+  h_bind_indirect(tstart, T_);
+  h_bind_indirect(tw0, tw0_);
+  h_bind_indirect(tw1, tw1_);
+  h_bind_indirect(tw2, tw2_);
+  h_bind_indirect(wstart, W_);
+  h_bind_indirect(wt0, wt0_);
+  h_bind_indirect(wt1, wt1_);
+  h_bind_indirect(wt2, wt2_);
+  HParser *tuck = h_choice(tstart, wstart, NULL);
+
+  HParser *tuckpairstar = h_indirect();
+  HParser *tpstar_ = h_choice(h_sequence(pair, tuckpairstar, NULL),
+			      h_sequence(tuck, tuckpairstar, NULL),
+			      h_epsilon_p(),
+			      NULL);
+  h_bind_indirect(tuckpairstar, tpstar_);
+			      
+  HParser *tie = h_choice(h_sequence(prefix, tuckpairstar, tuck, NULL), NULL);
+
+  h_desugar_augmented(mm__, tie);
+
+  T->desugared->user_data = "T";
+  W->desugared->user_data = "W";
+  U->desugared->user_data = "0U";
+  prefix->desugared->user_data = "prefix";
+  pair->desugared->user_data = "pair";
+  tuck->desugared->user_data = "tuck";
+  tpstar_->desugared->user_data = "tuckpairstar";
+  tie->desugared->user_data = "tie";
+
+  return tie;
+}
+
+
+int main(int argc, char **argv) {
+  mm__ = &system_allocator;
+
+  HParser *p = finkmao();
+  HCFGrammar *g = h_cfgrammar_(mm__, h_desugar_augmented(mm__, p));
+  if (g == NULL) {
+    fprintf(stderr, "h_cfgrammar failed\n");
+    return 1;
+  }
+  printf("\n==== Generating functions ====\n");
+  h_pprint_gfeqns(stdout, g);
+
+  printf("\n==== Grammar ====\n");
+  h_pprint_grammar(stdout, g, 0);
+}
--- a/examples/ttuser.c
+++ b/examples/ttuser.c
+/*
+ * Example parser that demonstrates the use of user-defined token types.
+ *
+ * Note the custom printer function that hooks into h_pprint().
+ */
+
+#include "../src/hammer.h"
+#include "../src/glue.h"
+
+
+/*
+ * custom tokens
+ */
+
+HTokenType TT_SUBJ, TT_PRED, TT_OBJ, TT_ADJ, TT_ADVC;
+
+void
+pprint(FILE *stream, const HParsedToken *tok, int indent, int delta)
+{
+	/* 
+	 * Pretty-printer rules:
+	 *
+	 *  - Output 'indent' spaces after every newline you produce.
+	 *  - Do not add indent on the first line of output.
+	 *  - Do not add a trailing newline.
+	 *  - Indent sub-objects by adding 'delta' to 'indent'.
+	 */
+
+	if (((HParsedToken *)tok->user)->token_type == TT_SEQUENCE)
+		fprintf(stream, "\n%*s", indent, "");
+	h_pprint(stream, tok->user, indent, delta);
+}
+
+/* XXX define umamb_sub as well */
+
+void
+init(void)
+{
+	TT_SUBJ = h_allocate_token_new("subject", NULL, pprint);
+	TT_PRED = h_allocate_token_new("predicate", NULL, pprint);
+	TT_OBJ  = h_allocate_token_new("object", NULL, pprint);
+	TT_ADJ  = h_allocate_token_new("adjective", NULL, pprint);
+	TT_ADVC = h_allocate_token_new("adverbial clause", NULL, pprint);
+}
+
+
+/*
+ * semantic actions
+ *
+ * Normally these would be more interesting, but for this example, we just wrap
+ * our tokens in their intended types.
+ */
+HParsedToken *act_subj(const HParseResult *p, void *u) {
+	return H_MAKE(SUBJ, (void *)p->ast);
+}
+HParsedToken *act_pred(const HParseResult *p, void *u) {
+	return H_MAKE(PRED, (void *)p->ast);
+}
+HParsedToken *act_obj(const HParseResult *p, void *u) {
+	return H_MAKE(OBJ, (void *)p->ast);
+}
+HParsedToken *act_adj(const HParseResult *p, void *u) {
+	return H_MAKE(ADJ, (void *)p->ast);
+}
+HParsedToken *act_advc(const HParseResult *p, void *u) {
+	return H_MAKE(ADVC, (void *)p->ast);
+}
+
+
+/*
+ * grammar
+ */
+
+HParser *
+build_parser(void)
+{
+	/* words */
+	#define W(X)	h_whitespace(h_literal((const uint8_t *)(#X)))
+	H_RULE(art,	h_choice(W(a), W(the), NULL));
+	H_RULE(noun,	h_choice(W(cat), W(dog), W(fox), W(tiger), W(lion),
+			    W(bear), W(fence), W(tree), W(car), W(cow), NULL));
+	H_RULE(verb,	h_choice(W(eats), W(jumps), W(falls), NULL));
+	H_ARULE(adj,	h_choice(W(quick), W(slow), W(happy), W(lazy), W(cyan),
+			    W(magenta), W(yellow), W(black), W(brown), NULL));
+	H_RULE(adverb,	h_choice(W(with), W(over), W(after), NULL));
+	#undef W
+
+	/* phrases */
+	H_RULE(nphrase,	h_sequence(art, h_many(adj), noun, NULL));
+
+	/* sentence structure */
+	H_ARULE(subj,	nphrase);
+	H_ARULE(pred,	verb);
+	H_ARULE(obj,	nphrase);
+	H_ARULE(advc,	h_sequence(adverb, nphrase, NULL));
+	H_RULE(sentnc,	h_sequence(subj, pred,
+			    h_optional(obj), h_optional(advc), NULL));
+
+	return sentnc;
+}
+
+
+/*
+ * main routine: read, parse, print
+ *
+ * input e.g.:
+ * "the quick brown fox jumps the fence with a cyan lion"
+ */
+
+#include <stdio.h>
+#include <inttypes.h>
+
+int
+main(int argc, char **argv)
+{
+	uint8_t input[1024];
+	size_t sz;
+	const HParser *parser;
+	const HParseResult *result;
+
+	init();
+	parser = build_parser();
+
+	sz = fread(input, 1, sizeof(input), stdin);
+	if (!feof(stdin)) {
+		fprintf(stderr, "too much input\n");
+		return 1;
+	}
+
+	result = h_parse(parser, input, sz);
+	if (!result) {
+		fprintf(stderr, "no parse\n");
+		return 1;
+	}
+
+        h_pprintln(stdout, result->ast);
+        fprintf(stderr, "consumed %" PRId64 "/%zu bytes.\n",
+	    result->bit_length / 8, sz);
+        return 0;
+}
--- a/lib/test-suite
+++ b/lib/test-suite
@@ -19,7 +19,7 @@
 token {
  parser token("95\xa2");
  test "95\xa2" --> "95\xa2";
-  test "95\xa2" --> fail;
+  test "95\xa3" --> fail;
 }

 ch {
@@ -87,7 +87,7 @@ uint8 {
 }

 int_range {
-  parser int_range(uint8(), 0x3, 0x10);
+  parser int_range(uint8(), 0x3, 0xa);
  test <05> --> u0x05;
  test <0b> --> fail;
 }
@@ -215,7 +215,7 @@ many1 {
  test "daabbabadef" --> fail;
 }

-repeat-n {
+repeat_n {
  parser repeat_n(choice(ch('a'),ch('b')),0x2);
  test "adef" --> fail;
  test "abdef" --> ['a','b'];
@@ -270,24 +270,27 @@ and {
 }

 not {
-  parser sequence(ch('a'), choice(token('+'), token("++")), ch('b'));
+  parser sequence(ch('a'), choice(token("+"), token("++")), ch('b'));
  test "a+b" --> ['a',"+",'b'];
  test "a++b" --> fail;

-  parser sequence(ch('a'), choice(sequence(token('+'), not(ch('+'))),
+  parser sequence(ch('a'), choice(sequence(token("+"), not(ch('+'))),
                                  token("++")),
                  ch('b'));
  test "a+b" --> ['a', ["+"], 'b'];
  test "a++b" --> ['a', "++", 'b'];
 }

-leftrec {
-  subparser $lr = choice(sequence($lr, ch('a')), epsilon_p());
-  parser $lr;
-  test "a" --> ['a'];
-  test "aa" --> [['a'],'a'];
-  test "aaa" --> [[['a'],'a'],'a'];
-}
+## This doesn't work for some reason; it segfaults. We'll leave it for
+## later.
+#
+#leftrec {
+#  subparser $lr = choice(sequence($lr, ch('a')), epsilon_p());
+#  parser $lr;
+#  test "a" --> ['a'];
+#  #test "aa" --> [['a'],'a'];
+#  #test "aaa" --> [[['a'],'a'],'a'];
+#}

 rightrec {
  subparser $rr = choice(sequence(ch('a'), $rr), epsilon_p());
@@ -296,17 +299,17 @@ rightrec {
  test "aa" --> ['a',['a']];
  test "aaa" --> ['a',['a',['a']]];
 }
-
-ambiguous {
-  subparser $d = ch('d');
-  subparser $p = ch('+');
-  subparser $e = choice(sequence($e, $p, $e), $d);
-  # TODO: implement action/h_act_flatten
-  parser $e;
-  
-  test "d" --> 'd';
-  test "d+d" --> ['d','+','d'];
-  test "d+d+d" --> [['d','+','d'],'+','d'];
-}
+## Only for GLR
+#ambiguous {
+#  subparser $d = ch('d');
+#  subparser $p = ch('+');
+#  subparser $e = choice(sequence($e, $p, $e), $d);
+#  # TODO: implement action/h_act_flatten
+#  parser $e;
+#  
+#  test "d" --> 'd';
+#  test "d+d" --> ['d','+','d'];
+#  test "d+d+d" --> [['d','+','d'],'+','d'];
+#}


--- a/lib/tsgencsharp.pl
+++ b/lib/tsgencsharp.pl
+% -*- prolog -*-
+% Run with:
+% $ swipl -q  -t halt -g tsgencsharp:prolog tsgencsharp.pl >output-file
+% Note: this needs to be run from the lib/ directory.
+
+% So,
+% swipl -q  -t halt -g tsgencsharp:prolog tsgencsharp.pl >../src/bindings/dotnet/test/hammer_tests.cs 
+
+
+:- module(tsgencsharp,
+          [gen_ts/2]).
+
+:- expects_dialect(swi).
+:- use_module(tsparser).
+
+% TODO: build a Box-like pretty-printer
+
+format_parser_name(Name, Result) :-
+    atom_codes(Name, [CInit|CName]),
+    code_type(RInit, to_upper(CInit)),
+    append("Hammer.", [RInit|CName], Result), !.
+
+format_test_name(Name, Result) :-
+    atom_codes(Name, [CInit|CName]),
+    code_type(RInit, to_upper(CInit)),
+    append("Test", [RInit|CName], Result), !.
+
+indent(0) --> "", !.
+indent(N) -->
+    {N > 0},
+    "    ",
+    {Np is N - 1},
+    indent(Np).
+
+pp_char_guts(0x22) -->
+    "\\\"", !.
+pp_char_guts(0x27) -->
+    "\\'", !.
+pp_char_guts(A) -->
+    { A >= 0x20, A < 0x7F } ->
+    [A];
+    "\\x",
+    { H is A >> 4, L is A /\ 0xF,
+      code_type(Hc, xdigit(H)),
+      code_type(Lc, xdigit(L)) },
+    [Hc,Lc].
+
+pp_hexnum_guts(0) --> !.
+pp_hexnum_guts(A) -->
+    { L is A /\ 0xF,
+      H is A >> 4,
+      code_type(Lc, xdigit(L)) },
+    pp_hexnum_guts(H),
+    [Lc], !.
+pp_string_guts([]) --> !.
+pp_string_guts([X|Xs]) -->
+    pp_char_guts(X),
+    pp_string_guts(Xs), !.
+
+pp_parser_args([]) --> !.
+pp_parser_args([X|Rest]) -->
+    pp_parser(X),
+    pp_parser_args_rest(Rest).
+pp_parser_args_rest([]) --> !.
+pp_parser_args_rest([X|Xs]) -->
+    ", ",
+    pp_parser(X),
+    pp_parser_args_rest(Xs).
+
+pp_parser(parser(Name, Args)) -->
+    !,
+    {format_parser_name(Name,Fname)},
+    Fname,
+    "(",
+    pp_parser_args(Args),
+    ")".
+pp_parser(string(Str)) --> !,
+    "\"",
+    pp_string_guts(Str),
+    "\"", !.
+pp_parser(num(0)) --> "0", !.
+pp_parser(num(Num)) --> !,
+    ( {Num < 0} ->
+      "-0x", {RNum is -Num}; "0x", {RNum = Num} ),
+    pp_hexnum_guts(RNum).
+pp_parser(char(C)) --> !,
+    "'", pp_char_guts(C), "'", !.
+
+pp_parser(ref(Name)) -->
+    {atom_codes(Name,CName)},
+    "sp_", CName, !.
+
+
+pp_parser(A) -->
+    { writef("WTF is a %w?\n", [A]),
+      !, fail
+    }.
+
+pp_test_elem(decl, parser(_)) --> !.
+pp_test_elem(init, parser(_)) --> !.
+pp_test_elem(exec, parser(P)) -->
+    !, indent(3),
+    "parser = ",
+    pp_parser(P),
+    ";\n".
+pp_test_elem(decl, subparser(Name,_)) -->
+    !, indent(3),
+    "IndirectParser ", pp_parser(ref(Name)),
+    " = Hammer.Indirect();\n".
+pp_test_elem(init, subparser(Name, Parser)) -->
+    !, indent(3),
+    pp_parser(ref(Name)), ".Bind(",
+    pp_parser(Parser),
+    ");\n".
+pp_test_elem(exec, subparser(_,_)) --> !.
+pp_test_elem(decl, test(_,_)) --> !.
+pp_test_elem(init, test(_,_)) --> !.
+pp_test_elem(decl, testFail(_)) --> !.
+pp_test_elem(init, testFail(_)) --> !.
+pp_test_elem(exec, test(Str, Result)) -->
+    !, indent(3),
+    "  CheckParseOK(parser, ", pp_parser(string(Str)),
+    ", ",
+    pp_parse_result(Result),
+    ");\n".
+pp_test_elem(exec, testFail(Str)) -->
+    !, indent(3),
+    "  CheckParseFail(parser, ", pp_parser(string(Str)),
+    ");\n".
+
+% pp_test_elem(_, _) --> !.
+
+pp_result_seq([]) --> !.
+pp_result_seq([X|Xs]) --> !,
+    pp_parse_result(X),
+    pp_result_seq_r(Xs).
+pp_result_seq_r([]) --> !.
+pp_result_seq_r([X|Xs]) --> !,
+    ", ",
+    pp_parse_result(X),
+    pp_result_seq_r(Xs).
+
+pp_byte_seq([]) --> !.
+pp_byte_seq([X|Xs]) --> !,
+    pp_parser(num(X)),
+    pp_byte_seq_r(Xs).
+pp_byte_seq_r([]) --> !.
+pp_byte_seq_r([X|Xs]) --> !,
+    ", ",
+    pp_parser(num(X)),
+    pp_byte_seq_r(Xs).
+
+pp_parse_result(char(C)) --> !,
+    %"(System.UInt64)",
+    pp_parser(char(C)).
+pp_parse_result(seq(Args)) --> !,
+    "new object[]{ ", pp_result_seq(Args), "}".
+pp_parse_result(none) --> !,
+    "null".
+pp_parse_result(uint(V)) --> !,
+    "(System.UInt64)", pp_parser(num(V)).
+pp_parse_result(sint(V)) --> !,
+    "(System.Int64)(", pp_parser(num(V)), ")".
+pp_parse_result(string(A)) --> !,
+    "new byte[]{ ", pp_byte_seq(A), "}".
+%pp_parse_result(A) -->
+%    "\x1b[1;31m",
+%    {with_output_to(codes(C), write(A))},
+%    C,
+%    "\x1b[0m".
+
+
+pp_test_elems(_, []) --> !.
+pp_test_elems(Phase, [X|Xs]) -->
+    !,
+    pp_test_elem(Phase,X),
+    pp_test_elems(Phase,Xs).
+
+pp_test_case(testcase(Name, Elems)) -->
+    !,
+    indent(2), "[Test]\n",
+    { format_test_name(Name, TName) },
+    indent(2), "public void ", TName, "() {\n",
+    indent(3), "Parser parser;\n",
+    pp_test_elems(decl, Elems),
+    pp_test_elems(init, Elems),
+    pp_test_elems(exec, Elems),
+    indent(2), "}\n".
+
+
+pp_test_cases([]) --> !.
+pp_test_cases([A|As]) -->
+    pp_test_case(A),
+    pp_test_cases(As).
+
+pp_test_suite(Suite) -->
+    "namespace Hammer.Test {\n",
+    indent(1), "using NUnit.Framework;\n",
+    %indent(1), "using Hammer;\n",
+    indent(1), "[TestFixture]\n",
+    indent(1), "public partial class HammerTest {\n",
+    pp_test_cases(Suite),
+    indent(1), "}\n",
+    "}\n".
+
+gen_ts(Foo,Str) :-
+    phrase(pp_test_suite(Foo),Str).
+
+prolog :-
+    read_tc(A),
+    gen_ts(A, Res),
+    writef("%s", [Res]).
--- a/lib/tsgenruby.pl
+++ b/lib/tsgenruby.pl
+% -*- prolog -*-
+% Run with:
+% $ swipl -q  -t halt -g tsgenruby:prolog tsgenruby.pl >output-file
+% Note: this needs to be run from the lib/ directory.
+
+% So, from the ruby directory
+% (cd ../../../lib && swipl -q -t halt -g tsgenruby:prolog tsgenruby.pl ) >test/autogen_test.rb
+
+
+
+:- module(tsgenruby,
+          [gen_ts/2]).
+
+:- expects_dialect(swi).
+:- use_module(tsparser).
+:- use_module(library(record)).
+
+:- record testsuite_state(parser_no:integer = 0, test_no:integer=0).
+% TODO: build a Box-like pretty-printer
+
+to_title_case([], []) :- !.
+to_title_case([WSep,S0|Ss], [R0|Rs]) :-
+        memberchk(WSep, "_-"), !,
+        code_type(R0, to_upper(S0)),
+        to_title_case(Ss,Rs).
+to_title_case([S0|Ss], [S0|Rs]) :-
+        \+ memberchk(S0, "_-"),
+        !, to_title_case(Ss,Rs).
+
+format_parser_name(Name, Result) :-
+    atom_codes(Name, CName),
+    append("h.", CName, Result), !.
+
+format_test_name(Name, Result) :-
+    atom_codes(Name, CName),
+    to_title_case([0x5f|CName], RName),
+    append("Test", RName, Result), !.
+
+indent(0) --> "", !.
+indent(N) -->
+    {N > 0},
+    "  ",
+    {Np is N - 1},
+    indent(Np).
+
+pp_char_guts(0x22) -->
+    "\\\"", !.
+pp_char_guts(0x27) -->
+    "\\'", !.
+pp_char_guts(A) -->
+    { A >= 0x20, A < 0x7F } ->
+    [A];
+    "\\x",
+    { H is A >> 4, L is A /\ 0xF,
+      code_type(Hc, xdigit(H)),
+      code_type(Lc, xdigit(L)) },
+    [Hc,Lc].
+
+pp_hexnum_guts(0) --> !.
+pp_hexnum_guts(A) -->
+    { L is A /\ 0xF,
+      H is A >> 4,
+      code_type(Lc, xdigit(L)) },
+    pp_hexnum_guts(H),
+    [Lc], !.
+pp_string_guts([]) --> !.
+pp_string_guts([X|Xs]) -->
+    pp_char_guts(X),
+    pp_string_guts(Xs), !.
+
+pp_parser_args([]) --> !.
+pp_parser_args([X|Rest]) -->
+    pp_parser(X),
+    pp_parser_args_rest(Rest).
+pp_parser_args_rest([]) --> !.
+pp_parser_args_rest([X|Xs]) -->
+    ", ",
+    pp_parser(X),
+    pp_parser_args_rest(Xs).
+
+pp_parser(parser(Name, Args)) -->
+        !,
+        {format_parser_name(Name,Fname)},
+        Fname,
+        ({Args \= []} ->
+        
+         "(", pp_parser_args(Args), ")"
+        ; "") .
+pp_parser(string(Str)) --> !,
+    "\"",
+    pp_string_guts(Str),
+    "\"", !.
+pp_parser(num(0)) --> "0", !.
+pp_parser(num(Num)) --> !,
+    ( {Num < 0} ->
+      "-0x", {RNum is -Num}; "0x", {RNum = Num} ),
+    pp_hexnum_guts(RNum).
+pp_parser(char(C)) --> !,
+        pp_parser(num(C)), ".chr". % Ruby is encoding-aware; this is a
+                                   % more reasonable implementation
+
+pp_parser(ref(Name)) -->
+    {atom_codes(Name,CName)},
+    "@sp_", CName, !.
+
+
+pp_parser(A) -->
+    { writef("WTF is a %w?\n", [A]),
+      !, fail
+    }.
+
+upd_state_test_elem(parser(_), OldSt, NewSt) :- !,
+        testsuite_state_parser_no(OldSt, OldRNo),
+        NewRNo is OldRNo + 1,
+        set_parser_no_of_testsuite_state(NewRNo, OldSt, NewSt).
+upd_state_test_elem(test(_, _), OldSt, NewSt) :- !,
+        testsuite_state_test_no(OldSt, OldTNo),
+        NewTNo is OldTNo + 1,
+        set_test_no_of_testsuite_state(NewTNo, OldSt, NewSt).
+upd_state_test_elem(testFail(_), OldSt, NewSt) :- !,
+        testsuite_state_test_no(OldSt, OldTNo),
+        NewTNo is OldTNo + 1,
+        set_test_no_of_testsuite_state(NewTNo, OldSt, NewSt).
+upd_state_test_elem(_, St, St).
+
+curparser_name(St) --> !,
+        { testsuite_state_parser_no(St, RNo),
+          format(string(X), "@parser_~w", RNo) },
+        X.
+curtest_name(St) --> !,
+        { testsuite_state_test_no(St, RNo),
+          format(string(X), "test_~w", RNo) },
+        X.
+
+pp_test_elem(decl, parser(_), _) --> !.
+pp_test_elem(init, parser(P), St) -->
+    !, indent(2),
+    curparser_name(St), " = ",
+    pp_parser(P),
+    "\n".
+pp_test_elem(exec, parser(_), _) --> !.
+pp_test_elem(decl, subparser(Name,_), _) -->
+    !, indent(2),
+    pp_parser(ref(Name)),
+    " = ",
+    pp_parser(parser(indirect,[])),
+    "\n".
+pp_test_elem(init, subparser(Name, Parser), _) -->
+    !, indent(2),
+    pp_parser(ref(Name)), ".bind ",
+    pp_parser(Parser),
+    "\n".
+pp_test_elem(exec, subparser(_,_), _) --> !.
+pp_test_elem(decl, test(_,_), _) --> !.
+pp_test_elem(init, test(_,_), _) --> !.
+pp_test_elem(decl, testFail(_), _) --> !.
+pp_test_elem(init, testFail(_), _) --> !.
+pp_test_elem(exec, test(Str, Result), St) -->
+    !,
+    "\n",
+    indent(1), "def ", curtest_name(St), "\n",
+    indent(2), "assert_parse_ok ", curparser_name(St), ", ", pp_parser(string(Str)),
+    ", ",
+    pp_parse_result(Result),
+    "\n",
+    indent(1), "end\n".
+pp_test_elem(exec, testFail(Str), St) -->
+    !,
+    "\n",
+    indent(1), "def ", curtest_name(St), "\n",
+    indent(2), "refute_parse_ok ", curparser_name(St), ", ", pp_parser(string(Str)), "\n",
+    indent(1), "end\n".
+
+% pp_test_elem(_, _) --> !.
+
+pp_result_seq([]) --> !.
+pp_result_seq([X|Xs]) --> !,
+    pp_parse_result(X),
+    pp_result_seq_r(Xs).
+pp_result_seq_r([]) --> !.
+pp_result_seq_r([X|Xs]) --> !,
+    ", ",
+    pp_parse_result(X),
+    pp_result_seq_r(Xs).
+
+pp_byte_seq([]) --> !.
+pp_byte_seq([X|Xs]) --> !,
+    pp_parser(num(X)),
+    pp_byte_seq_r(Xs).
+pp_byte_seq_r([]) --> !.
+pp_byte_seq_r([X|Xs]) --> !,
+    ", ",
+    pp_parser(num(X)),
+    pp_byte_seq_r(Xs).
+
+pp_parse_result(char(C)) --> !,
+    %"(System.UInt64)",
+    pp_parser(char(C)).
+pp_parse_result(seq(Args)) --> !,
+    "[", pp_result_seq(Args), "]".
+pp_parse_result(none) --> !,
+    "nil".
+pp_parse_result(uint(V)) --> !,
+        pp_parser(num(V)).
+pp_parse_result(sint(V)) --> !,
+        pp_parser(num(V)).
+pp_parse_result(string(A)) --> !,
+        pp_parser(string(A)).
+
+%pp_parse_result(A) -->
+%    "\x1b[1;31m",
+%    {with_output_to(codes(C), write(A))},
+%    C,
+%    "\x1b[0m".
+
+
+pp_test_elems(Phase, Elems) -->
+        { default_testsuite_state(State) },
+        pp_test_elems(Phase, Elems, State).
+pp_test_elems(_, [], _) --> !.
+pp_test_elems(Phase, [X|Xs], St) -->
+    !,
+    { upd_state_test_elem(X, St, NewSt) },
+    %{NewSt = St},
+    pp_test_elem(Phase,X, NewSt),
+    pp_test_elems(Phase,Xs, NewSt).
+
+pp_test_case(testcase(Name, Elems)) -->
+    !,
+    { format_test_name(Name, TName) },
+    indent(0), "class ", TName, " < Minitest::Test\n",
+    indent(1), "def setup\n",
+    indent(2), "super\n",
+    indent(2), "h = Hammer::Parser\n",
+    pp_test_elems(decl, Elems),
+    pp_test_elems(init, Elems),
+    indent(1), "end\n",
+    pp_test_elems(exec, Elems),
+    indent(0), "end\n\n".
+
+
+pp_test_cases([]) --> !.
+pp_test_cases([A|As]) -->
+    pp_test_case(A),
+    pp_test_cases(As).
+
+pp_test_suite(Suite) -->
+    "require 'bundler/setup'\n",
+    "require 'minitest/autorun'\n",
+    "require 'hammer'\n",
+    pp_test_cases(Suite).
+
+gen_ts(Foo,Str) :-
+    phrase(pp_test_suite(Foo),Str).
+
+prolog :-
+    read_tc(A),
+    gen_ts(A, Res),
+    writef("%s", [Res]).
--- a/lib/testgen.pl
+++ b/lib/testgen.pl
--- a/src/SConscript
+++ b/src/SConscript
 # -*- python -*-
+
+from __future__ import absolute_import, division, print_function
+
 import os.path
+
 Import('env testruns')

+# Bump this if you break binary compatibility (e.g. renumber backends)
+hammer_shlib_version = "1.0.0"
+
 dist_headers = [
-    "hammer.h",
-    "allocator.h",
-    "glue.h",
-    "internal.h"
+    'hammer.h',
+    'allocator.h',
+    'compiler_specifics.h',
+    'glue.h',
+    'internal.h',
+    'platform.h'
 ]

 parsers_headers = [
-    "parsers/parser_internal.h"
+    'parsers/parser_internal.h'
 ]

 backends_headers = [
-    "backends/regex.h",
-    "backends/contextfree.h"
+    'backends/regex.h',
+    'backends/contextfree.h',
+    'backends/missing.h',
+    'backends/params.h'
 ]

 parsers = ['parsers/%s.c'%s for s in
           ['action',
            'and',
            'attr_bool',
+            'bind',
            'bits',
+            'bytes',
            'butnot',
            'ch',
            'charset',
            'choice',
            'difference',
            'end',
+            'endianness',
            'epsilon',
            'ignore',
            'ignoreseq',
@@ -38,14 +52,17 @@ parsers = ['parsers/%s.c'%s for s in
            'not',
            'nothing',
            'optional',
+            'permutation',
            'sequence',
            'token',
            'unimplemented',
            'whitespace',
-            'xor']] 
+            'xor',
+            'value',
+            'seek']]

 backends = ['backends/%s.c' % s for s in
-            ['packrat', 'llk', 'regex', 'glr', 'lalr', 'lr', 'lr0']]
+            ['missing', 'packrat', 'llk', 'regex', 'glr', 'lalr', 'lr', 'lr0', 'params']]

 misc_hammer_parts = [
    'allocator.c',
@@ -59,34 +76,79 @@ misc_hammer_parts = [
    'hammer.c',
    'pprint.c',
    'registry.c',
-    'system_allocator.c']
+    'system_allocator.c',
+    'sloballoc.c']
+
+if env['PLATFORM'] == 'win32':
+    misc_hammer_parts += [
+        'platform_win32.c',
+        'tsearch.c',
+    ]
+else:
+    misc_hammer_parts += ['platform_bsdlike.c']

 ctests = ['t_benchmark.c',
          't_bitreader.c',
          't_bitwriter.c',
          't_parser.c',
          't_grammar.c',
-          't_misc.c']
+          't_misc.c',
+          't_mm.c',
+          't_names.c',
+          't_regression.c']
+
+
+static_library_name = 'hammer'
+build_shared_library=True
+if env['PLATFORM'] == 'win32':
+    # FIXME(windows): symbols in hammer are not exported yet, a shared lib would be useless
+    build_shared_library=False
+    # prevent collision between .lib from dll and .lib for static lib
+    static_library_name = 'hammer_s'
+
+if 'GPROF' in env and env['GPROF'] == 1:
+    # Disable the shared library (it won't work with gprof) and rename the static one
+    build_shared_library=False
+    static_library_name = 'hammer_pg'
+
+# Markers for later
+libhammer_static = None
+libhammer_shared = None
+
+if build_shared_library:
+    libhammer_shared = env.SharedLibrary('hammer', parsers + backends + misc_hammer_parts, \
+                                     SHLIBVERSION=hammer_shlib_version)
+libhammer_static = env.StaticLibrary(static_library_name, parsers + backends + misc_hammer_parts)

-libhammer_shared = env.SharedLibrary('hammer', parsers + backends + misc_hammer_parts)
-libhammer_static = env.StaticLibrary('hammer', parsers + backends + misc_hammer_parts)
-Default(libhammer_shared, libhammer_static)
+if libhammer_shared is not None:
+    Default(libhammer_shared, libhammer_static)
+    env.Install('$libpath', [libhammer_static, libhammer_shared])
+else:
+    Default(libhammer_static)
+    env.Install('$libpath', [libhammer_static])

-env.Install("$libpath", [libhammer_static, libhammer_shared])
-env.Install("$incpath", dist_headers)
-env.Install("$parsersincpath", parsers_headers)
-env.Install("$backendsincpath", backends_headers)
-env.Install("$pkgconfigpath", "../../../libhammer.pc")
+env.Install('$incpath', dist_headers)
+env.Install('$parsersincpath', parsers_headers)
+env.Install('$backendsincpath', backends_headers)
+env.Install('$pkgconfigpath', '../../../libhammer.pc')

-testenv = env.Clone()
-testenv.ParseConfig('pkg-config --cflags --libs glib-2.0')
-testenv.Append(LIBS=['hammer'], LIBPATH=['.'])
-ctestexec = testenv.Program('test_suite', ctests + ['test_suite.c'])
-ctest = Alias('testc', [ctestexec], "".join(["env LD_LIBRARY_PATH=", os.path.dirname(ctestexec[0].path), " ", ctestexec[0].path]))
-AlwaysBuild(ctest)
-testruns.append(ctest)
+if GetOption('with_tests'):
+    testenv = env.Clone()
+    testenv.ParseConfig('pkg-config --cflags --libs glib-2.0')
+    if libhammer_shared is not None:
+        testenv.Append(LIBS=['hammer'])
+    else:
+        testenv.Append(LIBS=[static_library_name])
+    testenv.Prepend(LIBPATH=['.'])
+    ctestexec = testenv.Program('test_suite', ctests + ['test_suite.c'], LINKFLAGS='--coverage' if testenv.GetOption('coverage') else None)
+    ctest = Alias('testc', [ctestexec], ''.join(['env LD_LIBRARY_PATH=', os.path.dirname(ctestexec[0].path), ' ', ctestexec[0].path]))
+    AlwaysBuild(ctest)
+    testruns.append(ctest)

-Export("libhammer_static libhammer_shared")
+if libhammer_shared is not None:
+    Export('libhammer_static libhammer_shared')
+else:
+    Export('libhammer_static')

 for b in env['bindings']:
-    env.SConscript(["bindings/%s/SConscript" % b])
+    env.SConscript(['bindings/%s/SConscript' % b])
--- a/src/allocator.c
+++ b/src/allocator.c
@@ -18,6 +18,7 @@
 #include <string.h>
 #include <stdint.h>
 #include <sys/types.h>
+#include <setjmp.h>

 #include "hammer.h"
 #include "internal.h"
@@ -28,28 +29,56 @@ struct arena_link {
  // For efficiency, we should probably allocate the arena links in 
  // their own slice, and link to a block directly. That can be
  // implemented later, though, with no change in interface.
-  struct arena_link *next; // It is crucial that this be the first item; so that 
-                           // any arena link can be casted to struct arena_link**.
-
+  struct arena_link *next;
  size_t free;
  size_t used;
  uint8_t rest[];
-} ;
+};

 struct HArena_ {
  struct arena_link *head;
  struct HAllocator_ *mm__;
+  /* does mm__ zero blocks for us? */
+  bool malloc_zeros;
  size_t block_size;
  size_t used;
  size_t wasted;
+#ifdef DETAILED_ARENA_STATS
+  size_t mm_malloc_count, mm_malloc_bytes;
+  size_t memset_count, memset_bytes;
+  size_t arena_malloc_count, arena_malloc_bytes;
+  size_t arena_su_malloc_count, arena_su_malloc_bytes;
+  size_t arena_si_malloc_count, arena_si_malloc_bytes;
+  size_t arena_lu_malloc_count, arena_lu_malloc_bytes;
+  size_t arena_li_malloc_count, arena_li_malloc_bytes;
+#endif
+
+  jmp_buf *except;
 };

+static void * h_arena_malloc_raw(HArena *arena, size_t size, bool need_zero);
+
+void* h_alloc(HAllocator* mm__, size_t size) {
+  void *p = mm__->alloc(mm__, size);
+  if(!p)
+    h_platform_errx(1, "memory allocation failed (%zuB requested)\n", size);
+  return p;
+}
+
+void* h_realloc(HAllocator* mm__, void* ptr, size_t size) {
+  void *p = mm__->realloc(mm__, ptr, size);
+  if(!p)
+    h_platform_errx(1, "memory reallocation failed (%zuB requested)\n", size);
+  return p;
+}
+
 HArena *h_new_arena(HAllocator* mm__, size_t block_size) {
  if (block_size == 0)
    block_size = 4096;
  struct HArena_ *ret = h_new(struct HArena_, 1);
-  struct arena_link *link = (struct arena_link*)mm__->alloc(mm__, sizeof(struct arena_link) + block_size);
-  memset(link, 0, sizeof(struct arena_link) + block_size);
+  struct arena_link *link = (struct arena_link*)h_alloc(mm__, sizeof(struct arena_link) + block_size);
+  assert(ret != NULL);
+  assert(link != NULL);
  link->free = block_size;
  link->used = 0;
  link->next = NULL;
@@ -57,41 +86,154 @@ HArena *h_new_arena(HAllocator* mm__, size_t block_size) {
  ret->block_size = block_size;
  ret->used = 0;
  ret->mm__ = mm__;
+#ifdef DETAILED_ARENA_STATS
+  ret->mm_malloc_count = 2;
+  ret->mm_malloc_bytes = sizeof(*ret) + sizeof(struct arena_link) + block_size;
+  ret->memset_count = 0;
+  ret->memset_bytes = 0;
+  ret->arena_malloc_count = ret->arena_malloc_bytes = 0;
+  ret->arena_su_malloc_count = ret->arena_su_malloc_bytes = 0;
+  ret->arena_si_malloc_count = ret->arena_si_malloc_bytes = 0;
+  ret->arena_lu_malloc_count = ret->arena_lu_malloc_bytes = 0;
+  ret->arena_li_malloc_count = ret->arena_li_malloc_bytes = 0;
+#endif
+  /* XXX provide a mechanism to indicate mm__ returns zeroed blocks */
+  ret->malloc_zeros = false;
  ret->wasted = sizeof(struct arena_link) + sizeof(struct HArena_) + block_size;
+  ret->except = NULL;
  return ret;
 }

-void* h_arena_malloc(HArena *arena, size_t size) {
+void h_arena_set_except(HArena *arena, jmp_buf *except)
+{
+  arena->except = except;
+}
+
+static void *alloc_block(HArena *arena, size_t size)
+{
+  void *block = arena->mm__->alloc(arena->mm__, size);
+  if (!block) {
+    if (arena->except)
+      longjmp(*arena->except, 1);
+    h_platform_errx(1, "memory allocation failed (%uB requested)\n", (unsigned int)size);
+  }
+  return block;
+}
+
+void * h_arena_malloc_noinit(HArena *arena, size_t size) {
+  return h_arena_malloc_raw(arena, size, false);
+}
+
+void * h_arena_malloc(HArena *arena, size_t size) {
+  return h_arena_malloc_raw(arena, size, true);
+}
+
+static void * h_arena_malloc_raw(HArena *arena, size_t size,
+                                 bool need_zero) {
+  struct arena_link *link = NULL;
+  void *ret = NULL;
+
  if (size <= arena->head->free) {
-    // fast path..
-    void* ret = arena->head->rest + arena->head->used;
+    /* fast path.. */
+    ret = arena->head->rest + arena->head->used;
    arena->used += size;
    arena->wasted -= size;
    arena->head->used += size;
    arena->head->free -= size;
-    return ret;
+
+#ifdef DETAILED_ARENA_STATS
+    ++(arena->arena_malloc_count);
+    arena->arena_malloc_bytes += size;
+    if (need_zero) {
+      ++(arena->arena_si_malloc_count);
+      arena->arena_si_malloc_bytes += size;
+    } else {
+      ++(arena->arena_su_malloc_count);
+      arena->arena_su_malloc_bytes += size;
+    }
+#endif
  } else if (size > arena->block_size) {
-    // We need a new, dedicated block for it, because it won't fit in a standard sized one.
-    // This involves some annoying casting...
+    /*
+     * We need a new, dedicated block for it, because it won't fit in a
+     * standard sized one.
+     *
+     * NOTE:
+     *
+     * We used to do a silly casting dance to treat blocks like this
+     * as special cases and make the used/free fields part of the allocated
+     * block, but the old code was not really proper portable C and depended
+     * on a bunch of implementation-specific behavior.  We could have done it
+     * better with a union in struct arena_link, but the memory savings is
+     * only 0.39% for a 64-bit machine, a 4096-byte block size and all
+     * large allocations *only just one byte* over the block size, so I
+     * question the utility of it.  We do still slip the large block in
+     * one position behind the list head so it doesn't cut off a partially
+     * filled list head.
+     *
+     * -- andrea
+     */
+    link = alloc_block(arena, size + sizeof(struct arena_link));
+    assert(link != NULL);
    arena->used += size;
-    arena->wasted += sizeof(struct arena_link*);
-    void* link = arena->mm__->alloc(arena->mm__, size + sizeof(struct arena_link*));
-    memset(link, 0, size + sizeof(struct arena_link*));
-    *(struct arena_link**)link = arena->head->next;
-    arena->head->next = (struct arena_link*)link;
-    return (void*)(((uint8_t*)link) + sizeof(struct arena_link*));
+    arena->wasted += sizeof(struct arena_link);
+    link->used = size;
+    link->free = 0;
+    link->next = arena->head->next;
+    arena->head->next = link;
+    ret = link->rest;
+
+#ifdef DETAILED_ARENA_STATS
+    ++(arena->arena_malloc_count);
+    arena->arena_malloc_bytes += size;
+    if (need_zero) {
+      ++(arena->arena_li_malloc_count);
+      arena->arena_li_malloc_bytes += size;
+    } else {
+      ++(arena->arena_lu_malloc_count);
+      arena->arena_lu_malloc_bytes += size;
+    }
+#endif
  } else {
-    // we just need to allocate an ordinary new block.
-    struct arena_link *link = (struct arena_link*)arena->mm__->alloc(arena->mm__, sizeof(struct arena_link) + arena->block_size);
-    memset(link, 0, sizeof(struct arena_link) + arena->block_size);
+    /* we just need to allocate an ordinary new block. */
+    link = alloc_block(arena, sizeof(struct arena_link) + arena->block_size);
+    assert(link != NULL);
+#ifdef DETAILED_ARENA_STATS
+    ++(arena->mm_malloc_count);
+    arena->mm_malloc_bytes += sizeof(struct arena_link) + arena->block_size;
+#endif
    link->free = arena->block_size - size;
    link->used = size;
    link->next = arena->head;
    arena->head = link;
    arena->used += size;
    arena->wasted += sizeof(struct arena_link) + arena->block_size - size;
-    return link->rest;
+    ret = link->rest;
+
+#ifdef DETAILED_ARENA_STATS
+    ++(arena->arena_malloc_count);
+    arena->arena_malloc_bytes += size;
+    if (need_zero) {
+      ++(arena->arena_si_malloc_count);
+      arena->arena_si_malloc_bytes += size;
+    } else {
+      ++(arena->arena_su_malloc_count);
+      arena->arena_su_malloc_bytes += size;
+    }
+#endif
  }
+
+  /*
+   * Zeroize if necessary
+   */
+  if (need_zero && !(arena->malloc_zeros)) {
+    memset(ret, 0, size);
+#ifdef DETAILED_ARENA_STATS
+    ++(arena->memset_count);
+    arena->memset_bytes += size;
+#endif
+  }
+
+  return ret;
 }

 void h_arena_free(HArena *arena, void* ptr) {
@@ -115,4 +257,49 @@ void h_delete_arena(HArena *arena) {
 void h_allocator_stats(HArena *arena, HArenaStats *stats) {
  stats->used = arena->used;
  stats->wasted = arena->wasted;
+#ifdef DETAILED_ARENA_STATS
+  stats->mm_malloc_count = arena->mm_malloc_count;
+  stats->mm_malloc_bytes = arena->mm_malloc_bytes;
+  stats->memset_count = arena->memset_count;
+  stats->memset_bytes = arena->memset_bytes;
+  stats->arena_malloc_count = arena->arena_malloc_count;
+  stats->arena_malloc_bytes = arena->arena_malloc_bytes;
+  stats->arena_su_malloc_count = arena->arena_su_malloc_count;
+  stats->arena_su_malloc_bytes = arena->arena_su_malloc_bytes;
+  stats->arena_si_malloc_count = arena->arena_si_malloc_count;
+  stats->arena_si_malloc_bytes = arena->arena_si_malloc_bytes;
+  stats->arena_lu_malloc_count = arena->arena_lu_malloc_count;
+  stats->arena_lu_malloc_bytes = arena->arena_lu_malloc_bytes;
+  stats->arena_li_malloc_count = arena->arena_li_malloc_count;
+  stats->arena_li_malloc_bytes = arena->arena_li_malloc_bytes;
+#endif
+}
+
+void* h_arena_realloc(HArena *arena, void* ptr, size_t n) {
+  struct arena_link *link;
+  void* ret;
+  size_t ncopy;
+
+  // XXX this is really wasteful, but maybe better than nothing?
+  //
+  // first, we walk the blocks to find our ptr. since we don't know how large
+  // the original allocation was, we must always make a new one and copy as
+  // much data from the old block as there could have been.
+
+  for (link = arena->head; link; link = link->next) {
+    if (ptr >= (void *)link->rest && ptr <= (void *)link->rest + link->used)
+      break;	/* found it */
+  }
+  assert(link != NULL);
+
+  ncopy = (void *)link->rest + link->used - ptr;
+  if (n < ncopy)
+    ncopy = n;
+
+  ret = h_arena_malloc_noinit(arena, n);
+  assert(ret != NULL);
+  memcpy(ret, ptr, ncopy);
+  h_arena_free(arena, ptr);
+
+  return ret;
 }
--- a/src/allocator.h
+++ b/src/allocator.h
@@ -18,6 +18,27 @@
 #ifndef HAMMER_ALLOCATOR__H__
 #define HAMMER_ALLOCATOR__H__
 #include <sys/types.h>
+#include <setjmp.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined __llvm__
+# if __has_attribute(malloc)
+#   define ATTR_MALLOC(n) __attribute__((malloc))
+# else
+#   define ATTR_MALLOC(n)
+# endif
+#elif defined SWIG
+# define ATTR_MALLOC(n)
+#elif defined __GNUC__
+# define ATTR_MALLOC(n) __attribute__((malloc, alloc_size(2)))
+#else
+# define ATTR_MALLOC(n)
+#endif
+
+/* #define DETAILED_ARENA_STATS */

 // TODO(thequux): Turn this into an "HAllocatorVtable", and add a wrapper that also takes an environment pointer.
 typedef struct HAllocator_ {
@@ -26,23 +47,49 @@ typedef struct HAllocator_ {
  void (*free)(struct HAllocator_* allocator, void* ptr);
 } HAllocator;

+void* h_alloc(HAllocator* allocator, size_t size) ATTR_MALLOC(2);
+void* h_realloc(HAllocator* allocator, void* ptr, size_t size);
+
 typedef struct HArena_ HArena ; // hidden implementation

 HArena *h_new_arena(HAllocator* allocator, size_t block_size); // pass 0 for default...
-#ifndef SWIG
-void* h_arena_malloc(HArena *arena, size_t count) __attribute__(( malloc, alloc_size(2) ));
-#else
-void* h_arena_malloc(HArena *arena, size_t count);
-#endif
+
+void* h_arena_malloc_noinit(HArena *arena, size_t count) ATTR_MALLOC(2);
+void* h_arena_malloc(HArena *arena, size_t count) ATTR_MALLOC(2);
+void* h_arena_realloc(HArena *arena, void* ptr, size_t count);
 void h_arena_free(HArena *arena, void* ptr); // For future expansion, with alternate memory managers.
 void h_delete_arena(HArena *arena);
+void h_arena_set_except(HArena *arena, jmp_buf *except);

 typedef struct {
  size_t used;
  size_t wasted;
+#ifdef DETAILED_ARENA_STATS
+  size_t mm_malloc_count;
+  size_t mm_malloc_bytes;
+  size_t memset_count;
+  size_t memset_bytes;
+  size_t arena_malloc_count;
+  size_t arena_malloc_bytes;
+  /* small, uninited */
+  size_t arena_su_malloc_count;
+  size_t arena_su_malloc_bytes;
+  /* small, inited */
+  size_t arena_si_malloc_count;
+  size_t arena_si_malloc_bytes;
+  /* large, uninited */
+  size_t arena_lu_malloc_count;
+  size_t arena_lu_malloc_bytes;
+  /* large, inited */
+  size_t arena_li_malloc_count;
+  size_t arena_li_malloc_bytes;
+#endif
 } HArenaStats;

 void h_allocator_stats(HArena *arena, HArenaStats *stats);

+#ifdef __cplusplus
+}
+#endif

 #endif // #ifndef LIB_ALLOCATOR__H__
No results found