diff --git a/izodparse/ok.py b/izodparse/ok.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee916463a6a7ed964a267dccccafeb57fd792811
--- /dev/null
+++ b/izodparse/ok.py
@@ -0,0 +1,8 @@
+"Testing utilities."
+
+def ok(a, b): assert a == b, (a, b)
+
+def please_be(a, b):
+    if a != b:
+        raise ValueError(a, '!=', b)
+
diff --git a/izodparse/pdftour.py b/izodparse/pdftour.py
index 3a48cbc93d0de76b9a3d5c8f43a1c587fde18976..fdb9c9cb320e58a817ad3829b7d57bb83edd3552 100755
--- a/izodparse/pdftour.py
+++ b/izodparse/pdftour.py
@@ -55,7 +55,8 @@ We can attempt parsing at that point with an ad-hoc custom grammar:
 
 We can get a debug dump of a grammar:
 
-    >>> print(pdftour.show_grammar(pdftour.xref_header))
+    >>> from izodparse.peg import show_grammar
+    >>> print(show_grammar(pdftour.xref_header))
     S: (((('xref' eol) ex0) ' ') ex0) eol
     eol: ('\r\n' | '\r') | '\n'
     ex0: '+' | '-' | '' digit digit*
@@ -97,7 +98,7 @@ page contents:
     b'\x00)', b'\x00\x0f', b'\x00\x0f', b'\x00,', b'\x00\x10',
     b'\x00\x10', b'\x00-', b'\x00\x11', b'\x00\x11', b'\x00.',
     b'\x00\x12', b'\x00\x12', b'\x00/', b'\x00\x13...
-    >>> print(pdftour.show_grammar(pdftour.csranges_to_grammar(_[0])))
+    >>> print(show_grammar(pdftour.csranges_to_grammar(_[0])))
     S: ([\x00-\xff] [\x00-\xff])*
 
 Also, we can pull out page contents:
@@ -114,15 +115,6 @@ Also, we can pull out page contents:
     0 G
     0 188.5 m
 
-Too slow to use in practice (30 kilobytes/sec), but hopefully
-validates our understanding of the problem and communicates it more
-clearly than a lower-level language would.
-
-A pitfall I've run into a lot with the PEG parser in here is that if
-you try to parse a Unicode string with a byte-string grammar or vice
-versa, you just get silent failures to parse.  This is a specific case
-of the more general problem with silent failures.
-
 Still unimplemented:
 
 - CMap bfranges that *use* arrays;
@@ -150,458 +142,12 @@ You should have received a copy of the GNU General Public License
 along with izodparse.  If not, see <http://www.gnu.org/licenses/>.
 
 """
-import sys, types, functools, operator, zlib
-
-
-class memoprop:
-    """"Simplified, non-multithreaded version of functools.cached_property.
-
-    For Pythons earlier than 3.8.  Doesn't support __slots__, custom
-    __dict__, etc.
-
-    """
-    def __init__(self, func):
-        self.func = func
-
-    def __get__(self, instance, cls):
-        cache = instance.__dict__
-        if self not in cache:
-            cache[self] = self.func(instance)
-        return cache[self]
-
-def please_be(a, b):
-    if a != b:
-        raise ValueError(a, '!=', b)
-
-
-### Packrat parsing engine with predictive lookahead parsing ###
-
-# First, some debugging utilities:
-def represent_cset(cs):
-    "Debugging helper for understanding first sets; takes a set() of ints or chars."
-    if any(isinstance(c, str) for c in cs):
-        return represent_uset(cs)
-    
-    ranges = []
-    for bt in sorted(cs):
-        if bt == b'-'[0] and not (bt - 1 in cs and bt + 1 in cs):
-            ranges[0:0] = [(bt, bt)]
-            continue
-            
-        if ranges and bt == ranges[-1][-1] + 1:
-            ranges[-1] = ranges[-1][0], bt
-        else:
-            ranges.append((bt, bt))
-
-    rangereps = [bytes((b,)) if a == b else bytes((a, ord('-'), b))
-                 for a, b in ranges]
-    return '[%s]' % repr(b''.join(rangereps))[2:-1]
-
-def represent_uset(cs):
-    "Unicode version of represent_cset."
-    ranges = []
-    for c in sorted(cs):
-        if c == '-' and not (',' in cs and '.' in cs): # previous and following characters
-            ranges[0:0] = [(c, c)]
-            continue
-
-        c = ord(c)
-        if ranges and c == ranges[-1][-1] + 1:
-            ranges[-1] = ranges[-1][0], c
-        else:
-            ranges.append((c, c))
-
-    rangereps = [chr(b) if a == b else chr(a) + '-' + chr(b)
-                 for a, b in ranges]
-    return '[%s]' % ''.join(rangereps)
-
-
-class GrammarVisitor:
-    """Compute global properties of a grammar, like what expressions are referenced twice.
-
-    This is useful for printing out a readable representation of a
-    grammar.  (Also, though we don't use this fact, singly-referenced
-    parsing expressions can never gain any benefit from the memo
-    table.)
-
-    This version of Visitor deviates a bit from the standard pattern,
-    because our grammars are cyclic, so simply having the client
-    peremptorily invoke .accept on each of its children would loop
-    infinitely.  Instead, we have the client invoke .inform_arc, which
-    allows the Visitor to decide whether to proceed to invoking
-    .accept.  And then the .visit or .visit_type method doesn't need
-    to exist at all; .accept merely invokes .inform_arc for each
-    child.  (This has the bug that if the root node has no children,
-    we'll never know about it.)
-
-    """
-    # A few nits that could be cleaned up:
-    # - negative charsets would make things like `regchar:
-    #   [\x01-\x08\x0b\x0e-\x1f!-$&-'*-.0-;=?-Z\\^-z|~-\xff]` a lot more readable.
-    #   This would involve shunting ^ to be not first if it's first I guess.
-    # - `[[]]` is not okay.  `-` gets pulled to the front; `]` should
-    #   be too.  Not sure how to handle set(['-', ']']) but it hasn't arisen yet.
-    # - It would be beneficial to take advantage of the associativity
-    #   of | and + to reduce parens.  That might be better than the
-    #   current precedence scheme.
-
-    def __init__(self):
-        self.nodes = set()
-        self.parents_of = {}
-        self.roots = set()
-
-    def inform_arc(self, parent, child):
-        # In the normal case, we got here because we invoked .accept
-        # on parent, so it'll already be in .nodes; but not for the
-        # initial entry point.
-        if parent not in self.nodes:
-            self.roots.add(parent)
-            self.nodes.add(parent)
-
-        if child not in self.parents_of:
-            self.parents_of[child] = []
-        self.parents_of[child].append(parent)
-
-        if child not in self.nodes:
-            self.nodes.add(child)
-            child.accept(self)
-
-    def show(self, rootname='S'):
-        "Dump out a human-readable grammar."
-        toplevel = (list(self.roots) +
-                    [n for n in self.nodes - self.roots
-                     if n.name is not None
-                     or len(self.parents_of[n]) != 1])
-
-        # Assign names to all top-level nodes so they can be referenced.
-        names = {}
-        i = 0
-
-        for n in toplevel:
-            if n.name is None and n == toplevel[0]:
-                names[n] = rootname
-            elif n.name is None:
-                names[n] = 'ex%d' % i
-                i += 1
-            else:
-                names[n] = n.name
-
-        # Traverse the grammar a second time and output the
-        # description of each top-level rule.
-        output = []
-        for n in toplevel:
-            output.append('%s: ' % names[n])
-            output.append(self.pprint(n, names, 0, 0, top=True))
-            output.append('\n')
-
-        return ''.join(output)
-
-    def pprint(self, prod, names, precedence_left, precedence_right, top=False):
-        "Produce a representation of a parsing expression. Violations of encapsulation willy-nilly."
-        if prod in names and not top:
-            return names[prod]
-
-        if isinstance(prod, Any):
-            return '%s*' % self.pprint(prod.body, names, 20, 20)
-        elif isinstance(prod, Cat):
-            v = '%s %s' % (self.pprint(prod.a, names, precedence_left, 5),
-                           self.pprint(prod.b, names, 5, precedence_right))
-            return v if precedence_left <= 5 > precedence_right else '(%s)' % v
-        elif isinstance(prod, Charset):
-            return represent_cset(prod.first)
-        elif isinstance(prod, Alt):
-            v = '%s | %s' % (self.pprint(prod.a, names, precedence_left, 10),
-                             self.pprint(prod.b, names, 10, precedence_right))
-            return v if precedence_left <= 10 > precedence_right else '(%s)' % v
-        elif isinstance(prod, Lit):
-            return repr(prod.s) if isinstance(prod.s, str) else repr(prod.s.decode('utf-8'))
-        elif isinstance(prod, Thunk):
-            return self.pprint(prod.forced, names, precedence_left, precedence_right)
-        elif isinstance(prod, Tell):
-            return '@'
-
-        else:
-            return str(prod)
-
-
-def show_grammar(prod, rootname='S'):
-    v = GrammarVisitor()
-    prod.accept(v)
-    return v.show(rootname)
-
-
-class Parse:
-    "A Packrat parse of a string or byte string."
-    def __init__(self, s, trace=lambda *args: None):
-        "s is the string to parse.  Pass trace=print to enable verbose logging."
-        self.s = s
-        self.memos = {}
-        self.trace = trace
-
-    def do(self, pos, ex):
-        "Attempt to parse using production/parsing expression `ex` starting at `pos`."
-        k = pos, ex
-        self.trace("parsing %s at %d" % (ex, pos))
-        if k in self.memos:
-            self.trace("memo hit")
-            return self.memos[k]
-        result = ex.parse(self, pos)
-        if result and ex.xform:
-            result = result[0], ex.xform(result[1])
-
-        # if len(self.memos) > 16384:
-        #     self.memos.clear()
-        self.memos[k] = result
-        self.trace("%s returns %s" % (ex, result))
-        return result
-
-
-class Prod:
-    "Base class for grammar productions, i.e., parsing expression types."
-    xform = None # Hook for post-processing.  This turns out to be the wrong thing
-    name = None  # Human-readable tag for debugging output
-
-    def __str__(self):
-        return self.debugstr
-
-    @memoprop
-    def debugstr(self):
-        "Used for tracing."
-        first = represent_cset(self.first)
-        if self.name:
-            return self.name + first
-        else:
-            return '<%s%s>' % (self.__class__.__name__, first)
-
-    def __add__(self, other):
-        "Concatenation of parsing expressions."
-        return Cat(self, as_prod(other))
-    def __radd__(self, other):
-        return as_prod(other) + self
-
-    def __or__(self, other):
-        "Alternation (ordered choice) of parsing expressions."
-        return Alt(self, as_prod(other))
-    def __ror__(self, other):
-        return as_prod(other) | self
-
-    def some(self):
-        """One or more repetitions.
-
-        The possibility of overwriting its .xform shows why .xform is bad.
-        """
-        result = self + Any(self)
-        result.xform = lambda d: [d[0]] + d[1]
-        return result
-    
-
-def as_prod(datum):
-    "Coerce an arbitrary thing into a grammar production."
-    if isinstance(datum, Prod):
-        return datum
-
-    if isinstance(datum, bytes) or isinstance(datum, str):
-        return Lit(datum)
-
-    if isinstance(datum, list):
-        if len(datum) == 1:
-            return as_prod(datum[0])
-        return as_prod(datum[0]) | datum[1:] 
-
-    if isinstance(datum, types.FunctionType):
-        return Thunk(datum)
-
-    raise ValueError(datum)
-
-
-class Lit(Prod):
-    "A parsing expression that matches a literal string or byte string."
-    def __init__(self, s):
-        self.s = s
-        self.first = {s[0]} if s else set()
-        self.nullable = not s
-
-    def parse(self, parse, pos):
-        npos = pos + len(self.s)
-        if parse.s[pos:npos] == self.s:
-            return npos, self.s
-
-    def accept(self, visitor):
-        pass
-
-def ok(a, b): assert a == b, (a, b)
-ok(Parse("hello").do(0, Lit("hel")), (3, "hel"))
-ok(Parse(b"hello").do(0, Lit(b"hel")), (3, b"hel"))
-ok(Parse("hello").do(0, Lit("hec")), None)
-
-
-class Cat(Prod):
-    "A parsing expression that matches the concatenation of two productions."
-    def __init__(self, a, b):
-        self.a, self.b = a, b
-
-    def parse(self, parse, pos):
-        a = parse.do(pos, self.a)
-        if a is None:
-            return None
-        b = parse.do(a[0], self.b)
-        if b is None:
-            return None
-        return b[0], (a[1], b[1])
-
-    # These properties, used for predictive parsing, are lazily
-    # computed so that you can finish constructing a cyclic graph
-    # using Thunks before evaluating them.
-    @memoprop
-    def nullable(self):
-        return self.a.nullable and self.b.nullable
-
-    @memoprop
-    def first(self):
-        return self.a.first | self.b.first if self.a.nullable else self.a.first
-
-    def accept(self, visitor):
-        visitor.inform_arc(self, self.a)
-        visitor.inform_arc(self, self.b)
-
-assert Parse("hello").do(0, Lit('he') + 'll') == (4, ('he', 'll'))
-assert Parse("hello").do(0, Lit('he') + 'lc') is None
-
-
-class Alt(Prod):
-    "Matches the ordered-choice alternation of two productions."
-    def __init__(self, a, b):
-        self.a, self.b = a, b
-
-    def parse(self, parse, pos):
-        c = parse.s[pos] if pos < len(parse.s) else None
-        # This form of lookahead increases speed by only about 30%
-        # to about 32 microseconds per byte.
-        if self.a.nullable or c in self.a.first:
-            a = parse.do(pos, self.a)
-            if a:
-                return a
-
-        if self.b.nullable or c in self.b.first:
-            return parse.do(pos, self.b)
-
-        return None
-
-    @memoprop
-    def nullable(self):
-        return self.a.nullable or self.b.nullable
-
-    @memoprop
-    def first(self):
-        return self.a.first | self.b.first
-
-    def accept(self, visitor):
-        visitor.inform_arc(self, self.a)
-        visitor.inform_arc(self, self.b)
-
-assert Parse("hello").do(0, Lit('h') | 'x') == (1, 'h')
-assert Parse("hello").do(0, Lit('x') | 'h') == (1, 'h')
-assert Parse("hello").do(0, Lit('x') | 'y') == None
-assert Parse("hello").do(0, Lit('h') | 'he') == (1, 'h')
-assert Parse("hello").do(0, Lit('he') | 'h') == (2, 'he')
-
-
-# To avoid Python's recursion depth when parsing nontrivial inputs,
-# this is not defined in terms of Alt and Thunk, though in theory it
-# could be.  It turns out to be simpler this way because you don't
-# have to post-process a cons list into a Python list.
-class Any(Prod):
-    "Kleene-closure parsing expression: matches zero or more repetitions of a production."
-    def __init__(self, body):
-        self.body = body
-        self.nullable = True
-
-    @memoprop
-    def first(self):
-        return self.body.first
-
-    def parse(self, parse, pos):
-        results = []
-        while True:
-            # Note, not taking advantage of predictive parsing in this
-            # case.  Maybe I should.
-            kid = parse.do(pos, self.body)
-            if not kid:
-                return pos, results
-            results.append(kid[1])
-            pos = kid[0]
-
-    def accept(self, visitor):
-        visitor.inform_arc(self, self.body)
-
-
-assert Parse("lalala!").do(0, Any(Lit('la'))) == (6, ['la', 'la', 'la'])
-assert Parse("lalala!").do(0, Any(Lit('al'))) == (0, [])
-            
-
-class Thunk(Prod):
-    """A production whose definition is deferred until later.
-
-    This allows the construction of cyclic grammars.  In theory it
-    also allows the grammar to be constructed lazily and thus grow
-    during the parse.
-    """
-    def __init__(self, body):
-        self.body = body
-
-    @memoprop
-    def forced(self):
-        """Memoized property that ensures we only evaluate our body once.
-        
-        This doesn't affect performance, apparently, but semantically
-        it ensures that the meaning of the production doesn't change
-        over time.  It still doesn't prevent you from creating lazily
-        computed infinite grammars, though...
-
-        """
-        return self.body()
-
-    def parse(self, parse, pos):
-        return self.forced.parse(parse, pos)
-
-    @property
-    def nullable(self):
-        return self.forced.nullable
-
-    @property
-    def first(self):
-        return self.forced.first
-
-    def accept(self, visitor):
-        visitor.inform_arc(self, self.forced)
-
-
-class Charset(Prod):
-    "A parsing expression that matches any byte or character from a set()."
-    def __init__(self, cset):
-        self.first = cset
-        self.nullable = False
-
-    def parse(self, parse, pos):
-        if pos >= len(parse.s):
-            return None
-        c = parse.s[pos]
-        if c in self.first:
-            return (pos + 1, c)
-
-    def accept(self, visitor):
-        pass
-
-class Tell(Prod):
-    "Consumes no characters but 'parses' the current parse position."
-    first = set()
-    nullable = True
-
-    def parse(self, parse, pos):
-        return pos, pos
+import functools
+import operator
+import zlib
 
-    def accept(self, visitor):
-        pass
+from .peg import Any, Charset, Lit, Thunk, Tell, Parse, as_prod
+from .ok import ok, please_be
 
 
 ### PDF and PostScript and CMap parsing ###
diff --git a/izodparse/peg.py b/izodparse/peg.py
new file mode 100644
index 0000000000000000000000000000000000000000..42cfaa3f47ce01f52286a08fe773fdf767d1409e
--- /dev/null
+++ b/izodparse/peg.py
@@ -0,0 +1,477 @@
+#!/usr/bin/python3
+r"""A small PEG parser generator.
+
+Too slow to use in practice (30 kilobytes/sec), but hopefully
+validates our understanding of the problem and communicates it more
+clearly than a lower-level language would.
+
+A pitfall I've run into a lot with the PEG parser in here is that if
+you try to parse a Unicode string with a byte-string grammar or vice
+versa, you just get silent failures to parse.  This is a specific case
+of the more general problem with silent failures.
+
+This file is part of izodparse.
+
+izodparse is free software: you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation, either version 3 of the License, or (at your
+option) any later version.
+
+izodparse is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with izodparse.  If not, see <http://www.gnu.org/licenses/>.
+
+"""
+import types
+
+from .ok import ok, please_be
+
+
+class memoprop:
+    """"Simplified, non-multithreaded version of functools.cached_property.
+
+    For Pythons earlier than 3.8.  Doesn't support __slots__, custom
+    __dict__, etc.
+
+    """
+    def __init__(self, func):
+        self.func = func
+
+    def __get__(self, instance, cls):
+        cache = instance.__dict__
+        if self not in cache:
+            cache[self] = self.func(instance)
+        return cache[self]
+
+
+### Packrat parsing engine with predictive lookahead parsing ###
+
+# First, some debugging utilities:
+def represent_cset(cs):
+    "Debugging helper for understanding first sets; takes a set() of ints or chars."
+    if any(isinstance(c, str) for c in cs):
+        return represent_uset(cs)
+    
+    ranges = []
+    for bt in sorted(cs):
+        if bt == b'-'[0] and not (bt - 1 in cs and bt + 1 in cs):
+            ranges[0:0] = [(bt, bt)]
+            continue
+            
+        if ranges and bt == ranges[-1][-1] + 1:
+            ranges[-1] = ranges[-1][0], bt
+        else:
+            ranges.append((bt, bt))
+
+    rangereps = [bytes((b,)) if a == b else bytes((a, ord('-'), b))
+                 for a, b in ranges]
+    return '[%s]' % repr(b''.join(rangereps))[2:-1]
+
+def represent_uset(cs):
+    "Unicode version of represent_cset."
+    ranges = []
+    for c in sorted(cs):
+        if c == '-' and not (',' in cs and '.' in cs): # previous and following characters
+            ranges[0:0] = [(c, c)]
+            continue
+
+        c = ord(c)
+        if ranges and c == ranges[-1][-1] + 1:
+            ranges[-1] = ranges[-1][0], c
+        else:
+            ranges.append((c, c))
+
+    rangereps = [chr(b) if a == b else chr(a) + '-' + chr(b)
+                 for a, b in ranges]
+    return '[%s]' % ''.join(rangereps)
+
+
+class GrammarVisitor:
+    """Compute global properties of a grammar, like what expressions are referenced twice.
+
+    This is useful for printing out a readable representation of a
+    grammar.  (Also, though we don't use this fact, singly-referenced
+    parsing expressions can never gain any benefit from the memo
+    table.)
+
+    This version of Visitor deviates a bit from the standard pattern,
+    because our grammars are cyclic, so simply having the client
+    peremptorily invoke .accept on each of its children would loop
+    infinitely.  Instead, we have the client invoke .inform_arc, which
+    allows the Visitor to decide whether to proceed to invoking
+    .accept.  And then the .visit or .visit_type method doesn't need
+    to exist at all; .accept merely invokes .inform_arc for each
+    child.  (This has the bug that if the root node has no children,
+    we'll never know about it.)
+
+    """
+    # A few nits that could be cleaned up:
+    # - negative charsets would make things like `regchar:
+    #   [\x01-\x08\x0b\x0e-\x1f!-$&-'*-.0-;=?-Z\\^-z|~-\xff]` a lot more readable.
+    #   This would involve shunting ^ to be not first if it's first I guess.
+    # - `[[]]` is not okay.  `-` gets pulled to the front; `]` should
+    #   be too.  Not sure how to handle set(['-', ']']) but it hasn't arisen yet.
+    # - It would be beneficial to take advantage of the associativity
+    #   of | and + to reduce parens.  That might be better than the
+    #   current precedence scheme.
+
+    def __init__(self):
+        self.nodes = set()
+        self.parents_of = {}
+        self.roots = set()
+
+    def inform_arc(self, parent, child):
+        # In the normal case, we got here because we invoked .accept
+        # on parent, so it'll already be in .nodes; but not for the
+        # initial entry point.
+        if parent not in self.nodes:
+            self.roots.add(parent)
+            self.nodes.add(parent)
+
+        if child not in self.parents_of:
+            self.parents_of[child] = []
+        self.parents_of[child].append(parent)
+
+        if child not in self.nodes:
+            self.nodes.add(child)
+            child.accept(self)
+
+    def show(self, rootname='S'):
+        "Dump out a human-readable grammar."
+        toplevel = (list(self.roots) +
+                    [n for n in self.nodes - self.roots
+                     if n.name is not None
+                     or len(self.parents_of[n]) != 1])
+
+        # Assign names to all top-level nodes so they can be referenced.
+        names = {}
+        i = 0
+
+        for n in toplevel:
+            if n.name is None and n == toplevel[0]:
+                names[n] = rootname
+            elif n.name is None:
+                names[n] = 'ex%d' % i
+                i += 1
+            else:
+                names[n] = n.name
+
+        # Traverse the grammar a second time and output the
+        # description of each top-level rule.
+        output = []
+        for n in toplevel:
+            output.append('%s: ' % names[n])
+            output.append(self.pprint(n, names, 0, 0, top=True))
+            output.append('\n')
+
+        return ''.join(output)
+
+    def pprint(self, prod, names, precedence_left, precedence_right, top=False):
+        "Produce a representation of a parsing expression. Violations of encapsulation willy-nilly."
+        if prod in names and not top:
+            return names[prod]
+
+        if isinstance(prod, Any):
+            return '%s*' % self.pprint(prod.body, names, 20, 20)
+        elif isinstance(prod, Cat):
+            v = '%s %s' % (self.pprint(prod.a, names, precedence_left, 5),
+                           self.pprint(prod.b, names, 5, precedence_right))
+            return v if precedence_left <= 5 > precedence_right else '(%s)' % v
+        elif isinstance(prod, Charset):
+            return represent_cset(prod.first)
+        elif isinstance(prod, Alt):
+            v = '%s | %s' % (self.pprint(prod.a, names, precedence_left, 10),
+                             self.pprint(prod.b, names, 10, precedence_right))
+            return v if precedence_left <= 10 > precedence_right else '(%s)' % v
+        elif isinstance(prod, Lit):
+            return repr(prod.s) if isinstance(prod.s, str) else repr(prod.s.decode('utf-8'))
+        elif isinstance(prod, Thunk):
+            return self.pprint(prod.forced, names, precedence_left, precedence_right)
+        elif isinstance(prod, Tell):
+            return '@'
+
+        else:
+            return str(prod)
+
+
+def show_grammar(prod, rootname='S'):
+    v = GrammarVisitor()
+    prod.accept(v)
+    return v.show(rootname)
+
+
+class Parse:
+    "A Packrat parse of a string or byte string."
+    def __init__(self, s, trace=lambda *args: None):
+        "s is the string to parse.  Pass trace=print to enable verbose logging."
+        self.s = s
+        self.memos = {}
+        self.trace = trace
+
+    def do(self, pos, ex):
+        "Attempt to parse using production/parsing expression `ex` starting at `pos`."
+        k = pos, ex
+        self.trace("parsing %s at %d" % (ex, pos))
+        if k in self.memos:
+            self.trace("memo hit")
+            return self.memos[k]
+        result = ex.parse(self, pos)
+        if result and ex.xform:
+            result = result[0], ex.xform(result[1])
+
+        # if len(self.memos) > 16384:
+        #     self.memos.clear()
+        self.memos[k] = result
+        self.trace("%s returns %s" % (ex, result))
+        return result
+
+
+class Prod:
+    "Base class for grammar productions, i.e., parsing expression types."
+    xform = None # Hook for post-processing.  This turns out to be the wrong thing
+    name = None  # Human-readable tag for debugging output
+
+    def __str__(self):
+        return self.debugstr
+
+    @memoprop
+    def debugstr(self):
+        "Used for tracing."
+        first = represent_cset(self.first)
+        if self.name:
+            return self.name + first
+        else:
+            return '<%s%s>' % (self.__class__.__name__, first)
+
+    def __add__(self, other):
+        "Concatenation of parsing expressions."
+        return Cat(self, as_prod(other))
+    def __radd__(self, other):
+        return as_prod(other) + self
+
+    def __or__(self, other):
+        "Alternation (ordered choice) of parsing expressions."
+        return Alt(self, as_prod(other))
+    def __ror__(self, other):
+        return as_prod(other) | self
+
+    def some(self):
+        """One or more repetitions.
+
+        The possibility of overwriting its .xform shows why .xform is bad.
+        """
+        result = self + Any(self)
+        result.xform = lambda d: [d[0]] + d[1]
+        return result
+    
+
+def as_prod(datum):
+    "Coerce an arbitrary thing into a grammar production."
+    if isinstance(datum, Prod):
+        return datum
+
+    if isinstance(datum, bytes) or isinstance(datum, str):
+        return Lit(datum)
+
+    if isinstance(datum, list):
+        if len(datum) == 1:
+            return as_prod(datum[0])
+        return as_prod(datum[0]) | datum[1:] 
+
+    if isinstance(datum, types.FunctionType):
+        return Thunk(datum)
+
+    raise ValueError(datum)
+
+
+class Lit(Prod):
+    "A parsing expression that matches a literal string or byte string."
+    def __init__(self, s):
+        self.s = s
+        self.first = {s[0]} if s else set()
+        self.nullable = not s
+
+    def parse(self, parse, pos):
+        npos = pos + len(self.s)
+        if parse.s[pos:npos] == self.s:
+            return npos, self.s
+
+    def accept(self, visitor):
+        pass
+
+ok(Parse("hello").do(0, Lit("hel")), (3, "hel"))
+ok(Parse(b"hello").do(0, Lit(b"hel")), (3, b"hel"))
+ok(Parse("hello").do(0, Lit("hec")), None)
+
+
+class Cat(Prod):
+    "A parsing expression that matches the concatenation of two productions."
+    def __init__(self, a, b):
+        self.a, self.b = a, b
+
+    def parse(self, parse, pos):
+        a = parse.do(pos, self.a)
+        if a is None:
+            return None
+        b = parse.do(a[0], self.b)
+        if b is None:
+            return None
+        return b[0], (a[1], b[1])
+
+    # These properties, used for predictive parsing, are lazily
+    # computed so that you can finish constructing a cyclic graph
+    # using Thunks before evaluating them.
+    @memoprop
+    def nullable(self):
+        return self.a.nullable and self.b.nullable
+
+    @memoprop
+    def first(self):
+        return self.a.first | self.b.first if self.a.nullable else self.a.first
+
+    def accept(self, visitor):
+        visitor.inform_arc(self, self.a)
+        visitor.inform_arc(self, self.b)
+
+assert Parse("hello").do(0, Lit('he') + 'll') == (4, ('he', 'll'))
+assert Parse("hello").do(0, Lit('he') + 'lc') is None
+
+
+class Alt(Prod):
+    "Matches the ordered-choice alternation of two productions."
+    def __init__(self, a, b):
+        self.a, self.b = a, b
+
+    def parse(self, parse, pos):
+        c = parse.s[pos] if pos < len(parse.s) else None
+        # This form of lookahead increases speed by only about 30%
+        # to about 32 microseconds per byte.
+        if self.a.nullable or c in self.a.first:
+            a = parse.do(pos, self.a)
+            if a:
+                return a
+
+        if self.b.nullable or c in self.b.first:
+            return parse.do(pos, self.b)
+
+        return None
+
+    @memoprop
+    def nullable(self):
+        return self.a.nullable or self.b.nullable
+
+    @memoprop
+    def first(self):
+        return self.a.first | self.b.first
+
+    def accept(self, visitor):
+        visitor.inform_arc(self, self.a)
+        visitor.inform_arc(self, self.b)
+
+assert Parse("hello").do(0, Lit('h') | 'x') == (1, 'h')
+assert Parse("hello").do(0, Lit('x') | 'h') == (1, 'h')
+assert Parse("hello").do(0, Lit('x') | 'y') == None
+assert Parse("hello").do(0, Lit('h') | 'he') == (1, 'h')
+assert Parse("hello").do(0, Lit('he') | 'h') == (2, 'he')
+
+
+# To avoid Python's recursion depth when parsing nontrivial inputs,
+# this is not defined in terms of Alt and Thunk, though in theory it
+# could be.  It turns out to be simpler this way because you don't
+# have to post-process a cons list into a Python list.
+class Any(Prod):
+    "Kleene-closure parsing expression: matches zero or more repetitions of a production."
+    def __init__(self, body):
+        self.body = body
+        self.nullable = True
+
+    @memoprop
+    def first(self):
+        return self.body.first
+
+    def parse(self, parse, pos):
+        results = []
+        while True:
+            # Note, not taking advantage of predictive parsing in this
+            # case.  Maybe I should.
+            kid = parse.do(pos, self.body)
+            if not kid:
+                return pos, results
+            results.append(kid[1])
+            pos = kid[0]
+
+    def accept(self, visitor):
+        visitor.inform_arc(self, self.body)
+
+
+assert Parse("lalala!").do(0, Any(Lit('la'))) == (6, ['la', 'la', 'la'])
+assert Parse("lalala!").do(0, Any(Lit('al'))) == (0, [])
+            
+
+class Thunk(Prod):
+    """A production whose definition is deferred until later.
+
+    This allows the construction of cyclic grammars.  In theory it
+    also allows the grammar to be constructed lazily and thus grow
+    during the parse.
+    """
+    def __init__(self, body):
+        self.body = body
+
+    @memoprop
+    def forced(self):
+        """Memoized property that ensures we only evaluate our body once.
+        
+        This doesn't affect performance, apparently, but semantically
+        it ensures that the meaning of the production doesn't change
+        over time.  It still doesn't prevent you from creating lazily
+        computed infinite grammars, though...
+
+        """
+        return self.body()
+
+    def parse(self, parse, pos):
+        return self.forced.parse(parse, pos)
+
+    @property
+    def nullable(self):
+        return self.forced.nullable
+
+    @property
+    def first(self):
+        return self.forced.first
+
+    def accept(self, visitor):
+        visitor.inform_arc(self, self.forced)
+
+
+class Charset(Prod):
+    "A parsing expression that matches any byte or character from a set()."
+    def __init__(self, cset):
+        self.first = cset
+        self.nullable = False
+
+    def parse(self, parse, pos):
+        if pos >= len(parse.s):
+            return None
+        c = parse.s[pos]
+        if c in self.first:
+            return (pos + 1, c)
+
+    def accept(self, visitor):
+        pass
+
+class Tell(Prod):
+    "Consumes no characters but 'parses' the current parse position."
+    first = set()
+    nullable = True
+
+    def parse(self, parse, pos):
+        return pos, pos
+
+    def accept(self, visitor):
+        pass
diff --git a/plans.org b/plans.org
index 996d0b4e8129a3ef865dba3b7f908319a872e771..9eedac711decc78e95358947cdb5b7b0a481cb17 100644
--- a/plans.org
+++ b/plans.org
@@ -1,80 +1,87 @@
-* DONE split out pdftour into separate repo.
-charpy? (too popular) rebound? (too popular) schmidtconcrete? (ok)
-sclerometer? (wrong test) izod? (seems okay) 1zod or iz0d? better.
-punch? (a tiny hammer, too popular).  Bec?  (de corbin, but too
-popular) martel?  (could be, but common surname) maillet? (could be,
-but common surname) otsuchi?  (would be fine) totokia?  (would be
-fine) mere? (too common) patu? (too common)
+* things I am thinking of doing [3/14]
+** DONE split out pdftour into separate repo.
+ charpy? (too popular) rebound? (too popular) schmidtconcrete? (ok)
+ sclerometer? (wrong test) izod? (seems okay) 1zod or iz0d? better.
+ punch? (a tiny hammer, too popular).  Bec?  (de corbin, but too
+ popular) martel?  (could be, but common surname) maillet? (could be,
+ but common surname) otsuchi?  (would be fine) totokia?  (would be
+ fine) mere? (too common) patu? (too common)
 
-For now it's 1zodparse.  No, izodparse, so it's a valid Python module
-name.
+ For now it's 1zodparse.  No, izodparse, so it's a valid Python module
+ name.
 
-How do I git filter-branch?  I want pdftour.py and parsecmaps.py.  or
-maybe git-filter-repo?  no, don't have it.  --prune-empty?  --all?
-Can I use git ls-files -z | egrep -zv 'parsecmap|pdftour' | xargs -0
-git rm?
+ How do I git filter-branch?  I want pdftour.py and parsecmaps.py.  or
+ maybe git-filter-repo?  no, don't have it.  --prune-empty?  --all?
+ Can I use git ls-files -z | egrep -zv 'parsecmap|pdftour' | xargs -0
+ git rm?
 
-maybe
+ maybe
 
-    git filter-branch --index-filter 'git ls-files -z | egrep -zv "parsemap|pdftour" | xargs -0 git rm' --all
+     git filter-branch --index-filter 'git ls-files -z | egrep -zv "parsemap|pdftour" | xargs -0 git rm' --all
 
-no, that gives me a usage warning...
+ no, that gives me a usage warning...
 
-    git filter-branch --prune-empty --index-filter 'git ls-files -z | egrep -zv "parsemap|pdftour" | xargs -r0 git rm' -- --all
+     git filter-branch --prune-empty --index-filter 'git ls-files -z | egrep -zv "parsemap|pdftour" | xargs -r0 git rm' -- --all
 
-almost!
+ almost!
 
-    FILTER_BRANCH_SQUELCH_WARNING=1 git filter-branch --prune-empty --index-filter 'git ls-files -z | egrep -zv "parsecmap|pdftour" | xargs -r0 git rm' -- --all
+     FILTER_BRANCH_SQUELCH_WARNING=1 git filter-branch --prune-empty --index-filter 'git ls-files -z | egrep -zv "parsecmap|pdftour" | xargs -r0 git rm' -- --all
 
-Almost but now I have this merge commit:
+ Almost but now I have this merge commit:
 
-    user (1):
-	  Merge /media/usb0/stuff/text_parse
+     user (1):
+	   Merge /media/usb0/stuff/text_parse
 
-In
-https://stackoverflow.com/questions/9803294/prune-empty-merge-commits-from-history-in-git-repository
-we find this suggestion:
+ In
+ https://stackoverflow.com/questions/9803294/prune-empty-merge-commits-from-history-in-git-repository
+ we find this suggestion:
 
-    compu@compu-HP-Pavilion-14-Notebook-PC:~/1zodparse$ git rebase --root HEAD
-    Successfully rebased and updated detached HEAD.
-    compu@compu-HP-Pavilion-14-Notebook-PC:~/1zodparse$ git shortlog
-    Kragen Javier Sitaker (10):
-	  Add initial spike of parsing CMaps
-	  Remove Python3.8 dependency from parsecmaps.py
-	  Add readable debug display of grammars
-	  Add PDF file traversal skeleton to parsecmaps
-	  Enable parsecmaps to navigate PDF file graph structure
-	  Actually get a CMap out of a PDF with pdftour
-	  Implement stream object decompression in pdftour
-	  Update pdftour comments a bit
-	  Update pdftour comments and make it handle PDF comments
-	  Clean up pdftour code slightly
+     compu@compu-HP-Pavilion-14-Notebook-PC:~/1zodparse$ git rebase --root HEAD
+     Successfully rebased and updated detached HEAD.
+     compu@compu-HP-Pavilion-14-Notebook-PC:~/1zodparse$ git shortlog
+     Kragen Javier Sitaker (10):
+	   Add initial spike of parsing CMaps
+	   Remove Python3.8 dependency from parsecmaps.py
+	   Add readable debug display of grammars
+	   Add PDF file traversal skeleton to parsecmaps
+	   Enable parsecmaps to navigate PDF file graph structure
+	   Actually get a CMap out of a PDF with pdftour
+	   Implement stream object decompression in pdftour
+	   Update pdftour comments a bit
+	   Update pdftour comments and make it handle PDF comments
+	   Clean up pdftour code slightly
 
-    compu@compu-HP-Pavilion-14-Notebook-PC:~/1zodparse$ git rebase --root master
-    Successfully rebased and updated refs/heads/master.
-    compu@compu-HP-Pavilion-14-Notebook-PC:~/1zodparse$ git shortlog
-    Kragen Javier Sitaker (10):
-	  Add initial spike of parsing CMaps
-	  Remove Python3.8 dependency from parsecmaps.py
-	  Add readable debug display of grammars
-	  Add PDF file traversal skeleton to parsecmaps
-	  Enable parsecmaps to navigate PDF file graph structure
-	  Actually get a CMap out of a PDF with pdftour
-	  Implement stream object decompression in pdftour
-	  Update pdftour comments a bit
-	  Update pdftour comments and make it handle PDF comments
-	  Clean up pdftour code slightly
+     compu@compu-HP-Pavilion-14-Notebook-PC:~/1zodparse$ git rebase --root master
+     Successfully rebased and updated refs/heads/master.
+     compu@compu-HP-Pavilion-14-Notebook-PC:~/1zodparse$ git shortlog
+     Kragen Javier Sitaker (10):
+	   Add initial spike of parsing CMaps
+	   Remove Python3.8 dependency from parsecmaps.py
+	   Add readable debug display of grammars
+	   Add PDF file traversal skeleton to parsecmaps
+	   Enable parsecmaps to navigate PDF file graph structure
+	   Actually get a CMap out of a PDF with pdftour
+	   Implement stream object decompression in pdftour
+	   Update pdftour comments a bit
+	   Update pdftour comments and make it handle PDF comments
+	   Clean up pdftour code slightly
 
-This apparently changes GIT_COMMITTER_DATE but I don't care.
-
-* DONE make izodparse an installable Python package
+ This apparently changes GIT_COMMITTER_DATE but I don't care.
+** TODO sanitize the history by removing this file?
+This file has a bunch of stuff in it about my local machine config.
+** TODO upload to gitlab
+** DONE make izodparse an installable Python package
 Man, I forgot all about the distutils/setuptools mess.
-* TODO examine example PDF file with compressed object streams and no fonts in page resource dictionaries
-* TODO fix nested parentheses parsing
-* TODO make xrefs, etc., lazy properties
-* TODO use nested dicts for labeled properties of concatenations instead of tuples
-* TODO make parse failures report where the problem is and what was expected
-* TODO add pretty-printing of PDF dictionaries
-* TODO examine text encodings in PDF files
-* TODO add graphviz display, maybe inline in Jupyter
-* TODO find XObjects with fonts
+** DONE split out parsing-engine stuff from PDF/PS stuff
+** TODO make xrefs, etc., lazy properties
+** TODO examine example PDF file with compressed object streams and no fonts in page resource dictionaries [0/3]
+*** TODO find it
+*** TODO copy it
+*** TODO see what its xrefs look like
+** TODO fix nested parentheses parsing
+** TODO use nested dicts for labeled properties of concatenations instead of tuples
+** TODO make parse failures report where the problem is and what was expected
+** TODO add pretty-printing of PDF dictionaries
+** TODO examine text encodings in PDF files
+** TODO add graphviz display, maybe inline in Jupyter
+** TODO find XObjects with fonts