diff --git a/parsecmaps.py b/parsecmaps.py
index f827f783c9c20d97a8ae0cf8ad0c91ffb71cc378..494a90e569918567038c479e5943931eff9a9fd0 100755
--- a/parsecmaps.py
+++ b/parsecmaps.py
@@ -97,6 +97,16 @@ class GrammarVisitor:
     we'll never know about it.)
 
     """
+    # A few nits that could be cleaned up:
+    # - negative charsets would make things like `regchar:
+    #   [\x01-\x08\x0b\x0e-\x1f!-$&-'*-.0-;=?-Z\\^-z|~-\xff]` a lot more readable.
+    #   This would involve shunting ^ to be not first if it's first I guess.
+    # - `[[]]` is not okay.  `-` gets pulled to the front; `]` should
+    #   be too.  Not sure how to handle set(['-', ']']) but it hasn't arisen yet.
+    # - It would be beneficial to take advantage of the associativity
+    #   of | and + to reduce parens.  That might be better than the
+    #   current precedence scheme.
+
     def __init__(self):
         self.nodes = set()
         self.parents_of = {}
@@ -138,7 +148,8 @@ class GrammarVisitor:
             else:
                 names[n] = n.name
 
-        # Traverse the grammar a second time to produce output.
+        # Traverse the grammar a second time and output the
+        # description of each top-level rule.
         output = []
         for n in toplevel:
             output.append('%s: ' % names[n])
@@ -167,7 +178,7 @@ class GrammarVisitor:
         elif isinstance(prod, Lit):
             return repr(prod.s) if isinstance(prod.s, str) else repr(prod.s.decode('utf-8'))
         elif isinstance(prod, Thunk):
-            return self.pprint(prod.body(), names, precedence_left,  precedence_right)
+            return self.pprint(prod.forced, names, precedence_left, precedence_right)
 
         else:
             return str(prod)
@@ -396,24 +407,33 @@ class Thunk(Prod):
     during the parse.
     """
     def __init__(self, body):
-        # XXX make .body itself a memoprop?
         self.body = body
 
+    @memoprop
+    def forced(self):
+        """Memoized property that ensures we only evaluate our body once.
+        
+        This doesn't affect performance, apparently, but semantically
+        it ensures that the meaning of the production doesn't change
+        over time.  It still doesn't prevent you from creating lazily
+        computed infinite grammars, though...
+
+        """
+        return self.body()
+
     def parse(self, parse, pos):
-        body = self.body()
-        parse.trace('body is %s' % body)
-        return body.parse(parse, pos)
+        return self.forced.parse(parse, pos)
 
     @property
     def nullable(self):
-        return self.body().nullable
+        return self.forced.nullable
 
     @property
     def first(self):
-        return self.body().first
+        return self.forced.first
 
     def accept(self, visitor):
-        visitor.inform_arc(self, self.body())
+        visitor.inform_arc(self, self.forced)
 
 
 class Charset(Prod):
@@ -433,12 +453,17 @@ class Charset(Prod):
         pass
 
 
+ws = set(b'\0\t\n\014\r ')
+wschar = Charset(ws)        # Whitespace character
+wschar.name = 'wschar'
+
+digit = Charset(set(b'0123456789'))
+digit.name = 'digit'
+integer = [b'+', b'-', b''] + digit.some()
+integer.xform = lambda d: ('int', int(d[0] + bytes(d[1])))
+
 def ps_grammar():
     "Construct a grammar that tokenizes a subset of PostScript/PDF."
-    ws = set(b'\0\t\n\014\r ')
-    wschar = Charset(ws)        # Whitespace character
-    wschar.name = 'wschar'
-
     delim = set(b'()<>[]{}/%')
     delimchar = Charset(delim)  # Delimiter character
 
@@ -455,9 +480,6 @@ def ps_grammar():
     name.name = 'name'
     name.xform = lambda d: ('name', bytes(d[1]))
 
-    integer = [b'+', b'-', b''] + Charset(set(b'0123456789')).some()
-    integer.xform = lambda d: ('int', int(d[0] + bytes(d[1])))
-
     # XXX no real-number support yet
 
     dictdelim = Lit(b'<<') | b'>>'
@@ -541,24 +563,125 @@ def csranges_to_grammar(csranges):
     return Any(as_prod(alternatives))
 
 
+def please_be(a, b):
+    if a != b:
+        raise ValueError(a, '!=', b)
+
+xref_header = Lit(b'xref\n') + integer + wschar.some() + integer + b'\n' # XXX too strict
+# XXX This xform clearly indicates that I need to rethink how
+# concatenation works for parsing expressions.
+xref_header.xform = lambda d: (d[0][0][0][1][1], d[0][1][1])
+
+class Pdf:
+    """Comprehends enough of the PDF format to facilitate exploration.
+
+    Still very incomplete.
+
+    Example:
+    >>> p = parsecmaps.read_pdf('../Descargas/1.2754649.pdf')
+    >>> p.read(p.xrefs.offset_of(55))
+    b'55 0 obj\n<</Type/Font/FirstChar 37/FontDescriptor 116 0 R/BaseFo'
+    >>> p.read(p.xrefs.offset_of(72))
+    b'72 0 obj\n<</Border[0 0 0]/Type/Annot/Dest[74 0 R/XYZ 307 380 nul'
+    >>> p.read(p.xrefs.offset_of(72), size=256)
+    b'72 0 obj\n<</Border[0 0 0]/Type/Annot/Dest[74 0 R/XYZ 307 380 null]/Rect[394.024017 55.089005 409.314026 65.729996]/Subtype/Link>>\nendobj\n73 0 obj\n<</Border[0 0 0]/Type/Annot/Dest[74 0 R/XYZ 43 253 null]/Rect[464.366028 55.089005 475.665009 65.729996]/Subty'
+    >>> p.xrefs[72]
+    b'0000350098 00000 n \n'
+    >>> p.parse(p.xrefs.offset_of(72), parsecmaps.integer + parsecmaps.wschar.some() + parsecmaps.integer)
+    (350102, ((('int', 72), [32]), ('int', 0)))
+    >>> p.parse(p.xrefs.offset_of(72), parsecmaps.integer + parsecmaps.wschar.some() + parsecmaps.integer + b' obj')
+    (350106, (((('int', 72), [32]), ('int', 0)), b' obj'))
+
+    """
+    def __init__(self, blob):
+        self.blob = blob
+        self.parser = Parse(blob)
+        sx = blob.rindex(b'startxref')
+        self.xref_start = int(blob[sx:].split()[1])
+        # XXX there could be many sections
+        self.xrefs = XrefSection(self, self.xref_start)
+
+    def read(self, offset, size=64):
+        return self.blob[offset:offset+size]
+
+    def parse(self, offset, ex):
+        return self.parser.do(offset, ex)
+
+class XrefSection:
+    def __init__(self, pdf, offset):
+        self.pdf = pdf
+        # XXX this is a subsection, bozo, there could be more than one
+        self.offset, (self.first, self.size) = pdf.parse(offset, xref_header)
+
+    def __getitem__(self, oid):
+        if not self.first <= oid < self.first + self.size:
+            raise RangeError(oid, self.first, self.size)
+        return self.pdf.read(self.offset + 20 * (oid - self.first), size=20)
+
+    def offset_of(self, oid):
+        return int(self[oid][:10])
+
+def read_pdf(filename):
+    with open(filename, 'rb') as fo:
+        return Pdf(fo.read())
+
+
+def all_pages(pdf, whine=lambda *args: None):
+    # XXX these structures aren't implemented yet
+    return page_descendants(pdf.catalog['Pages'], whine)
+
+def page_descendants(pages, whine=lambda *args: None):
+    for kid in pages['Kids']:
+        if kid.isa('Page'):
+            yield kid
+        elif kid.isa('Pages'):
+            yield from page_descendants(kid, whine)
+        else:
+            whine('what even is', kid)
+
+def cmaps_for_pages(pages, whine=lambda *args: None):
+    # Find all the CMap streams.  Really we probably want to know the
+    # CMap for a particular font on a particular page.  And /Type1 and
+    # /Type3 fonts don't have a CMap; we use the CMap for a /Type0
+    # font to do what the /Encoding does for the others.  But my
+    # immediate objective here is to suck out the CMap streams so I
+    # can bang more on the CMap parsing code above.
+    seen_cmaps = set()
+    seen_fonts = set()
+    for page in pages:
+        for font in page['Resources']['Font']:
+            if font in seen_fonts:
+                continue
+            seen_fonts.add(font)
+
+            if font.is_subtype('Type0'):
+                cmap = font['ToUnicode']
+                if cmap in seen_cmaps:
+                    continue
+                seen_cmaps.add(cmap)
+                yield cmap
+
+            else:
+                whine('not yet handling non-Unicode font', font)
+
+
 if __name__ == '__main__':
     import cgitb; cgitb.enable(format='text')
 
     print(show_grammar(ps_grammar(), 'postscript'))
-    print()
 
     with open(sys.argv[1], 'rb') as fo:
         data = fo.read()
 
     result = tokenize_ps(data)
+
     if result:
         for kind, val in result[1]:
             print("%10s %r" % (kind, val))
         a, b, c = buildtables(result[1])
         print('csranges', a)
         cg = csranges_to_grammar(a)
-        print(cg)
-        print(show_grammar(cg))
+        print(show_grammar(cg, 'character_code'))
         print(Parse(b'\0T\0h\0i\0s\0 \0c\0o\0s\0t\0s\0 \0001\0000\1\162').do(0, cg))
         print('bfchars', b)
         print('bfranges', c)