#!/usr/bin/python3
"""Explore PDF file structure, at least enough to parse a CMap file, hopefully.

For example, we can navigate the PDF structure.

    >>> import pdftour
    >>> p = pdftour.read_pdf('../Descargas/1.2754649.pdf')
    >>> p.catalog
    << /Type /Catalog  /Pages 13 0 R >>

Indirect object references are by default followed implicitly.  The
debug representation is a mix of Python and PostScript syntax:

    >>> p.catalog['Pages']
    << /Count 4  /Type /Pages  /ITXT b'5.1.2'  /Kids [11 0 R 38 0 R 74 0 R 86 0 R] >>
    >>> len(p.catalog['Pages']['Kids'])
    4
    >>> p.catalog['Pages']['Kids'][0].isa('Page')
    True
    >>> p.catalog['Pages']['Kids'][0]
    << /Type /Page  /Contents [5 0 R 12 0 R 6 0 R]  /Parent 13 0 R  /Resources << /XObject << /img0 14 0 R  /img1 15 0 R >>  /ProcSet [/PDF /Text /ImageB /ImageC /ImageI]  /Font << /F1 16 0 R  /Xi0 1 0 R  /F2 17 0 R >> >>  /Annots [18 0 R 19 0 R 20 0 R 21 0 R 22 0 R 23 0 R 24 0 R 25 0 R 26 0 R 27 0 R 28 0 R 29 0 R 30 0 R 31 0 R 32 0 R 33 0 R 34 0 R 35 0 R 36 0 R 37 0 R]  /MediaBox [0 0 612 792] >>
    >>> _['Resources']
    << /XObject << /img0 14 0 R  /img1 15 0 R >>  /ProcSet [/PDF /Text /ImageB /ImageC /ImageI]  /Font << /F1 16 0 R  /Xi0 1 0 R  /F2 17 0 R >> >>
    >>> _['Font']
    << /F1 16 0 R  /Xi0 1 0 R  /F2 17 0 R >>
    >>> f = _
    >>> f.keys()
    dict_keys(['F1', 'Xi0', 'F2'])
    >>> f['F1']
    << /Type /Font  /DescendantFonts [98 0 R]  /ToUnicode 99 0 R  /BaseFont /MFRGCI+ArialMT  /Subtype /Type0  /Encoding /Identity-H >>
    >>> f['Xi0']
    << /Type /Font  /BaseFont /Helvetica  /Subtype /Type1  /Encoding /WinAnsiEncoding >>
    >>> f['F2']
    << /Type /Font  /BaseFont /Helvetica  /Subtype /Type1  /Encoding /WinAnsiEncoding >>

Those fonts look the same, but they're distinct objects:

    >>> f.lstat('Xi0')
    ('ref', (('int', 1), ('int', 0)))
    >>> f.lstat('F2')
    ('ref', (('int', 17), ('int', 0)))

We can see the underlying bytes in the file:

    >>> p.read(p.xrefs.offset_of(17), 128)
    b'17 0 obj\n<</Type/Font/BaseFont/Helvetica/Subtype/Type1/Encoding/WinAnsiEncoding>>\nendobj\n18 0 obj\n<</Border[0 0 0]/Rect[56 696 5'

We can attempt parsing at that point with an ad-hoc custom grammar:

    >>> p.parse(p.xrefs.offset_of(17), pdftour.integer + pdftour.wschar.some() + pdftour.integer + b' obj\n<<' + pdftour.pdf_obj.some())
    (247827, ((((('int', 17), [32]), ('int', 0)), b' obj\n<<'),
    [('name', b'Type'), ('name', b'Font'), ('name', b'BaseFont'),
    ('name', b'Helvetica'), ('name', b'Subtype'), ('name', b'Type1'),
    ('name', b'Encoding'), ('name', b'WinAnsiEncoding')]))

We can get a debug dump of a grammar:

    >>> print(pdftour.show_grammar(pdftour.xref_header))
    S: (((('xref' eol) ex0) ' ') ex0) eol
    eol: ('\r\n' | '\r') | '\n'
    ex0: '+' | '-' | '' digit digit*
    digit: [0-9]

XXX the display of ex0 is incorrect

We can pull CMaps out of the PDF, and parse them, and compile their
code space ranges to a grammar for tokenizing text strings found in
page contents:

    >>> list(pdftour.cmaps_for_pages(pdftour.all_pages(p)))
    [<< /Filter /FlateDecode  /Length 568 >>
    stream
    (byte 247162...)]
    >>> p.catalog['Pages']['Kids'][0]['Resources']['Font']['F1']['ToUnicode']
    << /Filter /FlateDecode  /Length 568 >>
    stream
    (byte 247162...)
    >>> cmap = _.read()
    >>> import sys
    >>> sys.stdout.buffer.write(cmap[:100])
    /CIDInit /ProcSet findresource begin
    12 dict begin
    begincmap
    /CIDSystemInfo
    << /Registry (TTX+0)
    /Or100
    >>> length, tokens = pdftour.tokenize_ps(cmap)
    >>> length, len(cmap)
    (1569, 1569)
    >>> len(tokens)
    238
    >>> tokens[:8]
    [('name', b'CIDInit'), ('name', b'ProcSet'), ('op', b'findresource'), ('op', b'begin'), ('int', 12), ('op', b'dict'), ('op', b'begin'), ('op', b'begincmap')]
    >>> pdftour.buildtables(tokens)
    ([b'\x00\x00', b'\xff\xff'], [], [b'\x00\x03', b'\x00\x03', b'\x00
    ', b'\x00\x0b', b'\x00\x0b', b'\x00(', b'\x00\x0c', b'\x00\x0c',
    b'\x00)', b'\x00\x0f', b'\x00\x0f', b'\x00,', b'\x00\x10',
    b'\x00\x10', b'\x00-', b'\x00\x11', b'\x00\x11', b'\x00.',
    b'\x00\x12', b'\x00\x12', b'\x00/', b'\x00\x13...
    >>> print(pdftour.show_grammar(pdftour.csranges_to_grammar(_[0])))
    S: ([\x00-\xff] [\x00-\xff])*

Also, we can pull out page contents:

    >>> sys.stdout.buffer.write(p.catalog['Pages']['Kids'][0]['Contents'][1].read()[:256])
    q
    q 490 0 0 60 56 696 cm /img0 Do Q
    q
    0.3141 0 0 0.31746 56 696 cm
    q
    0 J
    0 j
    1 w
    0 G
    0 188.5 m

Too slow to use in practice (30 kilobytes/sec), but hopefully
validates our understanding of the problem and communicates it more
clearly than a lower-level language would.

A pitfall I've run into a lot with the PEG parser in here is that if
you try to parse a Unicode string with a byte-string grammar or vice
versa, you just get silent failures to parse.

Still unimplemented:

- CMap bfranges that *use* arrays;
- decoding of parenthesized strings;
- nesting of parenthesized strings;
- actually decoding a tokenized character using the tables extracted from the CMap;
- compressed object streams and xref streams;
- multiple xref sections;
- multiple xref subsections;
- pulling out the character strings being drawn from the page contents.

"""
import sys, types, functools, operator, zlib


class memoprop:
    """"Simplified, non-multithreaded version of functools.cached_property.

    For Pythons earlier than 3.8.  Doesn't support __slots__, custom
    __dict__, etc.

    """
    def __init__(self, func):
        self.func = func

    def __get__(self, instance, cls):
        cache = instance.__dict__
        if self not in cache:
            cache[self] = self.func(instance)
        return cache[self]

def please_be(a, b):
    if a != b:
        raise ValueError(a, '!=', b)


### Packrat parsing engine with predictive lookahead parsing ###

# First, some debugging utilities:
def represent_cset(cs):
    "Debugging helper for understanding first sets; takes a set() of ints or chars."
    if any(isinstance(c, str) for c in cs):
        return represent_uset(cs)
    
    ranges = []
    for bt in sorted(cs):
        if bt == b'-'[0] and not (bt - 1 in cs and bt + 1 in cs):
            ranges[0:0] = [(bt, bt)]
            continue
            
        if ranges and bt == ranges[-1][-1] + 1:
            ranges[-1] = ranges[-1][0], bt
        else:
            ranges.append((bt, bt))

    rangereps = [bytes((b,)) if a == b else bytes((a, ord('-'), b))
                 for a, b in ranges]
    return '[%s]' % repr(b''.join(rangereps))[2:-1]

def represent_uset(cs):
    "Unicode version of represent_cset."
    ranges = []
    for c in sorted(cs):
        if c == '-' and not (',' in cs and '.' in cs): # previous and following characters
            ranges[0:0] = [(c, c)]
            continue

        c = ord(c)
        if ranges and c == ranges[-1][-1] + 1:
            ranges[-1] = ranges[-1][0], c
        else:
            ranges.append((c, c))

    rangereps = [chr(b) if a == b else chr(a) + '-' + chr(b)
                 for a, b in ranges]
    return '[%s]' % ''.join(rangereps)


class GrammarVisitor:
    """Compute global properties of a grammar, like what expressions are referenced twice.

    This is useful for printing out a readable representation of a
    grammar.  (Also, though we don't use this fact, singly-referenced
    parsing expressions can never gain any benefit from the memo
    table.)

    This version of Visitor deviates a bit from the standard pattern,
    because our grammars are cyclic, so simply having the client
    peremptorily invoke .accept on each of its children would loop
    infinitely.  Instead, we have the client invoke .inform_arc, which
    allows the Visitor to decide whether to proceed to invoking
    .accept.  And then the .visit or .visit_type method doesn't need
    to exist at all; .accept merely invokes .inform_arc for each
    child.  (This has the bug that if the root node has no children,
    we'll never know about it.)

    """
    # A few nits that could be cleaned up:
    # - negative charsets would make things like `regchar:
    #   [\x01-\x08\x0b\x0e-\x1f!-$&-'*-.0-;=?-Z\\^-z|~-\xff]` a lot more readable.
    #   This would involve shunting ^ to be not first if it's first I guess.
    # - `[[]]` is not okay.  `-` gets pulled to the front; `]` should
    #   be too.  Not sure how to handle set(['-', ']']) but it hasn't arisen yet.
    # - It would be beneficial to take advantage of the associativity
    #   of | and + to reduce parens.  That might be better than the
    #   current precedence scheme.

    def __init__(self):
        self.nodes = set()
        self.parents_of = {}
        self.roots = set()

    def inform_arc(self, parent, child):
        # In the normal case, we got here because we invoked .accept
        # on parent, so it'll already be in .nodes; but not for the
        # initial entry point.
        if parent not in self.nodes:
            self.roots.add(parent)
            self.nodes.add(parent)

        if child not in self.parents_of:
            self.parents_of[child] = []
        self.parents_of[child].append(parent)

        if child not in self.nodes:
            self.nodes.add(child)
            child.accept(self)

    def show(self, rootname='S'):
        "Dump out a human-readable grammar."
        toplevel = (list(self.roots) +
                    [n for n in self.nodes - self.roots
                     if n.name is not None
                     or len(self.parents_of[n]) != 1])

        # Assign names to all top-level nodes so they can be referenced.
        names = {}
        i = 0

        for n in toplevel:
            if n.name is None and n == toplevel[0]:
                names[n] = rootname
            elif n.name is None:
                names[n] = 'ex%d' % i
                i += 1
            else:
                names[n] = n.name

        # Traverse the grammar a second time and output the
        # description of each top-level rule.
        output = []
        for n in toplevel:
            output.append('%s: ' % names[n])
            output.append(self.pprint(n, names, 0, 0, top=True))
            output.append('\n')

        return ''.join(output)

    def pprint(self, prod, names, precedence_left, precedence_right, top=False):
        "Produce a representation of a parsing expression. Violations of encapsulation willy-nilly."
        if prod in names and not top:
            return names[prod]

        if isinstance(prod, Any):
            return '%s*' % self.pprint(prod.body, names, 20, 20)
        elif isinstance(prod, Cat):
            v = '%s %s' % (self.pprint(prod.a, names, precedence_left, 5),
                           self.pprint(prod.b, names, 5, precedence_right))
            return v if precedence_left <= 5 > precedence_right else '(%s)' % v
        elif isinstance(prod, Charset):
            return represent_cset(prod.first)
        elif isinstance(prod, Alt):
            v = '%s | %s' % (self.pprint(prod.a, names, precedence_left, 10),
                             self.pprint(prod.b, names, 10, precedence_right))
            return v if precedence_left <= 10 > precedence_right else '(%s)' % v
        elif isinstance(prod, Lit):
            return repr(prod.s) if isinstance(prod.s, str) else repr(prod.s.decode('utf-8'))
        elif isinstance(prod, Thunk):
            return self.pprint(prod.forced, names, precedence_left, precedence_right)
        elif isinstance(prod, Tell):
            return '@'

        else:
            return str(prod)


def show_grammar(prod, rootname='S'):
    v = GrammarVisitor()
    prod.accept(v)
    return v.show(rootname)


class Parse:
    "A Packrat parse of a string or byte string."
    def __init__(self, s, trace=lambda *args: None):
        "s is the string to parse.  Pass trace=print to enable verbose logging."
        self.s = s
        self.memos = {}
        self.trace = trace

    def do(self, pos, ex):
        "Attempt to parse using production/parsing expression `ex` starting at `pos`."
        k = pos, ex
        self.trace("parsing %s at %d" % (ex, pos))
        if k in self.memos:
            self.trace("memo hit")
            return self.memos[k]
        result = ex.parse(self, pos)
        if result and ex.xform:
            result = result[0], ex.xform(result[1])

        # if len(self.memos) > 16384:
        #     self.memos.clear()
        self.memos[k] = result
        self.trace("%s returns %s" % (ex, result))
        return result


class Prod:
    "Base class for grammar productions, i.e., parsing expression types."
    xform = None # Hook for post-processing.  This turns out to be the wrong thing
    name = None  # Human-readable tag for debugging output

    def __str__(self):
        return self.debugstr

    @memoprop
    def debugstr(self):
        "Used for tracing."
        first = represent_cset(self.first)
        if self.name:
            return self.name + first
        else:
            return '<%s%s>' % (self.__class__.__name__, first)

    def __add__(self, other):
        "Concatenation of parsing expressions."
        return Cat(self, as_prod(other))
    def __radd__(self, other):
        return as_prod(other) + self

    def __or__(self, other):
        "Alternation (ordered choice) of parsing expressions."
        return Alt(self, as_prod(other))
    def __ror__(self, other):
        return as_prod(other) | self

    def some(self):
        """One or more repetitions.

        The possibility of overwriting its .xform shows why .xform is bad.
        """
        result = self + Any(self)
        result.xform = lambda d: [d[0]] + d[1]
        return result
    

def as_prod(datum):
    "Coerce an arbitrary thing into a grammar production."
    if isinstance(datum, Prod):
        return datum

    if isinstance(datum, bytes) or isinstance(datum, str):
        return Lit(datum)

    if isinstance(datum, list):
        if len(datum) == 1:
            return as_prod(datum[0])
        return as_prod(datum[0]) | datum[1:] 

    if isinstance(datum, types.FunctionType):
        return Thunk(datum)

    raise ValueError(datum)


class Lit(Prod):
    "A parsing expression that matches a literal string or byte string."
    def __init__(self, s):
        self.s = s
        self.first = {s[0]} if s else set()
        self.nullable = not s

    def parse(self, parse, pos):
        npos = pos + len(self.s)
        if parse.s[pos:npos] == self.s:
            return npos, self.s

    def accept(self, visitor):
        pass

def ok(a, b): assert a == b, (a, b)
ok(Parse("hello").do(0, Lit("hel")), (3, "hel"))
ok(Parse(b"hello").do(0, Lit(b"hel")), (3, b"hel"))
ok(Parse("hello").do(0, Lit("hec")), None)


class Cat(Prod):
    "A parsing expression that matches the concatenation of two productions."
    def __init__(self, a, b):
        self.a, self.b = a, b

    def parse(self, parse, pos):
        a = parse.do(pos, self.a)
        if a is None:
            return None
        b = parse.do(a[0], self.b)
        if b is None:
            return None
        return b[0], (a[1], b[1])

    # These properties, used for predictive parsing, are lazily
    # computed so that you can finish constructing a cyclic graph
    # using Thunks before evaluating them.
    @memoprop
    def nullable(self):
        return self.a.nullable and self.b.nullable

    @memoprop
    def first(self):
        return self.a.first | self.b.first if self.a.nullable else self.a.first

    def accept(self, visitor):
        visitor.inform_arc(self, self.a)
        visitor.inform_arc(self, self.b)

assert Parse("hello").do(0, Lit('he') + 'll') == (4, ('he', 'll'))
assert Parse("hello").do(0, Lit('he') + 'lc') is None


class Alt(Prod):
    "Matches the ordered-choice alternation of two productions."
    def __init__(self, a, b):
        self.a, self.b = a, b

    def parse(self, parse, pos):
        c = parse.s[pos] if pos < len(parse.s) else None
        # This form of lookahead increases speed by only about 30%
        # to about 32 microseconds per byte.
        if self.a.nullable or c in self.a.first:
            a = parse.do(pos, self.a)
            if a:
                return a

        if self.b.nullable or c in self.b.first:
            return parse.do(pos, self.b)

        return None

    @memoprop
    def nullable(self):
        return self.a.nullable or self.b.nullable

    @memoprop
    def first(self):
        return self.a.first | self.b.first

    def accept(self, visitor):
        visitor.inform_arc(self, self.a)
        visitor.inform_arc(self, self.b)

assert Parse("hello").do(0, Lit('h') | 'x') == (1, 'h')
assert Parse("hello").do(0, Lit('x') | 'h') == (1, 'h')
assert Parse("hello").do(0, Lit('x') | 'y') == None
assert Parse("hello").do(0, Lit('h') | 'he') == (1, 'h')
assert Parse("hello").do(0, Lit('he') | 'h') == (2, 'he')


# To avoid Python's recursion depth when parsing nontrivial inputs,
# this is not defined in terms of Alt and Thunk, though in theory it
# could be.  It turns out to be simpler this way because you don't
# have to post-process a cons list into a Python list.
class Any(Prod):
    "Kleene-closure parsing expression: matches zero or more repetitions of a production."
    def __init__(self, body):
        self.body = body
        self.nullable = True

    @memoprop
    def first(self):
        return self.body.first

    def parse(self, parse, pos):
        results = []
        while True:
            # Note, not taking advantage of predictive parsing in this
            # case.  Maybe I should.
            kid = parse.do(pos, self.body)
            if not kid:
                return pos, results
            results.append(kid[1])
            pos = kid[0]

    def accept(self, visitor):
        visitor.inform_arc(self, self.body)


assert Parse("lalala!").do(0, Any(Lit('la'))) == (6, ['la', 'la', 'la'])
assert Parse("lalala!").do(0, Any(Lit('al'))) == (0, [])
            

class Thunk(Prod):
    """A production whose definition is deferred until later.

    This allows the construction of cyclic grammars.  In theory it
    also allows the grammar to be constructed lazily and thus grow
    during the parse.
    """
    def __init__(self, body):
        self.body = body

    @memoprop
    def forced(self):
        """Memoized property that ensures we only evaluate our body once.
        
        This doesn't affect performance, apparently, but semantically
        it ensures that the meaning of the production doesn't change
        over time.  It still doesn't prevent you from creating lazily
        computed infinite grammars, though...

        """
        return self.body()

    def parse(self, parse, pos):
        return self.forced.parse(parse, pos)

    @property
    def nullable(self):
        return self.forced.nullable

    @property
    def first(self):
        return self.forced.first

    def accept(self, visitor):
        visitor.inform_arc(self, self.forced)


class Charset(Prod):
    "A parsing expression that matches any byte or character from a set()."
    def __init__(self, cset):
        self.first = cset
        self.nullable = False

    def parse(self, parse, pos):
        if pos >= len(parse.s):
            return None
        c = parse.s[pos]
        if c in self.first:
            return (pos + 1, c)

    def accept(self, visitor):
        pass

class Tell(Prod):
    "Consumes no characters but 'parses' the current parse position."
    first = set()
    nullable = True

    def parse(self, parse, pos):
        return pos, pos

    def accept(self, visitor):
        pass


### PDF and PostScript and CMap parsing ###

ws = set(b'\0\t\n\014\r ')
wschar = Charset(ws)        # Whitespace character
wschar.name = 'wschar'
eol = Lit(b'\r\n') | b'\r' | b'\n'
eol.name = 'eol'

digit = Charset(set(b'0123456789'))
digit.name = 'digit'
integer = [b'+', b'-', b''] + digit.some()
integer.xform = lambda d: ('int', int(d[0] + bytes(d[1])))

delim = set(b'()<>[]{}/%')
delimchar = Charset(delim)  # Delimiter character

any_byte = set(range(256))
reg = any_byte - delim - ws # "Regular" character
regchar = Charset(reg)
regchar.name = 'regchar'

name = b'/' + Any(regchar)  # say, /Type or /Page
name.name = 'name'
name.xform = lambda d: ('name', bytes(d[1]))

# XXX this will fail to recognize comments that end at the end of the
# file without a newline
comment = b'%' + Any(Charset(any_byte - set(b'\r\n'))) + eol

def drop_ws(prod):
    prod = prod + Any(wschar | comment)
    prod.xform = lambda d: d[0]
    return prod

def decode_hex(d):
    s = bytes(d[0][1])
    if len(s) % 2 != 0:
        s += b'0'
    return ('str', bytes(int(s[i:i+2], 16) for i in range(0, len(s), 2)))

hexstring = b'<' + Any(drop_ws(Charset(b'0123456789abcdefABCDEF'))) + b'>'
hexstring.xform = decode_hex

string_element = ( Charset(any_byte - set(br'\()'))
                 | (lambda: parenstring)
                 | b'\\' + Charset(any_byte)
                 )
parenstring = b'(' + Any(string_element) + b')'
parenstring.xform = lambda d: ('str', bytes(d[0][1])) # XXX croaks on anything with \()
ok(Parse(b'()').do(0, parenstring), (2, ('str', b'')))
ok(Parse(b'(hi)').do(0, parenstring), (4, ('str', b'hi')))
# XXX this one won't work until I write code to un-nest the paren strings
#ok(Parse(b'(hi(x))').do(0, parenstring), (7, ('str', b'hi(x)')))


def ps_grammar():
    "Construct a grammar that tokenizes a subset of PostScript/PDF."
    op = regchar.some() + b''   # operator. XXX this +b'' is to allow us to compose xforms
    op.name = 'op'
    op.xform = lambda d: ('op', bytes(d[0]))

    # XXX no real-number support yet

    dictdelim = Lit(b'<<') | b'>>'
    dictdelim.xform = lambda d: ('dd', d)
    arraydelim = Charset(set(b'[]'))
    arraydelim.xform = lambda d: ('ad', d)

    tokens = Any(drop_ws(integer | op | name| dictdelim | arraydelim | parenstring | hexstring))
    root = Any(wschar) + tokens
    root.xform = lambda d: d[1]
    return root

def tokenize_ps(ps):
    return Parse(ps).do(0, ps_grammar())


# XXX move these to the bottom
def buildtables(tokens):
    "Given a sequence of tokens from a CMap, pull out the tables they represent."
    csranges = []
    bfchars = []
    bfranges = []
    sizes = {}

    def n_strings_back(n):
        for j in range(i-n, i):
            assert tokens[j][0] == 'str'
            yield tokens[j][1]

    for i, tok in enumerate(tokens):
        if tok[0] == 'op':
            op = tok[1]
            if op in [b'begincodespacerange', b'beginbfchar', b'beginbfrange']:
                assert tokens[i-1][0] == 'int'
                sizes[op] = tokens[i-1][1]
            elif op == b'endcodespacerange':
                csranges.extend(n_strings_back(2*sizes[b'begincodespacerange']))
            elif op == b'endbfchar':
                bfchars.extend(n_strings_back(2*sizes[b'beginbfchar']))
            elif op == b'endbfrange':
                bfranges.extend(n_strings_back(3*sizes[b'beginbfrange']))

    return csranges, bfchars, bfranges


def csranges_to_grammar(csranges):
    "Compile the csranges from buildtables into a grammar we can use to tokenize strings."
    alternatives = []
    for i in range(0, len(csranges), 2):
        ranges = zip(csranges[i], csranges[i+1])
        cs = [Charset(set(c for c in range(startbyte, endbyte+1)))
              for startbyte, endbyte in ranges]
        prod = functools.reduce(operator.add, cs)
        prod.xform = bytes
        alternatives.append(prod)
            
    return Any(as_prod(alternatives))


xref_header = Lit(b'xref') + eol + integer + b' ' + integer + eol # clause 7.5.4
# XXX This xform clearly indicates that I need to rethink how
# concatenation works for parsing expressions.
xref_header.xform = lambda d: (d[0][0][0][1][1], d[0][1][1])
dictionary = Thunk(lambda: drop_ws(b'<<') + Any(drop_ws(name) + pdf_obj) + drop_ws(b'>>'))
dictionary.xform = lambda d: ('dict', {k[1].decode('utf-8'): v for k, v in d[0][1]})
array = Thunk(lambda: drop_ws(b'[') + Any(pdf_obj) + drop_ws(b']'))
array.xform = lambda d: ('array', d[0][1])
ref = drop_ws(integer) + drop_ws(integer) + drop_ws(b'R')
ref.xform = lambda d: ('ref', d[0])
# XXX integer and name should drop their own damn ws
pdf_obj = drop_ws(dictionary | name | ref | hexstring | array | integer | parenstring)

stream = dictionary + b'stream' + eol + Tell()
stream.xform = lambda d: ('stream', (d[0][0][0], d[1]))

endingobj = pdf_obj + drop_ws(b'endobj')
endingobj.xform = lambda d: d[0]
# XXX whitespace after 'obj' is not required according to clause
# 7.3.10; probably also endobj should require to be followed by a delimiter char
indirect_obj = drop_ws(integer) + drop_ws(integer) + drop_ws(b'obj') + (stream | endingobj)
indirect_obj.xform = lambda d: d[1]

class Pdf:
    """Comprehends enough of the PDF format to facilitate exploration.

    Still very incomplete.

    """
    def __init__(self, blob):
        self.blob = blob
        self.parser = Parse(blob)
        sx = blob.rindex(b'startxref')
        self.xref_start = int(blob[sx:].split()[1])
        # XXX there could be many sections
        self.xrefs = XrefSection(self, self.xref_start)
        self.trailer_plumbing = self.parse(self.xrefs.end, drop_ws(b'trailer') + dictionary)
        self.trailer = porcelainize(self, self.trailer_plumbing[1][1])
        self.catalog = self.trailer['Root']

    def read(self, offset, size=64):
        return self.blob[offset:offset+size]

    def parse(self, offset, ex):
        return self.parser.do(offset, ex)

    def get_indirect_obj(self, oid, generation=0):
        "Returns a plumbing object."
        result = self.parse(self.xrefs.offset_of(oid), indirect_obj)
        offset, plumb = result
        return plumb

    def dereference(self, plumb):
        """Given a plumbing object, follow indirect object refs if necessary.

        Returns another plumbing object.  Returns ('null', None) for
        dangling (XXX this is broken) or circular links.

        """
        seen = set()
        while plumb[0] == 'ref':
            if plumb in seen:
                return 'null', None
            seen.add(plumb)
            _, ((_, oid), (_, gen)) = plumb
            plumb = self.get_indirect_obj(oid, gen)

        return plumb

class XrefSection:
    def __init__(self, pdf, offset):
        self.pdf = pdf
        # XXX this is a subsection, bozo, there could be more than one
        self.offset, (self.first, self.size) = pdf.parse(offset, xref_header)
        self.end = self.offset + 20 * self.size

    def __getitem__(self, oid):
        if not self.first <= oid < self.first + self.size:
            raise RangeError(oid, self.first, self.size)
        return self.pdf.read(self.offset + 20 * (oid - self.first), size=20)

    def offset_of(self, oid):
        # XXX check that it's n, not f (clause 7.5.4)
        return int(self[oid][:10])


class Porcelain:
    """A family of façade objects to provide convenient access to PDF objects."""
    def __init__(self, pdf, d):
        self.pdf = pdf
        self.contents = d


class PorcelainDictionary(Porcelain):
    def __repr__(self):
        return '<< %s >>' % '  '.join('/%s %r' % (k, porcelainize(self.pdf, v)) for k, v in self.contents.items())

    def keys(self):
        return self.contents.keys()

    def __iter__(self):
        return iter(self.keys())

    def get(self, key, default=None):
        return self[key] if key in self else default

    def __contains__(self, key):
        return key in self.contents

    def __getitem__(self, key):
        return porcelainize(self.pdf, self.pdf.dereference(self.contents[key]))

    def lstat(self, key):
        return self.contents.get(key)

    def isa(self, typename):
        return self['Type'].name == typename

    def is_subtype(self, subtypename):
        return self['Subtype'].name == subtypename


class PorcelainArray(Porcelain):
    def __repr__(self):
        return '[%s]' % ' '.join(repr(porcelainize(self.pdf, x)) for x in self.contents)

    def __getitem__(self, i):
        return porcelainize(self.pdf, self.pdf.dereference(self.contents[i]))

    def lstat(self, key):
        # XXX porcelain ref?
        return self.contents[key]

    def __len__(self):
        return len(self.contents)


class PorcelainName(Porcelain):
    def __init__(self, pdf, name):
        self.pdf = pdf
        # XXX need to decode hex bytes
        self.name = name.decode('utf-8')

    def __repr__(self):
        return '/%s' % self.name

    def __hash__(self):
        return hash(self.name) * hash(self.pdf)

    def __eq__(self, other):
        return (isinstance(other, PorcelainName)
                and (self.pdf, self.name) == (other.pdf, other.name))


class PorcelainRef(Porcelain):
    """Usually we avoid instantiating this."""
    def __repr__(self):
        return '%r %r R' % (self.contents[0][1], self.contents[1][1])

    def __call__(self):
        # XXX this is unimpressive code and signals the need for a
        # rethinking
        return self.pdf.dereference(('ref', data))


class PorcelainStream:
    def __init__(self, pdf, plumb):
        self.pdf = pdf
        (_, header), self.offset = plumb
        self.header = PorcelainDictionary(pdf, header)

    def __repr__(self):
        return '%r\nstream\n(byte %d...)' % (self.header, self.offset)

    def read_raw(self):
        return self.pdf.read(self.offset, self.header['Length'])

    def read(self):
        data = self.read_raw()

        filters = self.header.get('Filter', [])
        if isinstance(filters, PorcelainName):
            filters = [filters]
        decoders = {'FlateDecode': zlib.decompress}
        for filtername in filters:
            data = decoders[filtername.name](data)

        return data

porcelain_classes = {
    'dict': PorcelainDictionary,
    'array': PorcelainArray,
    'name': PorcelainName,
    'ref': PorcelainRef,
    'stream': PorcelainStream,
    'str': lambda pdf, s: s,
    'int': lambda pdf, s: s,
}

def porcelainize(pdf, plumb):
    "Add a porcelain façade to a plumbing object, which is just a tuple."
    return porcelain_classes[plumb[0]](pdf, plumb[1])


def read_pdf(filename):
    with open(filename, 'rb') as fo:
        return Pdf(fo.read())


### Stuff for walking the PDF tree and pulling out CMaps ###

def all_pages(pdf, whine=lambda *args: None):
    return page_descendants(pdf.catalog['Pages'], whine)

def page_descendants(pages, whine=lambda *args: None):
    for kid in pages['Kids']:
        if kid.isa('Page'):
            yield kid
        elif kid.isa('Pages'):
            yield from page_descendants(kid, whine)
        else:
            whine('what even is', kid)

def cmaps_for_pages(pages, whine=lambda *args: None):
    # Find all the CMap streams.  Really we probably want to know the
    # CMap for a particular font on a particular page.  And /Type1 and
    # /Type3 fonts don't have a CMap; we use the CMap for a /Type0
    # font to do what the /Encoding does for the others.  But my
    # immediate objective here is to suck out the CMap streams so I
    # can bang more on the CMap parsing code above.
    seen_cmaps = set()
    seen_fonts = set()
    for page in pages:
        fonts = page['Resources']['Font']
        for name in fonts:
            font = fonts[name]
            if font in seen_fonts:
                continue
            seen_fonts.add(font)

            if font.is_subtype('Type0'):
                cmap = font['ToUnicode']
                if cmap in seen_cmaps:
                    continue
                seen_cmaps.add(cmap)
                yield cmap

            else:
                whine('not yet handling non-Unicode font', font)


if __name__ == '__main__':
    import cgitb; cgitb.enable(format='text')

    print(show_grammar(ps_grammar(), 'postscript'))

    with open(sys.argv[1], 'rb') as fo:
        data = fo.read()

    result = tokenize_ps(data)

    if result:
        for kind, val in result[1]:
            print("%10s %r" % (kind, val))
        a, b, c = buildtables(result[1])
        print('csranges', a)
        cg = csranges_to_grammar(a)
        print(show_grammar(cg, 'character_code'))
        print(Parse(b'\0T\0h\0i\0s\0 \0c\0o\0s\0t\0s\0 \0001\0000\1\162').do(0, cg))
        print('bfchars', b)
        print('bfranges', c)