#!/usr/bin/python3 """Explore PDF file structure, at least enough to parse a CMap file, hopefully. For example, we can navigate the PDF structure. >>> import pdftour >>> p = pdftour.read_pdf('../Descargas/1.2754649.pdf') >>> p.catalog << /Type /Catalog /Pages 13 0 R >> Indirect object references are by default followed implicitly. The debug representation is a mix of Python and PostScript syntax: >>> p.catalog['Pages'] << /Count 4 /Type /Pages /ITXT b'5.1.2' /Kids [11 0 R 38 0 R 74 0 R 86 0 R] >> >>> len(p.catalog['Pages']['Kids']) 4 >>> p.catalog['Pages']['Kids'][0].isa('Page') True >>> p.catalog['Pages']['Kids'][0] << /Type /Page /Contents [5 0 R 12 0 R 6 0 R] /Parent 13 0 R /Resources << /XObject << /img0 14 0 R /img1 15 0 R >> /ProcSet [/PDF /Text /ImageB /ImageC /ImageI] /Font << /F1 16 0 R /Xi0 1 0 R /F2 17 0 R >> >> /Annots [18 0 R 19 0 R 20 0 R 21 0 R 22 0 R 23 0 R 24 0 R 25 0 R 26 0 R 27 0 R 28 0 R 29 0 R 30 0 R 31 0 R 32 0 R 33 0 R 34 0 R 35 0 R 36 0 R 37 0 R] /MediaBox [0 0 612 792] >> >>> _['Resources'] << /XObject << /img0 14 0 R /img1 15 0 R >> /ProcSet [/PDF /Text /ImageB /ImageC /ImageI] /Font << /F1 16 0 R /Xi0 1 0 R /F2 17 0 R >> >> >>> _['Font'] << /F1 16 0 R /Xi0 1 0 R /F2 17 0 R >> >>> f = _ >>> f.keys() dict_keys(['F1', 'Xi0', 'F2']) >>> f['F1'] << /Type /Font /DescendantFonts [98 0 R] /ToUnicode 99 0 R /BaseFont /MFRGCI+ArialMT /Subtype /Type0 /Encoding /Identity-H >> >>> f['Xi0'] << /Type /Font /BaseFont /Helvetica /Subtype /Type1 /Encoding /WinAnsiEncoding >> >>> f['F2'] << /Type /Font /BaseFont /Helvetica /Subtype /Type1 /Encoding /WinAnsiEncoding >> Those fonts look the same, but they're distinct objects: >>> f.lstat('Xi0') ('ref', (('int', 1), ('int', 0))) >>> f.lstat('F2') ('ref', (('int', 17), ('int', 0))) We can see the underlying bytes in the file: >>> p.read(p.xrefs.offset_of(17), 128) b'17 0 obj\n<</Type/Font/BaseFont/Helvetica/Subtype/Type1/Encoding/WinAnsiEncoding>>\nendobj\n18 0 obj\n<</Border[0 0 0]/Rect[56 696 5' We can attempt parsing at that point with an ad-hoc custom grammar: >>> p.parse(p.xrefs.offset_of(17), pdftour.integer + pdftour.wschar.some() + pdftour.integer + b' obj\n<<' + pdftour.pdf_obj.some()) (247827, ((((('int', 17), [32]), ('int', 0)), b' obj\n<<'), [('name', b'Type'), ('name', b'Font'), ('name', b'BaseFont'), ('name', b'Helvetica'), ('name', b'Subtype'), ('name', b'Type1'), ('name', b'Encoding'), ('name', b'WinAnsiEncoding')])) We can get a debug dump of a grammar: >>> print(pdftour.show_grammar(pdftour.xref_header)) S: (((('xref' eol) ex0) ' ') ex0) eol eol: ('\r\n' | '\r') | '\n' ex0: '+' | '-' | '' digit digit* digit: [0-9] XXX the display of ex0 is incorrect We can pull CMaps out of the PDF, and parse them, and compile their code space ranges to a grammar for tokenizing text strings found in page contents: >>> list(pdftour.cmaps_for_pages(pdftour.all_pages(p))) [<< /Filter /FlateDecode /Length 568 >> stream (byte 247162...)] >>> p.catalog['Pages']['Kids'][0]['Resources']['Font']['F1']['ToUnicode'] << /Filter /FlateDecode /Length 568 >> stream (byte 247162...) >>> cmap = _.read() >>> import sys >>> sys.stdout.buffer.write(cmap[:100]) /CIDInit /ProcSet findresource begin 12 dict begin begincmap /CIDSystemInfo << /Registry (TTX+0) /Or100 >>> length, tokens = pdftour.tokenize_ps(cmap) >>> length, len(cmap) (1569, 1569) >>> len(tokens) 238 >>> tokens[:8] [('name', b'CIDInit'), ('name', b'ProcSet'), ('op', b'findresource'), ('op', b'begin'), ('int', 12), ('op', b'dict'), ('op', b'begin'), ('op', b'begincmap')] >>> pdftour.buildtables(tokens) ([b'\x00\x00', b'\xff\xff'], [], [b'\x00\x03', b'\x00\x03', b'\x00 ', b'\x00\x0b', b'\x00\x0b', b'\x00(', b'\x00\x0c', b'\x00\x0c', b'\x00)', b'\x00\x0f', b'\x00\x0f', b'\x00,', b'\x00\x10', b'\x00\x10', b'\x00-', b'\x00\x11', b'\x00\x11', b'\x00.', b'\x00\x12', b'\x00\x12', b'\x00/', b'\x00\x13... >>> print(pdftour.show_grammar(pdftour.csranges_to_grammar(_[0]))) S: ([\x00-\xff] [\x00-\xff])* Also, we can pull out page contents: >>> sys.stdout.buffer.write(p.catalog['Pages']['Kids'][0]['Contents'][1].read()[:256]) q q 490 0 0 60 56 696 cm /img0 Do Q q 0.3141 0 0 0.31746 56 696 cm q 0 J 0 j 1 w 0 G 0 188.5 m Too slow to use in practice (30 kilobytes/sec), but hopefully validates our understanding of the problem and communicates it more clearly than a lower-level language would. A pitfall I've run into a lot with the PEG parser in here is that if you try to parse a Unicode string with a byte-string grammar or vice versa, you just get silent failures to parse. Still unimplemented: - CMap bfranges that *use* arrays; - decoding of parenthesized strings; - nesting of parenthesized strings; - actually decoding a tokenized character using the tables extracted from the CMap; - compressed object streams and xref streams; - multiple xref sections; - multiple xref subsections; - pulling out the character strings being drawn from the page contents. """ import sys, types, functools, operator, zlib class memoprop: """"Simplified, non-multithreaded version of functools.cached_property. For Pythons earlier than 3.8. Doesn't support __slots__, custom __dict__, etc. """ def __init__(self, func): self.func = func def __get__(self, instance, cls): cache = instance.__dict__ if self not in cache: cache[self] = self.func(instance) return cache[self] def please_be(a, b): if a != b: raise ValueError(a, '!=', b) ### Packrat parsing engine with predictive lookahead parsing ### # First, some debugging utilities: def represent_cset(cs): "Debugging helper for understanding first sets; takes a set() of ints or chars." if any(isinstance(c, str) for c in cs): return represent_uset(cs) ranges = [] for bt in sorted(cs): if bt == b'-'[0] and not (bt - 1 in cs and bt + 1 in cs): ranges[0:0] = [(bt, bt)] continue if ranges and bt == ranges[-1][-1] + 1: ranges[-1] = ranges[-1][0], bt else: ranges.append((bt, bt)) rangereps = [bytes((b,)) if a == b else bytes((a, ord('-'), b)) for a, b in ranges] return '[%s]' % repr(b''.join(rangereps))[2:-1] def represent_uset(cs): "Unicode version of represent_cset." ranges = [] for c in sorted(cs): if c == '-' and not (',' in cs and '.' in cs): # previous and following characters ranges[0:0] = [(c, c)] continue c = ord(c) if ranges and c == ranges[-1][-1] + 1: ranges[-1] = ranges[-1][0], c else: ranges.append((c, c)) rangereps = [chr(b) if a == b else chr(a) + '-' + chr(b) for a, b in ranges] return '[%s]' % ''.join(rangereps) class GrammarVisitor: """Compute global properties of a grammar, like what expressions are referenced twice. This is useful for printing out a readable representation of a grammar. (Also, though we don't use this fact, singly-referenced parsing expressions can never gain any benefit from the memo table.) This version of Visitor deviates a bit from the standard pattern, because our grammars are cyclic, so simply having the client peremptorily invoke .accept on each of its children would loop infinitely. Instead, we have the client invoke .inform_arc, which allows the Visitor to decide whether to proceed to invoking .accept. And then the .visit or .visit_type method doesn't need to exist at all; .accept merely invokes .inform_arc for each child. (This has the bug that if the root node has no children, we'll never know about it.) """ # A few nits that could be cleaned up: # - negative charsets would make things like `regchar: # [\x01-\x08\x0b\x0e-\x1f!-$&-'*-.0-;=?-Z\\^-z|~-\xff]` a lot more readable. # This would involve shunting ^ to be not first if it's first I guess. # - `[[]]` is not okay. `-` gets pulled to the front; `]` should # be too. Not sure how to handle set(['-', ']']) but it hasn't arisen yet. # - It would be beneficial to take advantage of the associativity # of | and + to reduce parens. That might be better than the # current precedence scheme. def __init__(self): self.nodes = set() self.parents_of = {} self.roots = set() def inform_arc(self, parent, child): # In the normal case, we got here because we invoked .accept # on parent, so it'll already be in .nodes; but not for the # initial entry point. if parent not in self.nodes: self.roots.add(parent) self.nodes.add(parent) if child not in self.parents_of: self.parents_of[child] = [] self.parents_of[child].append(parent) if child not in self.nodes: self.nodes.add(child) child.accept(self) def show(self, rootname='S'): "Dump out a human-readable grammar." toplevel = (list(self.roots) + [n for n in self.nodes - self.roots if n.name is not None or len(self.parents_of[n]) != 1]) # Assign names to all top-level nodes so they can be referenced. names = {} i = 0 for n in toplevel: if n.name is None and n == toplevel[0]: names[n] = rootname elif n.name is None: names[n] = 'ex%d' % i i += 1 else: names[n] = n.name # Traverse the grammar a second time and output the # description of each top-level rule. output = [] for n in toplevel: output.append('%s: ' % names[n]) output.append(self.pprint(n, names, 0, 0, top=True)) output.append('\n') return ''.join(output) def pprint(self, prod, names, precedence_left, precedence_right, top=False): "Produce a representation of a parsing expression. Violations of encapsulation willy-nilly." if prod in names and not top: return names[prod] if isinstance(prod, Any): return '%s*' % self.pprint(prod.body, names, 20, 20) elif isinstance(prod, Cat): v = '%s %s' % (self.pprint(prod.a, names, precedence_left, 5), self.pprint(prod.b, names, 5, precedence_right)) return v if precedence_left <= 5 > precedence_right else '(%s)' % v elif isinstance(prod, Charset): return represent_cset(prod.first) elif isinstance(prod, Alt): v = '%s | %s' % (self.pprint(prod.a, names, precedence_left, 10), self.pprint(prod.b, names, 10, precedence_right)) return v if precedence_left <= 10 > precedence_right else '(%s)' % v elif isinstance(prod, Lit): return repr(prod.s) if isinstance(prod.s, str) else repr(prod.s.decode('utf-8')) elif isinstance(prod, Thunk): return self.pprint(prod.forced, names, precedence_left, precedence_right) elif isinstance(prod, Tell): return '@' else: return str(prod) def show_grammar(prod, rootname='S'): v = GrammarVisitor() prod.accept(v) return v.show(rootname) class Parse: "A Packrat parse of a string or byte string." def __init__(self, s, trace=lambda *args: None): "s is the string to parse. Pass trace=print to enable verbose logging." self.s = s self.memos = {} self.trace = trace def do(self, pos, ex): "Attempt to parse using production/parsing expression `ex` starting at `pos`." k = pos, ex self.trace("parsing %s at %d" % (ex, pos)) if k in self.memos: self.trace("memo hit") return self.memos[k] result = ex.parse(self, pos) if result and ex.xform: result = result[0], ex.xform(result[1]) # if len(self.memos) > 16384: # self.memos.clear() self.memos[k] = result self.trace("%s returns %s" % (ex, result)) return result class Prod: "Base class for grammar productions, i.e., parsing expression types." xform = None # Hook for post-processing. This turns out to be the wrong thing name = None # Human-readable tag for debugging output def __str__(self): return self.debugstr @memoprop def debugstr(self): "Used for tracing." first = represent_cset(self.first) if self.name: return self.name + first else: return '<%s%s>' % (self.__class__.__name__, first) def __add__(self, other): "Concatenation of parsing expressions." return Cat(self, as_prod(other)) def __radd__(self, other): return as_prod(other) + self def __or__(self, other): "Alternation (ordered choice) of parsing expressions." return Alt(self, as_prod(other)) def __ror__(self, other): return as_prod(other) | self def some(self): """One or more repetitions. The possibility of overwriting its .xform shows why .xform is bad. """ result = self + Any(self) result.xform = lambda d: [d[0]] + d[1] return result def as_prod(datum): "Coerce an arbitrary thing into a grammar production." if isinstance(datum, Prod): return datum if isinstance(datum, bytes) or isinstance(datum, str): return Lit(datum) if isinstance(datum, list): if len(datum) == 1: return as_prod(datum[0]) return as_prod(datum[0]) | datum[1:] if isinstance(datum, types.FunctionType): return Thunk(datum) raise ValueError(datum) class Lit(Prod): "A parsing expression that matches a literal string or byte string." def __init__(self, s): self.s = s self.first = {s[0]} if s else set() self.nullable = not s def parse(self, parse, pos): npos = pos + len(self.s) if parse.s[pos:npos] == self.s: return npos, self.s def accept(self, visitor): pass def ok(a, b): assert a == b, (a, b) ok(Parse("hello").do(0, Lit("hel")), (3, "hel")) ok(Parse(b"hello").do(0, Lit(b"hel")), (3, b"hel")) ok(Parse("hello").do(0, Lit("hec")), None) class Cat(Prod): "A parsing expression that matches the concatenation of two productions." def __init__(self, a, b): self.a, self.b = a, b def parse(self, parse, pos): a = parse.do(pos, self.a) if a is None: return None b = parse.do(a[0], self.b) if b is None: return None return b[0], (a[1], b[1]) # These properties, used for predictive parsing, are lazily # computed so that you can finish constructing a cyclic graph # using Thunks before evaluating them. @memoprop def nullable(self): return self.a.nullable and self.b.nullable @memoprop def first(self): return self.a.first | self.b.first if self.a.nullable else self.a.first def accept(self, visitor): visitor.inform_arc(self, self.a) visitor.inform_arc(self, self.b) assert Parse("hello").do(0, Lit('he') + 'll') == (4, ('he', 'll')) assert Parse("hello").do(0, Lit('he') + 'lc') is None class Alt(Prod): "Matches the ordered-choice alternation of two productions." def __init__(self, a, b): self.a, self.b = a, b def parse(self, parse, pos): c = parse.s[pos] if pos < len(parse.s) else None # This form of lookahead increases speed by only about 30% # to about 32 microseconds per byte. if self.a.nullable or c in self.a.first: a = parse.do(pos, self.a) if a: return a if self.b.nullable or c in self.b.first: return parse.do(pos, self.b) return None @memoprop def nullable(self): return self.a.nullable or self.b.nullable @memoprop def first(self): return self.a.first | self.b.first def accept(self, visitor): visitor.inform_arc(self, self.a) visitor.inform_arc(self, self.b) assert Parse("hello").do(0, Lit('h') | 'x') == (1, 'h') assert Parse("hello").do(0, Lit('x') | 'h') == (1, 'h') assert Parse("hello").do(0, Lit('x') | 'y') == None assert Parse("hello").do(0, Lit('h') | 'he') == (1, 'h') assert Parse("hello").do(0, Lit('he') | 'h') == (2, 'he') # To avoid Python's recursion depth when parsing nontrivial inputs, # this is not defined in terms of Alt and Thunk, though in theory it # could be. It turns out to be simpler this way because you don't # have to post-process a cons list into a Python list. class Any(Prod): "Kleene-closure parsing expression: matches zero or more repetitions of a production." def __init__(self, body): self.body = body self.nullable = True @memoprop def first(self): return self.body.first def parse(self, parse, pos): results = [] while True: # Note, not taking advantage of predictive parsing in this # case. Maybe I should. kid = parse.do(pos, self.body) if not kid: return pos, results results.append(kid[1]) pos = kid[0] def accept(self, visitor): visitor.inform_arc(self, self.body) assert Parse("lalala!").do(0, Any(Lit('la'))) == (6, ['la', 'la', 'la']) assert Parse("lalala!").do(0, Any(Lit('al'))) == (0, []) class Thunk(Prod): """A production whose definition is deferred until later. This allows the construction of cyclic grammars. In theory it also allows the grammar to be constructed lazily and thus grow during the parse. """ def __init__(self, body): self.body = body @memoprop def forced(self): """Memoized property that ensures we only evaluate our body once. This doesn't affect performance, apparently, but semantically it ensures that the meaning of the production doesn't change over time. It still doesn't prevent you from creating lazily computed infinite grammars, though... """ return self.body() def parse(self, parse, pos): return self.forced.parse(parse, pos) @property def nullable(self): return self.forced.nullable @property def first(self): return self.forced.first def accept(self, visitor): visitor.inform_arc(self, self.forced) class Charset(Prod): "A parsing expression that matches any byte or character from a set()." def __init__(self, cset): self.first = cset self.nullable = False def parse(self, parse, pos): if pos >= len(parse.s): return None c = parse.s[pos] if c in self.first: return (pos + 1, c) def accept(self, visitor): pass class Tell(Prod): "Consumes no characters but 'parses' the current parse position." first = set() nullable = True def parse(self, parse, pos): return pos, pos def accept(self, visitor): pass ### PDF and PostScript and CMap parsing ### ws = set(b'\0\t\n\014\r ') wschar = Charset(ws) # Whitespace character wschar.name = 'wschar' eol = Lit(b'\r\n') | b'\r' | b'\n' eol.name = 'eol' digit = Charset(set(b'0123456789')) digit.name = 'digit' integer = [b'+', b'-', b''] + digit.some() integer.xform = lambda d: ('int', int(d[0] + bytes(d[1]))) delim = set(b'()<>[]{}/%') delimchar = Charset(delim) # Delimiter character any_byte = set(range(256)) reg = any_byte - delim - ws # "Regular" character regchar = Charset(reg) regchar.name = 'regchar' name = b'/' + Any(regchar) # say, /Type or /Page name.name = 'name' name.xform = lambda d: ('name', bytes(d[1])) # XXX this will fail to recognize comments that end at the end of the # file without a newline comment = b'%' + Any(Charset(any_byte - set(b'\r\n'))) + eol def drop_ws(prod): prod = prod + Any(wschar | comment) prod.xform = lambda d: d[0] return prod def decode_hex(d): s = bytes(d[0][1]) if len(s) % 2 != 0: s += b'0' return ('str', bytes(int(s[i:i+2], 16) for i in range(0, len(s), 2))) hexstring = b'<' + Any(drop_ws(Charset(b'0123456789abcdefABCDEF'))) + b'>' hexstring.xform = decode_hex string_element = ( Charset(any_byte - set(br'\()')) | (lambda: parenstring) | b'\\' + Charset(any_byte) ) parenstring = b'(' + Any(string_element) + b')' parenstring.xform = lambda d: ('str', bytes(d[0][1])) # XXX croaks on anything with \() ok(Parse(b'()').do(0, parenstring), (2, ('str', b''))) ok(Parse(b'(hi)').do(0, parenstring), (4, ('str', b'hi'))) # XXX this one won't work until I write code to un-nest the paren strings #ok(Parse(b'(hi(x))').do(0, parenstring), (7, ('str', b'hi(x)'))) def ps_grammar(): "Construct a grammar that tokenizes a subset of PostScript/PDF." op = regchar.some() + b'' # operator. XXX this +b'' is to allow us to compose xforms op.name = 'op' op.xform = lambda d: ('op', bytes(d[0])) # XXX no real-number support yet dictdelim = Lit(b'<<') | b'>>' dictdelim.xform = lambda d: ('dd', d) arraydelim = Charset(set(b'[]')) arraydelim.xform = lambda d: ('ad', d) tokens = Any(drop_ws(integer | op | name| dictdelim | arraydelim | parenstring | hexstring)) root = Any(wschar) + tokens root.xform = lambda d: d[1] return root def tokenize_ps(ps): return Parse(ps).do(0, ps_grammar()) # XXX move these to the bottom def buildtables(tokens): "Given a sequence of tokens from a CMap, pull out the tables they represent." csranges = [] bfchars = [] bfranges = [] sizes = {} def n_strings_back(n): for j in range(i-n, i): assert tokens[j][0] == 'str' yield tokens[j][1] for i, tok in enumerate(tokens): if tok[0] == 'op': op = tok[1] if op in [b'begincodespacerange', b'beginbfchar', b'beginbfrange']: assert tokens[i-1][0] == 'int' sizes[op] = tokens[i-1][1] elif op == b'endcodespacerange': csranges.extend(n_strings_back(2*sizes[b'begincodespacerange'])) elif op == b'endbfchar': bfchars.extend(n_strings_back(2*sizes[b'beginbfchar'])) elif op == b'endbfrange': bfranges.extend(n_strings_back(3*sizes[b'beginbfrange'])) return csranges, bfchars, bfranges def csranges_to_grammar(csranges): "Compile the csranges from buildtables into a grammar we can use to tokenize strings." alternatives = [] for i in range(0, len(csranges), 2): ranges = zip(csranges[i], csranges[i+1]) cs = [Charset(set(c for c in range(startbyte, endbyte+1))) for startbyte, endbyte in ranges] prod = functools.reduce(operator.add, cs) prod.xform = bytes alternatives.append(prod) return Any(as_prod(alternatives)) xref_header = Lit(b'xref') + eol + integer + b' ' + integer + eol # clause 7.5.4 # XXX This xform clearly indicates that I need to rethink how # concatenation works for parsing expressions. xref_header.xform = lambda d: (d[0][0][0][1][1], d[0][1][1]) dictionary = Thunk(lambda: drop_ws(b'<<') + Any(drop_ws(name) + pdf_obj) + drop_ws(b'>>')) dictionary.xform = lambda d: ('dict', {k[1].decode('utf-8'): v for k, v in d[0][1]}) array = Thunk(lambda: drop_ws(b'[') + Any(pdf_obj) + drop_ws(b']')) array.xform = lambda d: ('array', d[0][1]) ref = drop_ws(integer) + drop_ws(integer) + drop_ws(b'R') ref.xform = lambda d: ('ref', d[0]) # XXX integer and name should drop their own damn ws pdf_obj = drop_ws(dictionary | name | ref | hexstring | array | integer | parenstring) stream = dictionary + b'stream' + eol + Tell() stream.xform = lambda d: ('stream', (d[0][0][0], d[1])) endingobj = pdf_obj + drop_ws(b'endobj') endingobj.xform = lambda d: d[0] # XXX whitespace after 'obj' is not required according to clause # 7.3.10; probably also endobj should require to be followed by a delimiter char indirect_obj = drop_ws(integer) + drop_ws(integer) + drop_ws(b'obj') + (stream | endingobj) indirect_obj.xform = lambda d: d[1] class Pdf: """Comprehends enough of the PDF format to facilitate exploration. Still very incomplete. """ def __init__(self, blob): self.blob = blob self.parser = Parse(blob) sx = blob.rindex(b'startxref') self.xref_start = int(blob[sx:].split()[1]) # XXX there could be many sections self.xrefs = XrefSection(self, self.xref_start) self.trailer_plumbing = self.parse(self.xrefs.end, drop_ws(b'trailer') + dictionary) self.trailer = porcelainize(self, self.trailer_plumbing[1][1]) self.catalog = self.trailer['Root'] def read(self, offset, size=64): return self.blob[offset:offset+size] def parse(self, offset, ex): return self.parser.do(offset, ex) def get_indirect_obj(self, oid, generation=0): "Returns a plumbing object." result = self.parse(self.xrefs.offset_of(oid), indirect_obj) offset, plumb = result return plumb def dereference(self, plumb): """Given a plumbing object, follow indirect object refs if necessary. Returns another plumbing object. Returns ('null', None) for dangling (XXX this is broken) or circular links. """ seen = set() while plumb[0] == 'ref': if plumb in seen: return 'null', None seen.add(plumb) _, ((_, oid), (_, gen)) = plumb plumb = self.get_indirect_obj(oid, gen) return plumb class XrefSection: def __init__(self, pdf, offset): self.pdf = pdf # XXX this is a subsection, bozo, there could be more than one self.offset, (self.first, self.size) = pdf.parse(offset, xref_header) self.end = self.offset + 20 * self.size def __getitem__(self, oid): if not self.first <= oid < self.first + self.size: raise RangeError(oid, self.first, self.size) return self.pdf.read(self.offset + 20 * (oid - self.first), size=20) def offset_of(self, oid): # XXX check that it's n, not f (clause 7.5.4) return int(self[oid][:10]) class Porcelain: """A family of façade objects to provide convenient access to PDF objects.""" def __init__(self, pdf, d): self.pdf = pdf self.contents = d class PorcelainDictionary(Porcelain): def __repr__(self): return '<< %s >>' % ' '.join('/%s %r' % (k, porcelainize(self.pdf, v)) for k, v in self.contents.items()) def keys(self): return self.contents.keys() def __iter__(self): return iter(self.keys()) def get(self, key, default=None): return self[key] if key in self else default def __contains__(self, key): return key in self.contents def __getitem__(self, key): return porcelainize(self.pdf, self.pdf.dereference(self.contents[key])) def lstat(self, key): return self.contents.get(key) def isa(self, typename): return self['Type'].name == typename def is_subtype(self, subtypename): return self['Subtype'].name == subtypename class PorcelainArray(Porcelain): def __repr__(self): return '[%s]' % ' '.join(repr(porcelainize(self.pdf, x)) for x in self.contents) def __getitem__(self, i): return porcelainize(self.pdf, self.pdf.dereference(self.contents[i])) def lstat(self, key): # XXX porcelain ref? return self.contents[key] def __len__(self): return len(self.contents) class PorcelainName(Porcelain): def __init__(self, pdf, name): self.pdf = pdf # XXX need to decode hex bytes self.name = name.decode('utf-8') def __repr__(self): return '/%s' % self.name def __hash__(self): return hash(self.name) * hash(self.pdf) def __eq__(self, other): return (isinstance(other, PorcelainName) and (self.pdf, self.name) == (other.pdf, other.name)) class PorcelainRef(Porcelain): """Usually we avoid instantiating this.""" def __repr__(self): return '%r %r R' % (self.contents[0][1], self.contents[1][1]) def __call__(self): # XXX this is unimpressive code and signals the need for a # rethinking return self.pdf.dereference(('ref', data)) class PorcelainStream: def __init__(self, pdf, plumb): self.pdf = pdf (_, header), self.offset = plumb self.header = PorcelainDictionary(pdf, header) def __repr__(self): return '%r\nstream\n(byte %d...)' % (self.header, self.offset) def read_raw(self): return self.pdf.read(self.offset, self.header['Length']) def read(self): data = self.read_raw() filters = self.header.get('Filter', []) if isinstance(filters, PorcelainName): filters = [filters] decoders = {'FlateDecode': zlib.decompress} for filtername in filters: data = decoders[filtername.name](data) return data porcelain_classes = { 'dict': PorcelainDictionary, 'array': PorcelainArray, 'name': PorcelainName, 'ref': PorcelainRef, 'stream': PorcelainStream, 'str': lambda pdf, s: s, 'int': lambda pdf, s: s, } def porcelainize(pdf, plumb): "Add a porcelain façade to a plumbing object, which is just a tuple." return porcelain_classes[plumb[0]](pdf, plumb[1]) def read_pdf(filename): with open(filename, 'rb') as fo: return Pdf(fo.read()) ### Stuff for walking the PDF tree and pulling out CMaps ### def all_pages(pdf, whine=lambda *args: None): return page_descendants(pdf.catalog['Pages'], whine) def page_descendants(pages, whine=lambda *args: None): for kid in pages['Kids']: if kid.isa('Page'): yield kid elif kid.isa('Pages'): yield from page_descendants(kid, whine) else: whine('what even is', kid) def cmaps_for_pages(pages, whine=lambda *args: None): # Find all the CMap streams. Really we probably want to know the # CMap for a particular font on a particular page. And /Type1 and # /Type3 fonts don't have a CMap; we use the CMap for a /Type0 # font to do what the /Encoding does for the others. But my # immediate objective here is to suck out the CMap streams so I # can bang more on the CMap parsing code above. seen_cmaps = set() seen_fonts = set() for page in pages: fonts = page['Resources']['Font'] for name in fonts: font = fonts[name] if font in seen_fonts: continue seen_fonts.add(font) if font.is_subtype('Type0'): cmap = font['ToUnicode'] if cmap in seen_cmaps: continue seen_cmaps.add(cmap) yield cmap else: whine('not yet handling non-Unicode font', font) if __name__ == '__main__': import cgitb; cgitb.enable(format='text') print(show_grammar(ps_grammar(), 'postscript')) with open(sys.argv[1], 'rb') as fo: data = fo.read() result = tokenize_ps(data) if result: for kind, val in result[1]: print("%10s %r" % (kind, val)) a, b, c = buildtables(result[1]) print('csranges', a) cg = csranges_to_grammar(a) print(show_grammar(cg, 'character_code')) print(Parse(b'\0T\0h\0i\0s\0 \0c\0o\0s\0t\0s\0 \0001\0000\1\162').do(0, cg)) print('bfchars', b) print('bfranges', c)