diff --git a/parsecmaps.py b/parsecmaps.py index f827f783c9c20d97a8ae0cf8ad0c91ffb71cc378..494a90e569918567038c479e5943931eff9a9fd0 100755 --- a/parsecmaps.py +++ b/parsecmaps.py @@ -97,6 +97,16 @@ class GrammarVisitor: we'll never know about it.) """ + # A few nits that could be cleaned up: + # - negative charsets would make things like `regchar: + # [\x01-\x08\x0b\x0e-\x1f!-$&-'*-.0-;=?-Z\\^-z|~-\xff]` a lot more readable. + # This would involve shunting ^ to be not first if it's first I guess. + # - `[[]]` is not okay. `-` gets pulled to the front; `]` should + # be too. Not sure how to handle set(['-', ']']) but it hasn't arisen yet. + # - It would be beneficial to take advantage of the associativity + # of | and + to reduce parens. That might be better than the + # current precedence scheme. + def __init__(self): self.nodes = set() self.parents_of = {} @@ -138,7 +148,8 @@ class GrammarVisitor: else: names[n] = n.name - # Traverse the grammar a second time to produce output. + # Traverse the grammar a second time and output the + # description of each top-level rule. output = [] for n in toplevel: output.append('%s: ' % names[n]) @@ -167,7 +178,7 @@ class GrammarVisitor: elif isinstance(prod, Lit): return repr(prod.s) if isinstance(prod.s, str) else repr(prod.s.decode('utf-8')) elif isinstance(prod, Thunk): - return self.pprint(prod.body(), names, precedence_left, precedence_right) + return self.pprint(prod.forced, names, precedence_left, precedence_right) else: return str(prod) @@ -396,24 +407,33 @@ class Thunk(Prod): during the parse. """ def __init__(self, body): - # XXX make .body itself a memoprop? self.body = body + @memoprop + def forced(self): + """Memoized property that ensures we only evaluate our body once. + + This doesn't affect performance, apparently, but semantically + it ensures that the meaning of the production doesn't change + over time. It still doesn't prevent you from creating lazily + computed infinite grammars, though... + + """ + return self.body() + def parse(self, parse, pos): - body = self.body() - parse.trace('body is %s' % body) - return body.parse(parse, pos) + return self.forced.parse(parse, pos) @property def nullable(self): - return self.body().nullable + return self.forced.nullable @property def first(self): - return self.body().first + return self.forced.first def accept(self, visitor): - visitor.inform_arc(self, self.body()) + visitor.inform_arc(self, self.forced) class Charset(Prod): @@ -433,12 +453,17 @@ class Charset(Prod): pass +ws = set(b'\0\t\n\014\r ') +wschar = Charset(ws) # Whitespace character +wschar.name = 'wschar' + +digit = Charset(set(b'0123456789')) +digit.name = 'digit' +integer = [b'+', b'-', b''] + digit.some() +integer.xform = lambda d: ('int', int(d[0] + bytes(d[1]))) + def ps_grammar(): "Construct a grammar that tokenizes a subset of PostScript/PDF." - ws = set(b'\0\t\n\014\r ') - wschar = Charset(ws) # Whitespace character - wschar.name = 'wschar' - delim = set(b'()<>[]{}/%') delimchar = Charset(delim) # Delimiter character @@ -455,9 +480,6 @@ def ps_grammar(): name.name = 'name' name.xform = lambda d: ('name', bytes(d[1])) - integer = [b'+', b'-', b''] + Charset(set(b'0123456789')).some() - integer.xform = lambda d: ('int', int(d[0] + bytes(d[1]))) - # XXX no real-number support yet dictdelim = Lit(b'<<') | b'>>' @@ -541,24 +563,125 @@ def csranges_to_grammar(csranges): return Any(as_prod(alternatives)) +def please_be(a, b): + if a != b: + raise ValueError(a, '!=', b) + +xref_header = Lit(b'xref\n') + integer + wschar.some() + integer + b'\n' # XXX too strict +# XXX This xform clearly indicates that I need to rethink how +# concatenation works for parsing expressions. +xref_header.xform = lambda d: (d[0][0][0][1][1], d[0][1][1]) + +class Pdf: + """Comprehends enough of the PDF format to facilitate exploration. + + Still very incomplete. + + Example: + >>> p = parsecmaps.read_pdf('../Descargas/1.2754649.pdf') + >>> p.read(p.xrefs.offset_of(55)) + b'55 0 obj\n<</Type/Font/FirstChar 37/FontDescriptor 116 0 R/BaseFo' + >>> p.read(p.xrefs.offset_of(72)) + b'72 0 obj\n<</Border[0 0 0]/Type/Annot/Dest[74 0 R/XYZ 307 380 nul' + >>> p.read(p.xrefs.offset_of(72), size=256) + b'72 0 obj\n<</Border[0 0 0]/Type/Annot/Dest[74 0 R/XYZ 307 380 null]/Rect[394.024017 55.089005 409.314026 65.729996]/Subtype/Link>>\nendobj\n73 0 obj\n<</Border[0 0 0]/Type/Annot/Dest[74 0 R/XYZ 43 253 null]/Rect[464.366028 55.089005 475.665009 65.729996]/Subty' + >>> p.xrefs[72] + b'0000350098 00000 n \n' + >>> p.parse(p.xrefs.offset_of(72), parsecmaps.integer + parsecmaps.wschar.some() + parsecmaps.integer) + (350102, ((('int', 72), [32]), ('int', 0))) + >>> p.parse(p.xrefs.offset_of(72), parsecmaps.integer + parsecmaps.wschar.some() + parsecmaps.integer + b' obj') + (350106, (((('int', 72), [32]), ('int', 0)), b' obj')) + + """ + def __init__(self, blob): + self.blob = blob + self.parser = Parse(blob) + sx = blob.rindex(b'startxref') + self.xref_start = int(blob[sx:].split()[1]) + # XXX there could be many sections + self.xrefs = XrefSection(self, self.xref_start) + + def read(self, offset, size=64): + return self.blob[offset:offset+size] + + def parse(self, offset, ex): + return self.parser.do(offset, ex) + +class XrefSection: + def __init__(self, pdf, offset): + self.pdf = pdf + # XXX this is a subsection, bozo, there could be more than one + self.offset, (self.first, self.size) = pdf.parse(offset, xref_header) + + def __getitem__(self, oid): + if not self.first <= oid < self.first + self.size: + raise RangeError(oid, self.first, self.size) + return self.pdf.read(self.offset + 20 * (oid - self.first), size=20) + + def offset_of(self, oid): + return int(self[oid][:10]) + +def read_pdf(filename): + with open(filename, 'rb') as fo: + return Pdf(fo.read()) + + +def all_pages(pdf, whine=lambda *args: None): + # XXX these structures aren't implemented yet + return page_descendants(pdf.catalog['Pages'], whine) + +def page_descendants(pages, whine=lambda *args: None): + for kid in pages['Kids']: + if kid.isa('Page'): + yield kid + elif kid.isa('Pages'): + yield from page_descendants(kid, whine) + else: + whine('what even is', kid) + +def cmaps_for_pages(pages, whine=lambda *args: None): + # Find all the CMap streams. Really we probably want to know the + # CMap for a particular font on a particular page. And /Type1 and + # /Type3 fonts don't have a CMap; we use the CMap for a /Type0 + # font to do what the /Encoding does for the others. But my + # immediate objective here is to suck out the CMap streams so I + # can bang more on the CMap parsing code above. + seen_cmaps = set() + seen_fonts = set() + for page in pages: + for font in page['Resources']['Font']: + if font in seen_fonts: + continue + seen_fonts.add(font) + + if font.is_subtype('Type0'): + cmap = font['ToUnicode'] + if cmap in seen_cmaps: + continue + seen_cmaps.add(cmap) + yield cmap + + else: + whine('not yet handling non-Unicode font', font) + + if __name__ == '__main__': import cgitb; cgitb.enable(format='text') print(show_grammar(ps_grammar(), 'postscript')) - print() with open(sys.argv[1], 'rb') as fo: data = fo.read() result = tokenize_ps(data) + if result: for kind, val in result[1]: print("%10s %r" % (kind, val)) a, b, c = buildtables(result[1]) print('csranges', a) cg = csranges_to_grammar(a) - print(cg) - print(show_grammar(cg)) + print(show_grammar(cg, 'character_code')) print(Parse(b'\0T\0h\0i\0s\0 \0c\0o\0s\0t\0s\0 \0001\0000\1\162').do(0, cg)) print('bfchars', b) print('bfranges', c)