From c8ba6bf55647136879833d0c0ada6865c82e33d9 Mon Sep 17 00:00:00 2001 From: Kragen Javier Sitaker <xentrac@special-circumstanc.es> Date: Thu, 1 Jul 2021 03:02:57 -0300 Subject: [PATCH] Clean up pdftour code slightly --- pdftour.py | 93 +++++++++++++++++++++++++++--------------------------- 1 file changed, 47 insertions(+), 46 deletions(-) diff --git a/pdftour.py b/pdftour.py index b043658..676408e 100755 --- a/pdftour.py +++ b/pdftour.py @@ -120,7 +120,8 @@ clearly than a lower-level language would. A pitfall I've run into a lot with the PEG parser in here is that if you try to parse a Unicode string with a byte-string grammar or vice -versa, you just get silent failures to parse. +versa, you just get silent failures to parse. This is a specific case +of the more general problem with silent failures. Still unimplemented: @@ -656,7 +657,7 @@ def ps_grammar(): arraydelim = Charset(set(b'[]')) arraydelim.xform = lambda d: ('ad', d) - tokens = Any(drop_ws(integer | op | name| dictdelim | arraydelim | parenstring | hexstring)) + tokens = Any(drop_ws(integer | op | name | dictdelim | arraydelim | parenstring | hexstring)) root = Any(wschar) + tokens root.xform = lambda d: d[1] return root @@ -664,49 +665,7 @@ def ps_grammar(): def tokenize_ps(ps): return Parse(ps).do(0, ps_grammar()) - -# XXX move these to the bottom -def buildtables(tokens): - "Given a sequence of tokens from a CMap, pull out the tables they represent." - csranges = [] - bfchars = [] - bfranges = [] - sizes = {} - - def n_strings_back(n): - for j in range(i-n, i): - assert tokens[j][0] == 'str' - yield tokens[j][1] - - for i, tok in enumerate(tokens): - if tok[0] == 'op': - op = tok[1] - if op in [b'begincodespacerange', b'beginbfchar', b'beginbfrange']: - assert tokens[i-1][0] == 'int' - sizes[op] = tokens[i-1][1] - elif op == b'endcodespacerange': - csranges.extend(n_strings_back(2*sizes[b'begincodespacerange'])) - elif op == b'endbfchar': - bfchars.extend(n_strings_back(2*sizes[b'beginbfchar'])) - elif op == b'endbfrange': - bfranges.extend(n_strings_back(3*sizes[b'beginbfrange'])) - - return csranges, bfchars, bfranges - - -def csranges_to_grammar(csranges): - "Compile the csranges from buildtables into a grammar we can use to tokenize strings." - alternatives = [] - for i in range(0, len(csranges), 2): - ranges = zip(csranges[i], csranges[i+1]) - cs = [Charset(set(c for c in range(startbyte, endbyte+1))) - for startbyte, endbyte in ranges] - prod = functools.reduce(operator.add, cs) - prod.xform = bytes - alternatives.append(prod) - - return Any(as_prod(alternatives)) - +# PDF constructs not used in PS: xref_header = Lit(b'xref') + eol + integer + b' ' + integer + eol # clause 7.5.4 # XXX This xform clearly indicates that I need to rethink how @@ -777,6 +736,7 @@ class Pdf: return plumb + class XrefSection: def __init__(self, pdf, offset): self.pdf = pdf @@ -917,7 +877,7 @@ def read_pdf(filename): return Pdf(fo.read()) -### Stuff for walking the PDF tree and pulling out CMaps ### +### CMap handling, including stuff for walking the PDF tree to find CMaps ### def all_pages(pdf, whine=lambda *args: None): return page_descendants(pdf.catalog['Pages'], whine) @@ -958,6 +918,47 @@ def cmaps_for_pages(pages, whine=lambda *args: None): else: whine('not yet handling non-Unicode font', font) +def buildtables(tokens): + "Given a sequence of tokens from a CMap, pull out the tables they represent." + csranges = [] + bfchars = [] + bfranges = [] + sizes = {} + + def n_strings_back(n): + for j in range(i-n, i): + assert tokens[j][0] == 'str' + yield tokens[j][1] + + for i, tok in enumerate(tokens): + if tok[0] == 'op': + op = tok[1] + if op in [b'begincodespacerange', b'beginbfchar', b'beginbfrange']: + assert tokens[i-1][0] == 'int' + sizes[op] = tokens[i-1][1] + elif op == b'endcodespacerange': + csranges.extend(n_strings_back(2*sizes[b'begincodespacerange'])) + elif op == b'endbfchar': + bfchars.extend(n_strings_back(2*sizes[b'beginbfchar'])) + elif op == b'endbfrange': + bfranges.extend(n_strings_back(3*sizes[b'beginbfrange'])) + + return csranges, bfchars, bfranges + + +def csranges_to_grammar(csranges): + "Compile the csranges from buildtables into a grammar we can use to tokenize strings." + alternatives = [] + for i in range(0, len(csranges), 2): + ranges = zip(csranges[i], csranges[i+1]) + cs = [Charset(set(c for c in range(startbyte, endbyte+1))) + for startbyte, endbyte in ranges] + prod = functools.reduce(operator.add, cs) + prod.xform = bytes + alternatives.append(prod) + + return Any(as_prod(alternatives)) + if __name__ == '__main__': import cgitb; cgitb.enable(format='text') -- GitLab