From e39478bbbd833babcff066bb639a08b4cb337695 Mon Sep 17 00:00:00 2001 From: Kragen Javier Sitaker <xentrac@special-circumstanc.es> Date: Thu, 8 Jul 2021 00:54:13 -0300 Subject: [PATCH] Make PDF trailer parsing lazy This facilitates exploring PDF files that I can't actually parse yet; I can still use the Pdf object to look at parts of the file. For example: >>> d = pdf.read('../Descargas/dercuano.20191230.pdf') >>> d.trailer Traceback (most recent call last): ... File "/home/compu/izodparse/izodparse/pdf.py", line 202, in <lambda> parenstring.xform = lambda d: ('str', bytes(d[0][1])) # XXX croaks on anything with \() TypeError: 'tuple' object cannot be interpreted as an integer >>> d.get_indirect_obj(440) Traceback (most recent call last): File "<stdin>", line 1, in <module> File "/home/compu/izodparse/izodparse/pdf.py", line 290, in get_indirect_obj offset, plumb = result TypeError: cannot unpack non-iterable NoneType object >>> d.xrefs <izodparse.pdf.XrefSection object at 0x7feaef606a30> >>> d.xrefs[440] b'0000095363 00000 n\r\n' >>> d.xrefs.offset_of(440) 95363 >>> d.read(_) b'440 0 obj\r\n<< /Border [ 0 0 .1 ] /C [ .6 .6 1 ] /Contents (notes' In this case there are two separate problems: I need to fix paren-string parsing for the trailer, and I need to be able to read fractions to read the .6. --- izodparse/ok.py | 8 -------- izodparse/pdf.py | 17 +++++++++++++---- izodparse/peg.py | 23 ++--------------------- izodparse/util.py | 24 ++++++++++++++++++++++++ plans.org | 7 +++++-- 5 files changed, 44 insertions(+), 35 deletions(-) delete mode 100644 izodparse/ok.py create mode 100644 izodparse/util.py diff --git a/izodparse/ok.py b/izodparse/ok.py deleted file mode 100644 index ee91646..0000000 --- a/izodparse/ok.py +++ /dev/null @@ -1,8 +0,0 @@ -"Testing utilities." - -def ok(a, b): assert a == b, (a, b) - -def please_be(a, b): - if a != b: - raise ValueError(a, '!=', b) - diff --git a/izodparse/pdf.py b/izodparse/pdf.py index 71b1e56..1d81799 100755 --- a/izodparse/pdf.py +++ b/izodparse/pdf.py @@ -148,7 +148,7 @@ import zlib from . import peg from .peg import Any, Charset, Lit, Thunk, Parse -from .ok import ok +from .util import ok, memoprop ### PDF and PostScript and CMap parsing ### @@ -265,9 +265,18 @@ class Pdf: self.xref_start = int(blob[sx:].split()[1]) # XXX there could be many sections self.xrefs = XrefSection(self, self.xref_start) - self.trailer_plumbing = self.parse(self.xrefs.end, drop_ws(b'trailer') + dictionary) - self.trailer = porcelainize(self, self.trailer_plumbing[1][1]) - self.catalog = self.trailer['Root'] + + @memoprop + def trailer_plumbing(self): + return self.parse(self.xrefs.end, drop_ws(b'trailer') + dictionary) + + @memoprop + def trailer(self): + return porcelainize(self, self.trailer_plumbing[1][1]) + + @memoprop + def catalog(self): + return self.trailer['Root'] def read(self, offset, size=64): return self.blob[offset:offset+size] diff --git a/izodparse/peg.py b/izodparse/peg.py index 42cfaa3..5e4889c 100644 --- a/izodparse/peg.py +++ b/izodparse/peg.py @@ -1,5 +1,5 @@ #!/usr/bin/python3 -r"""A small PEG parser generator. +r"""Packrat parsing engine with predictive lookahead parsing. Too slow to use in practice (30 kilobytes/sec), but hopefully validates our understanding of the problem and communicates it more @@ -28,28 +28,9 @@ along with izodparse. If not, see <http://www.gnu.org/licenses/>. """ import types -from .ok import ok, please_be +from .util import ok, please_be, memoprop -class memoprop: - """"Simplified, non-multithreaded version of functools.cached_property. - - For Pythons earlier than 3.8. Doesn't support __slots__, custom - __dict__, etc. - - """ - def __init__(self, func): - self.func = func - - def __get__(self, instance, cls): - cache = instance.__dict__ - if self not in cache: - cache[self] = self.func(instance) - return cache[self] - - -### Packrat parsing engine with predictive lookahead parsing ### - # First, some debugging utilities: def represent_cset(cs): "Debugging helper for understanding first sets; takes a set() of ints or chars." diff --git a/izodparse/util.py b/izodparse/util.py new file mode 100644 index 0000000..7613208 --- /dev/null +++ b/izodparse/util.py @@ -0,0 +1,24 @@ +"Some basic utilities for testing and lazy evaluation." + +def ok(a, b): assert a == b, (a, b) + +def please_be(a, b): + if a != b: + raise ValueError(a, '!=', b) + +class memoprop: + """"Simplified, non-multithreaded version of functools.cached_property. + + For Pythons earlier than 3.8. Doesn't support __slots__, custom + __dict__, etc. + + """ + def __init__(self, func): + self.func = func + + def __get__(self, instance, cls): + cache = instance.__dict__ + if self not in cache: + cache[self] = self.func(instance) + return cache[self] + diff --git a/plans.org b/plans.org index 9eedac7..94a8419 100644 --- a/plans.org +++ b/plans.org @@ -1,4 +1,4 @@ -* things I am thinking of doing [3/14] +* things I am thinking of doing [4/15] ** DONE split out pdftour into separate repo. charpy? (too popular) rebound? (too popular) schmidtconcrete? (ok) sclerometer? (wrong test) izod? (seems okay) 1zod or iz0d? better. @@ -73,7 +73,10 @@ This file has a bunch of stuff in it about my local machine config. ** DONE make izodparse an installable Python package Man, I forgot all about the distutils/setuptools mess. ** DONE split out parsing-engine stuff from PDF/PS stuff -** TODO make xrefs, etc., lazy properties +** DONE make trailer lazy +This way I can open, for example, Dercuano, even though I can't parse +its trailer yet. +** TODO add PDF support for fractions ** TODO examine example PDF file with compressed object streams and no fonts in page resource dictionaries [0/3] *** TODO find it *** TODO copy it -- GitLab