From e39478bbbd833babcff066bb639a08b4cb337695 Mon Sep 17 00:00:00 2001
From: Kragen Javier Sitaker <xentrac@special-circumstanc.es>
Date: Thu, 8 Jul 2021 00:54:13 -0300
Subject: [PATCH] Make PDF trailer parsing lazy

This facilitates exploring PDF files that I can't actually parse yet;
I can still use the Pdf object to look at parts of the file.  For example:

    >>> d = pdf.read('../Descargas/dercuano.20191230.pdf')
    >>> d.trailer
    Traceback (most recent call last):
    ...
      File "/home/compu/izodparse/izodparse/pdf.py", line 202, in <lambda>
	parenstring.xform = lambda d: ('str', bytes(d[0][1])) # XXX croaks on anything with \()
    TypeError: 'tuple' object cannot be interpreted as an integer
    >>> d.get_indirect_obj(440)
    Traceback (most recent call last):
      File "<stdin>", line 1, in <module>
      File "/home/compu/izodparse/izodparse/pdf.py", line 290, in get_indirect_obj
	offset, plumb = result
    TypeError: cannot unpack non-iterable NoneType object
    >>> d.xrefs
    <izodparse.pdf.XrefSection object at 0x7feaef606a30>
    >>> d.xrefs[440]
    b'0000095363 00000 n\r\n'
    >>> d.xrefs.offset_of(440)
    95363
    >>> d.read(_)
    b'440 0 obj\r\n<< /Border [ 0 0 .1 ] /C [ .6 .6 1 ] /Contents (notes'

In this case there are two separate problems: I need to fix
paren-string parsing for the trailer, and I need to be able to read
fractions to read the .6.
---
 izodparse/ok.py   |  8 --------
 izodparse/pdf.py  | 17 +++++++++++++----
 izodparse/peg.py  | 23 ++---------------------
 izodparse/util.py | 24 ++++++++++++++++++++++++
 plans.org         |  7 +++++--
 5 files changed, 44 insertions(+), 35 deletions(-)
 delete mode 100644 izodparse/ok.py
 create mode 100644 izodparse/util.py

diff --git a/izodparse/ok.py b/izodparse/ok.py
deleted file mode 100644
index ee91646..0000000
--- a/izodparse/ok.py
+++ /dev/null
@@ -1,8 +0,0 @@
-"Testing utilities."
-
-def ok(a, b): assert a == b, (a, b)
-
-def please_be(a, b):
-    if a != b:
-        raise ValueError(a, '!=', b)
-
diff --git a/izodparse/pdf.py b/izodparse/pdf.py
index 71b1e56..1d81799 100755
--- a/izodparse/pdf.py
+++ b/izodparse/pdf.py
@@ -148,7 +148,7 @@ import zlib
 
 from . import peg
 from .peg import Any, Charset, Lit, Thunk, Parse
-from .ok import ok
+from .util import ok, memoprop
 
 
 ### PDF and PostScript and CMap parsing ###
@@ -265,9 +265,18 @@ class Pdf:
         self.xref_start = int(blob[sx:].split()[1])
         # XXX there could be many sections
         self.xrefs = XrefSection(self, self.xref_start)
-        self.trailer_plumbing = self.parse(self.xrefs.end, drop_ws(b'trailer') + dictionary)
-        self.trailer = porcelainize(self, self.trailer_plumbing[1][1])
-        self.catalog = self.trailer['Root']
+
+    @memoprop
+    def trailer_plumbing(self):
+        return self.parse(self.xrefs.end, drop_ws(b'trailer') + dictionary)
+
+    @memoprop
+    def trailer(self):
+        return porcelainize(self, self.trailer_plumbing[1][1])
+
+    @memoprop
+    def catalog(self):
+        return self.trailer['Root']
 
     def read(self, offset, size=64):
         return self.blob[offset:offset+size]
diff --git a/izodparse/peg.py b/izodparse/peg.py
index 42cfaa3..5e4889c 100644
--- a/izodparse/peg.py
+++ b/izodparse/peg.py
@@ -1,5 +1,5 @@
 #!/usr/bin/python3
-r"""A small PEG parser generator.
+r"""Packrat parsing engine with predictive lookahead parsing.
 
 Too slow to use in practice (30 kilobytes/sec), but hopefully
 validates our understanding of the problem and communicates it more
@@ -28,28 +28,9 @@ along with izodparse.  If not, see <http://www.gnu.org/licenses/>.
 """
 import types
 
-from .ok import ok, please_be
+from .util import ok, please_be, memoprop
 
 
-class memoprop:
-    """"Simplified, non-multithreaded version of functools.cached_property.
-
-    For Pythons earlier than 3.8.  Doesn't support __slots__, custom
-    __dict__, etc.
-
-    """
-    def __init__(self, func):
-        self.func = func
-
-    def __get__(self, instance, cls):
-        cache = instance.__dict__
-        if self not in cache:
-            cache[self] = self.func(instance)
-        return cache[self]
-
-
-### Packrat parsing engine with predictive lookahead parsing ###
-
 # First, some debugging utilities:
 def represent_cset(cs):
     "Debugging helper for understanding first sets; takes a set() of ints or chars."
diff --git a/izodparse/util.py b/izodparse/util.py
new file mode 100644
index 0000000..7613208
--- /dev/null
+++ b/izodparse/util.py
@@ -0,0 +1,24 @@
+"Some basic utilities for testing and lazy evaluation."
+
+def ok(a, b): assert a == b, (a, b)
+
+def please_be(a, b):
+    if a != b:
+        raise ValueError(a, '!=', b)
+
+class memoprop:
+    """"Simplified, non-multithreaded version of functools.cached_property.
+
+    For Pythons earlier than 3.8.  Doesn't support __slots__, custom
+    __dict__, etc.
+
+    """
+    def __init__(self, func):
+        self.func = func
+
+    def __get__(self, instance, cls):
+        cache = instance.__dict__
+        if self not in cache:
+            cache[self] = self.func(instance)
+        return cache[self]
+
diff --git a/plans.org b/plans.org
index 9eedac7..94a8419 100644
--- a/plans.org
+++ b/plans.org
@@ -1,4 +1,4 @@
-* things I am thinking of doing [3/14]
+* things I am thinking of doing [4/15]
 ** DONE split out pdftour into separate repo.
  charpy? (too popular) rebound? (too popular) schmidtconcrete? (ok)
  sclerometer? (wrong test) izod? (seems okay) 1zod or iz0d? better.
@@ -73,7 +73,10 @@ This file has a bunch of stuff in it about my local machine config.
 ** DONE make izodparse an installable Python package
 Man, I forgot all about the distutils/setuptools mess.
 ** DONE split out parsing-engine stuff from PDF/PS stuff
-** TODO make xrefs, etc., lazy properties
+** DONE make trailer lazy
+This way I can open, for example, Dercuano, even though I can't parse
+its trailer yet.
+** TODO add PDF support for fractions
 ** TODO examine example PDF file with compressed object streams and no fonts in page resource dictionaries [0/3]
 *** TODO find it
 *** TODO copy it
-- 
GitLab