From 76bf656776f059f083f6476d51c398530d5dec35 Mon Sep 17 00:00:00 2001
From: "Sven M. Hallberg" <pesco@khjk.org>
Date: Fri, 29 Nov 2019 21:03:12 +0100
Subject: [PATCH] parse xref streams without looking at /Length

---
 pdf.c | 56 +++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 43 insertions(+), 13 deletions(-)

diff --git a/pdf.c b/pdf.c
index 35fcc92..5711efb 100644
--- a/pdf.c
+++ b/pdf.c
@@ -72,6 +72,15 @@ dictentry(const HCountedArray *dict, const char *key)
 }
 
 
+/*
+ * auxiliary global data structure needed by the parser
+ */
+struct Env {
+	const char *input;
+	size_t sz;
+};
+
+
 /*
  * semantic actions
  */
@@ -198,6 +207,15 @@ act_octal(const HParseResult *p, void *u)
 
 #define act_xrefs h_act_last
 
+HParsedToken *
+act_rest(const HParseResult *p, void *env)
+{
+	struct Env *aux = env;
+	size_t offset = H_CAST_UINT(p->ast) / 8;
+
+	return H_MAKE_BYTES(aux->input + offset, aux->sz - offset);
+}
+
 /* stream semantics (defined further below) */
 bool validate_xrstm(HParseResult *, void *);
 HParsedToken *act_xrstm(const HParseResult *, void *);
@@ -212,11 +230,12 @@ HParser *p_pdfdbg;
 HParser *p_startxref;
 HParser *p_xref;
 
-/* continuation for h_bind() */
+/* continuations for h_bind() */
 HParser *kstream(HAllocator *, const HParsedToken *, void *);
+HParser *kxstream(HAllocator *, const HParsedToken *, void *);
 
 void
-init_parser(const char *input)
+init_parser(struct Env *aux)
 {
 	/* lines */
 	H_RULE(cr,	p_mapch('\r', '\n'));	/* semantic value: \n */
@@ -329,7 +348,7 @@ init_parser(const char *input)
 	/* streams */
 	H_RULE(stmbeg,	SEQ(dict, KW("stream"), OPT(cr), lf));
 	H_RULE(stmend,	SEQ(OPT(eol), LIT("endstream")));
-	H_RULE(stream,	h_left(h_bind(stmbeg, kstream, (void *)input), stmend));
+	H_RULE(stream,	h_left(h_bind(stmbeg, kstream, aux), stmend));
 		// XXX is whitespace allowed between the eol and "endstream"?
 
 	H_RULE(obj_,	CHX(ref, null, boole, real, intg, name, string,
@@ -349,7 +368,7 @@ init_parser(const char *input)
 	H_RULE(objdef,	SEQ(pnat, nat, KW("obj"), indobj, KW("endobj")));
 	H_RULE(body,	h_many(objdef));	// XXX object streams
 
-	/* cross-reference table */
+	/* cross-reference section */
 	H_RULE(xreol,	CHX(SEQ(sp, cr), SEQ(sp, lf), crlf));
 		// ^ XXX does the real world follow this rule?! cf. loop.pdf
 	H_RULE(xrtyp,	CHX(h_ch('n'), h_ch('f')));
@@ -360,7 +379,11 @@ init_parser(const char *input)
 	H_RULE(xrhead,	SEQ(xrnat, IGN(sp), xrnat, nl));
 	H_RULE(xrsub,	SEQ(xrhead, h_many(xrent)));
 	H_ARULE(xrefs,	SEQ(KW("xref"), nl, h_many(xrsub)));
-	H_AVRULE(xrstm,	SEQ(pnat, nat, KW("obj"), stream, KW("endobj")));
+
+	/* cross-reference streams */
+	H_RULE(rest,	h_action(h_tell(), act_rest, aux));
+	H_AVRULE(xrstm,	SEQ(pnat, nat, KW("obj"), stmbeg, rest));
+		// XXX skip however much we consumed and check for "endstream endobj"?
 
 	/* trailer */
 	H_RULE(startxr,	SEQ(nl, KW("startxref"), nl,
@@ -368,7 +391,7 @@ init_parser(const char *input)
 			    LIT("%%EOF"), CHX(nl, end)));
 		// XXX should lws be allowed before EOF marker?
 		// NB: lws before xref offset is allowed, cf. p.48 (example 4)
-	H_RULE(xr_td,	CHX(SEQ(xrefs, KW("trailer"), dict), xrstm));
+	H_RULE(xr_td,	SEQ(xrefs, KW("trailer"), dict));
 
 	H_RULE(tail,	SEQ(body, h_optional(xr_td), startxr));
 	H_RULE(pdf,	SEQ(header, h_many1(tail), end));
@@ -379,10 +402,15 @@ init_parser(const char *input)
 	p_pdf = pdf;
 	p_pdfdbg = pdfdbg;
 	p_startxref = startxr;
-	p_xref = xr_td;
+	p_xref = CHX(xr_td, xrstm);
 }
 
-/* combine current position combined with env=(input,sz) into HBytes */
+
+/*
+ * stream object handling incl. cross-reference streams
+ */
+
+/* combine current position with env=(input,sz) into HBytes */
 HParsedToken *
 act_ks_bytes(const HParseResult *p, void *env)
 {
@@ -405,7 +433,7 @@ act_ks_bytes(const HParseResult *p, void *env)
 HParser *
 kstream(HAllocator *mm__, const HParsedToken *x, void *env)
 {
-	const char *input = env;
+	struct Env *aux = env;
 	const HParsedToken *dict_t = H_INDEX_TOKEN(x, 0);
 	const HCountedArray *dict = H_CAST_SEQ(dict_t);
 	const HParsedToken *v = NULL;
@@ -422,7 +450,7 @@ kstream(HAllocator *mm__, const HParsedToken *x, void *env)
 
 	/* dummy struct to hold the pair (input,sz) */
 	HBytes *bytes = h_alloc(mm__, sizeof(HBytes));
-	bytes->token = input;
+	bytes->token = aux->input;
 	bytes->len = sz;
 
 	HParser *tell = h_tell__m(mm__);
@@ -473,7 +501,7 @@ validate_xrstm(HParseResult *p, void *u)
  * interpret a cross-reference stream and return it in the same form as other
  * cross-reference sections:
  *
- * p = pnat nat (dict bytes)
+ * p = (pnat nat (dict ...) bytes)
  * result = (xrefs dict)
  */
 HParsedToken *
@@ -482,7 +510,7 @@ act_xrstm(const HParseResult *p, void *u)
 	HParsedToken *bytes, *dict, *result;
 
 	dict = H_INDEX_TOKEN(p->ast, 2, 0);
-	bytes = H_INDEX_TOKEN(p->ast, 2, 1);
+	bytes = H_INDEX_TOKEN(p->ast, 3);
 
 	// XXX decode XRefStm
 
@@ -597,6 +625,7 @@ end:
 int
 main(int argc, char *argv[])
 {
+	struct Env aux;
 	HParseResult *res = NULL;
 	const HParsedToken **xrefs;
 	const uint8_t *input;
@@ -622,7 +651,8 @@ main(int argc, char *argv[])
 		err(1, "mmap");
 
 	/* build parsers */
-	init_parser(input);
+	aux = (struct Env){input, sz};
+	init_parser(&aux);
 
 	/* parse all cross-reference sections and trailer dictionaries */
 	xrefs = parse_xrefs(input, sz, &nxrefs);
-- 
GitLab