From 0f96144c7e56bfd64c9a1d4c2d1267c74176cb67 Mon Sep 17 00:00:00 2001
From: "Sven M. Hallberg" <pesco@khjk.org>
Date: Mon, 2 Dec 2019 21:11:07 +0100
Subject: [PATCH] write the parser for (uncompressed) xref streams - untested

---
 pdf.c | 218 +++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 207 insertions(+), 11 deletions(-)

diff --git a/pdf.c b/pdf.c
index 64d0b5a..60feefe 100644
--- a/pdf.c
+++ b/pdf.c
@@ -21,18 +21,23 @@
  * some helpers
  */
 
+HParser *p_fail;
+HParser *p_epsilon;
+HParser *p_return_0;
+HParser *p_return_1;
+
 /* a combinator to parse a given character but return a different value */
 
 HParsedToken *
-act_mapch(const HParseResult *p, void *u)
+act_return_uint(const HParseResult *p, void *u)
 {
-	return H_MAKE_UINT((uint8_t)(uintptr_t)u);
+	return H_MAKE_UINT((uint64_t)(uintptr_t)u);
 }
 
 HParser *
 p_mapch(uint8_t c, uint8_t v)
 {
-	return h_action(h_ch(c), act_mapch, (void *)(uintptr_t)v);
+	return h_action(h_ch(c), act_return_uint, (void *)(uintptr_t)v);
 }
 
 /* a parser that just returns a given token */
@@ -46,9 +51,7 @@ act_return(const HParseResult *p, void *u)
 HParser *
 p_return__m(HAllocator *mm__, const HParsedToken *tok)
 {
-	HParser *eps  = h_epsilon_p__m(mm__);
-
-	return h_action__m(mm__, eps, act_return, (void *)tok);
+	return h_action__m(mm__, p_epsilon, act_return, (void *)tok);
 }
 
 /* a helper to look up a value in a dictionary */
@@ -71,6 +74,14 @@ dictentry(const HCountedArray *dict, const char *key)
 	return NULL;
 }
 
+bool
+validate_eq_uint(HParseResult *p, void *u)
+{
+	const HParsedToken *v = p->ast;
+	return (v != NULL && v->token_type == TT_UINT &&
+	    v->uint == (uint64_t)(uintptr_t)u);
+}
+
 
 /*
  * auxiliary global data structure needed by the parser
@@ -382,7 +393,8 @@ init_parser(struct Env *aux)
 
 	/* cross-reference streams */
 	H_RULE(rest,	h_action(h_tell(), act_rest, aux));
-	H_AVRULE(xrstm,	SEQ(pnat, nat, KW("obj"), stmbeg, rest));
+	H_RULE(xstream,	h_bind(SEQ(stmbeg, rest), kxstream, aux));
+	H_AVRULE(xrstm,	SEQ(pnat, nat, KW("obj"), xstream));
 		// XXX skip however much we consumed and check for "endstream endobj"?
 
 	/* trailer */
@@ -399,10 +411,17 @@ init_parser(struct Env *aux)
 	/* debug parser to consume as much as possible */
 	H_RULE(pdfdbg,	SEQ(header, h_many(tail), body, OPT(xr_td), OPT(startxr)));
 
+
+	/* global parser variables */
 	p_pdf = pdf;
 	p_pdfdbg = pdfdbg;
 	p_startxref = startxr;
 	p_xref = CHX(xr_td, xrstm);
+
+	p_fail = h_nothing_p();
+	p_epsilon = epsilon;
+	p_return_0 = h_action(epsilon, act_return_uint, (void *)0);
+	p_return_1 = h_action(epsilon, act_return_uint, (void *)1);
 }
 
 
@@ -469,7 +488,7 @@ fail:
 		fprintf(stderr, "stream /Length negative\n");
 #endif
 	//h_pprintln(stderr, p);	// XXX debug
-	return h_nothing_p__m(mm__);
+	return p_fail;
 }
 
 /*
@@ -501,7 +520,7 @@ validate_xrstm(HParseResult *p, void *u)
  * interpret a cross-reference stream and return it in the same form as other
  * cross-reference sections:
  *
- * p = (pnat nat (dict ...) bytes)
+ * p = (pnat nat (dict ...) xrefs)
  * result = (xrefs dict)
  */
 HParsedToken *
@@ -512,8 +531,6 @@ act_xrstm(const HParseResult *p, void *u)
 	dict = H_INDEX_TOKEN(p->ast, 2, 0);
 	bytes = H_INDEX_TOKEN(p->ast, 3);
 
-	// XXX decode XRefStm
-
 	result = H_MAKE_SEQN(2);
 	result->seq->elements[0] = bytes;
 	result->seq->elements[1] = dict;
@@ -521,6 +538,185 @@ act_xrstm(const HParseResult *p, void *u)
 	return result;
 }
 
+HParser *
+p_xrefsub__m(HAllocator *mm__, size_t base, size_t n, HParser *p_entry)
+{
+	return h_repeat_n__m(mm__, p_entry, n);
+}
+
+/* x = ((dict ...) bytes) */
+HParser *
+kxstream(HAllocator *mm__, const HParsedToken *x, void *env)
+{
+	//struct Env *aux = env;
+	const HParsedToken *v, *dict_t;
+	const HParseResult *res;
+	HCountedArray *dict;
+	HBytes bytes;
+	size_t W[3];
+	size_t Size, Wn, Wskip;
+	const uint8_t *data;
+	size_t sz;
+	HParser *p_field[3], *p_entry, **p_subs, *p_xrefdata;
+
+	dict_t = H_INDEX_TOKEN(x, 0, 0);
+	dict = H_CAST_SEQ(dict_t);
+	bytes = H_INDEX_BYTES(x, 1);
+
+	/*
+	 * what follows is a horrible bunch of code that builds, from the
+	 * entries /W, /Index, and /Size in the stream dictionary, a parser for
+	 * the cross-reference data itself.
+	 *
+	 * in short, every cross-reference entry consists of (as of PDF 1.7)
+	 * three fields, but it could be more. /W gives the widths (in bytes)
+	 * of these fields. the /Index specifies the division of the data into
+	 * subsections; it is an array of natural numbers that in pairs specify
+	 * the base object number and length of each subsection - analogous to
+	 * the subsection headers in classic xref sections.
+	 *
+	 * when /Index is missing, a default value of [0 Size] is defined,
+	 * where Size is the value of the /Size field. as in normal trailer
+	 * dictionaries, it specifies the total size of the (entire)
+	 * cross-reference table.
+	 *
+	 * when /W states a width of 0 for a field, that field is not present
+	 * in the data and a default value should be used "if there is one".
+	 * most notably, the first field determines the "type" of the entry,
+	 * analogous to the 'n' and 'f' tags in classic xref sections; a width
+	 * of 0 for the first field is specified to mean that every entry is of
+	 * type 1 (= "n"). that type, in particular, specifies a default of 0
+	 * for field 3 (generation). in fact, these are the only defaults
+	 * defined by ISO 32000-1:2008 (PDF 1.7).
+	 *
+	 *   entry type  field no.  default value
+	 *               1 (type)   1
+	 *   1 ("n")     3 (gen.)   0
+	 */
+
+	/* Size (required) - total size of xref table */
+	v = dictentry(dict, "Size");
+	if (v == NULL || v->token_type != TT_SINT)
+		goto fail;
+	if ((Size = v->sint) < 1)
+		goto fail;
+
+	/* W (required) - field widths for each xref entry */
+	v = dictentry(dict, "W");
+	if (v == NULL || v->token_type != TT_SEQUENCE)
+		goto fail;
+	if ((Wn = v->seq->used) < 3)
+		goto fail;
+	Wskip = 0;
+	for (size_t i = 0; i < Wn; i++) {
+		if (v->seq->elements[i]->token_type != TT_SINT ||
+		    v->seq->elements[i]->sint < 0)
+			goto fail;
+		if (i < 3) {
+			if (v->seq->elements[i]->sint > 8)
+				goto fail;	/* can't take >64 bits */
+			W[i] = (size_t)v->seq->elements[i]->sint;
+		} else {
+			if (v->seq->elements[i]->sint > SIZE_MAX - Wskip)
+				goto fail;	/* overflow */
+			Wskip += v->seq->elements[i]->sint;
+		}
+	}
+	if (Wskip > SIZE_MAX / 8)
+		goto fail;
+
+	/*
+	 * build the parser for one xref entry.
+	 *
+	 * in summary, the only sensible forms for /W are:
+	 *
+	 *   [t x y] with t,x,y > 0  full general form
+	 *   [0 x y] with x,y > 0    only type-1 ("in use") entries
+	 *   [0 x 0] with x > 0      only type-1 entries, only offsets
+	 *
+	 * however, though nonsensical, [t x 0] with t,x > 0 is not disallowed
+	 * by the spec; as long as all entries are of type 1, the xref data can
+	 * be interpreted without ambiguity.
+	 *
+	 * in fact, every nonsensical form is possible as long as there are 0
+	 * entries.
+	 *
+	 * we realize this mess by just initializing the default parser to
+	 * p_fail and and replacing the known cases afterwards.
+	 */
+	for (size_t i = 0; i < 3; i++) {
+		if (W[i] == 0)
+			p_field[i] = p_fail;	/* no known default */
+		else
+			p_field[i] = h_bits__m(mm__, W[i] * 8, false);
+	}
+	/* known default cases: */
+	if (W[0] == 0)
+		p_field[0] = p_return_1;	/* all type 1 */
+	if (W[2] == 0) {
+		p_field[2] = p_return_0;	/* all generation 0 */
+		if (W[0] > 0) {
+			/* type field *must* be 1 */
+			p_field[0] = h_attr_bool__m(mm__, p_field[0],
+			    validate_eq_uint, (void *)1);
+		}
+	}
+	p_entry = h_sequence__m(mm__, p_field[0], p_field[1], p_field[2],
+	    h_skip__m(mm__, Wskip), NULL);
+
+	/* Index (optional) - subsections [base count ...] */
+	v = dictentry(dict, "Index");
+	if (v == NULL) {
+		/* default: [0 Size] */
+		p_subs = h_alloc(mm__, 2 * sizeof(HParser *));
+		p_subs[0] = h_repeat_n__m(mm__, p_entry, Size);
+		p_subs[1] = NULL;
+	} else if (v->token_type != TT_SEQUENCE) {
+		goto fail;
+	} else {
+		size_t nsubs = v->seq->used / 2;
+
+		/* build a parser for each subsection */
+		if (nsubs >= SIZE_MAX / sizeof(HParser *))
+			goto fail;
+		p_subs = h_alloc(mm__, (nsubs + 1) * sizeof(HParser *));
+		for (size_t i = 0; i < nsubs; i++) {
+			HParsedToken *base = v->seq->elements[2 * i];
+			HParsedToken *n = v->seq->elements[2 * i + 1];
+
+			if (base->token_type != TT_SINT || base->sint < 0 ||
+			    n->token_type != TT_SINT || n->sint < 0 ||
+			    n->sint > SIZE_MAX)
+				goto fail;
+
+			p_subs[i] = p_xrefsub__m(mm__, base->sint, n->sint,
+			    p_entry);
+		}
+		p_subs[nsubs] = NULL;
+	}
+	p_xrefdata = h_sequence__ma(mm__, (void **)p_subs);
+
+	/* Filter (optional) - XXX */
+	v = dictentry(dict, "Filter");
+	if (v != NULL)
+		goto fail;
+	data = bytes.token;
+	sz = bytes.len;
+
+	res = h_parse__m(mm__, p_xrefdata, data, sz);
+	if (res == NULL)
+		goto fail;
+
+	HParser *dict_p = p_return__m(mm__, dict_t);
+	HParser *xref_p = p_return__m(mm__, res->ast);
+	HParser *skip_p = h_skip__m(mm__, bytes.len * 8);
+
+	return h_sequence__m(mm__, dict_p, xref_p, skip_p, NULL);
+fail:
+	return p_fail;
+}
+
+
 /*
  * main program
  */
-- 
GitLab