From e1eaaf0ac0400c9eb9dd07fcbaf95227c3cb1c46 Mon Sep 17 00:00:00 2001
From: "Sven M. Hallberg" <>
Date: Tue, 3 Dec 2019 20:52:41 +0100
Subject: [PATCH] add zlib decoding

 Makefile |   2 +-
 pdf.c    | 165 +++++++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 143 insertions(+), 24 deletions(-)

diff --git a/Makefile b/Makefile
index 7322086..68b09f3 100644
--- a/Makefile
+++ b/Makefile
@@ -18,4 +18,4 @@ test: pdf
 pdf: pdf.c
-	$(CC) -o $@ $(CFLAGS) $(LDFLAGS) $> -lhammer
+	$(CC) -o $@ $(CFLAGS) $(LDFLAGS) $> -lhammer -lz
diff --git a/pdf.c b/pdf.c
index c5bb67a..63863af 100644
--- a/pdf.c
+++ b/pdf.c
@@ -54,6 +54,13 @@ p_return__m(HAllocator *mm__, const HParsedToken *tok)
 	return h_action__m(mm__, p_epsilon, act_return, (void *)tok);
+/* a helper to compare an HBytes to a string */
+bytes_eq(HBytes b, const char *s)
+	return strncmp(s, b.token, b.len) == 0 && b.len == strlen(s);
 /* a helper to look up a value in a dictionary */
 const HParsedToken *
 dictentry(const HCountedArray *dict, const char *key)
@@ -62,12 +69,15 @@ dictentry(const HCountedArray *dict, const char *key)
 	HBytes k;
 	size_t len;
+	if (dict == NULL)
+		return NULL;
 	len = strlen(key);
 	for (size_t i = 0; i < dict->used; i++) {
 		ent = dict->elements[i];
 		k = H_INDEX_BYTES(ent, 0);
-		if (k.len == len && strncmp(key, k.token, k.len) == 0)
+		if (k.len == len && bytes_eq(k, key))
 			return H_INDEX_TOKEN(ent, 1);
@@ -429,6 +439,10 @@ init_parser(struct Env *aux)
  * stream object handling incl. cross-reference streams
+#include <inttypes.h>
+#include <zlib.h>
+#include <err.h>
 /* combine current position with env=(input,sz) into HBytes */
 HParsedToken *
 act_ks_bytes(const HParseResult *p, void *env)
@@ -504,23 +518,129 @@ validate_xrstm(HParseResult *p, void *u)
 #if 0
 	if (v == NULL)
-		fprintf(stderr, "stream dict has not /Type\n");
+		fprintf(stderr, "stream dict has no /Type\n");
 	else if (v->token_type != TT_BYTES)
 		fprintf(stderr, "stream /Type is no name object\n");
-	else if (v->bytes.len == 4 && strncmp("XRef", v->bytes.token, v->bytes.len) == 0)
+	else if (bytes_eq(v->bytes, "XRef"))
 		return true;
 	return false;
-	return (v != NULL && v->token_type == TT_BYTES && v->bytes.len == 4 &&
-	    strncmp("XRef", v->bytes.token, v->bytes.len) == 0);
+	return (v != NULL && v->token_type == TT_BYTES &&
+	    bytes_eq(v->bytes, "XRef"));
+struct Predictor {
+	int num;	/* default: 1 (no prediction) */
+	int colors;	/* default: 1 */
+	int bpc;	/* bits per component; default: 8 */
+	int columns;	/* default: 1 */
+HParseResult *
+FlateDecode(HAllocator *mm__, HCountedArray *parms, HBytes b, HParser *p)
+	size_t const BUFSIZE = 8 * 1024;
+	uint8_t *buf;
+	HSuspendedParser *sp;
+	HParseResult *res;
+	const HParsedToken *v;
+	size_t sz;
+	bool done;
+	z_stream strm = {0};
+	int ret;
+	struct Predictor pred = {1, 1, 8, 1};
+	/* determine the predictor algorithm to use (if any) */
+	#define SETPARM(VAR,STR) do {					\
+		v = dictentry(parms, (STR));				\
+		if (v != NULL) {					\
+			if (v->token_type != TT_SINT || v->sint < 0)	\
+				return NULL;				\
+			VAR = v->sint;					\
+		} } while(0)
+	SETPARM(pred.num, "Predictor");
+	SETPARM(pred.colors, "Colors");
+	SETPARM(pred.bpc, "BitsPerComponent");
+	SETPARM(pred.columns, "Columns");
+	#undef SETPARM
+	if (pred.num != 1) {	// XXX
+		fprintf(stderr, "FlateDecode: /Predictor %d unimplemented\n",
+		    pred.num);
+		return NULL;
+	}
+	// XXX pass our allocator to zlib
+	ret = inflateInit(&strm);
+	if (ret != Z_OK)
+		errx(1, "inflateInit: %s (%d)", strm.msg, ret);
+	buf = h_alloc(mm__, BUFSIZE);
+	sp = h_parse_start__m(mm__, p);
+	assert(sp != NULL);
+	done = false;
+	strm.avail_in = b.len;
+	strm.next_in = (unsigned char *)b.token;
+	do {
+		strm.avail_out = BUFSIZE;
+		strm.next_out = buf;
+		ret = inflate(&strm, Z_NO_FLUSH);
+		if (ret != Z_STREAM_END && ret != Z_OK) {
+			fprintf(stderr, "inflate: %s (%d)\n", strm.msg, ret);
+			break;
+		}
+		sz = BUFSIZE - strm.avail_out;
+		done = h_parse_chunk(sp, buf, sz);
+	} while (!done && ret == Z_OK);
+	res = h_parse_finish(sp);
+		// XXX always return NULL on error?
+	inflateEnd(&strm);
+	mm__->free(mm__, buf);
+	return res;
+ * decode the byte stream 'b' according to metadata in its stream dictionary
+ * 'd' and parse the result with 'p'.
+ */
+HParseResult *
+parse_stream(HAllocator  *mm__, HCountedArray *d, HBytes b, HParser *p)
+	HParseResult *(*filter)(HAllocator *, HCountedArray *, HBytes, HParser *);
+	HCountedArray *parms = NULL;
+	const HParsedToken *v;
+	v = dictentry(d, "Filter");
+	if (v == NULL)
+		return h_parse__m(mm__, p, b.token, b.len);
+	/* compile to a CF backend to enable incremental parsing */
+	if (h_compile(p, PB_LLk, NULL) == -1)
+		errx(1, "xref data parser: LL(1) compile failed");
+	if (v->token_type == TT_SEQUENCE)
+		return NULL;	// XXX filter chains not supported, yet
+	assert(v->token_type == TT_BYTES);
+	if (bytes_eq(v->bytes, "FlateDecode"))
+		filter = FlateDecode;
+	else
+		return NULL;		/* filter not supported */
+	v = dictentry(d, "DecodeParms");
+	if (v && v->token_type == TT_SEQUENCE)
+		parms = v->seq;
+	return filter(mm__, parms, b, p);
  * interpret a cross-reference stream and return it in the same form as other
  * cross-reference sections:
- * p = (pnat nat (dict ...) xrefs)
+ * p = (pnat nat (dict xrefs))
  * result = (xrefs dict)
 HParsedToken *
@@ -529,7 +649,7 @@ act_xrstm(const HParseResult *p, void *u)
 	HParsedToken *bytes, *dict, *result;
 	dict = H_INDEX_TOKEN(p->ast, 2, 0);
-	bytes = H_INDEX_TOKEN(p->ast, 3);
+	bytes = H_INDEX_TOKEN(p->ast, 2, 1);
 	result = H_MAKE_SEQN(2);
 	result->seq->elements[0] = bytes;
@@ -555,8 +675,6 @@ kxstream(HAllocator *mm__, const HParsedToken *x, void *env)
 	HBytes bytes;
 	size_t W[3];
 	size_t Size, Wn, Wskip;
-	const uint8_t *data;
-	size_t sz;
 	HParser *p_field[3], *p_entry, **p_subs, *p_xrefdata;
 	dict_t = H_INDEX_TOKEN(x, 0, 0);
@@ -661,15 +779,16 @@ kxstream(HAllocator *mm__, const HParsedToken *x, void *env)
 			    validate_eq_uint, (void *)1);
-	p_entry = h_sequence__m(mm__, p_field[0], p_field[1], p_field[2],
-	    h_skip__m(mm__, Wskip * 8), NULL);
+	if (Wskip > 0)	// XXX h_skip does not work with CF, yet
+		goto fail;
+	p_entry = h_sequence__m(mm__, p_field[0], p_field[1], p_field[2], NULL);
 	/* Index (optional) - subsections [base count ...] */
 	v = dictentry(dict, "Index");
 	if (v == NULL) {
 		/* default: [0 Size] */
 		p_subs = h_alloc(mm__, 2 * sizeof(HParser *));
-		p_subs[0] = h_repeat_n__m(mm__, p_entry, Size);
+		p_subs[0] = p_xrefsub__m(mm__, 0, Size, p_entry);
 		p_subs[1] = NULL;
 	} else if (v->token_type != TT_SEQUENCE) {
 		goto fail;
@@ -696,20 +815,20 @@ kxstream(HAllocator *mm__, const HParsedToken *x, void *env)
 	p_xrefdata = h_sequence__ma(mm__, (void **)p_subs);
-	/* Filter (optional) - XXX */
-	v = dictentry(dict, "Filter");
-	if (v != NULL)
-		goto fail;
-	data = bytes.token;
-	sz = bytes.len;
+	/* restrict bytes to Length if present (and not indirect) */
+	v = dictentry(dict, "Length");
+	if (v != NULL && v->token_type == TT_SINT && v->sint >= 0)
+		bytes.len = v->sint;
-	res = h_parse__m(mm__, p_xrefdata, data, sz);
+	/* decode and parse the stream data */
+	res = parse_stream(mm__, dict, bytes, p_xrefdata);
 	if (res == NULL)
 		goto fail;
 	HParser *dict_p = p_return__m(mm__, dict_t);
 	HParser *xref_p = p_return__m(mm__, res->ast);
 	HParser *skip_p = h_skip__m(mm__, bytes.len * 8);
+		// XXX skip only as much as parse_stream consumed
 	return h_sequence__m(mm__, dict_p, xref_p, skip_p, NULL);
@@ -722,9 +841,6 @@ fail:
 #include <stdio.h>
-#include <inttypes.h>
-#include <err.h>
-#include <errno.h>
 #include <stdlib.h>	/* realloc() */
 #include <fcntl.h>	/* open() */
 #include <unistd.h>	/* lseek() */
@@ -768,7 +884,9 @@ parse_xrefs(const char *input, size_t sz, size_t *nxrefs)
 	offset = H_INDEX_UINT(res->ast, 0);
 	for (;;) {
-		res = h_parse(p_xref, input + offset, sz - offset);
+		//res = h_parse(p_xref, input + offset, sz - offset);
+		HParser *p = h_right(h_seek(offset * 8, SEEK_SET), p_xref);	// XXX
+		res = h_parse(p, input, sz);
 		if (res == NULL) {
 			fprintf(stderr, "%s: error parsing xref section at "
 			    "position %zu (0x%zx)\n", infile, offset, offset);
@@ -852,6 +970,7 @@ main(int argc, char *argv[])
 	/* parse all cross-reference sections and trailer dictionaries */
 	xrefs = parse_xrefs(input, sz, &nxrefs);
+	(void)xrefs;	// shut up, gcc
 	/* run the main parser */
 	res = h_parse(p_pdf, input, sz);