From 44c4319455903176b6a48d5b03602b10b58d4d80 Mon Sep 17 00:00:00 2001
From: "Sven M. Hallberg" <pesco@khjk.org>
Date: Fri, 24 Jan 2020 18:19:18 +0100
Subject: [PATCH] represent xref entries semantically

---
 pdf.c | 139 ++++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 125 insertions(+), 14 deletions(-)

diff --git a/pdf.c b/pdf.c
index dd68556..85bae4c 100644
--- a/pdf.c
+++ b/pdf.c
@@ -54,6 +54,13 @@ p_return__m(HAllocator *mm__, const HParsedToken *tok)
 	return h_action__m(mm__, p_epsilon, act_return, (void *)tok);
 }
 
+/* a parser that just returns a given uint */
+HParser *
+p_return_uint__m(HAllocator *mm__, uint64_t x)
+{
+	return h_action__m(mm__, p_epsilon, act_return_uint, (void *)x);
+}
+
 /* a helper to compare an HBytes to a string */
 bool
 bytes_eq(HBytes b, const char *s)
@@ -108,6 +115,44 @@ struct Env {
 };
 
 
+/*
+ * custom token types
+ */
+HTokenType TT_XREntry;
+
+typedef struct {
+	enum {XR_FREE, XR_INUSE, XR_OBJSTM} type;
+	union {
+		struct { size_t next, ngen; } f;	/* free */
+		struct { size_t offs, gen; } n;		/* inuse */
+		struct { size_t stm, idx; } o;		/* objstm */
+	};
+} XREntry;
+
+void
+pp_xrentry(FILE *stream, const HParsedToken *tok, int indent, int delta)
+{
+	XREntry *xr = H_CAST(XREntry, tok);
+
+	switch (xr->type) {
+	case XR_FREE:
+		fprintf(stream, "{ \"type\":\"free\", \"next\":%zu, "
+		    "\"ngen\":%zu }", xr->f.next, xr->f.ngen);
+		break;
+	case XR_INUSE:
+		fprintf(stream, "{ \"type\":\"inuse\", \"offs\":%zu, "
+		    "\"gen\":%zu }", xr->n.offs, xr->n.gen);
+		break;
+	case XR_OBJSTM:
+		fprintf(stream, "{ \"type\":\"objstm\", \"stm\":%zu, "
+		    "\"idx\":%zu }", xr->o.stm, xr->o.idx);
+		break;
+	default:
+		assert(!"reached");
+	}
+}
+
+
 /*
  * semantic actions
  */
@@ -138,6 +183,7 @@ act_nat(const HParseResult *p, void *u)
 	uint64_t x = 0;
 	HCountedArray *seq = H_CAST_SEQ(p->ast);
 
+	// XXX check for overflow
 	for (size_t i = 0; i < seq->used; i++)
 		x = x*10 + H_CAST_UINT(seq->elements[i]);
 
@@ -153,6 +199,7 @@ act_pnat(const HParseResult *p, void *u)
 	uint64_t x = H_FIELD_UINT(0);
 	HCountedArray *seq = H_FIELD_SEQ(1);
 
+	// XXX check for overflow
 	for (size_t i = 0; i < seq->used; i++)
 		x = x*10 + H_CAST_UINT(seq->elements[i]);
 	
@@ -165,6 +212,7 @@ act_intg(const HParseResult *p, void *u)
 	int64_t x = 0;
 	HCountedArray *seq = H_FIELD_SEQ(1);
 
+	// XXX check for overflow
 	for (size_t i = 0; i < seq->used; i++)
 		x = x*10 + H_CAST_UINT(seq->elements[i]);
 
@@ -184,6 +232,7 @@ act_real(const HParseResult *p, void *u)
 	HCountedArray *whole = H_FIELD_SEQ(1, 0);
 	HCountedArray *fract = H_FIELD_SEQ(1, 2);
 
+	// XXX check for overflow
 	for (size_t i = 0; i < whole->used; i++)
 		x = x*10 + H_CAST_UINT(whole->elements[i]);
 	for (size_t i = 0; i < fract->used; i++)
@@ -232,6 +281,56 @@ act_octal(const HParseResult *p, void *u)
 	return H_MAKE_UINT(x);
 }
 
+HParsedToken *
+act_xrent(const HParseResult *p, void *u)
+{
+	XREntry *xr = H_ALLOC(XREntry);
+	char c = H_FIELD_UINT(2);
+
+	switch (c) {
+	case 'f':
+		xr->type = XR_FREE;
+		xr->f.next = H_FIELD_UINT(0);
+		xr->f.ngen = H_FIELD_UINT(1);
+		break;
+	case 'n':
+		xr->type = XR_INUSE;
+		xr->n.offs = H_FIELD_UINT(0);
+		xr->n.gen = H_FIELD_UINT(1);
+		break;
+	default:
+		assert(!"reached");
+	}
+
+	return H_MAKE(XREntry, xr);
+}
+
+HParsedToken *
+act_xrstment(const HParseResult *p, void *u)
+{
+	XREntry *xr = H_ALLOC(XREntry);
+
+	xr->type = H_FIELD_UINT(0);
+	switch (xr->type) {
+	case XR_FREE:
+		xr->f.next = H_FIELD_UINT(1);
+		xr->f.ngen = H_FIELD_UINT(2);
+		break;
+	case XR_INUSE:
+		xr->n.offs = H_FIELD_UINT(1);
+		xr->n.gen = H_FIELD_UINT(2);
+		break;
+	case XR_OBJSTM:
+		xr->o.stm = H_FIELD_UINT(1);
+		xr->o.idx = H_FIELD_UINT(2);
+		break;
+	default:
+		assert(!"reached");
+	}
+
+	return H_MAKE(XREntry, xr);
+}
+
 #define act_xrefs h_act_last
 
 HParsedToken *
@@ -264,6 +363,8 @@ HParser *kxstream(HAllocator *, const HParsedToken *, void *);
 void
 init_parser(struct Env *aux)
 {
+	TT_XREntry = h_allocate_token_new("XREntry", NULL, pp_xrentry);
+
 	/* lines */
 	H_RULE(cr,	p_mapch('\r', '\n'));	/* semantic value: \n */
 	H_RULE(lf,	h_ch('\n'));		/* semantic value: \n */
@@ -401,7 +502,7 @@ init_parser(struct Env *aux)
 	H_RULE(xrtyp,	CHX(h_ch('n'), h_ch('f')));
 	H_ARULE(xroff,	REP(digit, 10));
 	H_ARULE(xrgen,	REP(digit, 5));
-	H_RULE(xrent,	SEQ(xroff, IGN(sp), xrgen, IGN(sp), xrtyp, IGN(xreol)));
+	H_ARULE(xrent,	SEQ(xroff, IGN(sp), xrgen, IGN(sp), xrtyp, IGN(xreol)));
 	H_ARULE(xrnat,	h_many1(digit));
 	H_RULE(xrhead,	SEQ(xrnat, IGN(sp), xrnat, nl));
 	H_RULE(xrsub,	SEQ(xrhead, h_many(xrent)));
@@ -445,7 +546,6 @@ init_parser(struct Env *aux)
  * stream object handling incl. cross-reference streams
  */
 
-#include <inttypes.h>
 #include <limits.h>	/* INT_MAX */
 #include <zlib.h>
 #include <err.h>
@@ -801,9 +901,16 @@ act_xrstm(const HParseResult *p, void *u)
 }
 
 HParser *
-p_xrefsub__m(HAllocator *mm__, size_t base, size_t n, HParser *p_entry)
+p_xrefsub__m(HAllocator *mm__, size_t base, size_t count, HParser *p_entry)
 {
-	return h_repeat_n__m(mm__, p_entry, n);
+	HParser *ret_base, *ret_count, *p_header, *p_entries;
+
+	ret_base  = p_return_uint__m(mm__, base);
+	ret_count = p_return_uint__m(mm__, count);
+	p_header  = h_sequence__m(mm__, ret_base, ret_count, NULL);
+	p_entries = h_repeat_n__m(mm__, p_entry, count);
+
+	return h_sequence__m(mm__, p_header, p_entries, NULL);
 }
 
 /* x = ((dict ...) bytes) */
@@ -856,10 +963,9 @@ kxstream(HAllocator *mm__, const HParsedToken *x, void *env)
 
 	/* Size (required) - total size of xref table */
 	v = dictentry(dict, "Size");
-	if (v == NULL || v->token_type != TT_SINT)
-		goto fail;
-	if ((Size = v->sint) < 1)
+	if (v == NULL || v->token_type != TT_SINT || v->sint < 1)
 		goto fail;
+	Size = v->sint;
 
 	/* W (required) - field widths for each xref entry */
 	v = dictentry(dict, "W");
@@ -869,17 +975,20 @@ kxstream(HAllocator *mm__, const HParsedToken *x, void *env)
 		goto fail;
 	Wskip = 0;
 	for (size_t i = 0; i < Wn; i++) {
-		if (v->seq->elements[i]->token_type != TT_SINT ||
-		    v->seq->elements[i]->sint < 0)
+		HTokenType tt = v->seq->elements[i]->token_type;
+		int64_t w = v->seq->elements[i]->sint;
+
+		if (tt != TT_SINT || w < 0)
 			goto fail;
 		if (i < 3) {
-			if (v->seq->elements[i]->sint > 8)
-				goto fail;	/* can't take >64 bits */
-			W[i] = (size_t)v->seq->elements[i]->sint;
+			/* we can't take >64 bits and want to use size_t */
+			if (w > 8 || w > sizeof(size_t))
+				goto fail;
+			W[i] = (size_t)w;
 		} else {
-			if (v->seq->elements[i]->sint > SIZE_MAX - Wskip)
+			if (w > SIZE_MAX - Wskip)
 				goto fail;	/* overflow */
-			Wskip += v->seq->elements[i]->sint;
+			Wskip += w;
 		}
 	}
 	if (Wskip > SIZE_MAX / 8)
@@ -924,6 +1033,7 @@ kxstream(HAllocator *mm__, const HParsedToken *x, void *env)
 	if (Wskip > 0)	// XXX h_skip does not work with CF, yet
 		goto fail;
 	p_entry = h_sequence__m(mm__, p_field[0], p_field[1], p_field[2], NULL);
+	p_entry = h_action__m(mm__, p_entry, act_xrstment, NULL);
 
 	/* Index (optional) - subsections [base count ...] */
 	v = dictentry(dict, "Index");
@@ -983,6 +1093,7 @@ fail:
  */
 
 #include <stdio.h>
+#include <inttypes.h>
 #include <stdlib.h>	/* realloc() */
 #include <fcntl.h>	/* open() */
 #include <unistd.h>	/* lseek() */
-- 
GitLab