From fbbe953faea40f1f1e35ccd4c753fba0498cac50 Mon Sep 17 00:00:00 2001
From: "sumit.ray@baesystems.com" <sumit.ray@baesystems.com>
Date: Mon, 28 Jun 2021 23:05:21 -0400
Subject: [PATCH] Working through processing object streams

---
 Makefile |    2 +-
 pdf.c    | 1622 +++++++++++++++++++++++++++++++++++++-----------------
 2 files changed, 1108 insertions(+), 516 deletions(-)

diff --git a/Makefile b/Makefile
index 6154e1d..4c7b12c 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,7 @@ CFLAGS += -std=c99 -Wall -Werror -DLOG
 # lib@ -> ../hammer/build/opt/src
 HAMMER_INCLUDE = .
 HAMMER_LIB = ./lib
-CFLAGS += -I$(HAMMER_INCLUDE)
+CFLAGS += -I$(HAMMER_INCLUDE)  -g  -pg  # (-pg :: profile using gprof) (-g :: debug info)
 LDFLAGS += -L$(HAMMER_LIB)
 SOURCES = pdf.c lzw-lib.c
 
diff --git a/pdf.c b/pdf.c
index b2508a3..40c6441 100644
--- a/pdf.c
+++ b/pdf.c
@@ -1090,6 +1090,11 @@ act_page(const HParseResult *p, void *u)
 	return (HParsedToken *)p->ast;
 }
 
+HParsedToken *
+act_dictobj(const HParseResult *p, void *u)
+{
+	return (HParsedToken *)p->ast;
+}
 
 
 /*
@@ -1710,11 +1715,130 @@ act_txtobj(const HParseResult *p, void *u)
 
 
 
+/*
+ * This continuation takes the text stream and saves it in the environment for further
+ * processing, e.g. writing it out to a file with the same name as the pdf input filename
+ * but woth a .psectxt suffix.
+ * It does not consume the string and returns the parser as the output.
+ *
+ * x = (txtobj ...)
+ */
+HParser *
+ktxtstream(HAllocator *mm__, const HParsedToken *x, void *env)
+{
+
+	struct Env *aux = env;
+
+
+	assert (x->token_type == TT_SEQUENCE);
+	int n_tobjs = x->seq->used;
+
+	for (int n=0; n<n_tobjs; n++) {
+
+		assert(x->seq->elements[n]->token_type == TT_TextEntry);
+		TextEntry *tste = H_CAST(TextEntry, x->seq->elements[n]);
+		struct textstr *tstr = NULL;
+		/*
+		 *  To save all of the operators along with the text string, we have to walk
+		 *  through all of the tokens and keep a table of pointers to them
+		 *  For now, just keep a pointer to the text string in the environment
+		 *
+		 */
+		switch (tste->type) {
+		case TW_Tj:
+		case TW_Tq:
+		case TW_Tqq:
+			tstr = &tste->tstr;
+			break;
+		case TW_TJ:
+			tstr = &tste->tarray.flattened;
+			break;
+		default:
+			fprintf(stderr, "ktxtstream:: Text token type '%u' ignored\n",
+					tste->type);
+		}
+
+		fprintf(stdout, "ktxtstream: Value = %.*s\n", tstr->nchars, tstr->text);
+
+
+		// store the string in the environment
+		// not sure whether we need to actually store the string in malloc'ed area
+		// currently, we are reusing the token memory previously created
+		struct textnode *txtnd = (struct textnode *) malloc(
+				sizeof(struct textnode));
+		txtnd->tstr = tstr;
+		txtnd->next = NULL;
+		if (aux->txthead == NULL)
+			aux->txthead = txtnd;
+		if (aux->txttail == NULL)
+			aux->txttail = txtnd;
+		else {
+			aux->txttail->next = txtnd;
+			aux->txttail       = txtnd;
+		}
+		aux->ntextobjs += 1;
+
+	}
+
+	return p_return__m(mm__, x);
+}
+
+
+
+
+
+/*
+ * This utility extracts the text stream from the global environment
+ * writes it out to a file with the same name as the pdf input filename
+ * but with a .psectxt suffix.
+ */
+void
+text_extract(const struct Env *aux)
+{
+    fprintf(stdout, "text_extract:: num text objects = %ld\n", aux->ntextobjs);
+	fprintf(stdout, "text_extract:: %s\n", aux->infile);
+
+	int infnlen = strlen(aux->infile);
+	int sfxlen = strlen(".psectxt");
+	int namelen = infnlen + sfxlen + 1;
+
+	char *outfn = (char *) malloc(sizeof(char) * namelen);
+	if (outfn == NULL) {
+		fprintf(stderr, "text_extract:: h_arena_realloc() failed");
+		return;
+	}
+	memcpy(outfn, aux->infile, infnlen);
+	memcpy(&outfn[infnlen], ".psectxt", sfxlen);
+	outfn[namelen-1] = '\0'; // null terminate the string
+
+	// open the file for writing
+	FILE *stream;
+	if (!(stream = fopen(outfn, "w"))) {
+		fprintf(stderr,
+				"text_extract:: Failed to open file '%s' for writing\n", outfn);
+		return;
+	}
+	struct textnode *curr = aux->txthead;
+	for (int i = 0; i < aux->ntextobjs; i++) {
+		fprintf(stdout, "%.*s\n", (int) curr->tstr->nchars, curr->tstr->text);
+		fprintf(stream, "%.*s\n", (int) curr->tstr->nchars, curr->tstr->text);
+		curr = curr->next;
+	}
+	fclose(stream);
+	free(outfn);
+	return;
+}
+
+
+
+
 
 
 
 
 
+// *********************************************************************
+// DEBUG
 
 
 // Utility -- Handles simplistic approach to UTF-16
@@ -1740,9 +1864,6 @@ char convert2char(unsigned int b1)
 }
 
 
-// *********************************************************************
-// DEBUG
-
 HParsedToken *
 act_txtbegin_(const HParseResult *p, void *u)
 {
@@ -1761,6 +1882,8 @@ act_txtend(const HParseResult *p, void *u)
   return (HParsedToken *)p->ast;
 }
 
+// *********************************************************************
+
 
 /*
  * ********************************************************************
@@ -1769,6 +1892,11 @@ act_txtend(const HParseResult *p, void *u)
  */
 
 
+
+
+
+
+
 /*
  * input grammar
  */
@@ -1793,6 +1921,7 @@ HParser *p_textbegin;
 HParser *p_textstream;
 HParser *p_trailer;
 HParser *p_page;
+HParser *p_dictobj;
 
 
 /* continuations for h_bind() */
@@ -1826,6 +1955,7 @@ init_parser(struct Env *aux)
 	TT_XREntry      = h_allocate_token_new("XREntry", NULL, pp_xrentry);
 	TT_Ref          = h_allocate_token_new("Ref", NULL, pp_ref);
 	TT_Dict         = h_allocate_token_new("Dict", NULL, pp_dict);
+	TT_TextEntry    = h_allocate_token_new("TextEntry", NULL, pp_textentry);
 
 	/* lines */
 	H_RULE(cr,	p_mapch('\r', '\n'));	/* semantic value: \n */
@@ -2176,13 +2306,13 @@ init_parser(struct Env *aux)
                                                                                    \
                                                                                    \
 	// Page Tree
-	H_ARULE(contentstream, h_middle(stmbeg, h_many1(h_uint8()), stmend));
+	H_RULE(contentstream, h_left(h_bind(stmbeg, kcontentstream, aux), stmend));
 //	H_ARULE(contentstream, h_middle(stmbeg, h_many(SEQ(h_not(stmend), h_uint8())), stmend));
-	H_ARULE(pgcontents, CHX(array, contentstream));
-	H_ARULE(page, SEQ(ws, npair, wel, KW("obj"), ws, pgcontents,
+//	H_ARULE(pgcontents, CHX(array, contentstream));
+	H_ARULE(page, SEQ(ws, npair, wel, KW("obj"), ws, contentstream,
+			OPT(ws), OPT(lws), KW("endobj")));
+	H_ARULE(dictobj, SEQ(ws, npair, wel, KW("obj"), ws, CHX(contentstream, dict),
 			OPT(ws), OPT(lws), KW("endobj")));
-//	H_ARULE(page, CHX(ref, array));
-	p_page = page;
 
 
 
@@ -2202,6 +2332,8 @@ init_parser(struct Env *aux)
 	/* text parser variables */                                                  \
 	p_textbegin  = txtbegin;                                                       \
 	p_textstream = txtstream;                                                      \
+	p_page       = page;
+	p_dictobj    = CHX(dictobj, contentstream);
 
 	p_fail = h_nothing_p();
 	p_epsilon = epsilon;
@@ -2325,6 +2457,8 @@ parse_objstm_obj(struct Env *aux, size_t nr, size_t stm_nr, size_t idx)
 	/*
 	 * decode the stream and find the target object in it
 	 */
+	// SR:: ??? This seems wrong ???
+	// -- The only path through this function is the one through the parser
 	return NULL;	// XXX
 }
 
@@ -2596,6 +2730,7 @@ FlateDecode(const Dict *parms, HBytes b, HParser *p)
 	res = h_parse_finish(sp);
 		// XXX always return NULL on error?
 #else
+	fprintf (stdout, "FlateDecode:: Inflated string (%lu)\n:%.*s\n", pred.nout, (int)pred.nout, pred.out);
 	res = h_parse(p, pred.out, pred.nout);
 	free(pred.out);
 #endif
@@ -2608,64 +2743,222 @@ FlateDecode(const Dict *parms, HBytes b, HParser *p)
 	return res;
 }
 
-/* LZW helpers */
 
-typedef struct
-{
-	uint8_t *lzw_buf;
-	size_t total_buf_size;
-	size_t write_head;
-	size_t write_tail;
-	uint8_t write_checksum;
-	size_t eof_loc;
 
-	HBytes *input_stream;
-	size_t read_head;
-	size_t read_tail;
-	uint8_t read_checksum;
-} lzwspec;
+HParseResult *
+FlateDecode2(const Dict *parms, HBytes b, HParser *p)
+{
+	size_t const BUFSIZE = 8 * 1024;
+	uint8_t *buf;
+#ifdef ITERATIVE // XXX
+	HSuspendedParser *sp;
+#endif
+	HParseResult *res;
+	const HParsedToken *v;
+	size_t sz;
+	int done;
+	z_stream strm = {0};
+	int ret;
+	struct predictor pred = {1, 1, 8, 1};
+	int (*depredict)(struct predictor *, uint8_t *, size_t);
 
-lzwspec *cur_lzw_spec;
+	/* set up the predictor (if any) */
+	#define SETPARM(VAR,STR) do {					\
+		v = dictentry(parms, (STR));				\
+		if (v != NULL) {					\
+			if (v->token_type != TT_SINT || v->sint < 0)	\
+				return NULL;				\
+			VAR = v->sint;					\
+		} } while(0)
+	SETPARM(pred.num,	"Predictor");
+	SETPARM(pred.colors,	"Colors");
+	SETPARM(pred.bpc,	"BitsPerComponent");
+	SETPARM(pred.columns,	"Columns");
+	#undef SETPARM
+	if (pred.num == 1)
+		depredict = depred_none;
+	else {
+		if (pred.num >= 10 && pred.num <= 15)
+			depredict = depred_png;
+		else if (pred.num == 2) {
+			/* for 8-bpc TIFF pred. 2, we can reuse PNG Sub */
+			if (pred.bpc == 8) {
+				pred.predfun = pp_sub;	/* predict left */
+				depredict = depred_png;
+			} else {
+				// XXX add general TIFF predictor (bpc != 8)
+				fprintf(stderr, "FlateDecode: /Predictor %d "
+				    "not supported for /BitsPerComponent %d\n",
+				    pred.num, pred.bpc);
+				return NULL;
+			}
+		} else {
+			fprintf(stderr, "FlateDecode: /Predictor %d"
+			    " not supported\n", pred.num);
+			return NULL;
+		}
 
-/* used by write_lzw_buffer to get more space for decoding if needed */
-void
-grow_lzw_buffer(size_t amount)
-{
-	uint8_t *ret_buf = realloc(cur_lzw_spec->lzw_buf, (cur_lzw_spec->total_buf_size+amount) * sizeof(uint8_t));
-	if(ret_buf != NULL)
-	{
-		cur_lzw_spec->total_buf_size += amount;
-		cur_lzw_spec->lzw_buf = ret_buf;
-	}
-	else
-	{
-		fprintf(stderr, "LZWDecode: h_arena_realloc() failed");
-		return;
+		/* allocate row buffer */
+		if (pred.columns > (INT_MAX - 7) / pred.colors / pred.bpc) {
+			fprintf(stderr, "FlateDecode: overflow\n");
+			return NULL;
+		}
+		pred.rowsz = (pred.colors * pred.bpc * pred.columns + 7) / 8;
+		pred.buf = calloc(1, pred.rowsz);
+		if (pred.buf == NULL)
+			err(1, "FlateDecode");
 	}
-}
 
-lzwspec *
-new_lzw_spec(HBytes *bytes)
-{
-	size_t const BUFSIZE = sizeof(uint8_t) * 1024;
-	lzwspec *ret = malloc(sizeof(lzwspec));
-	memset(ret, 0, sizeof(lzwspec));
-	ret->input_stream = bytes;
-	ret->lzw_buf = malloc(BUFSIZE);
-	ret->total_buf_size = BUFSIZE;
-	return ret;
-}
+	/* set up zlib */
+	// XXX pass our allocator to zlib
+	ret = inflateInit(&strm);
+	if (ret != Z_OK)
+		errx(1, "inflateInit: %s (%d)", strm.msg, ret);
+	buf = malloc(BUFSIZE);
+	if (buf == NULL)
+		err(1, "FlateDecode");
 
-void
-delete_lzw_spec(lzwspec *spec)
-{
-	free(spec->lzw_buf);
-	free(spec);
-}
+#ifdef ITERATIVE // XXX
+	/* initialize target parser */
+	sp = h_parse_start(p);
+	assert(sp != NULL);
+	pred.sp = sp;
+#endif
 
-void
-bind_lzw_spec(lzwspec *spec)
-{
+	done = 0;
+	strm.avail_in = b.len;
+	strm.next_in = (unsigned char *)b.token;
+	do {
+		strm.avail_out = BUFSIZE;
+		strm.next_out = buf;
+
+		ret = inflate(&strm, Z_NO_FLUSH);
+		if (ret != Z_STREAM_END && ret != Z_OK) {
+			fprintf(stderr, "inflate: %s (%d)\n", strm.msg, ret);
+			break;
+		}
+
+		sz = BUFSIZE - strm.avail_out;
+		done = depredict(&pred, buf, sz);
+	} while (done == 0 && ret == Z_OK);
+
+#ifdef ITERATIVE // XXX
+	res = h_parse_finish(sp);
+		// XXX always return NULL on error?
+#else
+	// decoded stream in pred.out
+//	FILE *decodef = fopen ("flatecode.out", "w");
+//	fprintf (decodef, "FlateDecode:: Inflated string (%lu)\n:%.*s\n", pred.nout, (int)pred.nout, pred.out);
+	fprintf (stdout, "FlateDecode:: Inflated string (%lu)\n:%.*s\n", pred.nout, (int)pred.nout, pred.out);
+	unsigned char *fdec = pred.out;
+//	char _l;
+	int i;
+	for (i=0; i<(pred.nout/2); ++i)
+	{
+		convert2char(*fdec);
+//		_l = convert2char(*fdec);
+//		fprintf(decodef, " %c-%d ", _l, _l);
+		fdec ++;
+	}
+	res = NULL;
+
+	res = h_parse(p_textbegin, pred.out, pred.nout);
+	if ((res != NULL) && (res->ast != NULL)) {
+		/* let's make sure if the stream has text strings */
+		const HParsedToken *tstr = H_INDEX_TOKEN(res->ast, 0);
+		if (bytes_eq(tstr->bytes, "BT")) {
+			fprintf (stdout, "decode_stream:: Found a text stream\n");
+			res = h_parse(p, pred.out, pred.nout);
+			if (res == NULL) {
+				fprintf(stderr, "decode_stream::Text String parse failed!!\n");
+			}
+		}
+	}
+	res = h_parse(p, pred.out, pred.nout);
+//	exit(0);
+
+	// TODO:: Refactor across all filters
+	// Create a byte parser to return the decoded stream
+	if (res == NULL) {
+		H_RULE(bytes_p, h_many1(h_token((const uint8_t*)pred.out, (size_t)pred.nout)));
+		res = h_parse(bytes_p, pred.out, pred.nout);  // return the decoded stream
+	}
+
+	free(pred.out);
+#endif
+	inflateEnd(&strm);
+	free(pred.buf);
+	free(buf);
+
+	if (done == -1)
+		return NULL;
+	return res;
+}
+
+
+
+
+
+
+
+/* LZW helpers */
+
+typedef struct
+{
+	uint8_t *lzw_buf;
+	size_t total_buf_size;
+	size_t write_head;
+	size_t write_tail;
+	uint8_t write_checksum;
+	size_t eof_loc;
+
+	HBytes *input_stream;
+	size_t read_head;
+	size_t read_tail;
+	uint8_t read_checksum;
+} lzwspec;
+
+lzwspec *cur_lzw_spec;
+
+/* used by write_lzw_buffer to get more space for decoding if needed */
+void
+grow_lzw_buffer(size_t amount)
+{
+	uint8_t *ret_buf = realloc(cur_lzw_spec->lzw_buf, (cur_lzw_spec->total_buf_size+amount) * sizeof(uint8_t));
+	if(ret_buf != NULL)
+	{
+		cur_lzw_spec->total_buf_size += amount;
+		cur_lzw_spec->lzw_buf = ret_buf;
+	}
+	else
+	{
+		fprintf(stderr, "LZWDecode: h_arena_realloc() failed");
+		return;
+	}
+}
+
+lzwspec *
+new_lzw_spec(HBytes *bytes)
+{
+	size_t const BUFSIZE = sizeof(uint8_t) * 1024;
+	lzwspec *ret = malloc(sizeof(lzwspec));
+	memset(ret, 0, sizeof(lzwspec));
+	ret->input_stream = bytes;
+	ret->lzw_buf = malloc(BUFSIZE);
+	ret->total_buf_size = BUFSIZE;
+	return ret;
+}
+
+void
+delete_lzw_spec(lzwspec *spec)
+{
+	free(spec->lzw_buf);
+	free(spec);
+}
+
+void
+bind_lzw_spec(lzwspec *spec)
+{
 	cur_lzw_spec = spec;
 }
 
@@ -2988,344 +3281,280 @@ p_take__m(HAllocator *mm__, size_t n, struct Env *aux)
 	return h_left__m(mm__, bytes, skip);
 }
 
-HParser *p_xrefdata__m(HAllocator *, const Dict *);
+
+// Parser for object streams
 HParser *p_objstm__m(HAllocator *, const Dict *);
 
-HParser *
-p_stream_data__m(HAllocator *mm__, const Dict *dict)
+struct streamspec {
+	Dict *dict;		/* stream dictionary */
+	HParser *parser;	/* data parser */
+};
+
+
+
+/*
+ * ********************************************************************
+ * Start Catalog parsing
+ * ********************************************************************
+ */
+
+/*
+ * decode the bytes in 'b' according to metadata in the stream dictionary 'd'
+ * and parse the result with 'p'.
+ */
+HParseResult *
+decode_contentstream(const Dict *d, HBytes b, HParser *p)
 {
+	HParseResult *(*filter)(const Dict *, HBytes, HParser *);
+	const Dict *parms = NULL;
 	const HParsedToken *v;
+	HParseResult *res = NULL;
 
-	v = dictentry(dict, "Type");
-	if (v == NULL || v->token_type != TT_BYTES)	// XXX -> custom type
-		return NULL;				/* no /Type field */
 
-	/* interpret known stream types */
-	if (bytes_eq(v->bytes, "XRef"))
-		return p_xrefdata__m(mm__, dict);
-#ifndef NOOBJSTM
-	if (bytes_eq(v->bytes, "ObjStm"))
-		return p_objstm__m(mm__, dict);
-#endif
-	if (bytes_eq(v->bytes, "XObject")) {
+	/*
+	 *  Check if there is additional information in the dictionary
+	 *  that we should use to process the content stream
+	 *
+	 *  If the data in the stream is encoded, a filter will be specified in
+	 *  the dictionary that must be used to decode the data first
+	 *
+	 *  TODO:: Handle arrays of filters (chained) and their decode parameters
+	 */
+	v = dictentry(d, "Filter"); // look for a filter
+
+	if (v != NULL) { // data is encoded
+
+
+		if (v->token_type != TT_BYTES) {
+			// XXX TT_SEQUENCE would be a filter chain; that’s not supported, yet.
+			// But it might also be something bogus, in which case we should fail.
+			return NULL;
+		}
+
+		if (bytes_eq(v->bytes, "FlateDecode"))
+			filter = FlateDecode;
+		else if (bytes_eq(v->bytes, "ASCIIHexDecode"))
+			filter = ASCIIHexDecode;
+		else if (bytes_eq(v->bytes, "ASCII85Decode"))
+			filter = ASCII85Decode;
+		else if (bytes_eq(v->bytes, "RunLengthDecode"))
+			filter = RunLengthDecode;
+		else if (bytes_eq(v->bytes, "LZWDecode"))
+			filter = LZWDecode;
+		else {		/* filter not supported */
+			fprintf(stderr, "decode_stream:: Unsupported Filter [%.*s\n]",
+					(int)v->bytes.len, v->bytes.token);
+			return NULL; /* Treat the stream as a byte array */
+		}
+		/* Check for parameters for the filter */
+		v = dictentry(d, "DecodeParms");
+		if (v && v->token_type == TT_Dict)
+			parms = v->user;
+
+		res = filter(parms, b, p);
+
+		/* Debug */
+		if (res){
+			fprintf(stdout, "decode_stream: parsed token type is = %u\n", res->ast->token_type);
+		}
+	} /* The dictionary provided direction for processing the stream */
+
+	/*
+	 * It is possible that we should always process the stream as a content stream
+	 * But not yet sure that covers all case.
+	 */
+	else { // content stream is not encoded
 		/*
-		 *  TODO:: external objects can be images, forms, or postscript objects
-		 *  We are not handling them at the moment
+		 * We know it is a stream and has a length
+		 * Have to find out what kind of content stream it is
+		 * For now, just check for text string (stream)
+		 * Parse the text stream object
+		 * Note: the stream can have text streams embedded in other stream content
+		 * and can also have in-between content
 		 */
-		fprintf (stdout, "p_stream_data__m: XObject parsing is not yet supported!\n");
-		return NULL;
+		res = h_parse(p_textbegin, b.token, b.len);
+		if ((res != NULL) && (res->ast != NULL)) {
+			/* let's make sure */
+			const HParsedToken *tstr = H_INDEX_TOKEN(res->ast, 0);
+			if (bytes_eq(tstr->bytes, "BT")) {
+				fprintf (stdout, "decode_stream:: Found a text stream\n");
+				res = h_parse(p, b.token, b.len);
+				if (res == NULL) {
+					fprintf(stderr, "decode_stream::Text String parse failed!!\n");
+				}
+			}
+		}
 	}
-	return NULL;					/* unrecognized type */
+
+	/*
+	 * There are other parameters that can be passed in the dictionary
+	 * They are not being handled currently
+	 */
+	const int numOptKeys = 3;
+	char *optionalKeys[3] = { "F", "FDecodeParms", "DL" };
+	for (int i=0; i<numOptKeys; i++) {
+		v = dictentry(d, optionalKeys[i]);
+		if (v) fprintf(stderr, "decode_stream:: Unsupported Specifications [%s\n]", optionalKeys[i]);
+	}
+	return res;
 }
 
-struct streamspec {
-	Dict *dict;		/* stream dictionary */
-	HParser *parser;	/* data parser */
-};
 
 HParsedToken *
-act_ks_value(const HParseResult *p, void *u)
+act_kcontentstream_value(const HParseResult *p, void *u)
 {
 	struct streamspec *spec = u;
 	HBytes bytes = H_CAST_BYTES(p->ast);
 	HParseResult *res;
 
 	/* decode and parse the stream data */
-	res = decode_stream(spec->dict, bytes, spec->parser);
-	if (res == NULL) {
-		HBytes b = {NULL, 0};
-		const HParsedToken *v = dictentry(spec->dict, "Type");
-		if (v != NULL && v->token_type == TT_BYTES) {
-			b.token = v->bytes.token;
-			b.len   = v->bytes.len;
-		}
-		if (b.len > INT_MAX)
-			b.len = INT_MAX;
-		fprintf(stderr, "parse error in stream (%*s)\n",
-		    (int)b.len, b.token);
-		// XXX return the undecoded stream (p->ast)?
+	res = decode_contentstream(spec->dict, bytes, spec->parser);
+	if (!res) {
+		res = (HParseResult *)p;
 	}
 
 	return H_MAKE(HParseResult, res);
 }
 
-/*
- * This continuation takes the stream dictionary (as first element of x) and
- * should return a parser that consumes exactly the bytes that make up the
- * stream data.
- *
- * x = (dict ...)
- */
-HParser *
-kstream(HAllocator *mm__, const HParsedToken *x, void *env)
-{
-	struct Env *aux = env;
-	HParsedToken *dict_t = H_INDEX_TOKEN(x, 0);
-	Dict *dict = H_CAST(Dict, dict_t);
-	const HParsedToken *v = NULL;
-	HParser *bytes_p, *dict_p, *value_p;
-	struct streamspec *spec;
-	size_t sz;
-
-	/* look for the Length entry */
-	v = dictentry(dict, "Length");
-	v = resolve(aux, v);		/* resolve indirect references */
-	if (v == NULL || v->token_type != TT_SINT || v->sint < 0)
-		goto fail;
-	sz = (size_t)v->sint;
-
-	//fprintf(stderr, "parsing stream object, length %zu.\n", sz);	// XXX debug
-
-	dict_p	= p_return__m(mm__, dict_t);
-	bytes_p = p_take__m(mm__, sz, aux);
-
-	spec = h_alloc(mm__, sizeof(struct streamspec));
-	spec->dict = dict;
-	spec->parser = p_stream_data__m(mm__, dict);
-	if (spec->parser != NULL)
-		value_p = h_action__m(mm__, bytes_p, act_ks_value, spec);
-	else
-		value_p = bytes_p;
-
-	return h_sequence__m(mm__, dict_p, value_p, NULL);
-fail:
 #if 0
-	if (v == NULL)
-		fprintf(stderr, "stream /Length missing\n");
-	else if (v -> token_type != TT_SINT)
-		fprintf(stderr, "stream /Length not an integer\n");
-	else if (v < 0)
-		fprintf(stderr, "stream /Length negative\n");
+typedef struct {
+	enum {XR_FREE, XR_INUSE, XR_OBJSTM} type;
+	union {
+		struct { size_t next, ngen; } f;	/* free */
+		struct { size_t offs, gen; } n;		/* inuse */
+		struct { size_t stm, idx; } o;		/* objstm */
+	};
+	const HParsedToken *obj;
+} XREntry;
+
+typedef struct { size_t nr, gen; } Ref;
 #endif
-	//h_pprintln(stderr, p);	// XXX debug
-	return p_fail;
-}
 
-HParser *
-p_xrefsub__m(HAllocator *mm__, size_t base, size_t count, HParser *p_entry)
+const HParsedToken *
+parse_item(struct Env *aux, size_t nr, size_t gen, size_t offset, HParser *p)
 {
-	HParser *ret_base, *ret_count, *p_header, *p_entries;
+	HParseResult *res;
+	size_t def_nr, def_gen;
 
-	ret_base  = p_return_uint__m(mm__, base);
-	ret_count = p_return_uint__m(mm__, count);
-	p_header  = h_sequence__m(mm__, ret_base, ret_count, NULL);
-	p_entries = h_repeat_n__m(mm__, p_entry, count);
+	if (offset >= aux->sz) {
+		fprintf(stderr, "%s: position %zu (%#zx) for object %zu %zu is "
+		    "out of bounds\n", aux->infile, offset, offset, nr, gen);
+		return NULL;
+	}
 
-	return h_sequence__m(mm__, p_header, p_entries, NULL);
+	if (p == NULL) {
+		fprintf(stderr, "parse_item: Unexpected request to parse object!!\n");
+		return NULL;
+	}
+	HParser *pItem = h_right(h_seek(offset * 8, SEEK_SET), p);	// XXX
+	res = h_parse(pItem, aux->input, aux->sz);
+	if (res == NULL) {
+		fprintf(stderr, "%s: error parsing object %zu %zu at position "
+		    "%zu (%#zx)\n", aux->infile, nr, gen, offset, offset);
+		return NULL;
+	}
+	assert(res->ast != NULL && res->ast->token_type == TT_SEQUENCE);
+	/* res->ast = ((nr gen) obj) */
+
+	def_nr = H_INDEX_UINT(res->ast, 0, 0);
+	def_gen = H_INDEX_UINT(res->ast, 0, 1);
+	if (def_nr != nr || def_gen != gen) {
+		fprintf(stderr, "%s: object ID mismatch at position %zu "
+		    "(%#zx): sought %zu %zu, found %zu %zu.\n", aux->infile,
+		    offset, offset, nr, gen, def_nr, def_gen);
+		return NULL;
+	}
+
+	return H_INDEX_TOKEN(res->ast, 1);
 }
 
-HParser *
-p_xrefdata__m(HAllocator *mm__, const Dict *dict)
+const HParsedToken *
+parse_objstm_item(struct Env *aux, size_t nr, size_t stm_nr, size_t idx, size_t *offset, HParser *p)
 {
-	const HParsedToken *v;
-	HParser *p_field[3], *p_entry, **p_subs;
-	size_t W[3];
-	size_t Size, Wn, Wskip;
+	XREntry *ent;
+	const HParsedToken *stm;
+
+	*offset = 0; // initialize the offset
 
 	/*
-	 * what follows is a horrible bunch of code that builds, from the
-	 * entries /W, /Index, and /Size in the stream dictionary, a parser for
-	 * the cross-reference data itself.
-	 *
-	 * in short, every cross-reference entry consists of (as of PDF 2.0)
-	 * three fields, but it could be more. /W gives the widths (in bytes)
-	 * of these fields. the /Index specifies the division of the data into
-	 * subsections; it is an array of natural numbers that in pairs specify
-	 * the base object number and length of each subsection - analogous to
-	 * the subsection headers in classic xref sections.
-	 *
-	 * when /Index is missing, a default value of [0 Size] is defined,
-	 * where Size is the value of the /Size field. as in normal trailer
-	 * dictionaries, it specifies the total size of the (entire)
-	 * cross-reference table.
-	 *
-	 * when /W states a width of 0 for a field, that field is not present
-	 * in the data and a default value should be used "if there is one".
-	 * most notably, the first field determines the "type" of the entry,
-	 * analogous to the 'n' and 'f' tags in classic xref sections; a width
-	 * of 0 for the first field is specified to mean that every entry is of
-	 * type 1 (= "n"). that type, in particular, specifies a default of 0
-	 * for field 3 (generation). in fact, these are the only defaults
-	 * defined by ISO 32000-1:2008 (PDF 1.7).
-	 *
-	 *   entry type  field no.  default value
-	 *               1 (type)   1
-	 *   1 ("n")     3 (gen.)   0
+	 * acquire the stream object
 	 */
 
-	/* Size (required) - total size of xref table */
-	v = dictentry(dict, "Size");
-	if (v == NULL || v->token_type != TT_SINT || v->sint < 1)
-		return p_fail;
-	Size = v->sint;
-
-	/* W (required) - field widths for each xref entry */
-	v = dictentry(dict, "W");
-	if (v == NULL || v->token_type != TT_SEQUENCE)
-		return p_fail;
-	if ((Wn = v->seq->used) < 3)
-		return p_fail;
-	Wskip = 0;
-	for (size_t i = 0; i < Wn; i++) {
-		HTokenType tt = v->seq->elements[i]->token_type;
-		int64_t w = v->seq->elements[i]->sint;
+	ent = lookup_xref(aux, stm_nr, 0);
+	if (ent == NULL)
+		return NULL;		/* stream not found */
 
-		if (tt != TT_SINT || w < 0)
-			return p_fail;
-		if (i < 3) {
-			/* we can't take >64 bits and want to use size_t */
-			if (w > 8 || (uint64_t)w > sizeof(size_t))
-				return p_fail;
-			W[i] = (size_t)w;
-		} else {
-			if ((uint64_t)w > SIZE_MAX - Wskip)
-				return p_fail;	/* overflow */
-			Wskip += w;
+	switch (ent->type)
+	{
+	case XR_FREE:
+		return NULL;		/* stream deleted */
+	case XR_INUSE:
+		if (ent->n.gen != 0)
+			return NULL;	/* stream replaced */
+		if (ent->obj == NULL) {
+			/*
+			 * decode the stream and find the target object in it
+			 */
+			ent->obj = parse_item(aux, stm_nr, 0, ent->n.offs, p);
+			*offset = ent->n.offs;
 		}
+		break;
+	case XR_OBJSTM:
+		return NULL;		/* invalid: nested streams */
 	}
-	if (Wskip > SIZE_MAX / 8)
-		return p_fail;
 
-	/*
-	 * build the parser for one xref entry.
-	 *
-	 * in summary, the only sensible forms for /W are:
-	 *
-	 *   [t x y] with t,x,y > 0  full general form
-	 *   [0 x y] with x,y > 0    only type-1 ("in use") entries
-	 *   [0 x 0] with x > 0      only type-1 entries, only offsets
-	 *
-	 * however, though nonsensical, [t x 0] with t,x > 0 is not disallowed
-	 * by the spec; as long as all entries are of type 1, the xref data can
-	 * be interpreted without ambiguity.
-	 *
-	 * in fact, every nonsensical form is possible as long as there are 0
-	 * entries.
-	 *
-	 * we realize this mess by just initializing the default parser to
-	 * p_fail and and replacing the known cases afterwards.
-	 */
-	for (size_t i = 0; i < 3; i++) {
-		if (W[i] == 0)
-			p_field[i] = p_fail;	/* no known default */
-		else
-			p_field[i] = h_bits__m(mm__, W[i] * 8, false);
-	}
-	/* known default cases: */
-	if (W[0] == 0)
-		p_field[0] = p_return_1;	/* all type 1 */
-	if (W[2] == 0) {
-		p_field[2] = p_return_0;	/* all generation 0 */
-		#if 0
-		/* XXX
-		 * i've seen a 0-width field 3 used with values of 1 (inuse)
-		 * and 2 (objstm) in field 1, implying "objstm idx 0" for the
-		 * latter case.
-		 */
-		if (W[0] > 0) {
-			/* type field *must* be 1 */
-			p_field[0] = h_attr_bool__m(mm__, p_field[0],
-			    validate_eq_uint, (void *)1);
-		}
-		#endif
+	if ((stm = ent->obj) == NULL) {
+		fprintf(stderr, "%s: error parsing object stream at position "
+		    "%zu (%#zx)\n", aux->infile, ent->n.offs, ent->n.offs);
+		return NULL;
 	}
-	if (Wskip > 0)	// XXX h_skip does not work with CF, yet
-		return p_fail;
-	p_entry = h_sequence__m(mm__, p_field[0], p_field[1], p_field[2], NULL);
-	p_entry = h_action__m(mm__, p_entry, act_xrstment, NULL);
 
-	/* Index (optional) - subsections [base count ...] */
-	v = dictentry(dict, "Index");
-	if (v == NULL) {
-		/* default: [0 Size] */
-		p_subs = h_alloc(mm__, 2 * sizeof(HParser *));
-		p_subs[0] = p_xrefsub__m(mm__, 0, Size, p_entry);
-		p_subs[1] = NULL;
-	} else if (v->token_type != TT_SEQUENCE) {
-		return p_fail;
-	} else {
-		size_t nsubs = v->seq->used / 2;
-
-		/* build a parser for each subsection */
-		if (nsubs >= SIZE_MAX / sizeof(HParser *))
-			return p_fail;
-		p_subs = h_alloc(mm__, (nsubs + 1) * sizeof(HParser *));
-		for (size_t i = 0; i < nsubs; i++) {
-			HParsedToken *base = v->seq->elements[2 * i];
-			HParsedToken *n = v->seq->elements[2 * i + 1];
-
-			if (base->token_type != TT_SINT || base->sint < 0 ||
-			    n->token_type != TT_SINT || n->sint < 0 ||
-			    (uint64_t)n->sint > SIZE_MAX)
-				return p_fail;
-
-			p_subs[i] = p_xrefsub__m(mm__, base->sint, n->sint,
-			    p_entry);
-		}
-		p_subs[nsubs] = NULL;
-	}
-	return h_sequence__ma(mm__, (void **)p_subs);
+	return ent->obj;	// The only path through this function is the one through the parser
 }
 
-HParser *
-p_objstm__m(HAllocator *mm__, const Dict *dict)
-{
-	const HParsedToken *v;
-	size_t N;
-
-	v = dictentry(dict, "N");
-	if (v == NULL || v->token_type != TT_SINT || v->sint < 0 ||
-	    (uint64_t)v->sint > SIZE_MAX) {
-		fprintf(stderr, "missing /N on object stream\n");
-		return p_fail;
-	}
-	N = v->sint;
 
-	HParser *wel_ws = h_sequence__m(mm__, p_wel, p_ws, NULL);
-	HParser *idx = p_sepBy_n__m(mm__, p_npair, wel_ws, N);
+const HParsedToken *
+resolve_item(struct Env *aux, const HParsedToken *v, size_t *offset, HParser *p)
+{
+	XREntry *ent = NULL;
+	Ref *r;
 
-	return h_sequence__m(mm__, p_ws, idx, p_elemr, p_ws, NULL);
-		// XXX leading and trailing ws OK?
+	*offset = 0; // initialize the offset
 
-	// XXX consistency-check against /First, idx, /N
-}
-
-/*
- * This continuation is very similar to kstream, except that it does not
- * rely on /Length to consume the right amount of input. If /Length is
- * not present or indirect, it will operate on the entire rest of the input.
- * This is permissible, other than for general streams, because the XRef data
- * is always self-delimiting.
- *
- * x = (dict ...)
- */
-HParser *
-kxstream(HAllocator *mm__, const HParsedToken *x, void *env)
-{
-	struct Env *aux = env;
-	HParsedToken *dict_t = H_INDEX_TOKEN(x, 0);
-	Dict *dict = H_CAST(Dict, dict_t);
-	const HParsedToken *v;
-	HParser *bytes_p, *dict_p, *value_p;
-	struct streamspec *spec;
+	/* direct objects pass through */
+	if (v == NULL || v->token_type != TT_Ref)
+		return v;
 
-	/* restrict bytes to Length if present (and not indirect) */
-	v = dictentry(dict, "Length");
-	if (v != NULL && v->token_type == TT_SINT && v->sint >= 0)
-		bytes_p = p_take__m(mm__, v->sint, aux);
-	else
-		bytes_p = p_rest__m(mm__, aux);	// XXX consume the proper amount
+	/* we are looking at an indirect reference */
+	r = v->user;
 
-	/* construct the parser for the stream data */
-	spec = h_alloc(mm__, sizeof(struct streamspec));
-	spec->dict = dict;
-	spec->parser = p_xrefdata__m(mm__, dict);
-	assert (spec->parser != NULL);
+	/* find the xref entry for this reference */
+	ent = lookup_xref(aux, r->nr, r->gen);
+	if (ent == NULL)
+		return NULL;			/* obj not found */
+	if (ent->obj != NULL)
+		return resolve_item(aux, ent->obj, offset, p);
 
-	dict_p  = p_return__m(mm__, dict_t);
-	value_p = h_action__m(mm__, bytes_p, act_ks_value, spec);
+	/* parse the object and memoize */
+	ent->obj = v;				/* break loops */
+	switch (ent->type)
+	{
+	case XR_FREE:
+		return NULL;			/* obj deleted */
+	case XR_INUSE:
+		if (ent->n.gen != r->gen)
+			return NULL;		/* obj nr reused */
+		ent->obj = parse_item(aux, r->nr, r->gen, ent->n.offs, p);
+		*offset = ent->n.offs;
+		break;
+	case XR_OBJSTM:
+		if (r->gen != 0)
+			return NULL;		/* invalid entry! */
+		ent->obj = parse_objstm_item(aux, r->nr, ent->o.stm, ent->o.idx, offset, p);
+		break;
+	}
 
-	return h_sequence__m(mm__, dict_p, value_p, NULL);
+	return resolve_item(aux, ent->obj, offset, p);
 }
 
 
@@ -3333,89 +3562,67 @@ kxstream(HAllocator *mm__, const HParsedToken *x, void *env)
 
 
 /*
- * This continuation takes the text stream and saves it in the environment for further
- * processing, e.g. writing it out to a file with the same name as the pdf input filename
- * but woth a .psectxt suffix.
+ * This continuation takes the content stream and processes it for test extraction.
+ * It is very similar to kstream in approach. It decodes and extracts the stream contents
+ * and
  * It does not consume the string and returns the token as the output.
  *
  * x = (txtobj ...)
  */
 HParser *
-ktxtstream(HAllocator *mm__, const HParsedToken *x, void *env)
+kcontentstream(HAllocator *mm__, const HParsedToken *x, void *env)
 {
 
 	struct Env *aux = env;
-#if 0
-	if (x->token_type != TT_TextEntry) {
-		fprintf(
-				stderr,
-				"ktxtstream:: Unexpected token type =%d :: (Expected TT_TextEntry)\n",
-				x->token_type);
-		assert(x->token_type == TT_TextEntry);
-		return NULL;
-	}
-#endif
-	assert (x->token_type == TT_SEQUENCE);
-	int n_tobjs = x->seq->used;
+	HParsedToken *dict_t = H_INDEX_TOKEN(x, 0);
+	Dict *dict = H_CAST(Dict, dict_t);
+	const HParsedToken *v = NULL;
+	HParser *bytes_p, *dict_p, *value_p;
+	struct streamspec *spec;
+	size_t sz=0, nOffset=0;
 
-	for (int n=0; n<n_tobjs; n++) {
 
-		assert(x->seq->elements[n]->token_type == TT_TextEntry);
-		TextEntry *tste = H_CAST(TextEntry, x->seq->elements[n]);
-		struct textstr *tstr = NULL;
-		/*
-		 *  To save all of the operators along with the text string, we have to walk
-		 *  through all of the tokens and keep a table of pointers to them
-		 *  For now, just keep a pointer to the text string in the environment
-		 *
-		 */
-		switch (tste->type) {
-		case TW_Tj:
-		case TW_Tq:
-		case TW_Tqq:
-			tstr = &tste->tstr;
-			break;
-		case TW_TJ:
-			tstr = &tste->tarray.flattened;
-			break;
-		default:
-			fprintf(stderr, "ktxtstream:: Text token type '%u' ignored\n",
-					tste->type);
-		}
+	/* look for the Length entry -- could be a reference */
+	v = dictentry(dict, "Length");
+	v = resolve_item(aux, v, &nOffset, NULL);		/* resolve indirect references */
+	if (v == NULL || v->token_type != TT_SINT || v->sint < 0) {
+		if (v == NULL)
+			fprintf(stderr, "kcontentstream: stream /Length missing\n");
+		else if (v -> token_type != TT_SINT)
+			fprintf(stderr, "kcontentstream: stream /Length not an integer\n");
+		else if (v < 0)
+			fprintf(stderr, "kcontentstream: stream /Length negative\n");
+
+		//h_pprintln(stderr, p);	// XXX debug
+		return p_fail;
+	}
 
-		fprintf(stdout, "ktxtstream: Value = %.*s\n", tstr->nchars, tstr->text);
+	sz = (size_t)v->sint;
 
+	dict_p	= p_return__m(mm__, dict_t);
+	bytes_p = p_take__m(mm__, sz, aux);
 
-		// store the string in the environment
-		// not sure whether we need to actually store the string in malloc'ed area
-		// currently, we are reusing the token memory previously created
-		struct textnode *txtnd = (struct textnode *) malloc(
-				sizeof(struct textnode));
-		txtnd->tstr = tstr;
-		txtnd->next = NULL;
-		if (aux->txthead == NULL)
-			aux->txthead = txtnd;
-		if (aux->txttail == NULL)
-			aux->txttail = txtnd;
-		else {
-			aux->txttail->next = txtnd;
-			aux->txttail       = txtnd;
-		}
-		aux->ntextobjs += 1;
+	spec = h_alloc(mm__, sizeof(struct streamspec));
+	spec->dict = dict;
 
+	v = dictentry(dict, "Type");
+	if (v == NULL)	// XXX -> custom type
+		spec->parser = p_textstream;
+	else if ( (v->token_type == TT_BYTES) && bytes_eq(v->bytes, "ObjStm") )
+		spec->parser = p_objstm__m(mm__, dict);
+	else {
+		fprintf(stderr, "kcontentstream: Not a text or object stream!\n");
+		return p_fail;
 	}
 
-	return p_return__m(mm__, x);
+	value_p = h_action__m(mm__, bytes_p, act_kcontentstream_value, spec);
+
+	return h_sequence__m(mm__, dict_p, value_p, NULL);
+
 }
 
 
 
-/*
- * ********************************************************************
- * Start Catalog parsing
- * ********************************************************************
- */
-
 void parse_pagenode(
 		struct Env         *aux,
 		PtNode_S           *pgNode   // node
@@ -3425,10 +3632,15 @@ void parse_pagenode(
 	Dict               *pageD       = pgNode->pn.page;
 	const HParsedToken *contents_t  = NULL; // dictionary token
 	Ref                *contents_r  = NULL;
-//	const HParsedToken *contents    = NULL; // resolved token
-	XREntry            *ent         = NULL;
-	HParseResult       *res         = NULL;
+	const HParsedToken *contents    = NULL; // resolved token
+//	XREntry            *ent         = NULL;
+//	HParseResult       *res         = NULL;
 
+//	const HParsedToken *v = NULL;
+//	const HParsedToken *dict_t;
+//	Dict               *dict_s; // stream
+//	HParser *bytes_p, *dict_p, *value_p;
+	size_t             sz = 0, nOffset = 0;
 
 	// Hold on to the Resources dictionary
 	// This dictionary may be empty
@@ -3445,14 +3657,30 @@ void parse_pagenode(
 		}
 	else if (contents_t->token_type == TT_Ref) {
 		contents_r = H_CAST(Ref, contents_t);
-		ent        = lookup_xref(aux, contents_r->nr, contents_r->gen);
-		if (ent->type == XR_INUSE) {
-			size_t offset = ent->n.offs;
-			fprintf (stdout, "parse_pagenode:: Offset = %ld\n", offset);
-			res = h_parse(p_page, aux->input + offset, aux->sz - offset);
-			fprintf (stdout, "parse_pagenode:: res = %p\n", (void *) res);
-		}
-//		contents   = resolve(aux, contents_t);
+		fprintf(stdout, "parse_pagenode: ref.nr = %ld, ref.gen=%ld\n", contents_r->nr, contents_r->gen);
+//		dict_t = resolve_item(aux, contents_t, &nOffset, p_dictobj);
+//		dict_s = H_CAST(Dict, dict_t);
+//		v = dictentry(dict_s, "Length");
+//		v = resolve(aux, v);   /* resolve indirect references if necessary */
+//		if (v == NULL || v->token_type != TT_SINT || v->sint < 0)
+//			goto fail;
+//		sz = (size_t)v->sint;
+//		fprintf(stdout, "parse_pagenode: sz = %ld\n", sz);
+//		dict_p	= p_return__m(dict_s->arena, dict_t);
+//		bytes_p = p_take__m(dict_s->arena, sz, aux);  // parser for the byte stream
+//		return h_sequence__m(dict_s->arena, dict_p, p_page, NULL);
+
+		contents   = resolve_item(aux, contents_t, &nOffset, p_page);
+		fprintf(stdout, "parse_pagenode: Page node contents = %p\n", (void *)contents);
+		fprintf(stdout, "parse_pagenode: sz = %ld\n", sz);
+
+//		ent = lookup_xref(aux, contents_r->nr, contents_r->gen);
+//		if (ent->type == XR_INUSE) {
+//			size_t offset = ent->n.offs;
+//			fprintf (stdout, "parse_pagenode:: Offset = %ld\n", offset);
+//			res = h_parse(p_page, aux->input + offset, aux->sz - offset);
+//			fprintf (stdout, "parse_pagenode:: res = %p\n", (void *) res);
+//		}
 	}
 	else {
 		fprintf(stderr, "parse_pagenode: Page node is not a reference ... may be an array!\n");
@@ -3464,7 +3692,9 @@ void parse_pagenode(
 //			contents->token_type);
 
 
-	end:
+end:
+
+//fail:
 	return;
 }
 
@@ -3502,6 +3732,7 @@ parse_pagetree(
 	const HParsedToken *item      = NULL;
 	size_t              npages    = 0;
     Ref     *ptRef=NULL, *meRef=NULL;
+    size_t              nOffset   = 0;
 
 
 
@@ -3520,7 +3751,7 @@ parse_pagetree(
 	{
 		node      = &treeNode->kids[i];
 		kidRef    = pgTable->elements[i];
-		kidDict_t = resolve(aux, kidRef);     // page or tree node dictionary token
+		kidDict_t = resolve_item(aux, kidRef, &nOffset, p_dictobj);     // page or tree node dictionary token
 		kidDict   = H_CAST(Dict, kidDict_t);  // page or tree node dictionary
 
 
@@ -3607,8 +3838,9 @@ parse_pagetree(
 
 
 
-	end:
-	exit(0);
+end:
+	return nOffset;
+//	exit(0);
 
 }
 
@@ -3629,6 +3861,7 @@ parse_catalog(struct Env *aux, const HParsedToken *root)
 	const Dict         *ptRoot  = NULL; // page tree root Dictionary
 	const HParsedToken *kids    = NULL;
 	const HParsedToken *item    = NULL;
+	size_t              nOffset = 0;
 
 
 	// initialize the catalog structure
@@ -3640,7 +3873,7 @@ parse_catalog(struct Env *aux, const HParsedToken *root)
 
 
 	// Ensure the reference is to the catalog dictionary
-	dict_t  = resolve(aux, root);     // token
+	dict_t  = resolve_item(aux, root, &nOffset, p_dictobj);     // token
 	catalog = H_CAST(Dict, dict_t);   // catalog dictionary
 	item = dictentry(catalog, "Type");
 	if ( (item == NULL) || (item->token_type != TT_BYTES) ||
@@ -3651,74 +3884,413 @@ parse_catalog(struct Env *aux, const HParsedToken *root)
 	aux->catalog.catalog = dict_t; // catalog dictionary token
 
 
-	// Catalog found -- Now get the root of the page tree associated with the catalog
-	ptRef = dictentry(catalog, "Pages"); // indirect reference to a dictionary
-	if ( (ptRef == NULL) || (ptRef->token_type != TT_Ref) ) {
-		fprintf(stderr, "parse_catalog: Page Tree not found!\n");
-		goto end;
+	// Catalog found -- Now get the root of the page tree associated with the catalog
+	ptRef = dictentry(catalog, "Pages"); // indirect reference to a dictionary
+	if ( (ptRef == NULL) || (ptRef->token_type != TT_Ref) ) {
+		fprintf(stderr, "parse_catalog: Page Tree not found!\n");
+		goto end;
+	}
+	aux->catalog.pRoot = ptRef; // indirect reference to the page tree
+
+
+	/* resolve and process the page tree root reference to extract the dictionary --> Page Tree Object */
+	dict_t = resolve_item(aux, ptRef, &nOffset, p_dictobj);    // page tree root node
+//	dict_t = resolve(aux, ptRef);    // page tree root node
+	ptRoot = H_CAST(Dict, dict_t);   // page tree root dictionary
+
+	// Count is a required field
+	item = dictentry(ptRoot, "Count");
+	if ( (item == NULL) || (item->token_type != TT_SINT) ) {
+		fprintf(stderr, "parse_catalog: Required page node count missing!\n");
+		goto end;
+	}
+	else {
+		aux->catalog.pgCount = H_CAST_SINT(item);
+	}
+
+	item = dictentry(ptRoot, "Parent");  // root node ==> parent should be NULL
+	if (item != NULL) {
+		fprintf(stderr, "parse_pagetree: Parent of root page tree node is not NULL [p = %p]!\n",
+				(void *)item);
+		goto end;
+	}
+
+
+	// Kids is a required field
+	kids = dictentry(ptRoot, "Kids");  // array of references to page or page tree nodes
+	if ( (kids == NULL) || (kids->token_type != TT_SEQUENCE) ) {
+		fprintf(stderr, "parse_catalog: There are no kids!\n");
+		goto end;
+	}
+
+	// parse_pagetree
+	aux->catalog.pgTree.type   = PG_TREE;
+	aux->catalog.pgTree.parent = NULL;
+	parse_pagetree(aux, &aux->catalog.pgTree, ptRef, kids, 0);
+
+
+
+	end:
+//	exit(0);
+	return success;
+}
+
+/*
+ * ********************************************************************
+ * End Catalog parsing
+ * ********************************************************************
+ */
+
+
+
+
+
+
+
+
+/*
+ * ********************************************************************
+ * Start xref parsing
+ * ********************************************************************
+ */
+
+
+HParser *p_xrefdata__m(HAllocator *, const Dict *);
+
+HParser *
+p_stream_data__m(HAllocator *mm__, const Dict *dict)
+{
+	const HParsedToken *v;
+
+	v = dictentry(dict, "Type");
+	if (v == NULL || v->token_type != TT_BYTES)	// XXX -> custom type
+		return NULL;				/* no /Type field */
+
+	/* interpret known stream types */
+	if (bytes_eq(v->bytes, "XRef"))
+		return p_xrefdata__m(mm__, dict);
+#ifndef NOOBJSTM
+	if (bytes_eq(v->bytes, "ObjStm"))
+		return p_objstm__m(mm__, dict);
+#endif
+	if (bytes_eq(v->bytes, "XObject")) {
+		/*
+		 *  TODO:: external objects can be images, forms, or postscript objects
+		 *  We are not handling them at the moment
+		 */
+		fprintf (stderr, "p_stream_data__m: XObject parsing is not yet supported!\n");
+		return NULL;
+	}
+	return NULL;					/* unrecognized type */
+}
+
+
+HParsedToken *
+act_ks_value(const HParseResult *p, void *u)
+{
+	struct streamspec *spec = u;
+	HBytes bytes = H_CAST_BYTES(p->ast);
+	HParseResult *res;
+
+	/* decode and parse the stream data */
+	res = decode_stream(spec->dict, bytes, spec->parser);
+	if (res == NULL) {
+		HBytes b = {NULL, 0};
+		const HParsedToken *v = dictentry(spec->dict, "Type");
+		if (v != NULL && v->token_type == TT_BYTES) {
+			b.token = v->bytes.token;
+			b.len   = v->bytes.len;
+		}
+		if (b.len > INT_MAX)
+			b.len = INT_MAX;
+		fprintf(stderr, "parse error in stream (%*s)\n",
+		    (int)b.len, b.token);
+		// XXX return the undecoded stream (p->ast)?
+	}
+
+	return H_MAKE(HParseResult, res);
+}
+
+/*
+ * This continuation takes the stream dictionary (as first element of x) and
+ * should return a parser that consumes exactly the bytes that make up the
+ * stream data.
+ *
+ * x = (dict ...)
+ */
+HParser *
+kstream(HAllocator *mm__, const HParsedToken *x, void *env)
+{
+	struct Env *aux = env;
+	HParsedToken *dict_t = H_INDEX_TOKEN(x, 0);
+	Dict *dict = H_CAST(Dict, dict_t);
+	const HParsedToken *v = NULL;
+	HParser *bytes_p, *dict_p, *value_p;
+	struct streamspec *spec;
+	size_t sz;
+
+	/* look for the Length entry */
+	v = dictentry(dict, "Length");
+	v = resolve(aux, v);		/* resolve indirect references */
+	if (v == NULL || v->token_type != TT_SINT || v->sint < 0)
+		goto fail;
+	sz = (size_t)v->sint;
+
+	//fprintf(stderr, "parsing stream object, length %zu.\n", sz);	// XXX debug
+
+	dict_p	= p_return__m(mm__, dict_t);
+	bytes_p = p_take__m(mm__, sz, aux);
+
+	spec = h_alloc(mm__, sizeof(struct streamspec));
+	spec->dict = dict;
+	spec->parser = p_stream_data__m(mm__, dict);
+	if (spec->parser != NULL)
+		value_p = h_action__m(mm__, bytes_p, act_ks_value, spec);
+	else
+		value_p = bytes_p;
+
+	return h_sequence__m(mm__, dict_p, value_p, NULL);
+
+fail:
+	if (v == NULL)
+		fprintf(stderr, "stream /Length missing\n");
+	else if (v -> token_type != TT_SINT)
+		fprintf(stderr, "stream /Length not an integer\n");
+	else if (v < 0)
+		fprintf(stderr, "stream /Length negative\n");
+
+	//h_pprintln(stderr, p);	// XXX debug
+	return p_fail;
+}
+
+HParser *
+p_xrefsub__m(HAllocator *mm__, size_t base, size_t count, HParser *p_entry)
+{
+	HParser *ret_base, *ret_count, *p_header, *p_entries;
+
+	ret_base  = p_return_uint__m(mm__, base);
+	ret_count = p_return_uint__m(mm__, count);
+	p_header  = h_sequence__m(mm__, ret_base, ret_count, NULL);
+	p_entries = h_repeat_n__m(mm__, p_entry, count);
+
+	return h_sequence__m(mm__, p_header, p_entries, NULL);
+}
+
+HParser *
+p_xrefdata__m(HAllocator *mm__, const Dict *dict)
+{
+	const HParsedToken *v;
+	HParser *p_field[3], *p_entry, **p_subs;
+	size_t W[3];
+	size_t Size, Wn, Wskip;
+
+	/*
+	 * what follows is a horrible bunch of code that builds, from the
+	 * entries /W, /Index, and /Size in the stream dictionary, a parser for
+	 * the cross-reference data itself.
+	 *
+	 * in short, every cross-reference entry consists of (as of PDF 2.0)
+	 * three fields, but it could be more. /W gives the widths (in bytes)
+	 * of these fields. the /Index specifies the division of the data into
+	 * subsections; it is an array of natural numbers that in pairs specify
+	 * the base object number and length of each subsection - analogous to
+	 * the subsection headers in classic xref sections.
+	 *
+	 * when /Index is missing, a default value of [0 Size] is defined,
+	 * where Size is the value of the /Size field. as in normal trailer
+	 * dictionaries, it specifies the total size of the (entire)
+	 * cross-reference table.
+	 *
+	 * when /W states a width of 0 for a field, that field is not present
+	 * in the data and a default value should be used "if there is one".
+	 * most notably, the first field determines the "type" of the entry,
+	 * analogous to the 'n' and 'f' tags in classic xref sections; a width
+	 * of 0 for the first field is specified to mean that every entry is of
+	 * type 1 (= "n"). that type, in particular, specifies a default of 0
+	 * for field 3 (generation). in fact, these are the only defaults
+	 * defined by ISO 32000-1:2008 (PDF 1.7).
+	 *
+	 *   entry type  field no.  default value
+	 *               1 (type)   1
+	 *   1 ("n")     3 (gen.)   0
+	 */
+
+	/* Size (required) - total size of xref table */
+	v = dictentry(dict, "Size");
+	if (v == NULL || v->token_type != TT_SINT || v->sint < 1)
+		return p_fail;
+	Size = v->sint;
+
+	/* W (required) - field widths for each xref entry */
+	v = dictentry(dict, "W");
+	if (v == NULL || v->token_type != TT_SEQUENCE)
+		return p_fail;
+	if ((Wn = v->seq->used) < 3)
+		return p_fail;
+	Wskip = 0;
+	for (size_t i = 0; i < Wn; i++) {
+		HTokenType tt = v->seq->elements[i]->token_type;
+		int64_t w = v->seq->elements[i]->sint;
+
+		if (tt != TT_SINT || w < 0)
+			return p_fail;
+		if (i < 3) {
+			/* we can't take >64 bits and want to use size_t */
+			if (w > 8 || (uint64_t)w > sizeof(size_t))
+				return p_fail;
+			W[i] = (size_t)w;
+		} else {
+			if ((uint64_t)w > SIZE_MAX - Wskip)
+				return p_fail;	/* overflow */
+			Wskip += w;
+		}
+	}
+	if (Wskip > SIZE_MAX / 8)
+		return p_fail;
+
+	/*
+	 * build the parser for one xref entry.
+	 *
+	 * in summary, the only sensible forms for /W are:
+	 *
+	 *   [t x y] with t,x,y > 0  full general form
+	 *   [0 x y] with x,y > 0    only type-1 ("in use") entries
+	 *   [0 x 0] with x > 0      only type-1 entries, only offsets
+	 *
+	 * however, though nonsensical, [t x 0] with t,x > 0 is not disallowed
+	 * by the spec; as long as all entries are of type 1, the xref data can
+	 * be interpreted without ambiguity.
+	 *
+	 * in fact, every nonsensical form is possible as long as there are 0
+	 * entries.
+	 *
+	 * we realize this mess by just initializing the default parser to
+	 * p_fail and and replacing the known cases afterwards.
+	 */
+	for (size_t i = 0; i < 3; i++) {
+		if (W[i] == 0)
+			p_field[i] = p_fail;	/* no known default */
+		else
+			p_field[i] = h_bits__m(mm__, W[i] * 8, false);
 	}
-	aux->catalog.pRoot = ptRef; // indirect reference to the page tree
+	/* known default cases: */
+	if (W[0] == 0)
+		p_field[0] = p_return_1;	/* all type 1 */
+	if (W[2] == 0) {
+		p_field[2] = p_return_0;	/* all generation 0 */
+		#if 0
+		/* XXX
+		 * i've seen a 0-width field 3 used with values of 1 (inuse)
+		 * and 2 (objstm) in field 1, implying "objstm idx 0" for the
+		 * latter case.
+		 */
+		if (W[0] > 0) {
+			/* type field *must* be 1 */
+			p_field[0] = h_attr_bool__m(mm__, p_field[0],
+			    validate_eq_uint, (void *)1);
+		}
+		#endif
+	}
+	if (Wskip > 0)	// XXX h_skip does not work with CF, yet
+		return p_fail;
+	p_entry = h_sequence__m(mm__, p_field[0], p_field[1], p_field[2], NULL);
+	p_entry = h_action__m(mm__, p_entry, act_xrstment, NULL);
 
+	/* Index (optional) - subsections [base count ...] */
+	v = dictentry(dict, "Index");
+	if (v == NULL) {
+		/* default: [0 Size] */
+		p_subs = h_alloc(mm__, 2 * sizeof(HParser *));
+		p_subs[0] = p_xrefsub__m(mm__, 0, Size, p_entry);
+		p_subs[1] = NULL;
+	} else if (v->token_type != TT_SEQUENCE) {
+		return p_fail;
+	} else {
+		size_t nsubs = v->seq->used / 2;
 
-	/* resolve and process the page tree root reference to extract the dictionary --> Page Tree Object */
-	dict_t = resolve(aux, ptRef);    // page tree root node
-	ptRoot = H_CAST(Dict, dict_t);   // page tree root dictionary
+		/* build a parser for each subsection */
+		if (nsubs >= SIZE_MAX / sizeof(HParser *))
+			return p_fail;
+		p_subs = h_alloc(mm__, (nsubs + 1) * sizeof(HParser *));
+		for (size_t i = 0; i < nsubs; i++) {
+			HParsedToken *base = v->seq->elements[2 * i];
+			HParsedToken *n = v->seq->elements[2 * i + 1];
 
-	// Count is a required field
-	item = dictentry(ptRoot, "Count");
-	if ( (item == NULL) || (item->token_type != TT_SINT) ) {
-		fprintf(stderr, "parse_catalog: Required page node count missing!\n");
-		goto end;
-	}
-	else {
-		aux->catalog.pgCount = H_CAST_SINT(item);
-	}
+			if (base->token_type != TT_SINT || base->sint < 0 ||
+			    n->token_type != TT_SINT || n->sint < 0 ||
+			    (uint64_t)n->sint > SIZE_MAX)
+				return p_fail;
 
-	item = dictentry(ptRoot, "Parent");  // root node ==> parent should be NULL
-	if (item != NULL) {
-		fprintf(stderr, "parse_pagetree: Parent of root page tree node is not NULL [p = %p]!\n",
-				(void *)item);
-		goto end;
+			p_subs[i] = p_xrefsub__m(mm__, base->sint, n->sint,
+			    p_entry);
+		}
+		p_subs[nsubs] = NULL;
 	}
+	return h_sequence__ma(mm__, (void **)p_subs);
+}
 
+HParser *
+p_objstm__m(HAllocator *mm__, const Dict *dict)
+{
+	const HParsedToken *v;
+	size_t N;
 
-	// Kids is a required field
-	kids = dictentry(ptRoot, "Kids");  // array of references to page or page tree nodes
-	if ( (kids == NULL) || (kids->token_type != TT_SEQUENCE) ) {
-		fprintf(stderr, "parse_catalog: There are no kids!\n");
-		goto end;
+	v = dictentry(dict, "N");
+	if (v == NULL || v->token_type != TT_SINT || v->sint < 0 ||
+	    (uint64_t)v->sint > SIZE_MAX) {
+		fprintf(stderr, "missing /N on object stream\n");
+		return p_fail;
 	}
+	N = v->sint;
 
-	// parse_pagetree
-	aux->catalog.pgTree.type   = PG_TREE;
-	aux->catalog.pgTree.parent = NULL;
-	parse_pagetree(aux, &aux->catalog.pgTree, ptRef, kids, 0);
-
+	HParser *wel_ws = h_sequence__m(mm__, p_wel, p_ws, NULL);
+	HParser *idx = p_sepBy_n__m(mm__, p_npair, wel_ws, N);
 
+	return h_sequence__m(mm__, p_ws, idx, p_elemr, p_ws, NULL);
+		// XXX leading and trailing ws OK?
 
-	end:
-	exit(0);
-	return success;
+	// XXX consistency-check against /First, idx, /N
 }
 
 /*
- * ********************************************************************
- * End Catalog parsing
- * ********************************************************************
+ * This continuation is very similar to kstream, except that it does not
+ * rely on /Length to consume the right amount of input. If /Length is
+ * not present or indirect, it will operate on the entire rest of the input.
+ * This is permissible, other than for general streams, because the XRef data
+ * is always self-delimiting.
+ *
+ * x = (dict ...)
  */
+HParser *
+kxstream(HAllocator *mm__, const HParsedToken *x, void *env)
+{
+	struct Env *aux = env;
+	HParsedToken *dict_t = H_INDEX_TOKEN(x, 0);
+	Dict *dict = H_CAST(Dict, dict_t);
+	const HParsedToken *v;
+	HParser *bytes_p, *dict_p, *value_p;
+	struct streamspec *spec;
 
-/*
- * main program
- */
+	/* restrict bytes to Length if present (and not indirect) */
+	v = dictentry(dict, "Length");
+	if (v != NULL && v->token_type == TT_SINT && v->sint >= 0)
+		bytes_p = p_take__m(mm__, v->sint, aux);
+	else
+		bytes_p = p_rest__m(mm__, aux);	// XXX consume the proper amount
+
+	/* construct the parser for the stream data */
+	spec = h_alloc(mm__, sizeof(struct streamspec));
+	spec->dict = dict;
+	spec->parser = p_xrefdata__m(mm__, dict);
+	assert (spec->parser != NULL);
+
+	dict_p  = p_return__m(mm__, dict_t);
+	value_p = h_action__m(mm__, bytes_p, act_ks_value, spec);
+
+	return h_sequence__m(mm__, dict_p, value_p, NULL);
+}
 
-#include <stdio.h>
-#include <inttypes.h>
-#include <stdlib.h>	/* realloc() */
-#include <fcntl.h>	/* open() */
-#include <unistd.h>	/* lseek() */
-#include <sys/mman.h>	/* mmap() */
 
-const char *infile = NULL;
 
 /*
  * This helper implements the standard backwards parsing strategy to read all
@@ -3728,17 +4300,22 @@ const char *infile = NULL;
  * Allocates and returns an array of HParsedTokens, each containing the result
  * of a successful 'p_xref' parse. Sets the output parameter 'nxrefs' to the
  * number of elements.
- *
- * A return value of NULL indicates an empty result.
  */
-const HParsedToken **
-parse_xrefs(const uint8_t *input, size_t sz, size_t *nxrefs)
+const char *infile = NULL;
+
+void
+parse_xrefs(struct Env *aux)
 {
+	const uint8_t *input = aux->input;
+	size_t         sz    = aux->sz;
 	HParseResult *res = NULL;
 	const HParsedToken **xrefs = NULL;	/* empty result */
 	const HParsedToken *tok = NULL;
 	size_t n = 0, nfwd = 0;
 	size_t offset = 0;
+	bool processRoot = true;
+	size_t maxObjNum = 0;
+	Dict *trailer = NULL;
 
 	// XXX try formulating this as a parser using h_seek()
 
@@ -3773,6 +4350,17 @@ parse_xrefs(const uint8_t *input, size_t sz, size_t *nxrefs)
 			err(1, "realloc");
 		xrefs[n++] = res->ast;
 
+
+		/* process the root */
+		if (processRoot) {
+			// Size is a required field in the trailer dictionary
+			trailer = H_INDEX(Dict, res->ast, 1);
+			maxObjNum = H_CAST_SINT(dictentry(trailer, "Size"));
+
+			processRoot = false;
+		}
+
+
 		/* look up the next offset (to the previous xref section) */
 		tok = dictentry(H_INDEX(Dict, res->ast, 1), "Prev");
 		if (tok == NULL)
@@ -3806,53 +4394,47 @@ parse_xrefs(const uint8_t *input, size_t sz, size_t *nxrefs)
 	}
 
 end:
-	*nxrefs = n;
-	return xrefs;
+	aux->xrefs = xrefs;
+	aux->nxrefs = n;
+	if (n > maxObjNum) {
+		fprintf(stderr, "%s: Number of xrefs found -%ld- "
+						"Greater than specified /Size -%ld-.\n"
+						"Ignoring objects numberd greater than -%ld-!\n",
+						infile, n, maxObjNum, n);
+		aux->nxrefs = maxObjNum;
+	}
+
+
+	// Process the trailer dictionary
+	const HParsedToken *root = dictentry(trailer, "Root");
+	assert(root->token_type == TT_Ref);
+	parse_catalog(aux, root);
+
+	return;
 }
 
 
 
 /*
- * This utility extracts the text stream from the global environment
- * writes it out to a file with the same name as the pdf input filename
- * but with a .psectxt suffix.
+ * ********************************************************************
+ * End xref parsing
+ * ********************************************************************
  */
-void
-text_extract(const struct Env *aux)
-{
-    fprintf(stdout, "text_extract:: num text objects = %ld\n", aux->ntextobjs);
-	fprintf(stdout, "text_extract:: %s\n", aux->infile);
 
-	int infnlen = strlen(aux->infile);
-	int sfxlen = strlen(".psectxt");
-	int namelen = infnlen + sfxlen + 1;
 
-	char *outfn = (char *) malloc(sizeof(char) * namelen);
-	if (outfn == NULL) {
-		fprintf(stderr, "text_extract:: h_arena_realloc() failed");
-		return;
-	}
-	memcpy(outfn, aux->infile, infnlen);
-	memcpy(&outfn[infnlen], ".psectxt", sfxlen);
-	outfn[namelen-1] = '\0'; // null terminate the string
 
-	// open the file for writing
-	FILE *stream;
-	if (!(stream = fopen(outfn, "w"))) {
-		fprintf(stderr,
-				"text_extract:: Failed to open file '%s' for writing\n", outfn);
-		return;
-	}
-	struct textnode *curr = aux->txthead;
-	for (int i = 0; i < aux->ntextobjs; i++) {
-		fprintf(stdout, "%.*s\n", (int) curr->tstr->nchars, curr->tstr->text);
-		fprintf(stream, "%.*s\n", (int) curr->tstr->nchars, curr->tstr->text);
-		curr = curr->next;
-	}
-	fclose(stream);
-	free(outfn);
-	return;
-}
+
+/*
+ * main program
+ */
+
+#include <stdio.h>
+#include <inttypes.h>
+#include <stdlib.h>	/* realloc() */
+#include <fcntl.h>	/* open() */
+#include <unistd.h>	/* lseek() */
+#include <sys/mman.h>	/* mmap() */
+
 
 
 
@@ -3894,12 +4476,17 @@ main(int argc, char *argv[])
 	init_parser(&aux);
 
 	/* parse all cross-reference sections and trailer dictionaries */
-	aux.xrefs = parse_xrefs(input, sz, &aux.nxrefs);
+	parse_xrefs(&aux);
 	// XXX debug
 	//fprintf(stderr, "%s: %zu xref sections parsed\n", infile, aux.nxrefs);
 	//for (size_t i = 0; i < aux.nxrefs; i++)
 	//	h_pprintln(stderr, aux.xrefs[i]);
 
+
+//	/* parse the catalog page tree */
+//	bool success = parse_catalog(&aux);
+//	fprintf(stdout, "main: parse_catalog successfully = %s\n", success ? "true":"false");
+
 	/* run the main parser */
 	res = h_parse(p_pdf, input, sz);
 	if (!res) {
@@ -3921,5 +4508,10 @@ main(int argc, char *argv[])
 	/* print result */
 	h_pprintln(stdout, res->ast);
 
+	/* Save the extracted text */
+	if (aux.ntextobjs > 0) {
+		text_extract(&aux);
+	}
+
 	return 0;
 }
-- 
GitLab