From a1014f81d804955bb38b434865b733271aa3d7a7 Mon Sep 17 00:00:00 2001
From: "Sven M. Hallberg" <pesco@khjk.org>
Date: Thu, 30 Mar 2023 16:52:28 +0000
Subject: [PATCH] improve handling of parse errors in xref stream data

Improve on the bugfix in commit a5abf1e2:

- Reinstate the assert for 'res->ast != NULL'. If it fails, there is a bug
  in the parser, not an error in the input file.
- Provide a distinct error message for the case where p_xref fails on a
  cross-reference stream because of invalid data.
- Only skip storing the invalid section. Try to follow the /Prev entry in
  the stream dictionary to find more sections.
---
 pdf.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/pdf.c b/pdf.c
index afd483c..b377e34 100644
--- a/pdf.c
+++ b/pdf.c
@@ -4984,19 +4984,26 @@ parse_xrefs(const uint8_t *input, size_t sz, size_t *nxrefs)
 	for (;;) {
 		assert(offset <= sz);
 		res = h_parse(p_xref, input + offset, sz - offset);
-		if (res == NULL || res->ast == NULL || H_INDEX_TOKEN(res->ast, 0) == NULL) {
+		if (res == NULL) {
 			log_message(5, "%s: error parsing xref section at "
 			    "position %zu (%#zx)\n", infile, offset, offset);
 			break;
 		}
+		assert(res->ast != NULL);
 
-		/* save this section in xrefs */
-		if (n >= SIZE_MAX / sizeof(HParsedToken *))
-			errx(2, "parse_xrefs: realloc: size would overflow");
-		xrefs = realloc(xrefs, (n + 1) * sizeof(HParsedToken *));
-		if (xrefs == NULL)
-			err(2, "parse_xrefs");
-		xrefs[n++] = res->ast;
+		if (H_INDEX_TOKEN(res->ast, 0) == NULL) {
+			log_message(5, "%s: error parsing xref stream data at "
+			    "position %zu (%#zx)\n", infile, offset, offset);
+			/* skip this section, but continue following /Prev */
+		} else {
+			/* data is valid, save this section in xrefs */
+			if (n >= SIZE_MAX / sizeof(HParsedToken *))
+				errx(2, "parse_xrefs: realloc: size overflow");
+			xrefs = realloc(xrefs, (n + 1) * sizeof *xrefs);
+			if (xrefs == NULL)
+				err(2, "parse_xrefs");
+			xrefs[n++] = res->ast;
+		}
 
 		/* look up the next offset (to the previous xref section) */
 		tok = dictentry(H_INDEX(Dict, res->ast, 1), "Prev");
-- 
GitLab