From 3a63a399696dccfa788da20621e258afb8bd0dc2 Mon Sep 17 00:00:00 2001
From: "plvines (corpora)" <paul.vines@baesystems.com>
Date: Tue, 18 Feb 2020 19:48:28 +0000
Subject: [PATCH] Added relaxation/logging for stream length malformations
 (short-by-1, short-by-more, missing endstream) amended endobj parsing to
 allow whitespace before it

---
 pdf.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/pdf.c b/pdf.c
index ce648cb..bb77f32 100644
--- a/pdf.c
+++ b/pdf.c
@@ -647,7 +647,19 @@ init_parser(struct Env *aux)
 
 	/* streams */
 	H_RULE(stmbeg,	SEQ(dict, ws, LIT("stream"), OPT(cr), lf));
-	H_RULE(stmend,	SEQ(OPT(eol), LIT("endstream")));
+	H_RULE(stmend, CHX(SEQ(OPT(eol), LIT("endstream")),
+			   VIOL(SEQ(OPT(h_ch_range(0, 255)), OPT(eol), LIT("endstream")),
+				"Stream length 1-too-short (severity=4)"),
+			   VIOL(SEQ(h_many1(h_butnot(h_ch_range(0, 255), CHX(KW("endobj"),
+									     SEQ(npair, wel, KW("obj")),
+									     KW("xref"),
+									     LIT("endstream")))), LIT("endstream")),
+				"Stream length >1-too-short (severity=5)"),
+			   VIOL(h_many1(h_butnot(h_ch_range(0, 255), CHX(KW("endobj"),
+									 SEQ(npair, wel, KW("obj")),
+									 KW("xref")))),
+				"Missing endstream token (severity=7)")));
+
 	H_RULE(stream,	h_left(h_bind(stmbeg, kstream, aux), stmend));
 		// XXX is whitespace allowed between the eol and "endstream"?
 
@@ -662,9 +674,9 @@ init_parser(struct Env *aux)
 	/* body */
 	H_RULE(indobj,	CHX(stream, obj));
 	H_RULE(objdef,	SEQ(ws, npair, wel, KW("obj"), ws, indobj,
-			    CHX(VIOL(SEQ(KW("endobj"), h_many(CHX(wel, eol)), h_many1(KW("endobj"))),
+			    CHX(VIOL(SEQ(OPT(ws), OPT(lws), KW("endobj"), h_many(CHX(wel, eol)), h_many1(KW("endobj"))),
 				     "More than 1 endobj token (severity=1)"),
-				KW("endobj"),
+				SEQ(OPT(ws), OPT(lws), KW("endobj")),
 				VIOL(h_optional(KW("endobj")), "Missing endobj token (severity=1)"))));
 	H_RULE(body,	h_many(objdef));
 
-- 
GitLab