From c316cc894443e526b4986cf5f778a4f544bae6a8 Mon Sep 17 00:00:00 2001
From: "Sven M. Hallberg" <pesco@khjk.org>
Date: Sat, 8 Feb 2020 20:10:49 +0100
Subject: [PATCH] let 'obj' no longer consume leading whitespace

---
 pdf.c | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/pdf.c b/pdf.c
index 2ceff3c..208cd10 100644
--- a/pdf.c
+++ b/pdf.c
@@ -509,13 +509,14 @@ init_parser(struct Env *aux)
 
 	/* whitespace */
 	H_RULE(comment,	SEQ(percent, line));
-	H_RULE(ws,	IGN(h_many(CHX(wchar, comment))));
-	H_RULE(lws,	IGN(h_many(lwchar)));
+	H_RULE(wel,	IGN(CHX(wchar, comment));
+	H_RULE(ws,	IGN(h_many(wel))));
+	H_RULE(lws,	IGN(h_many(IGN(lwchar))));
 
+	/* tokenization */
+#define MANY_WS(X)	h_many(CHX(wel, X))
 #define TOK(X)	h_middle(ws, X, h_not(rchar))
 #define KW(S)	IGN(TOK(LIT(S)))
-#define TOKD(X)	h_right(ws, X)	/* for tokens that end on delimiters */
-// XXX this allows, for instance, "<<<<" to be parsed as "<< <<". ok?
 
 	/* misc */
 	H_RULE(nl,	IGN(h_right(lws, eol)));
@@ -524,6 +525,7 @@ init_parser(struct Env *aux)
 	H_RULE(empty,	SEQ(epsilon));
 	H_ARULE(nat,	TOK(h_many1(digit)));
 	H_ARULE(pnat,	TOK(SEQ(pdigit, h_many(digit))));
+	//H_RULE(npair,	SEQ(pnat, wel,ws, nat));
 
 #define OPT(X)	CHX(X, epsilon)
 
@@ -562,9 +564,8 @@ init_parser(struct Env *aux)
 		/* NB: lone backslashes and escaped newlines are ignored */
 	H_ARULE(schars,	h_many(CHX(schar, snest, sesc, eol)));
 	H_RULE(snest_,	SEQ(lparen, schars, rparen));
-	H_RULE(litstr,	h_middle(TOKD(lparen), schars, rparen));
-	H_RULE(hexchr,	h_right(ws, hdigit));
-	H_RULE(hexstr,	h_middle(TOKD(langle), h_many(hexchr), TOKD(rangle)));
+	H_RULE(litstr,	h_middle(lparen, schars, rparen));
+	H_RULE(hexstr,	h_middle(langle, MANY_WS(hdigit), rangle));
 	H_ARULE(string,	CHX(litstr, hexstr));
 	h_bind_indirect(snest, snest_);
 
@@ -572,13 +573,16 @@ init_parser(struct Env *aux)
 	H_RULE(dopen,	LIT("<<"));
 	H_RULE(dclose,	LIT(">>"));
 	H_RULE(obj,	h_indirect());
-	H_RULE(k_v,	SEQ(name, obj));
-	H_ARULE(dict,	h_middle(TOKD(dopen), h_many(k_v), TOKD(dclose)));
-	H_RULE(array,	h_middle(TOKD(lbrack), h_many(obj), TOKD(rbrack)));
+	H_RULE(k_v,	SEQ(name, ws, obj));
+	//H_RULE(k_v,	CHX(SEQ(name, wel,ws, obj),
+	//		    SEQ(name, dobj)));
+	H_ARULE(dict,	h_middle(dopen, MANY_WS(k_v), dclose));
+		// XXX this allows, for instance, "<<<<" to be parsed as "<< <<". ok?
+	H_RULE(array,	h_middle(lbrack, MANY_WS(obj), rbrack));
 		// XXX validate: dict keys must be unique
 
 	/* streams */
-	H_RULE(stmbeg,	SEQ(dict, KW("stream"), OPT(cr), lf));
+	H_RULE(stmbeg,	SEQ(dict, ws, LIT("stream"), OPT(cr), lf));
 	H_RULE(stmend,	SEQ(OPT(eol), LIT("endstream")));
 	H_RULE(stream,	h_left(h_bind(stmbeg, kstream, aux), stmend));
 		// XXX is whitespace allowed between the eol and "endstream"?
@@ -597,7 +601,7 @@ init_parser(struct Env *aux)
 
 	/* body */
 	H_RULE(indobj,	CHX(stream, obj));
-	H_RULE(objdef,	SEQ(pnat, nat, KW("obj"), indobj, KW("endobj")));
+	H_RULE(objdef,	SEQ(pnat, nat, KW("obj"), ws, indobj, KW("endobj")));
 	H_RULE(body,	h_many(objdef));
 
 	/* for object streams */
@@ -617,7 +621,7 @@ init_parser(struct Env *aux)
 
 	/* cross-reference streams */
 	H_RULE(xstream,	h_bind(stmbeg, kxstream, aux));
-	H_AVRULE(xrstm,	SEQ(pnat, nat, KW("obj"), xstream));
+	H_AVRULE(xrstm,	SEQ(pnat, nat, KW("obj"), ws, xstream));
 		// XXX skip however much we consumed and check for "endstream endobj"?
 
 	/* trailer */
@@ -626,7 +630,7 @@ init_parser(struct Env *aux)
 			    LIT("%%EOF"), CHX(nl, end)));
 		// XXX should lws be allowed before EOF marker?
 		// NB: lws before xref offset is allowed, cf. p.48 (example 4)
-	H_RULE(xr_td,	SEQ(xrefs, KW("trailer"), dict));
+	H_RULE(xr_td,	SEQ(xrefs, KW("trailer"), ws, dict));
 
 	H_RULE(tail,	SEQ(body, h_optional(xr_td), startxr));
 	H_RULE(pdf,	SEQ(header, h_many1(tail), end));
-- 
GitLab