From c316cc894443e526b4986cf5f778a4f544bae6a8 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" <pesco@khjk.org> Date: Sat, 8 Feb 2020 20:10:49 +0100 Subject: [PATCH] let 'obj' no longer consume leading whitespace --- pdf.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/pdf.c b/pdf.c index 2ceff3c..208cd10 100644 --- a/pdf.c +++ b/pdf.c @@ -509,13 +509,14 @@ init_parser(struct Env *aux) /* whitespace */ H_RULE(comment, SEQ(percent, line)); - H_RULE(ws, IGN(h_many(CHX(wchar, comment)))); - H_RULE(lws, IGN(h_many(lwchar))); + H_RULE(wel, IGN(CHX(wchar, comment)); + H_RULE(ws, IGN(h_many(wel)))); + H_RULE(lws, IGN(h_many(IGN(lwchar)))); + /* tokenization */ +#define MANY_WS(X) h_many(CHX(wel, X)) #define TOK(X) h_middle(ws, X, h_not(rchar)) #define KW(S) IGN(TOK(LIT(S))) -#define TOKD(X) h_right(ws, X) /* for tokens that end on delimiters */ -// XXX this allows, for instance, "<<<<" to be parsed as "<< <<". ok? /* misc */ H_RULE(nl, IGN(h_right(lws, eol))); @@ -524,6 +525,7 @@ init_parser(struct Env *aux) H_RULE(empty, SEQ(epsilon)); H_ARULE(nat, TOK(h_many1(digit))); H_ARULE(pnat, TOK(SEQ(pdigit, h_many(digit)))); + //H_RULE(npair, SEQ(pnat, wel,ws, nat)); #define OPT(X) CHX(X, epsilon) @@ -562,9 +564,8 @@ init_parser(struct Env *aux) /* NB: lone backslashes and escaped newlines are ignored */ H_ARULE(schars, h_many(CHX(schar, snest, sesc, eol))); H_RULE(snest_, SEQ(lparen, schars, rparen)); - H_RULE(litstr, h_middle(TOKD(lparen), schars, rparen)); - H_RULE(hexchr, h_right(ws, hdigit)); - H_RULE(hexstr, h_middle(TOKD(langle), h_many(hexchr), TOKD(rangle))); + H_RULE(litstr, h_middle(lparen, schars, rparen)); + H_RULE(hexstr, h_middle(langle, MANY_WS(hdigit), rangle)); H_ARULE(string, CHX(litstr, hexstr)); h_bind_indirect(snest, snest_); @@ -572,13 +573,16 @@ init_parser(struct Env *aux) H_RULE(dopen, LIT("<<")); H_RULE(dclose, LIT(">>")); H_RULE(obj, h_indirect()); - H_RULE(k_v, SEQ(name, obj)); - H_ARULE(dict, h_middle(TOKD(dopen), h_many(k_v), TOKD(dclose))); - H_RULE(array, h_middle(TOKD(lbrack), h_many(obj), TOKD(rbrack))); + H_RULE(k_v, SEQ(name, ws, obj)); + //H_RULE(k_v, CHX(SEQ(name, wel,ws, obj), + // SEQ(name, dobj))); + H_ARULE(dict, h_middle(dopen, MANY_WS(k_v), dclose)); + // XXX this allows, for instance, "<<<<" to be parsed as "<< <<". ok? + H_RULE(array, h_middle(lbrack, MANY_WS(obj), rbrack)); // XXX validate: dict keys must be unique /* streams */ - H_RULE(stmbeg, SEQ(dict, KW("stream"), OPT(cr), lf)); + H_RULE(stmbeg, SEQ(dict, ws, LIT("stream"), OPT(cr), lf)); H_RULE(stmend, SEQ(OPT(eol), LIT("endstream"))); H_RULE(stream, h_left(h_bind(stmbeg, kstream, aux), stmend)); // XXX is whitespace allowed between the eol and "endstream"? @@ -597,7 +601,7 @@ init_parser(struct Env *aux) /* body */ H_RULE(indobj, CHX(stream, obj)); - H_RULE(objdef, SEQ(pnat, nat, KW("obj"), indobj, KW("endobj"))); + H_RULE(objdef, SEQ(pnat, nat, KW("obj"), ws, indobj, KW("endobj"))); H_RULE(body, h_many(objdef)); /* for object streams */ @@ -617,7 +621,7 @@ init_parser(struct Env *aux) /* cross-reference streams */ H_RULE(xstream, h_bind(stmbeg, kxstream, aux)); - H_AVRULE(xrstm, SEQ(pnat, nat, KW("obj"), xstream)); + H_AVRULE(xrstm, SEQ(pnat, nat, KW("obj"), ws, xstream)); // XXX skip however much we consumed and check for "endstream endobj"? /* trailer */ @@ -626,7 +630,7 @@ init_parser(struct Env *aux) LIT("%%EOF"), CHX(nl, end))); // XXX should lws be allowed before EOF marker? // NB: lws before xref offset is allowed, cf. p.48 (example 4) - H_RULE(xr_td, SEQ(xrefs, KW("trailer"), dict)); + H_RULE(xr_td, SEQ(xrefs, KW("trailer"), ws, dict)); H_RULE(tail, SEQ(body, h_optional(xr_td), startxr)); H_RULE(pdf, SEQ(header, h_many1(tail), end)); -- GitLab