diff --git a/pdf.c b/pdf.c index e5cf938f4f565637ed5bbba6bb0288dc0cf1babb..a47599f2660287468fd6b162a6a02a537967a6b5 100644 --- a/pdf.c +++ b/pdf.c @@ -18,9 +18,9 @@ #define NOT_IN(STR) h_not_in((const uint8_t *)(STR), sizeof(STR) - 1) #ifdef LOG -#define WARN(P,MSG) h_action(P, act_warn, (char *) MSG) +#define VIOL(P,VIOL) h_action(P, act_viol, VIOL) #else -#define WARN(P,MSG) P +#define VIOL(P,VIOL) P #endif @@ -35,14 +35,6 @@ HParser *p_return_1; /* a combinator to parse a given character but return a different value */ -HParsedToken * -act_warn(const HParseResult *p, void *warnstring) -{ - fprintf(stderr, "WARNING: %s\n", (char *) warnstring); - return (HParsedToken *) p->ast; -} - - HParsedToken * act_return_uint(const HParseResult *p, void *u) { @@ -91,7 +83,6 @@ validate_eq_uint(HParseResult *p, void *u) v->uint == (uint64_t)(uintptr_t)u); } - /* * auxiliary global data structure needed by the parser */ @@ -244,13 +235,28 @@ act_nat(const HParseResult *p, void *u) // XXX check for overflow for (size_t i = 0; i < seq->used; i++) - x = x*10 + H_CAST_UINT(seq->elements[i]); + x = x*10 + (H_CAST_UINT(seq->elements[i])); return H_MAKE_UINT(x); } #define act_xroff act_nat #define act_xrgen act_nat +HParsedToken * +act_viol(const HParseResult *p, char *viol) +{ + H_RULE(viol_preamble, SEQ(h_many(NOT_IN("=")), LIT("="))); + H_RULE(nat, h_action(h_many1(h_action(h_ch_range('0', '9'), act_digit, NULL)), + act_nat, NULL)); + H_RULE(p_violsev, SEQ(IGN(viol_preamble), nat)); + uint severity = h_parse(p_violsev, viol, strlen(viol))->ast->seq->elements[0]->uint; + fprintf(stderr, "VIOLATION[%d]: %s\n", + severity, + viol); + + return (HParsedToken *) p->ast; +} + bool validate_pnat(HParseResult *p, void *u) { @@ -668,7 +674,12 @@ init_parser(struct Env *aux) // XXX skip however much we consumed and check for "endstream endobj"? /* trailer */ - H_RULE(startxr, SEQ(nl, KW("startxref"), nl, + H_RULE(startxr, SEQ(nl, KW("startxref"), nl, + lws, nat, nl, + LIT("%%EOF"), OPT(nl))); + + /* used for the backwards search */ + H_RULE(lasteof, SEQ(nl, KW("startxref"), nl, lws, nat, nl, // XXX the real world sometimes omits nl after %%EOF inside the file. // the next 'tail' would be appended right after the 'F', @@ -676,17 +687,26 @@ init_parser(struct Env *aux) // ended without a trailing newline. m) // this is invalid per spec, because it creates a run-on // comment, but we should probably accept-and-warn. - LIT("%%EOF"), CHX(h_many(nl), end))); // XXX should lws be allowed before EOF marker? // NB: lws before xref offset is allowed, cf. p.48 (example 4) + LIT("%%EOF"), + CHX(VIOL(SEQ(nl, h_many1(nl)), "Multiple newlines after final %%EOF (severity=4)"), + h_many(nl)))); + H_RULE(xr_td, SEQ(xrefs, KW("trailer"), ws, dict)); - H_RULE(hdr_junk, WARN(h_many1(h_butnot(h_ch_range(0, 255), objdef)), - "Junk after header")); + H_RULE(hdr_junk, VIOL(h_many1(h_butnot(h_ch_range(0, 255), objdef)), + "Uncommented junk after header (severity=1)")); H_RULE(tail, SEQ(body, h_optional(xr_td), startxr)); +<<<<<<< aef9ebb6217d2fc47e87e7b3993bef35276d5601 // XXX the real world likes to omit 'startxr' from all but the // last trailer. we should accept-and-warn in that case. H_RULE(pdf, SEQ(header, OPT(hdr_junk), h_many1(tail), end)); +======= + H_RULE(final_eof_junk, CHX(VIOL(h_many1(nl), "Multiple newlines after final %%EOF (severity=4)"), + end)); + H_RULE(pdf, SEQ(header, OPT(hdr_junk), h_many1(tail), final_eof_junk)); +>>>>>>> Added severity level extraction. /* debug parser to consume as much as possible */ H_RULE(pdfdbg, SEQ(header, h_many(tail), body, OPT(xr_td), OPT(startxr))); @@ -695,7 +715,7 @@ init_parser(struct Env *aux) /* global parser variables */ p_pdf = pdf; p_pdfdbg = pdfdbg; - p_startxref = startxr; + p_startxref = lasteof; //startxr; p_xref = CHX(xr_td, xrstm); p_objdef = objdef;