diff --git a/pdf.c b/pdf.c index 3ec9e81fdf8c554019b3cc02fc3bb61531a0a34e..fe155c14f2fb0985436f531439af76707f0c498e 100644 --- a/pdf.c +++ b/pdf.c @@ -1,9 +1,11 @@ /* beginnings of a PDF parser in hammer * pesco 2019 * pompolic 2020 + * Paul Vines 2020 */ #include <string.h> /* strncmp(), memset() */ +#include <stdlib.h> /* exit() */ #include <hammer/hammer.h> #include <hammer/glue.h> @@ -18,6 +20,13 @@ #define IN(STR) h_in((const uint8_t *)(STR), sizeof(STR) - 1) #define NOT_IN(STR) h_not_in((const uint8_t *)(STR), sizeof(STR) - 1) +#ifdef LOG +#define VIOL(P,VIOL) h_action(h_sequence(P, h_tell(), NULL), act_viol, VIOL) +#else +#define VIOL(P,VIOL) P +#endif + + /* * some helpers @@ -27,6 +36,7 @@ HParser *p_fail; HParser *p_epsilon; HParser *p_return_0; HParser *p_return_1; +uint8_t strictness = 0; /* a combinator to parse a given character but return a different value */ @@ -411,6 +421,31 @@ act_nat(const HParseResult *p, void *u) #define act_xroff act_nat #define act_xrgen act_nat +HParser *p_violsev; +HParsedToken * +act_viol(const HParseResult *p, void *viol) +{ + uint severity; + uint offset; + HParseResult *severity_parse; + viol = (uint8_t *) viol; + severity_parse = h_parse(p_violsev, viol, strlen((char *)viol)); + if (!severity_parse) { + fprintf(stderr, "Severity for violaiton %s not assigned!\n", (char *)viol); + severity = 99999; + } + else { + severity = severity_parse->ast->seq->elements[0]->uint; + } + offset = p->ast->seq->elements[1]->uint / 8; + fprintf(stderr, "VIOLATION[%d]@%d (0x%x): %s\n", severity, offset, offset, (char *) viol); + if (strictness && severity > strictness) { + exit(1); + } + /* Just return the parse AST, drop the h_tell */ + return (HParsedToken *) p->ast->seq->elements[0]; +} + bool validate_pnat(HParseResult *p, void *u) { @@ -856,8 +891,9 @@ init_parser(struct Env *aux) /* dictionaries */ H_RULE(dopen, LIT("<<")); H_RULE(dclose, LIT(">>")); - H_RULE(k_v, CHX(SEQ(name, wel,ws, obj), - SEQ(name, CHX(name,dobj)))); + H_RULE(k_v, CHX(CHX(SEQ(name, wel,ws, obj), + SEQ(name, CHX(name,dobj))), + VIOL(SEQ(name, wel,ws), "Key with no value (severity=2)"))); H_ARULE(dict_, h_middle(dopen, MANY_WS(k_v), dclose)); // XXX this allows, for instance, "<<<<" to be parsed as "<< <<". ok? // XXX validate: dict keys must be unique @@ -880,8 +916,24 @@ init_parser(struct Env *aux) h_bind_indirect(array, array_); /* streams */ - H_RULE(stmbeg, SEQ(dict, ws, LIT("stream"), OPT(cr), lf)); - H_RULE(stmend, SEQ(OPT(eol), LIT("endstream"))); + H_RULE(stmbeg, SEQ(dict, OPT(ws), LIT("stream"), OPT(cr), lf)); + H_RULE(stmend, CHX(SEQ(eol, LIT("endstream")), + VIOL(LIT("ndstream"), "Stream length >1-too-long (severity=10)"), + VIOL(SEQ(h_many1(wchar), LIT("endstream")), + "No newline before endstream (severity=7)"), + VIOL(LIT("endstream"), "Stream length 1-too-long (severity=9)"), + VIOL(SEQ(OPT(h_ch_range(0, 255)), OPT(eol), LIT("endstream")), + "Stream length 1-too-short (severity=4)"), + VIOL(SEQ(h_many1(h_butnot(h_ch_range(0, 255), CHX(KW("endobj"), + SEQ(npair, wel, KW("obj")), + KW("xref"), + LIT("endstream")))), LIT("endstream")), + "Stream length >1-too-short (severity=5)"), + VIOL(h_many1(h_butnot(h_ch_range(0, 255), CHX(KW("endobj"), + SEQ(npair, wel, KW("obj")), + KW("xref")))), + "Missing endstream token (severity=7)"))); + H_RULE(stream, h_left(h_bind(stmbeg, kstream, aux), stmend)); // XXX is whitespace allowed between the eol and "endstream"? @@ -890,12 +942,18 @@ init_parser(struct Env *aux) */ /* header */ - H_RULE(version, SEQ(pdigit, IGN(period), pdigit)); + H_RULE(version, SEQ(pdigit, IGN(period), digit)); H_RULE(header, h_middle(LIT("%PDF-"), version, nl)); /* body */ H_RULE(indobj, CHX(stream, obj)); - H_RULE(objdef, SEQ(ws, npair, wel, KW("obj"), ws, indobj, KW("endobj"))); + H_RULE(objdef, SEQ(ws, npair, wel, KW("obj"), ws, indobj, + CHX(VIOL(SEQ(OPT(ws), OPT(lws), KW("endobj"), h_many(CHX(wel, eol)), h_many1(KW("endobj"))), + "More than 1 endobj token (severity=1)"), + VIOL(SEQ(OPT(ws), OPT(lws), KW("endobj"), h_many(CHX(wel, eol)), h_many1(SEQ(dclose, h_many1(CHX(wchar, eol)), KW("endobj")))), + "More than 1 >> and endobj token (severity=2)"), + SEQ(OPT(ws), OPT(lws), KW("endobj")), + VIOL(h_optional(KW("endobj")), "Missing endobj token (severity=1)")))); H_RULE(body, h_many(objdef)); /* for object streams */ @@ -918,26 +976,44 @@ init_parser(struct Env *aux) // XXX skip however much we consumed and check for "endstream endobj"? /* trailer */ - H_RULE(startxr, SEQ(nl, KW("startxref"), nl, + H_RULE(startxr, SEQ(nl, KW("startxref"), nl, + lws, nat, nl, + LIT("%%EOF"), OPT(nl))); + + /* used for the backwards search */ + H_RULE(lasteof, SEQ(nl, KW("startxref"), nl, lws, nat, nl, - LIT("%%EOF"), CHX(nl, end))); // XXX the real world sometimes omits nl after %%EOF inside the file. // the next 'tail' would be appended right after the 'F', // presumably because the previous version of the file // ended without a trailing newline. m) - // this is invalid per spec, because it creates a run-on + // this is invalid per spec, because it creates a run-on // comment, but we should probably accept-and-warn. // XXX should lws be allowed before EOF marker? // NB: lws before xref offset is allowed, cf. p.48 (example 4) + LIT("%%EOF"), + CHX(VIOL(SEQ(nl, h_many1(nl), end), + "(offset FROM END) Multiple newlines after final %%EOF (severity=4)"), + SEQ(h_many(nl), end), + VIOL(SEQ(h_butnot(h_ch_range(0, 255), LIT("%%EOF"))), + "(offset FROM END) Data after final %%EOF (severity=7)")))); + H_RULE(xr_td, SEQ(xrefs, KW("trailer"), ws, dict)); - H_RULE(tail, SEQ(body, h_optional(xr_td), startxr)); - // XXX the real world likes to omit 'startxr' from all but the - // last trailer. we should accept-and-warn in that case. - H_RULE(pdf, SEQ(header, h_many1(tail), end)); + H_RULE(hdr_junk, VIOL(h_many1(h_butnot(h_ch_range(0, 255), objdef)), + "Uncommented junk after header (severity=1)")); + H_RULE(tail, SEQ(body, CHX(SEQ(h_optional(xr_td), startxr), + VIOL(SEQ(xr_td, OPT(SEQ(nl, KW("startxref"), nl, lws, nat, nl)), + OPT(nl), OPT(LIT("%%EOF")), OPT(nl)), + "Improper end of trailer - missing startxref and/or %%EOF (severity=5)")))); + H_RULE(final_eof_junk, CHX(VIOL(SEQ(h_many1(nl), end), "Multiple newlines after final %%EOF (severity=4)"), + VIOL(h_many1(h_butnot(h_ch_range(0, 255), LIT("%%EOF"))), + "Data after final %%EOF (severity=7)"), + end)); + H_RULE(pdf, SEQ(header, OPT(hdr_junk), h_many1(tail), final_eof_junk)); /* debug parser to consume as much as possible */ - H_RULE(pdfdbg, SEQ(header, h_many(tail), body, OPT(xr_td), OPT(startxr))); + H_RULE(pdfdbg, SEQ(header, OPT(hdr_junk), h_many(tail), body, OPT(xr_td), OPT(SEQ(startxr, final_eof_junk)))); /* * filters @@ -979,7 +1055,7 @@ init_parser(struct Env *aux) /* global parser variables */ p_pdf = pdf; p_pdfdbg = pdfdbg; - p_startxref = startxr; + p_startxref = lasteof; //startxr; p_xref = CHX(xr_td, xrstm); p_objdef = objdef; p_a85string = a85string; @@ -990,6 +1066,12 @@ init_parser(struct Env *aux) p_return_0 = h_action(epsilon, act_return_uint, (void *)0); p_return_1 = h_action(epsilon, act_return_uint, (void *)1); + /* Parsing of severity messages */ + H_RULE(viol_preamble, SEQ(h_many(NOT_IN("=")), LIT("="))); + H_RULE(severity_num, h_action(h_many1(h_action(h_ch_range('0', '9'), act_digit, NULL)), + act_nat, NULL)); + H_RULE(violsev, SEQ(IGN(viol_preamble), severity_num)); + p_violsev = violsev; #if 0 // XXX testing int r; @@ -1563,7 +1645,7 @@ kstream(HAllocator *mm__, const HParsedToken *x, void *env) //fprintf(stderr, "parsing stream object, length %zu.\n", sz); // XXX debug - dict_p = p_return__m(mm__, dict_t); + dict_p = p_return__m(mm__, dict_t); bytes_p = p_take__m(mm__, sz, aux); spec = h_alloc(mm__, sizeof(struct streamspec)); @@ -1635,9 +1717,9 @@ p_xrefdata__m(HAllocator *mm__, const Dict *dict) * for field 3 (generation). in fact, these are the only defaults * defined by ISO 32000-1:2008 (PDF 1.7). * - * entry type field no. default value - * 1 (type) 1 - * 1 ("n") 3 (gen.) 0 + * entry type field no. default value + * 1 (type) 1 + * 1 ("n") 3 (gen.) 0 */ /* Size (required) - total size of xref table */ @@ -1680,7 +1762,7 @@ p_xrefdata__m(HAllocator *mm__, const Dict *dict) * * [t x y] with t,x,y > 0 full general form * [0 x y] with x,y > 0 only type-1 ("in use") entries - * [0 x 0] with x > 0 only type-1 entries, only offsets + * [0 x 0] with x > 0 only type-1 entries, only offsets * * however, though nonsensical, [t x 0] with t,x > 0 is not disallowed * by the spec; as long as all entries are of type 1, the xref data can @@ -1907,10 +1989,15 @@ main(int argc, char *argv[]) int fd; /* command line handling */ - if (argc != 2) { + if (argc > 3) { fprintf(stderr, "usage: %s file\n", argv[0]); return 1; } + if (argc == 3) { + H_RULE(nat, h_action(h_many1(h_action(h_ch_range('0', '9'), act_digit, NULL)), + act_nat, NULL)); + strictness = h_parse(nat, (uint8_t *)argv[2], strlen(argv[2]))->ast->uint; + } infile = argv[1]; /* mmap the input file */