From 31cc688591341718e1c1daedcc791bc4d25fc306 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" <pesco@khjk.org> Date: Sun, 24 Nov 2019 20:16:49 +0100 Subject: [PATCH] try to parse xrefs before the rest of the document --- pdf.c | 51 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 12 deletions(-) diff --git a/pdf.c b/pdf.c index eff119b..b095f35 100644 --- a/pdf.c +++ b/pdf.c @@ -165,6 +165,8 @@ act_octal(const HParseResult *p, void *u) HParser *p_pdf; HParser *p_pdfdbg; +HParser *p_startxref; +HParser *p_xref; /* continuation for h_bind() */ HParser *kstream(HAllocator *, const HParsedToken *, void *); @@ -317,23 +319,23 @@ init_parser(void) // XXX cross-reference streams /* trailer */ - H_RULE(tdict, SEQ(KW("trailer"), dict, nl)); - H_RULE(startxr, SEQ(KW("startxref"), nl, - lws, xrnat, nl)); - // NB: lws before xref offset is allowed, cf. p.48 (example 4) - H_RULE(eofmark, SEQ(LIT("%%EOF"), CHX(nl, end))); + H_RULE(startxr, SEQ(nl, KW("startxref"), nl, + lws, xrnat, nl, + LIT("%%EOF"), CHX(nl, end))); // XXX should lws be allowed before EOF marker? - H_RULE(txrefs, SEQ(xrefs, tdict)); - H_RULE(trailer, SEQ(h_optional(txrefs), startxr, eofmark)); + // NB: lws before xref offset is allowed, cf. p.48 (example 4) + H_RULE(xr_td, SEQ(xrefs, KW("trailer"), dict)); - H_RULE(tail, SEQ(body, trailer)); + H_RULE(tail, SEQ(body, h_optional(xr_td), startxr)); H_RULE(pdf, SEQ(header, h_many1(tail), end)); /* debug parser to consume as much as possible */ - H_RULE(pdfdbg, SEQ(header, h_many(tail), body, OPT(trailer))); + H_RULE(pdfdbg, SEQ(header, h_many(tail), body, OPT(xr_td), OPT(startxr))); p_pdf = pdf; p_pdfdbg = pdfdbg; + p_startxref = startxr; + p_xref = xr_td; } /* @@ -395,10 +397,10 @@ fail: int main(int argc, char *argv[]) { - HParseResult *res; + HParseResult *res = NULL; const char *infile = NULL; const uint8_t *input; - size_t sz; + size_t sz, startxref; int fd; /* command line handling */ @@ -419,8 +421,33 @@ int main(int argc, char *argv[]) if (input == MAP_FAILED) err(1, "mmap"); - /* build and run parser */ + /* build parsers */ init_parser(); + + /* search for the "startxref" section from the back of the file */ + HParser *p = h_left(p_startxref, h_end_p()); + for (size_t i = 0; i < sz; i++) { + res = h_parse(p, input + sz - i, i); + if (res) break; + } + if (res == NULL) { + fprintf(stderr, "%s: startxref not found\n", infile); + return 1; + } + startxref = H_INDEX_UINT(res->ast, 0); + + /* parse cross-references and trailer dictionary */ + res = h_parse(p_xref, input + startxref, sz - startxref); + if (!res) { + fprintf(stderr, "%s: error parsing xref/trailer at " + "position %zu (0x%zx)\n", infile, startxref, startxref); + // continue anyway... + } + // XXX debug + //h_pprint(stderr, res->ast, 0, 2); + //return 0; + + /* run the main parser */ res = h_parse(p_pdf, input, sz); if (!res) { fprintf(stderr, "%s: no parse\n", infile); -- GitLab