From 76bf656776f059f083f6476d51c398530d5dec35 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" <pesco@khjk.org> Date: Fri, 29 Nov 2019 21:03:12 +0100 Subject: [PATCH] parse xref streams without looking at /Length --- pdf.c | 56 +++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 43 insertions(+), 13 deletions(-) diff --git a/pdf.c b/pdf.c index 35fcc92..5711efb 100644 --- a/pdf.c +++ b/pdf.c @@ -72,6 +72,15 @@ dictentry(const HCountedArray *dict, const char *key) } +/* + * auxiliary global data structure needed by the parser + */ +struct Env { + const char *input; + size_t sz; +}; + + /* * semantic actions */ @@ -198,6 +207,15 @@ act_octal(const HParseResult *p, void *u) #define act_xrefs h_act_last +HParsedToken * +act_rest(const HParseResult *p, void *env) +{ + struct Env *aux = env; + size_t offset = H_CAST_UINT(p->ast) / 8; + + return H_MAKE_BYTES(aux->input + offset, aux->sz - offset); +} + /* stream semantics (defined further below) */ bool validate_xrstm(HParseResult *, void *); HParsedToken *act_xrstm(const HParseResult *, void *); @@ -212,11 +230,12 @@ HParser *p_pdfdbg; HParser *p_startxref; HParser *p_xref; -/* continuation for h_bind() */ +/* continuations for h_bind() */ HParser *kstream(HAllocator *, const HParsedToken *, void *); +HParser *kxstream(HAllocator *, const HParsedToken *, void *); void -init_parser(const char *input) +init_parser(struct Env *aux) { /* lines */ H_RULE(cr, p_mapch('\r', '\n')); /* semantic value: \n */ @@ -329,7 +348,7 @@ init_parser(const char *input) /* streams */ H_RULE(stmbeg, SEQ(dict, KW("stream"), OPT(cr), lf)); H_RULE(stmend, SEQ(OPT(eol), LIT("endstream"))); - H_RULE(stream, h_left(h_bind(stmbeg, kstream, (void *)input), stmend)); + H_RULE(stream, h_left(h_bind(stmbeg, kstream, aux), stmend)); // XXX is whitespace allowed between the eol and "endstream"? H_RULE(obj_, CHX(ref, null, boole, real, intg, name, string, @@ -349,7 +368,7 @@ init_parser(const char *input) H_RULE(objdef, SEQ(pnat, nat, KW("obj"), indobj, KW("endobj"))); H_RULE(body, h_many(objdef)); // XXX object streams - /* cross-reference table */ + /* cross-reference section */ H_RULE(xreol, CHX(SEQ(sp, cr), SEQ(sp, lf), crlf)); // ^ XXX does the real world follow this rule?! cf. loop.pdf H_RULE(xrtyp, CHX(h_ch('n'), h_ch('f'))); @@ -360,7 +379,11 @@ init_parser(const char *input) H_RULE(xrhead, SEQ(xrnat, IGN(sp), xrnat, nl)); H_RULE(xrsub, SEQ(xrhead, h_many(xrent))); H_ARULE(xrefs, SEQ(KW("xref"), nl, h_many(xrsub))); - H_AVRULE(xrstm, SEQ(pnat, nat, KW("obj"), stream, KW("endobj"))); + + /* cross-reference streams */ + H_RULE(rest, h_action(h_tell(), act_rest, aux)); + H_AVRULE(xrstm, SEQ(pnat, nat, KW("obj"), stmbeg, rest)); + // XXX skip however much we consumed and check for "endstream endobj"? /* trailer */ H_RULE(startxr, SEQ(nl, KW("startxref"), nl, @@ -368,7 +391,7 @@ init_parser(const char *input) LIT("%%EOF"), CHX(nl, end))); // XXX should lws be allowed before EOF marker? // NB: lws before xref offset is allowed, cf. p.48 (example 4) - H_RULE(xr_td, CHX(SEQ(xrefs, KW("trailer"), dict), xrstm)); + H_RULE(xr_td, SEQ(xrefs, KW("trailer"), dict)); H_RULE(tail, SEQ(body, h_optional(xr_td), startxr)); H_RULE(pdf, SEQ(header, h_many1(tail), end)); @@ -379,10 +402,15 @@ init_parser(const char *input) p_pdf = pdf; p_pdfdbg = pdfdbg; p_startxref = startxr; - p_xref = xr_td; + p_xref = CHX(xr_td, xrstm); } -/* combine current position combined with env=(input,sz) into HBytes */ + +/* + * stream object handling incl. cross-reference streams + */ + +/* combine current position with env=(input,sz) into HBytes */ HParsedToken * act_ks_bytes(const HParseResult *p, void *env) { @@ -405,7 +433,7 @@ act_ks_bytes(const HParseResult *p, void *env) HParser * kstream(HAllocator *mm__, const HParsedToken *x, void *env) { - const char *input = env; + struct Env *aux = env; const HParsedToken *dict_t = H_INDEX_TOKEN(x, 0); const HCountedArray *dict = H_CAST_SEQ(dict_t); const HParsedToken *v = NULL; @@ -422,7 +450,7 @@ kstream(HAllocator *mm__, const HParsedToken *x, void *env) /* dummy struct to hold the pair (input,sz) */ HBytes *bytes = h_alloc(mm__, sizeof(HBytes)); - bytes->token = input; + bytes->token = aux->input; bytes->len = sz; HParser *tell = h_tell__m(mm__); @@ -473,7 +501,7 @@ validate_xrstm(HParseResult *p, void *u) * interpret a cross-reference stream and return it in the same form as other * cross-reference sections: * - * p = pnat nat (dict bytes) + * p = (pnat nat (dict ...) bytes) * result = (xrefs dict) */ HParsedToken * @@ -482,7 +510,7 @@ act_xrstm(const HParseResult *p, void *u) HParsedToken *bytes, *dict, *result; dict = H_INDEX_TOKEN(p->ast, 2, 0); - bytes = H_INDEX_TOKEN(p->ast, 2, 1); + bytes = H_INDEX_TOKEN(p->ast, 3); // XXX decode XRefStm @@ -597,6 +625,7 @@ end: int main(int argc, char *argv[]) { + struct Env aux; HParseResult *res = NULL; const HParsedToken **xrefs; const uint8_t *input; @@ -622,7 +651,8 @@ main(int argc, char *argv[]) err(1, "mmap"); /* build parsers */ - init_parser(input); + aux = (struct Env){input, sz}; + init_parser(&aux); /* parse all cross-reference sections and trailer dictionaries */ xrefs = parse_xrefs(input, sz, &nxrefs); -- GitLab