From b318059c8df3616fdee35412814c6712070da4f9 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" <pesco@khjk.org> Date: Tue, 26 Nov 2019 18:18:05 +0100 Subject: [PATCH] recognize cross-reference streams (no decode, yet) --- pdf.c | 104 +++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 84 insertions(+), 20 deletions(-) diff --git a/pdf.c b/pdf.c index f6dd414..e15b5b6 100644 --- a/pdf.c +++ b/pdf.c @@ -31,6 +31,26 @@ mapch(uint8_t c, uint8_t v) return h_action(h_ch(c), act_mapch, (void *)(uintptr_t)v); } +/* a helper to look up a value in a dictionary */ +const HParsedToken * +dictentry(const HCountedArray *dict, const char *key) +{ + HParsedToken *ent; + HBytes k; + size_t len; + + len = strlen(key); + for (size_t i = 0; i < dict->used; i++) { + ent = dict->elements[i]; + k = H_INDEX_BYTES(ent, 0); + + if (k.len == len && strncmp(key, k.token, k.len) == 0) + return H_INDEX_TOKEN(ent, 1); + } + + return NULL; +} + /* * semantic actions @@ -158,6 +178,60 @@ act_octal(const HParseResult *p, void *u) #define act_xrefs h_act_last +/* + * validate the /Type field on a cross-reference stream. + * + * p = pnat nat (dict offs offs) + */ +bool +validate_xrstm(HParseResult *p, void *u) +{ + const HCountedArray *tdict = H_FIELD_SEQ(2, 0); + const HParsedToken *v = dictentry(tdict, "Type"); + +#if 0 + if (v == NULL) + fprintf(stderr, "stream dict has not /Type\n"); + else if (v->token_type != TT_BYTES) + fprintf(stderr, "stream /Type is no name object\n"); + else if (v->bytes.len == 4 && strncmp("XRef", v->bytes.token, v->bytes.len) == 0) + return true; + return false; +#endif + + return (v != NULL && v->token_type == TT_BYTES && v->bytes.len == 4 && + strncmp("XRef", v->bytes.token, v->bytes.len) == 0); +} + +/* + * interpret a cross-reference stream and return it in the same form as other + * cross-reference sections: + * + * p = pnat nat (dict offs offs) + * result = (xrefs dict) + */ +HParsedToken * +act_xrstm(const HParseResult *p, void *u) +{ + HParsedToken *xrefs, *offs1, *offs2, *dict, *result; + + dict = H_INDEX_TOKEN(p->ast, 2, 0); + offs1 = H_INDEX_TOKEN(p->ast, 2, 1); + offs2 = H_INDEX_TOKEN(p->ast, 2, 2); + + // XXX decode XRefStm + + xrefs = H_MAKE_SEQN(2); + xrefs->seq->elements[0] = offs1; + xrefs->seq->elements[1] = offs2; + xrefs->seq->used = 2; + result = H_MAKE_SEQN(2); + result->seq->elements[0] = xrefs; + result->seq->elements[1] = dict; + result->seq->used = 2; + return result; +} + /* * input grammar @@ -316,7 +390,7 @@ init_parser(void) H_RULE(xrhead, SEQ(xrnat, IGN(sp), xrnat, nl)); H_RULE(xrsub, SEQ(xrhead, h_many(xrent))); H_ARULE(xrefs, SEQ(KW("xref"), nl, h_many(xrsub))); - // XXX cross-reference streams + H_AVRULE(xrstm, SEQ(pnat, nat, KW("obj"), stream, KW("endobj"))); /* trailer */ H_RULE(startxr, SEQ(nl, KW("startxref"), nl, @@ -324,7 +398,7 @@ init_parser(void) LIT("%%EOF"), CHX(nl, end))); // XXX should lws be allowed before EOF marker? // NB: lws before xref offset is allowed, cf. p.48 (example 4) - H_RULE(xr_td, SEQ(xrefs, KW("trailer"), dict)); + H_RULE(xr_td, CHX(SEQ(xrefs, KW("trailer"), dict), xrstm)); H_RULE(tail, SEQ(body, h_optional(xr_td), startxr)); H_RULE(pdf, SEQ(header, h_many1(tail), end)); @@ -338,23 +412,10 @@ init_parser(void) p_xref = xr_td; } -const HParsedToken * -dictentry(const HCountedArray *dict, const char *key) +HParsedToken * +act_return(const HParseResult *p, void *u) { - HParsedToken *ent; - HBytes k; - size_t len; - - len = strlen(key); - for (size_t i = 0; i < dict->used; i++) { - ent = dict->elements[i]; - k = H_INDEX_BYTES(ent, 0); - - if (k.len == len && strncmp(key, k.token, k.len) == 0) - return H_INDEX_TOKEN(ent, 1); - } - - return NULL; + return u; } /* @@ -365,7 +426,8 @@ dictentry(const HCountedArray *dict, const char *key) HParser * kstream(HAllocator *mm__, const HParsedToken *x, void *env) { - const HCountedArray *dict = H_INDEX_SEQ(x, 0); + const HParsedToken *dict_t = H_INDEX_TOKEN(x, 0); + const HCountedArray *dict = H_CAST_SEQ(dict_t); const HParsedToken *v = NULL; size_t sz; @@ -379,7 +441,9 @@ kstream(HAllocator *mm__, const HParsedToken *x, void *env) //fprintf(stderr, "parsing stream object, length %zu.\n", sz); // XXX debug HParser *tell = h_tell__m(mm__); HParser *skip = h_skip__m(mm__, sz * 8); - return h_sequence__m(mm__, tell, skip, tell, NULL); + HParser *eps = h_epsilon_p__m(mm__); + HParser *ret = h_action__m(mm__, eps, act_return, (void *)dict_t); + return h_sequence__m(mm__, ret, tell, skip, tell, NULL); fail: #if 0 if (v == NULL) -- GitLab