From 0f96144c7e56bfd64c9a1d4c2d1267c74176cb67 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" <pesco@khjk.org> Date: Mon, 2 Dec 2019 21:11:07 +0100 Subject: [PATCH] write the parser for (uncompressed) xref streams - untested --- pdf.c | 218 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 207 insertions(+), 11 deletions(-) diff --git a/pdf.c b/pdf.c index 64d0b5a..60feefe 100644 --- a/pdf.c +++ b/pdf.c @@ -21,18 +21,23 @@ * some helpers */ +HParser *p_fail; +HParser *p_epsilon; +HParser *p_return_0; +HParser *p_return_1; + /* a combinator to parse a given character but return a different value */ HParsedToken * -act_mapch(const HParseResult *p, void *u) +act_return_uint(const HParseResult *p, void *u) { - return H_MAKE_UINT((uint8_t)(uintptr_t)u); + return H_MAKE_UINT((uint64_t)(uintptr_t)u); } HParser * p_mapch(uint8_t c, uint8_t v) { - return h_action(h_ch(c), act_mapch, (void *)(uintptr_t)v); + return h_action(h_ch(c), act_return_uint, (void *)(uintptr_t)v); } /* a parser that just returns a given token */ @@ -46,9 +51,7 @@ act_return(const HParseResult *p, void *u) HParser * p_return__m(HAllocator *mm__, const HParsedToken *tok) { - HParser *eps = h_epsilon_p__m(mm__); - - return h_action__m(mm__, eps, act_return, (void *)tok); + return h_action__m(mm__, p_epsilon, act_return, (void *)tok); } /* a helper to look up a value in a dictionary */ @@ -71,6 +74,14 @@ dictentry(const HCountedArray *dict, const char *key) return NULL; } +bool +validate_eq_uint(HParseResult *p, void *u) +{ + const HParsedToken *v = p->ast; + return (v != NULL && v->token_type == TT_UINT && + v->uint == (uint64_t)(uintptr_t)u); +} + /* * auxiliary global data structure needed by the parser @@ -382,7 +393,8 @@ init_parser(struct Env *aux) /* cross-reference streams */ H_RULE(rest, h_action(h_tell(), act_rest, aux)); - H_AVRULE(xrstm, SEQ(pnat, nat, KW("obj"), stmbeg, rest)); + H_RULE(xstream, h_bind(SEQ(stmbeg, rest), kxstream, aux)); + H_AVRULE(xrstm, SEQ(pnat, nat, KW("obj"), xstream)); // XXX skip however much we consumed and check for "endstream endobj"? /* trailer */ @@ -399,10 +411,17 @@ init_parser(struct Env *aux) /* debug parser to consume as much as possible */ H_RULE(pdfdbg, SEQ(header, h_many(tail), body, OPT(xr_td), OPT(startxr))); + + /* global parser variables */ p_pdf = pdf; p_pdfdbg = pdfdbg; p_startxref = startxr; p_xref = CHX(xr_td, xrstm); + + p_fail = h_nothing_p(); + p_epsilon = epsilon; + p_return_0 = h_action(epsilon, act_return_uint, (void *)0); + p_return_1 = h_action(epsilon, act_return_uint, (void *)1); } @@ -469,7 +488,7 @@ fail: fprintf(stderr, "stream /Length negative\n"); #endif //h_pprintln(stderr, p); // XXX debug - return h_nothing_p__m(mm__); + return p_fail; } /* @@ -501,7 +520,7 @@ validate_xrstm(HParseResult *p, void *u) * interpret a cross-reference stream and return it in the same form as other * cross-reference sections: * - * p = (pnat nat (dict ...) bytes) + * p = (pnat nat (dict ...) xrefs) * result = (xrefs dict) */ HParsedToken * @@ -512,8 +531,6 @@ act_xrstm(const HParseResult *p, void *u) dict = H_INDEX_TOKEN(p->ast, 2, 0); bytes = H_INDEX_TOKEN(p->ast, 3); - // XXX decode XRefStm - result = H_MAKE_SEQN(2); result->seq->elements[0] = bytes; result->seq->elements[1] = dict; @@ -521,6 +538,185 @@ act_xrstm(const HParseResult *p, void *u) return result; } +HParser * +p_xrefsub__m(HAllocator *mm__, size_t base, size_t n, HParser *p_entry) +{ + return h_repeat_n__m(mm__, p_entry, n); +} + +/* x = ((dict ...) bytes) */ +HParser * +kxstream(HAllocator *mm__, const HParsedToken *x, void *env) +{ + //struct Env *aux = env; + const HParsedToken *v, *dict_t; + const HParseResult *res; + HCountedArray *dict; + HBytes bytes; + size_t W[3]; + size_t Size, Wn, Wskip; + const uint8_t *data; + size_t sz; + HParser *p_field[3], *p_entry, **p_subs, *p_xrefdata; + + dict_t = H_INDEX_TOKEN(x, 0, 0); + dict = H_CAST_SEQ(dict_t); + bytes = H_INDEX_BYTES(x, 1); + + /* + * what follows is a horrible bunch of code that builds, from the + * entries /W, /Index, and /Size in the stream dictionary, a parser for + * the cross-reference data itself. + * + * in short, every cross-reference entry consists of (as of PDF 1.7) + * three fields, but it could be more. /W gives the widths (in bytes) + * of these fields. the /Index specifies the division of the data into + * subsections; it is an array of natural numbers that in pairs specify + * the base object number and length of each subsection - analogous to + * the subsection headers in classic xref sections. + * + * when /Index is missing, a default value of [0 Size] is defined, + * where Size is the value of the /Size field. as in normal trailer + * dictionaries, it specifies the total size of the (entire) + * cross-reference table. + * + * when /W states a width of 0 for a field, that field is not present + * in the data and a default value should be used "if there is one". + * most notably, the first field determines the "type" of the entry, + * analogous to the 'n' and 'f' tags in classic xref sections; a width + * of 0 for the first field is specified to mean that every entry is of + * type 1 (= "n"). that type, in particular, specifies a default of 0 + * for field 3 (generation). in fact, these are the only defaults + * defined by ISO 32000-1:2008 (PDF 1.7). + * + * entry type field no. default value + * 1 (type) 1 + * 1 ("n") 3 (gen.) 0 + */ + + /* Size (required) - total size of xref table */ + v = dictentry(dict, "Size"); + if (v == NULL || v->token_type != TT_SINT) + goto fail; + if ((Size = v->sint) < 1) + goto fail; + + /* W (required) - field widths for each xref entry */ + v = dictentry(dict, "W"); + if (v == NULL || v->token_type != TT_SEQUENCE) + goto fail; + if ((Wn = v->seq->used) < 3) + goto fail; + Wskip = 0; + for (size_t i = 0; i < Wn; i++) { + if (v->seq->elements[i]->token_type != TT_SINT || + v->seq->elements[i]->sint < 0) + goto fail; + if (i < 3) { + if (v->seq->elements[i]->sint > 8) + goto fail; /* can't take >64 bits */ + W[i] = (size_t)v->seq->elements[i]->sint; + } else { + if (v->seq->elements[i]->sint > SIZE_MAX - Wskip) + goto fail; /* overflow */ + Wskip += v->seq->elements[i]->sint; + } + } + if (Wskip > SIZE_MAX / 8) + goto fail; + + /* + * build the parser for one xref entry. + * + * in summary, the only sensible forms for /W are: + * + * [t x y] with t,x,y > 0 full general form + * [0 x y] with x,y > 0 only type-1 ("in use") entries + * [0 x 0] with x > 0 only type-1 entries, only offsets + * + * however, though nonsensical, [t x 0] with t,x > 0 is not disallowed + * by the spec; as long as all entries are of type 1, the xref data can + * be interpreted without ambiguity. + * + * in fact, every nonsensical form is possible as long as there are 0 + * entries. + * + * we realize this mess by just initializing the default parser to + * p_fail and and replacing the known cases afterwards. + */ + for (size_t i = 0; i < 3; i++) { + if (W[i] == 0) + p_field[i] = p_fail; /* no known default */ + else + p_field[i] = h_bits__m(mm__, W[i] * 8, false); + } + /* known default cases: */ + if (W[0] == 0) + p_field[0] = p_return_1; /* all type 1 */ + if (W[2] == 0) { + p_field[2] = p_return_0; /* all generation 0 */ + if (W[0] > 0) { + /* type field *must* be 1 */ + p_field[0] = h_attr_bool__m(mm__, p_field[0], + validate_eq_uint, (void *)1); + } + } + p_entry = h_sequence__m(mm__, p_field[0], p_field[1], p_field[2], + h_skip__m(mm__, Wskip), NULL); + + /* Index (optional) - subsections [base count ...] */ + v = dictentry(dict, "Index"); + if (v == NULL) { + /* default: [0 Size] */ + p_subs = h_alloc(mm__, 2 * sizeof(HParser *)); + p_subs[0] = h_repeat_n__m(mm__, p_entry, Size); + p_subs[1] = NULL; + } else if (v->token_type != TT_SEQUENCE) { + goto fail; + } else { + size_t nsubs = v->seq->used / 2; + + /* build a parser for each subsection */ + if (nsubs >= SIZE_MAX / sizeof(HParser *)) + goto fail; + p_subs = h_alloc(mm__, (nsubs + 1) * sizeof(HParser *)); + for (size_t i = 0; i < nsubs; i++) { + HParsedToken *base = v->seq->elements[2 * i]; + HParsedToken *n = v->seq->elements[2 * i + 1]; + + if (base->token_type != TT_SINT || base->sint < 0 || + n->token_type != TT_SINT || n->sint < 0 || + n->sint > SIZE_MAX) + goto fail; + + p_subs[i] = p_xrefsub__m(mm__, base->sint, n->sint, + p_entry); + } + p_subs[nsubs] = NULL; + } + p_xrefdata = h_sequence__ma(mm__, (void **)p_subs); + + /* Filter (optional) - XXX */ + v = dictentry(dict, "Filter"); + if (v != NULL) + goto fail; + data = bytes.token; + sz = bytes.len; + + res = h_parse__m(mm__, p_xrefdata, data, sz); + if (res == NULL) + goto fail; + + HParser *dict_p = p_return__m(mm__, dict_t); + HParser *xref_p = p_return__m(mm__, res->ast); + HParser *skip_p = h_skip__m(mm__, bytes.len * 8); + + return h_sequence__m(mm__, dict_p, xref_p, skip_p, NULL); +fail: + return p_fail; +} + + /* * main program */ -- GitLab