diff --git a/pdf.c b/pdf.c index 703cf5f9f18dd5f5d3b0eeead822247ead90c650..f6dd414ffd223250d832fe06e97f258a4c71770c 100644 --- a/pdf.c +++ b/pdf.c @@ -338,6 +338,25 @@ init_parser(void) p_xref = xr_td; } +const HParsedToken * +dictentry(const HCountedArray *dict, const char *key) +{ + HParsedToken *ent; + HBytes k; + size_t len; + + len = strlen(key); + for (size_t i = 0; i < dict->used; i++) { + ent = dict->elements[i]; + k = H_INDEX_BYTES(ent, 0); + + if (k.len == len && strncmp(key, k.token, k.len) == 0) + return H_INDEX_TOKEN(ent, 1); + } + + return NULL; +} + /* * This continuation takes the stream dictionary (as first element of x) and * should return a parser that consumes exactly the bytes that make up the @@ -346,21 +365,12 @@ init_parser(void) HParser * kstream(HAllocator *mm__, const HParsedToken *x, void *env) { - HCountedArray *dict = H_INDEX_SEQ(x, 0); - HParsedToken *ent, *v = NULL; - HBytes k; + const HCountedArray *dict = H_INDEX_SEQ(x, 0); + const HParsedToken *v = NULL; size_t sz; /* look for the Length entry */ - for (size_t i = 0; i < dict->used; i++) { - ent = dict->elements[i]; - k = H_INDEX_BYTES(ent, 0); - - if (k.len == 6 && strncmp("Length", k.token, k.len) == 0) { - v = H_INDEX_TOKEN(ent, 1); - break; - } - } + v = dictentry(dict, "Length"); if (v == NULL || v->token_type != TT_SINT || v->sint < 0) goto fail; sz = (size_t)v->sint; @@ -385,22 +395,113 @@ fail: /* - * minimal main program + * main program */ #include <stdio.h> #include <inttypes.h> #include <err.h> +#include <errno.h> +#include <stdlib.h> /* realloc() */ #include <fcntl.h> /* open() */ #include <unistd.h> /* lseek() */ #include <sys/mman.h> /* mmap() */ -int main(int argc, char *argv[]) +const char *infile = NULL; + +/* + * This helper implements the standard backwards parsing strategy to read all + * cross-reference sections and trailer dictionaries, starting from the + * 'startxref' offset found at the very end of the input. + * + * Allocates and returns an array of HParsedTokens, each containing the result + * of a successful 'p_xref' parse. Sets the output parameter 'nxrefs' to the + * number of elements. + * + * A return value of NULL indicates an empty result. + */ +const HParsedToken ** +parse_xrefs(const char *input, size_t sz, size_t *nxrefs) +{ + HParseResult *res = NULL; + const HParsedToken **xrefs = NULL; /* empty result */ + const HParsedToken *tok = NULL; + size_t n = 0; + size_t offset = 0; + + // XXX try formulating this as a parser using h_seek() + + /* search for the "startxref" section from the back of the file */ + HParser *p = h_left(p_startxref, h_end_p()); + for (size_t i = 0; i < sz; i++) { + res = h_parse(p, input + sz - i, i); + if (res != NULL) + break; + } + if (res == NULL) { + fprintf(stderr, "%s: startxref not found\n", infile); + goto end; + } + offset = H_INDEX_UINT(res->ast, 0); + + for (;;) { + res = h_parse(p_xref, input + offset, sz - offset); + if (res == NULL) { + fprintf(stderr, "%s: error parsing xref section at " + "position %zu (0x%zx)\n", infile, offset, offset); + break; + } + + /* save this section in xrefs */ + if (n >= SIZE_MAX / sizeof(HParsedToken *)) + errc(1, EOVERFLOW, "overflow"); + xrefs = realloc(xrefs, (n + 1) * sizeof(HParsedToken *)); + if (xrefs == NULL) + err(1, "realloc"); + xrefs[n++] = res->ast; + + /* look up the next offset (to the previous xref section) */ + tok = dictentry(H_INDEX_SEQ(res->ast, 1), "Prev"); + if (tok == NULL) + break; + if (tok->token_type != TT_SINT) { + fprintf(stderr, "%s: /Prev not an integer\n", infile); + break; + } + + /* + * validate the new offset. we don't want to get caught in a + * loop. the offsets should strictly decrease, unless the file + * is a "linearized" PDF. in that case there should be exactly + * two xref sections in the reverse order, so we allow the + * first section to point forward. + */ + if (n > 1 && tok->sint >= offset) { + fprintf(stderr, "%s: /Prev pointer of xref section at " + "%zu (0x%zx) points forward\n", infile, offset, + offset); + break; + } + + offset = (size_t)tok->sint; + } + // XXX debug + //fprintf(stderr, "%s: %zu xref sections parsed\n", infile, n); + //for (size_t i = 0; i < n; i++) + // h_pprintln(stderr, xrefs[i]); + +end: + *nxrefs = n; + return xrefs; +} + +int +main(int argc, char *argv[]) { HParseResult *res = NULL; - const char *infile = NULL; + const HParsedToken **xrefs; const uint8_t *input; - size_t sz, startxref; + size_t sz, nxrefs; int fd; /* command line handling */ @@ -424,28 +525,8 @@ int main(int argc, char *argv[]) /* build parsers */ init_parser(); - /* search for the "startxref" section from the back of the file */ - HParser *p = h_left(p_startxref, h_end_p()); - for (size_t i = 0; i < sz; i++) { - res = h_parse(p, input + sz - i, i); - if (res) break; - } - if (res == NULL) { - fprintf(stderr, "%s: startxref not found\n", infile); - return 1; - } - startxref = H_INDEX_UINT(res->ast, 0); - - /* parse cross-references and trailer dictionary */ - res = h_parse(p_xref, input + startxref, sz - startxref); - if (!res) { - fprintf(stderr, "%s: error parsing xref/trailer at " - "position %zu (0x%zx)\n", infile, startxref, startxref); - // continue anyway... - } - // XXX debug - //h_pprintln(stderr, res->ast); - //return 0; + /* parse all cross-reference sections and trailer dictionaries */ + xrefs = parse_xrefs(input, sz, &nxrefs); /* run the main parser */ res = h_parse(p_pdf, input, sz);