From e1eaaf0ac0400c9eb9dd07fcbaf95227c3cb1c46 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" <pesco@khjk.org> Date: Tue, 3 Dec 2019 20:52:41 +0100 Subject: [PATCH] add zlib decoding --- Makefile | 2 +- pdf.c | 165 +++++++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 143 insertions(+), 24 deletions(-) diff --git a/Makefile b/Makefile index 7322086..68b09f3 100644 --- a/Makefile +++ b/Makefile @@ -18,4 +18,4 @@ test: pdf @true pdf: pdf.c - $(CC) -o $@ $(CFLAGS) $(LDFLAGS) $> -lhammer + $(CC) -o $@ $(CFLAGS) $(LDFLAGS) $> -lhammer -lz diff --git a/pdf.c b/pdf.c index c5bb67a..63863af 100644 --- a/pdf.c +++ b/pdf.c @@ -54,6 +54,13 @@ p_return__m(HAllocator *mm__, const HParsedToken *tok) return h_action__m(mm__, p_epsilon, act_return, (void *)tok); } +/* a helper to compare an HBytes to a string */ +bool +bytes_eq(HBytes b, const char *s) +{ + return strncmp(s, b.token, b.len) == 0 && b.len == strlen(s); +} + /* a helper to look up a value in a dictionary */ const HParsedToken * dictentry(const HCountedArray *dict, const char *key) @@ -62,12 +69,15 @@ dictentry(const HCountedArray *dict, const char *key) HBytes k; size_t len; + if (dict == NULL) + return NULL; + len = strlen(key); for (size_t i = 0; i < dict->used; i++) { ent = dict->elements[i]; k = H_INDEX_BYTES(ent, 0); - if (k.len == len && strncmp(key, k.token, k.len) == 0) + if (k.len == len && bytes_eq(k, key)) return H_INDEX_TOKEN(ent, 1); } @@ -429,6 +439,10 @@ init_parser(struct Env *aux) * stream object handling incl. cross-reference streams */ +#include <inttypes.h> +#include <zlib.h> +#include <err.h> + /* combine current position with env=(input,sz) into HBytes */ HParsedToken * act_ks_bytes(const HParseResult *p, void *env) @@ -504,23 +518,129 @@ validate_xrstm(HParseResult *p, void *u) #if 0 if (v == NULL) - fprintf(stderr, "stream dict has not /Type\n"); + fprintf(stderr, "stream dict has no /Type\n"); else if (v->token_type != TT_BYTES) fprintf(stderr, "stream /Type is no name object\n"); - else if (v->bytes.len == 4 && strncmp("XRef", v->bytes.token, v->bytes.len) == 0) + else if (bytes_eq(v->bytes, "XRef")) return true; return false; #endif - return (v != NULL && v->token_type == TT_BYTES && v->bytes.len == 4 && - strncmp("XRef", v->bytes.token, v->bytes.len) == 0); + return (v != NULL && v->token_type == TT_BYTES && + bytes_eq(v->bytes, "XRef")); +} + +struct Predictor { + int num; /* default: 1 (no prediction) */ + int colors; /* default: 1 */ + int bpc; /* bits per component; default: 8 */ + int columns; /* default: 1 */ +}; + +HParseResult * +FlateDecode(HAllocator *mm__, HCountedArray *parms, HBytes b, HParser *p) +{ + size_t const BUFSIZE = 8 * 1024; + uint8_t *buf; + HSuspendedParser *sp; + HParseResult *res; + const HParsedToken *v; + size_t sz; + bool done; + z_stream strm = {0}; + int ret; + struct Predictor pred = {1, 1, 8, 1}; + + /* determine the predictor algorithm to use (if any) */ + #define SETPARM(VAR,STR) do { \ + v = dictentry(parms, (STR)); \ + if (v != NULL) { \ + if (v->token_type != TT_SINT || v->sint < 0) \ + return NULL; \ + VAR = v->sint; \ + } } while(0) + SETPARM(pred.num, "Predictor"); + SETPARM(pred.colors, "Colors"); + SETPARM(pred.bpc, "BitsPerComponent"); + SETPARM(pred.columns, "Columns"); + #undef SETPARM + if (pred.num != 1) { // XXX + fprintf(stderr, "FlateDecode: /Predictor %d unimplemented\n", + pred.num); + return NULL; + } + + // XXX pass our allocator to zlib + ret = inflateInit(&strm); + if (ret != Z_OK) + errx(1, "inflateInit: %s (%d)", strm.msg, ret); + buf = h_alloc(mm__, BUFSIZE); + sp = h_parse_start__m(mm__, p); + assert(sp != NULL); + + done = false; + strm.avail_in = b.len; + strm.next_in = (unsigned char *)b.token; + do { + strm.avail_out = BUFSIZE; + strm.next_out = buf; + + ret = inflate(&strm, Z_NO_FLUSH); + if (ret != Z_STREAM_END && ret != Z_OK) { + fprintf(stderr, "inflate: %s (%d)\n", strm.msg, ret); + break; + } + + sz = BUFSIZE - strm.avail_out; + done = h_parse_chunk(sp, buf, sz); + } while (!done && ret == Z_OK); + + res = h_parse_finish(sp); + // XXX always return NULL on error? + inflateEnd(&strm); + mm__->free(mm__, buf); + return res; +} + +/* + * decode the byte stream 'b' according to metadata in its stream dictionary + * 'd' and parse the result with 'p'. + */ +HParseResult * +parse_stream(HAllocator *mm__, HCountedArray *d, HBytes b, HParser *p) +{ + HParseResult *(*filter)(HAllocator *, HCountedArray *, HBytes, HParser *); + HCountedArray *parms = NULL; + const HParsedToken *v; + + v = dictentry(d, "Filter"); + if (v == NULL) + return h_parse__m(mm__, p, b.token, b.len); + + /* compile to a CF backend to enable incremental parsing */ + if (h_compile(p, PB_LLk, NULL) == -1) + errx(1, "xref data parser: LL(1) compile failed"); + + if (v->token_type == TT_SEQUENCE) + return NULL; // XXX filter chains not supported, yet + assert(v->token_type == TT_BYTES); + if (bytes_eq(v->bytes, "FlateDecode")) + filter = FlateDecode; + else + return NULL; /* filter not supported */ + + v = dictentry(d, "DecodeParms"); + if (v && v->token_type == TT_SEQUENCE) + parms = v->seq; + + return filter(mm__, parms, b, p); } /* * interpret a cross-reference stream and return it in the same form as other * cross-reference sections: * - * p = (pnat nat (dict ...) xrefs) + * p = (pnat nat (dict xrefs)) * result = (xrefs dict) */ HParsedToken * @@ -529,7 +649,7 @@ act_xrstm(const HParseResult *p, void *u) HParsedToken *bytes, *dict, *result; dict = H_INDEX_TOKEN(p->ast, 2, 0); - bytes = H_INDEX_TOKEN(p->ast, 3); + bytes = H_INDEX_TOKEN(p->ast, 2, 1); result = H_MAKE_SEQN(2); result->seq->elements[0] = bytes; @@ -555,8 +675,6 @@ kxstream(HAllocator *mm__, const HParsedToken *x, void *env) HBytes bytes; size_t W[3]; size_t Size, Wn, Wskip; - const uint8_t *data; - size_t sz; HParser *p_field[3], *p_entry, **p_subs, *p_xrefdata; dict_t = H_INDEX_TOKEN(x, 0, 0); @@ -661,15 +779,16 @@ kxstream(HAllocator *mm__, const HParsedToken *x, void *env) validate_eq_uint, (void *)1); } } - p_entry = h_sequence__m(mm__, p_field[0], p_field[1], p_field[2], - h_skip__m(mm__, Wskip * 8), NULL); + if (Wskip > 0) // XXX h_skip does not work with CF, yet + goto fail; + p_entry = h_sequence__m(mm__, p_field[0], p_field[1], p_field[2], NULL); /* Index (optional) - subsections [base count ...] */ v = dictentry(dict, "Index"); if (v == NULL) { /* default: [0 Size] */ p_subs = h_alloc(mm__, 2 * sizeof(HParser *)); - p_subs[0] = h_repeat_n__m(mm__, p_entry, Size); + p_subs[0] = p_xrefsub__m(mm__, 0, Size, p_entry); p_subs[1] = NULL; } else if (v->token_type != TT_SEQUENCE) { goto fail; @@ -696,20 +815,20 @@ kxstream(HAllocator *mm__, const HParsedToken *x, void *env) } p_xrefdata = h_sequence__ma(mm__, (void **)p_subs); - /* Filter (optional) - XXX */ - v = dictentry(dict, "Filter"); - if (v != NULL) - goto fail; - data = bytes.token; - sz = bytes.len; + /* restrict bytes to Length if present (and not indirect) */ + v = dictentry(dict, "Length"); + if (v != NULL && v->token_type == TT_SINT && v->sint >= 0) + bytes.len = v->sint; - res = h_parse__m(mm__, p_xrefdata, data, sz); + /* decode and parse the stream data */ + res = parse_stream(mm__, dict, bytes, p_xrefdata); if (res == NULL) goto fail; HParser *dict_p = p_return__m(mm__, dict_t); HParser *xref_p = p_return__m(mm__, res->ast); HParser *skip_p = h_skip__m(mm__, bytes.len * 8); + // XXX skip only as much as parse_stream consumed return h_sequence__m(mm__, dict_p, xref_p, skip_p, NULL); fail: @@ -722,9 +841,6 @@ fail: */ #include <stdio.h> -#include <inttypes.h> -#include <err.h> -#include <errno.h> #include <stdlib.h> /* realloc() */ #include <fcntl.h> /* open() */ #include <unistd.h> /* lseek() */ @@ -768,7 +884,9 @@ parse_xrefs(const char *input, size_t sz, size_t *nxrefs) offset = H_INDEX_UINT(res->ast, 0); for (;;) { - res = h_parse(p_xref, input + offset, sz - offset); + //res = h_parse(p_xref, input + offset, sz - offset); + HParser *p = h_right(h_seek(offset * 8, SEEK_SET), p_xref); // XXX + res = h_parse(p, input, sz); if (res == NULL) { fprintf(stderr, "%s: error parsing xref section at " "position %zu (0x%zx)\n", infile, offset, offset); @@ -852,6 +970,7 @@ main(int argc, char *argv[]) /* parse all cross-reference sections and trailer dictionaries */ xrefs = parse_xrefs(input, sz, &nxrefs); + (void)xrefs; // shut up, gcc /* run the main parser */ res = h_parse(p_pdf, input, sz); -- GitLab