From 6b54ebfa3261e225ceeb55557879f1fbd5bda222 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" <pesco@khjk.org> Date: Wed, 5 Feb 2020 21:08:24 +0100 Subject: [PATCH] generally parse stream objects (only XRef for now) --- pdf.c | 464 +++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 294 insertions(+), 170 deletions(-) diff --git a/pdf.c b/pdf.c index 0520696..5795ee8 100644 --- a/pdf.c +++ b/pdf.c @@ -116,7 +116,7 @@ struct Env { /* * custom token types */ -HTokenType TT_XREntry, TT_Ref; +HTokenType TT_XREntry, TT_Ref, TT_HParseResult; typedef struct { enum {XR_FREE, XR_INUSE, XR_OBJSTM} type; @@ -161,6 +161,14 @@ pp_ref(FILE *stream, const HParsedToken *tok, int indent, int delta) fprintf(stream, "[%zu,%zu]", r->nr, r->gen); } +void +pp_parseresult(FILE *stream, const HParsedToken *tok, int indent, int delta) +{ + HParseResult *res = H_CAST(HParseResult, tok); + + h_pprint(stream, res->ast, indent, delta); +} + /* * semantic actions @@ -355,18 +363,54 @@ act_xrstment(const HParseResult *p, void *u) #define act_xrefs h_act_last +/* + * return a cross-reference stream in the same form as xref sections. + * + * p = (pnat nat (dict [HParseResult: xrefs])) + * result = (xrefs dict) + */ HParsedToken * -act_rest(const HParseResult *p, void *env) +act_xrstm(const HParseResult *p, void *u) { - struct Env *aux = env; - size_t offset = H_CAST_UINT(p->ast) / 8; + const HParsedToken *xrefs, *dict; + HParsedToken *tok; + HParseResult *res; - return H_MAKE_BYTES(aux->input + offset, aux->sz - offset); + dict = H_INDEX_TOKEN(p->ast, 2, 0); + res = H_FIELD(HParseResult, 2, 1); // XXX free this + xrefs = res->ast; + + tok = H_MAKE_SEQN(2); + tok->seq->elements[0] = (HParsedToken *)xrefs; + tok->seq->elements[1] = (HParsedToken *)dict; + tok->seq->used = 2; + return tok; } -/* stream semantics (defined further below) */ -bool validate_xrstm(HParseResult *, void *); -HParsedToken *act_xrstm(const HParseResult *, void *); +/* + * validate the /Type field on a cross-reference stream. + * + * p = pnat nat (dict offs offs) + */ +bool +validate_xrstm(HParseResult *p, void *u) +{ + const HCountedArray *tdict = H_FIELD_SEQ(2, 0); + const HParsedToken *v = dictentry(tdict, "Type"); + +#if 0 + if (v == NULL) + fprintf(stderr, "stream dict has no /Type\n"); + else if (v->token_type != TT_BYTES) + fprintf(stderr, "stream /Type is no name object\n"); + else if (bytes_eq(v->bytes, "XRef")) + return true; + return false; +#endif + + return (v != NULL && v->token_type == TT_BYTES && + bytes_eq(v->bytes, "XRef")); +} /* @@ -386,6 +430,7 @@ HParser *kxstream(HAllocator *, const HParsedToken *, void *); void init_parser(struct Env *aux) { + TT_HParseResult = h_allocate_token_new("HParseResult", NULL, pp_parseresult); TT_XREntry = h_allocate_token_new("XREntry", NULL, pp_xrentry); TT_Ref = h_allocate_token_new("Ref", NULL, pp_ref); @@ -518,7 +563,10 @@ init_parser(struct Env *aux) /* body */ H_RULE(indobj, CHX(stream, obj)); H_RULE(objdef, SEQ(pnat, nat, KW("obj"), indobj, KW("endobj"))); - H_RULE(body, h_many(objdef)); // XXX object streams + H_RULE(body, h_many(objdef)); + + /* for object streams */ + //H_RULE(osidx, h_many(SEQ(pnat, nat))); /* cross-reference section */ H_RULE(xreol, CHX(SEQ(sp, cr), SEQ(sp, lf), crlf)); @@ -533,8 +581,7 @@ init_parser(struct Env *aux) H_ARULE(xrefs, SEQ(KW("xref"), nl, h_many(xrsub))); /* cross-reference streams */ - H_RULE(rest, h_action(h_tell(), act_rest, aux)); - H_RULE(xstream, h_bind(SEQ(stmbeg, rest), kxstream, aux)); + H_RULE(xstream, h_bind(stmbeg, kxstream, aux)); H_AVRULE(xrstm, SEQ(pnat, nat, KW("obj"), xstream)); // XXX skip however much we consumed and check for "endstream endobj"? @@ -568,28 +615,9 @@ init_parser(struct Env *aux) /* - * stream object handling incl. cross-reference streams + * lookup and resolution of indirect references */ -#include <limits.h> /* INT_MAX */ -#include <zlib.h> -#include <err.h> - -/* combine current position with env=(input,sz) into HBytes */ -HParsedToken * -act_ks_bytes(const HParseResult *p, void *env) -{ - const HBytes *bs = env; - size_t offset = H_CAST_UINT(p->ast) / 8; - - /* - * NB: we must allocate a new HBytes struct here because the old one is - * allocated only temporarily for the lifetime of the continuation - * below. - */ - return H_MAKE_BYTES(bs->token + offset, bs->len); -} - XREntry * lookup_xref(struct Env *aux, size_t nr, size_t gen) { @@ -644,13 +672,43 @@ parse_obj(struct Env *aux, size_t nr, size_t gen, size_t offset) } const HParsedToken * -parse_obj_stm(struct Env *aux, size_t nr, size_t stm_nr, size_t idx) +parse_objstm_obj(struct Env *aux, size_t nr, size_t stm_nr, size_t idx) { - //const HParsedToken *stm; + XREntry *ent; + const HParsedToken *stm; - // XXX find the stream object, decode it, parse the offset at idx, - // and parse the target object at it - return NULL; + /* + * acquire the stream object + */ + + ent = lookup_xref(aux, stm_nr, 0); + if (ent == NULL) + return NULL; /* stream not found */ + + switch (ent->type) + { + case XR_FREE: + return NULL; /* stream deleted */ + case XR_INUSE: + if (ent->n.gen != 0) + return NULL; /* stream replaced */ + if (ent->obj == NULL) + ent->obj = parse_obj(aux, stm_nr, 0, ent->n.offs); + break; + case XR_OBJSTM: + return NULL; /* invalid: nested streams */ + } + + if ((stm = ent->obj) == NULL) { + fprintf(stderr, "%s: error parsing object stream at position " + "%zu (%#zx)\n", aux->infile, ent->n.offs, ent->n.offs); + return NULL; + } + + /* + * decode the stream and find the target object in it + */ + return NULL; // XXX } const HParsedToken * @@ -687,84 +745,22 @@ resolve(struct Env *aux, const HParsedToken *v) case XR_OBJSTM: if (r->gen != 0) return NULL; /* invalid entry! */ - ent->obj = parse_obj_stm(aux, r->nr, ent->o.stm, ent->o.idx); + ent->obj = parse_objstm_obj(aux, r->nr, ent->o.stm, ent->o.idx); break; } return resolve(aux, ent->obj); } -/* - * This continuation takes the stream dictionary (as first element of x) and - * should return a parser that consumes exactly the bytes that make up the - * stream data. - */ -HParser * -kstream(HAllocator *mm__, const HParsedToken *x, void *env) -{ - struct Env *aux = env; - const HParsedToken *dict_t = H_INDEX_TOKEN(x, 0); - const HCountedArray *dict = H_CAST_SEQ(dict_t); - const HParsedToken *v = NULL; - size_t sz; - - /* look for the Length entry */ - v = dictentry(dict, "Length"); - v = resolve(aux, v); /* resolve indirect references */ - if (v == NULL || v->token_type != TT_SINT || v->sint < 0) - goto fail; - sz = (size_t)v->sint; - - //fprintf(stderr, "parsing stream object, length %zu.\n", sz); // XXX debug - - /* dummy struct to hold the pair (input,sz) */ - HBytes *bytes = h_alloc(mm__, sizeof(HBytes)); - bytes->token = aux->input; - bytes->len = sz; - - HParser *tell = h_tell__m(mm__); - HParser *skip = h_skip__m(mm__, sz * 8); - - HParser *bytes_p = h_action__m(mm__, tell, act_ks_bytes, bytes); - HParser *dict_p = p_return__m(mm__, dict_t); - return h_sequence__m(mm__, dict_p, bytes_p, skip, NULL); -fail: -#if 0 - if (v == NULL) - fprintf(stderr, "stream /Length missing\n"); - else if (v -> token_type != TT_SINT) - fprintf(stderr, "stream /Length not an integer\n"); - else if (v < 0) - fprintf(stderr, "stream /Length negative\n"); -#endif - //h_pprintln(stderr, p); // XXX debug - return p_fail; -} /* - * validate the /Type field on a cross-reference stream. - * - * p = pnat nat (dict offs offs) + * stream object handling incl. filters and cross-reference streams */ -bool -validate_xrstm(HParseResult *p, void *u) -{ - const HCountedArray *tdict = H_FIELD_SEQ(2, 0); - const HParsedToken *v = dictentry(tdict, "Type"); - -#if 0 - if (v == NULL) - fprintf(stderr, "stream dict has no /Type\n"); - else if (v->token_type != TT_BYTES) - fprintf(stderr, "stream /Type is no name object\n"); - else if (bytes_eq(v->bytes, "XRef")) - return true; - return false; -#endif - return (v != NULL && v->token_type == TT_BYTES && - bytes_eq(v->bytes, "XRef")); -} +#include <limits.h> /* INT_MAX */ +#include <stdlib.h> /* abs() */ +#include <zlib.h> +#include <err.h> struct predictor { /* parameters */ @@ -794,8 +790,6 @@ uint8_t pp_sub(int a, int b, int c) { return a; } uint8_t pp_up(int a, int b, int c) { return b; } uint8_t pp_avg(int a, int b, int c) { return (a + b) / 2; } -#include <stdlib.h> /* abs() */ - uint8_t pp_paeth(int a, int b, int c) { @@ -866,7 +860,7 @@ depred_png(struct predictor *pred, uint8_t *inp, size_t sz) } HParseResult * -FlateDecode(HAllocator *mm__, HCountedArray *parms, HBytes b, HParser *p) +FlateDecode(HCountedArray *parms, HBytes b, HParser *p) { size_t const BUFSIZE = 8 * 1024; uint8_t *buf; @@ -922,8 +916,9 @@ FlateDecode(HAllocator *mm__, HCountedArray *parms, HBytes b, HParser *p) return NULL; } pred.rowsz = (pred.colors * pred.bpc * pred.columns + 7) / 8; - pred.buf = h_alloc(mm__, pred.rowsz); - memset(pred.buf, 0, pred.rowsz); + pred.buf = calloc(1, pred.rowsz); + if (pred.buf == NULL) + err(1, "FlateDecode"); } /* set up zlib */ @@ -931,10 +926,12 @@ FlateDecode(HAllocator *mm__, HCountedArray *parms, HBytes b, HParser *p) ret = inflateInit(&strm); if (ret != Z_OK) errx(1, "inflateInit: %s (%d)", strm.msg, ret); - buf = h_alloc(mm__, BUFSIZE); + buf = malloc(BUFSIZE); + if (buf == NULL) + err(1, "FlateDecode"); /* initialize target parser */ - sp = h_parse_start__m(mm__, p); + sp = h_parse_start(p); assert(sp != NULL); pred.sp = sp; @@ -958,8 +955,8 @@ FlateDecode(HAllocator *mm__, HCountedArray *parms, HBytes b, HParser *p) res = h_parse_finish(sp); // XXX always return NULL on error? inflateEnd(&strm); - mm__->free(mm__, pred.buf); - mm__->free(mm__, buf); + free(pred.buf); + free(buf); if (done == -1) return NULL; @@ -967,23 +964,23 @@ FlateDecode(HAllocator *mm__, HCountedArray *parms, HBytes b, HParser *p) } /* - * decode the byte stream 'b' according to metadata in its stream dictionary - * 'd' and parse the result with 'p'. + * decode the bytes in 'b' according to metadata in the stream dictionary 'd' + * and parse the result with 'p'. */ HParseResult * -parse_stream(HAllocator *mm__, HCountedArray *d, HBytes b, HParser *p) +decode_stream(const HCountedArray *d, HBytes b, HParser *p) { - HParseResult *(*filter)(HAllocator *, HCountedArray *, HBytes, HParser *); + HParseResult *(*filter)(HCountedArray *, HBytes, HParser *); HCountedArray *parms = NULL; const HParsedToken *v; v = dictentry(d, "Filter"); if (v == NULL) - return h_parse__m(mm__, p, b.token, b.len); + return h_parse(p, b.token, b.len); /* compile to a CF backend to enable incremental parsing */ if (h_compile(p, PB_LLk, NULL) == -1) - errx(1, "xref data parser: LL(1) compile failed"); + errx(1, "stream data parser: LL(1) compile failed"); if (v->token_type == TT_SEQUENCE) return NULL; // XXX filter chains not supported, yet @@ -997,29 +994,146 @@ parse_stream(HAllocator *mm__, HCountedArray *d, HBytes b, HParser *p) if (v && v->token_type == TT_SEQUENCE) parms = v->seq; - return filter(mm__, parms, b, p); + return filter(parms, b, p); +} + +HParsedToken * +act_rest(const HParseResult *p, void *env) +{ + struct Env *aux = env; + size_t offset = H_CAST_UINT(p->ast) / 8; + + return H_MAKE_BYTES(aux->input + offset, aux->sz - offset); +} + +HParser * +p_rest__m(HAllocator *mm__, struct Env *aux) +{ + return h_action__m(mm__, h_tell__m(mm__), act_rest, aux); +} + +/* combine current position with env=(input,sz) into HBytes */ +HParsedToken * +act_take_bytes(const HParseResult *p, void *env) +{ + const HBytes *bs = env; + size_t offset = H_CAST_UINT(p->ast) / 8; + + /* + * NB: we must allocate a new HBytes struct here because the old one is + * allocated only temporarily for the lifetime of the continuation + * below. + */ + return H_MAKE_BYTES(bs->token + offset, bs->len); +} + +HParser * +p_take__m(HAllocator *mm__, size_t n, struct Env *aux) +{ + HParser *skip, *bytes; + HBytes *bs; + + /* dummy struct to hold the pair (input,n) */ + bs = h_alloc(mm__, sizeof(HBytes)); + bs->token = aux->input; + bs->len = n; + + bytes = h_action__m(mm__, h_tell__m(mm__), act_take_bytes, bs); + skip = h_skip__m(mm__, n * 8); + + return h_left__m(mm__, bytes, skip); +} + +HParser * +p_xrefdata__m(HAllocator *mm__, const HCountedArray *dict); + +HParser * +p_stream_data__m(HAllocator *mm__, const HCountedArray *dict) +{ + const HParsedToken *v; + + v = dictentry(dict, "Type"); + if (v == NULL || v->token_type != TT_BYTES) // XXX -> custom type + return NULL; /* no /Type field */ + + /* interpret known stream types */ + if (bytes_eq(v->bytes, "XRef")) + return p_xrefdata__m(mm__, dict); + // XXX + //if (bytes_eq(v->bytes, "ObjStm")) + // return p_objstm__m(mm__, dict); + + return NULL; /* unrecognized type */ +} + +struct streamspec { + HCountedArray *dict; /* stream dictionary */ + HParser *parser; /* data parser */ +}; + +HParsedToken * +act_ks_value(const HParseResult *p, void *u) +{ + struct streamspec *spec = u; + HBytes bytes = H_CAST_BYTES(p->ast); + HParseResult *res; + + /* decode and parse the stream data */ + res = decode_stream(spec->dict, bytes, spec->parser); + + return H_MAKE(HParseResult, res); } /* - * interpret a cross-reference stream and return it in the same form as other - * cross-reference sections: + * This continuation takes the stream dictionary (as first element of x) and + * should return a parser that consumes exactly the bytes that make up the + * stream data. * - * p = (pnat nat (dict xrefs)) - * result = (xrefs dict) + * x = (dict ...) */ -HParsedToken * -act_xrstm(const HParseResult *p, void *u) +HParser * +kstream(HAllocator *mm__, const HParsedToken *x, void *env) { - HParsedToken *xrefs, *dict, *result; + struct Env *aux = env; + HParsedToken *dict_t = H_INDEX_TOKEN(x, 0); + HCountedArray *dict = H_CAST_SEQ(dict_t); + const HParsedToken *v = NULL; + HParser *bytes_p, *dict_p, *value_p; + struct streamspec *spec; + size_t sz; - dict = H_INDEX_TOKEN(p->ast, 2, 0); - xrefs = H_INDEX_TOKEN(p->ast, 2, 1); + /* look for the Length entry */ + v = dictentry(dict, "Length"); + v = resolve(aux, v); /* resolve indirect references */ + if (v == NULL || v->token_type != TT_SINT || v->sint < 0) + goto fail; + sz = (size_t)v->sint; + + //fprintf(stderr, "parsing stream object, length %zu.\n", sz); // XXX debug + + dict_p = p_return__m(mm__, dict_t); + bytes_p = p_take__m(mm__, sz, aux); + + spec = h_alloc(mm__, sizeof(struct streamspec)); + spec->dict = dict; + spec->parser = p_stream_data__m(mm__, dict); + if (spec->parser != NULL) + value_p = h_action__m(mm__, bytes_p, act_ks_value, spec); + else + value_p = bytes_p; - result = H_MAKE_SEQN(2); - result->seq->elements[0] = xrefs; - result->seq->elements[1] = dict; - result->seq->used = 2; - return result; + return h_sequence__m(mm__, dict_p, value_p, NULL); +fail: +#if 0 + if (v == NULL) + fprintf(stderr, "stream /Length missing\n"); + else if (v -> token_type != TT_SINT) + fprintf(stderr, "stream /Length not an integer\n"); + else if (v < 0) + fprintf(stderr, "stream /Length negative\n"); +#endif + //h_pprintln(stderr, p); // XXX debug + return p_fail; } HParser * @@ -1035,22 +1149,13 @@ p_xrefsub__m(HAllocator *mm__, size_t base, size_t count, HParser *p_entry) return h_sequence__m(mm__, p_header, p_entries, NULL); } -/* x = ((dict ...) bytes) */ HParser * -kxstream(HAllocator *mm__, const HParsedToken *x, void *env) +p_xrefdata__m(HAllocator *mm__, const HCountedArray *dict) { - //struct Env *aux = env; - const HParsedToken *v, *dict_t; - const HParseResult *res; - HCountedArray *dict; - HBytes bytes; + const HParsedToken *v; + HParser *p_field[3], *p_entry, **p_subs; size_t W[3]; size_t Size, Wn, Wskip; - HParser *p_field[3], *p_entry, **p_subs, *p_xrefdata; - - dict_t = H_INDEX_TOKEN(x, 0, 0); - dict = H_CAST_SEQ(dict_t); - bytes = H_INDEX_BYTES(x, 1); /* * what follows is a horrible bunch of code that builds, from the @@ -1086,35 +1191,35 @@ kxstream(HAllocator *mm__, const HParsedToken *x, void *env) /* Size (required) - total size of xref table */ v = dictentry(dict, "Size"); if (v == NULL || v->token_type != TT_SINT || v->sint < 1) - goto fail; + return p_fail; Size = v->sint; /* W (required) - field widths for each xref entry */ v = dictentry(dict, "W"); if (v == NULL || v->token_type != TT_SEQUENCE) - goto fail; + return p_fail; if ((Wn = v->seq->used) < 3) - goto fail; + return p_fail; Wskip = 0; for (size_t i = 0; i < Wn; i++) { HTokenType tt = v->seq->elements[i]->token_type; int64_t w = v->seq->elements[i]->sint; if (tt != TT_SINT || w < 0) - goto fail; + return p_fail; if (i < 3) { /* we can't take >64 bits and want to use size_t */ if (w > 8 || w > sizeof(size_t)) - goto fail; + return p_fail; W[i] = (size_t)w; } else { if (w > SIZE_MAX - Wskip) - goto fail; /* overflow */ + return p_fail; /* overflow */ Wskip += w; } } if (Wskip > SIZE_MAX / 8) - goto fail; + return p_fail; /* * build the parser for one xref entry. @@ -1160,7 +1265,7 @@ kxstream(HAllocator *mm__, const HParsedToken *x, void *env) #endif } if (Wskip > 0) // XXX h_skip does not work with CF, yet - goto fail; + return p_fail; p_entry = h_sequence__m(mm__, p_field[0], p_field[1], p_field[2], NULL); p_entry = h_action__m(mm__, p_entry, act_xrstment, NULL); @@ -1172,13 +1277,13 @@ kxstream(HAllocator *mm__, const HParsedToken *x, void *env) p_subs[0] = p_xrefsub__m(mm__, 0, Size, p_entry); p_subs[1] = NULL; } else if (v->token_type != TT_SEQUENCE) { - goto fail; + return p_fail; } else { size_t nsubs = v->seq->used / 2; /* build a parser for each subsection */ if (nsubs >= SIZE_MAX / sizeof(HParser *)) - goto fail; + return p_fail; p_subs = h_alloc(mm__, (nsubs + 1) * sizeof(HParser *)); for (size_t i = 0; i < nsubs; i++) { HParsedToken *base = v->seq->elements[2 * i]; @@ -1187,33 +1292,52 @@ kxstream(HAllocator *mm__, const HParsedToken *x, void *env) if (base->token_type != TT_SINT || base->sint < 0 || n->token_type != TT_SINT || n->sint < 0 || n->sint > SIZE_MAX) - goto fail; + return p_fail; p_subs[i] = p_xrefsub__m(mm__, base->sint, n->sint, p_entry); } p_subs[nsubs] = NULL; } - p_xrefdata = h_sequence__ma(mm__, (void **)p_subs); + return h_sequence__ma(mm__, (void **)p_subs); +} + +/* + * This continuation is very similar to kstream, except that it does not + * rely on /Length to consume the right amount of input. If /Length is + * not present or indirect, it will operate on the entire rest of the input. + * This is permissible, other than for general streams, because the XRef data + * is always self-delimiting. + * + * x = (dict ...) + */ +HParser * +kxstream(HAllocator *mm__, const HParsedToken *x, void *env) +{ + struct Env *aux = env; + HParsedToken *dict_t = H_INDEX_TOKEN(x, 0); + HCountedArray *dict = H_CAST_SEQ(dict_t); + const HParsedToken *v; + HParser *bytes_p, *dict_p, *value_p; + struct streamspec *spec; /* restrict bytes to Length if present (and not indirect) */ v = dictentry(dict, "Length"); if (v != NULL && v->token_type == TT_SINT && v->sint >= 0) - bytes.len = v->sint; + bytes_p = p_take__m(mm__, v->sint, aux); + else + bytes_p = p_rest__m(mm__, aux); // XXX consume the proper amount - /* decode and parse the stream data */ - res = parse_stream(mm__, dict, bytes, p_xrefdata); - if (res == NULL) - goto fail; + /* construct the parser for the stream data */ + spec = h_alloc(mm__, sizeof(struct streamspec)); + spec->dict = dict; + spec->parser = p_xrefdata__m(mm__, dict); + assert (spec->parser != NULL); - HParser *dict_p = p_return__m(mm__, dict_t); - HParser *xref_p = p_return__m(mm__, res->ast); - HParser *skip_p = h_skip__m(mm__, bytes.len * 8); - // XXX skip only as much as parse_stream consumed + dict_p = p_return__m(mm__, dict_t); + value_p = h_action__m(mm__, bytes_p, act_ks_value, spec); - return h_sequence__m(mm__, dict_p, xref_p, skip_p, NULL); -fail: - return p_fail; + return h_sequence__m(mm__, dict_p, value_p, NULL); } -- GitLab