From fbbe953faea40f1f1e35ccd4c753fba0498cac50 Mon Sep 17 00:00:00 2001 From: "sumit.ray@baesystems.com" <sumit.ray@baesystems.com> Date: Mon, 28 Jun 2021 23:05:21 -0400 Subject: [PATCH] Working through processing object streams --- Makefile | 2 +- pdf.c | 1622 +++++++++++++++++++++++++++++++++++++----------------- 2 files changed, 1108 insertions(+), 516 deletions(-) diff --git a/Makefile b/Makefile index 6154e1d..4c7b12c 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ CFLAGS += -std=c99 -Wall -Werror -DLOG # lib@ -> ../hammer/build/opt/src HAMMER_INCLUDE = . HAMMER_LIB = ./lib -CFLAGS += -I$(HAMMER_INCLUDE) +CFLAGS += -I$(HAMMER_INCLUDE) -g -pg # (-pg :: profile using gprof) (-g :: debug info) LDFLAGS += -L$(HAMMER_LIB) SOURCES = pdf.c lzw-lib.c diff --git a/pdf.c b/pdf.c index b2508a3..40c6441 100644 --- a/pdf.c +++ b/pdf.c @@ -1090,6 +1090,11 @@ act_page(const HParseResult *p, void *u) return (HParsedToken *)p->ast; } +HParsedToken * +act_dictobj(const HParseResult *p, void *u) +{ + return (HParsedToken *)p->ast; +} /* @@ -1710,11 +1715,130 @@ act_txtobj(const HParseResult *p, void *u) +/* + * This continuation takes the text stream and saves it in the environment for further + * processing, e.g. writing it out to a file with the same name as the pdf input filename + * but woth a .psectxt suffix. + * It does not consume the string and returns the parser as the output. + * + * x = (txtobj ...) + */ +HParser * +ktxtstream(HAllocator *mm__, const HParsedToken *x, void *env) +{ + + struct Env *aux = env; + + + assert (x->token_type == TT_SEQUENCE); + int n_tobjs = x->seq->used; + + for (int n=0; n<n_tobjs; n++) { + + assert(x->seq->elements[n]->token_type == TT_TextEntry); + TextEntry *tste = H_CAST(TextEntry, x->seq->elements[n]); + struct textstr *tstr = NULL; + /* + * To save all of the operators along with the text string, we have to walk + * through all of the tokens and keep a table of pointers to them + * For now, just keep a pointer to the text string in the environment + * + */ + switch (tste->type) { + case TW_Tj: + case TW_Tq: + case TW_Tqq: + tstr = &tste->tstr; + break; + case TW_TJ: + tstr = &tste->tarray.flattened; + break; + default: + fprintf(stderr, "ktxtstream:: Text token type '%u' ignored\n", + tste->type); + } + + fprintf(stdout, "ktxtstream: Value = %.*s\n", tstr->nchars, tstr->text); + + + // store the string in the environment + // not sure whether we need to actually store the string in malloc'ed area + // currently, we are reusing the token memory previously created + struct textnode *txtnd = (struct textnode *) malloc( + sizeof(struct textnode)); + txtnd->tstr = tstr; + txtnd->next = NULL; + if (aux->txthead == NULL) + aux->txthead = txtnd; + if (aux->txttail == NULL) + aux->txttail = txtnd; + else { + aux->txttail->next = txtnd; + aux->txttail = txtnd; + } + aux->ntextobjs += 1; + + } + + return p_return__m(mm__, x); +} + + + + + +/* + * This utility extracts the text stream from the global environment + * writes it out to a file with the same name as the pdf input filename + * but with a .psectxt suffix. + */ +void +text_extract(const struct Env *aux) +{ + fprintf(stdout, "text_extract:: num text objects = %ld\n", aux->ntextobjs); + fprintf(stdout, "text_extract:: %s\n", aux->infile); + + int infnlen = strlen(aux->infile); + int sfxlen = strlen(".psectxt"); + int namelen = infnlen + sfxlen + 1; + + char *outfn = (char *) malloc(sizeof(char) * namelen); + if (outfn == NULL) { + fprintf(stderr, "text_extract:: h_arena_realloc() failed"); + return; + } + memcpy(outfn, aux->infile, infnlen); + memcpy(&outfn[infnlen], ".psectxt", sfxlen); + outfn[namelen-1] = '\0'; // null terminate the string + + // open the file for writing + FILE *stream; + if (!(stream = fopen(outfn, "w"))) { + fprintf(stderr, + "text_extract:: Failed to open file '%s' for writing\n", outfn); + return; + } + struct textnode *curr = aux->txthead; + for (int i = 0; i < aux->ntextobjs; i++) { + fprintf(stdout, "%.*s\n", (int) curr->tstr->nchars, curr->tstr->text); + fprintf(stream, "%.*s\n", (int) curr->tstr->nchars, curr->tstr->text); + curr = curr->next; + } + fclose(stream); + free(outfn); + return; +} + + + + +// ********************************************************************* +// DEBUG // Utility -- Handles simplistic approach to UTF-16 @@ -1740,9 +1864,6 @@ char convert2char(unsigned int b1) } -// ********************************************************************* -// DEBUG - HParsedToken * act_txtbegin_(const HParseResult *p, void *u) { @@ -1761,6 +1882,8 @@ act_txtend(const HParseResult *p, void *u) return (HParsedToken *)p->ast; } +// ********************************************************************* + /* * ******************************************************************** @@ -1769,6 +1892,11 @@ act_txtend(const HParseResult *p, void *u) */ + + + + + /* * input grammar */ @@ -1793,6 +1921,7 @@ HParser *p_textbegin; HParser *p_textstream; HParser *p_trailer; HParser *p_page; +HParser *p_dictobj; /* continuations for h_bind() */ @@ -1826,6 +1955,7 @@ init_parser(struct Env *aux) TT_XREntry = h_allocate_token_new("XREntry", NULL, pp_xrentry); TT_Ref = h_allocate_token_new("Ref", NULL, pp_ref); TT_Dict = h_allocate_token_new("Dict", NULL, pp_dict); + TT_TextEntry = h_allocate_token_new("TextEntry", NULL, pp_textentry); /* lines */ H_RULE(cr, p_mapch('\r', '\n')); /* semantic value: \n */ @@ -2176,13 +2306,13 @@ init_parser(struct Env *aux) \ \ // Page Tree - H_ARULE(contentstream, h_middle(stmbeg, h_many1(h_uint8()), stmend)); + H_RULE(contentstream, h_left(h_bind(stmbeg, kcontentstream, aux), stmend)); // H_ARULE(contentstream, h_middle(stmbeg, h_many(SEQ(h_not(stmend), h_uint8())), stmend)); - H_ARULE(pgcontents, CHX(array, contentstream)); - H_ARULE(page, SEQ(ws, npair, wel, KW("obj"), ws, pgcontents, +// H_ARULE(pgcontents, CHX(array, contentstream)); + H_ARULE(page, SEQ(ws, npair, wel, KW("obj"), ws, contentstream, + OPT(ws), OPT(lws), KW("endobj"))); + H_ARULE(dictobj, SEQ(ws, npair, wel, KW("obj"), ws, CHX(contentstream, dict), OPT(ws), OPT(lws), KW("endobj"))); -// H_ARULE(page, CHX(ref, array)); - p_page = page; @@ -2202,6 +2332,8 @@ init_parser(struct Env *aux) /* text parser variables */ \ p_textbegin = txtbegin; \ p_textstream = txtstream; \ + p_page = page; + p_dictobj = CHX(dictobj, contentstream); p_fail = h_nothing_p(); p_epsilon = epsilon; @@ -2325,6 +2457,8 @@ parse_objstm_obj(struct Env *aux, size_t nr, size_t stm_nr, size_t idx) /* * decode the stream and find the target object in it */ + // SR:: ??? This seems wrong ??? + // -- The only path through this function is the one through the parser return NULL; // XXX } @@ -2596,6 +2730,7 @@ FlateDecode(const Dict *parms, HBytes b, HParser *p) res = h_parse_finish(sp); // XXX always return NULL on error? #else + fprintf (stdout, "FlateDecode:: Inflated string (%lu)\n:%.*s\n", pred.nout, (int)pred.nout, pred.out); res = h_parse(p, pred.out, pred.nout); free(pred.out); #endif @@ -2608,64 +2743,222 @@ FlateDecode(const Dict *parms, HBytes b, HParser *p) return res; } -/* LZW helpers */ -typedef struct -{ - uint8_t *lzw_buf; - size_t total_buf_size; - size_t write_head; - size_t write_tail; - uint8_t write_checksum; - size_t eof_loc; - HBytes *input_stream; - size_t read_head; - size_t read_tail; - uint8_t read_checksum; -} lzwspec; +HParseResult * +FlateDecode2(const Dict *parms, HBytes b, HParser *p) +{ + size_t const BUFSIZE = 8 * 1024; + uint8_t *buf; +#ifdef ITERATIVE // XXX + HSuspendedParser *sp; +#endif + HParseResult *res; + const HParsedToken *v; + size_t sz; + int done; + z_stream strm = {0}; + int ret; + struct predictor pred = {1, 1, 8, 1}; + int (*depredict)(struct predictor *, uint8_t *, size_t); -lzwspec *cur_lzw_spec; + /* set up the predictor (if any) */ + #define SETPARM(VAR,STR) do { \ + v = dictentry(parms, (STR)); \ + if (v != NULL) { \ + if (v->token_type != TT_SINT || v->sint < 0) \ + return NULL; \ + VAR = v->sint; \ + } } while(0) + SETPARM(pred.num, "Predictor"); + SETPARM(pred.colors, "Colors"); + SETPARM(pred.bpc, "BitsPerComponent"); + SETPARM(pred.columns, "Columns"); + #undef SETPARM + if (pred.num == 1) + depredict = depred_none; + else { + if (pred.num >= 10 && pred.num <= 15) + depredict = depred_png; + else if (pred.num == 2) { + /* for 8-bpc TIFF pred. 2, we can reuse PNG Sub */ + if (pred.bpc == 8) { + pred.predfun = pp_sub; /* predict left */ + depredict = depred_png; + } else { + // XXX add general TIFF predictor (bpc != 8) + fprintf(stderr, "FlateDecode: /Predictor %d " + "not supported for /BitsPerComponent %d\n", + pred.num, pred.bpc); + return NULL; + } + } else { + fprintf(stderr, "FlateDecode: /Predictor %d" + " not supported\n", pred.num); + return NULL; + } -/* used by write_lzw_buffer to get more space for decoding if needed */ -void -grow_lzw_buffer(size_t amount) -{ - uint8_t *ret_buf = realloc(cur_lzw_spec->lzw_buf, (cur_lzw_spec->total_buf_size+amount) * sizeof(uint8_t)); - if(ret_buf != NULL) - { - cur_lzw_spec->total_buf_size += amount; - cur_lzw_spec->lzw_buf = ret_buf; - } - else - { - fprintf(stderr, "LZWDecode: h_arena_realloc() failed"); - return; + /* allocate row buffer */ + if (pred.columns > (INT_MAX - 7) / pred.colors / pred.bpc) { + fprintf(stderr, "FlateDecode: overflow\n"); + return NULL; + } + pred.rowsz = (pred.colors * pred.bpc * pred.columns + 7) / 8; + pred.buf = calloc(1, pred.rowsz); + if (pred.buf == NULL) + err(1, "FlateDecode"); } -} -lzwspec * -new_lzw_spec(HBytes *bytes) -{ - size_t const BUFSIZE = sizeof(uint8_t) * 1024; - lzwspec *ret = malloc(sizeof(lzwspec)); - memset(ret, 0, sizeof(lzwspec)); - ret->input_stream = bytes; - ret->lzw_buf = malloc(BUFSIZE); - ret->total_buf_size = BUFSIZE; - return ret; -} + /* set up zlib */ + // XXX pass our allocator to zlib + ret = inflateInit(&strm); + if (ret != Z_OK) + errx(1, "inflateInit: %s (%d)", strm.msg, ret); + buf = malloc(BUFSIZE); + if (buf == NULL) + err(1, "FlateDecode"); -void -delete_lzw_spec(lzwspec *spec) -{ - free(spec->lzw_buf); - free(spec); -} +#ifdef ITERATIVE // XXX + /* initialize target parser */ + sp = h_parse_start(p); + assert(sp != NULL); + pred.sp = sp; +#endif -void -bind_lzw_spec(lzwspec *spec) -{ + done = 0; + strm.avail_in = b.len; + strm.next_in = (unsigned char *)b.token; + do { + strm.avail_out = BUFSIZE; + strm.next_out = buf; + + ret = inflate(&strm, Z_NO_FLUSH); + if (ret != Z_STREAM_END && ret != Z_OK) { + fprintf(stderr, "inflate: %s (%d)\n", strm.msg, ret); + break; + } + + sz = BUFSIZE - strm.avail_out; + done = depredict(&pred, buf, sz); + } while (done == 0 && ret == Z_OK); + +#ifdef ITERATIVE // XXX + res = h_parse_finish(sp); + // XXX always return NULL on error? +#else + // decoded stream in pred.out +// FILE *decodef = fopen ("flatecode.out", "w"); +// fprintf (decodef, "FlateDecode:: Inflated string (%lu)\n:%.*s\n", pred.nout, (int)pred.nout, pred.out); + fprintf (stdout, "FlateDecode:: Inflated string (%lu)\n:%.*s\n", pred.nout, (int)pred.nout, pred.out); + unsigned char *fdec = pred.out; +// char _l; + int i; + for (i=0; i<(pred.nout/2); ++i) + { + convert2char(*fdec); +// _l = convert2char(*fdec); +// fprintf(decodef, " %c-%d ", _l, _l); + fdec ++; + } + res = NULL; + + res = h_parse(p_textbegin, pred.out, pred.nout); + if ((res != NULL) && (res->ast != NULL)) { + /* let's make sure if the stream has text strings */ + const HParsedToken *tstr = H_INDEX_TOKEN(res->ast, 0); + if (bytes_eq(tstr->bytes, "BT")) { + fprintf (stdout, "decode_stream:: Found a text stream\n"); + res = h_parse(p, pred.out, pred.nout); + if (res == NULL) { + fprintf(stderr, "decode_stream::Text String parse failed!!\n"); + } + } + } + res = h_parse(p, pred.out, pred.nout); +// exit(0); + + // TODO:: Refactor across all filters + // Create a byte parser to return the decoded stream + if (res == NULL) { + H_RULE(bytes_p, h_many1(h_token((const uint8_t*)pred.out, (size_t)pred.nout))); + res = h_parse(bytes_p, pred.out, pred.nout); // return the decoded stream + } + + free(pred.out); +#endif + inflateEnd(&strm); + free(pred.buf); + free(buf); + + if (done == -1) + return NULL; + return res; +} + + + + + + + +/* LZW helpers */ + +typedef struct +{ + uint8_t *lzw_buf; + size_t total_buf_size; + size_t write_head; + size_t write_tail; + uint8_t write_checksum; + size_t eof_loc; + + HBytes *input_stream; + size_t read_head; + size_t read_tail; + uint8_t read_checksum; +} lzwspec; + +lzwspec *cur_lzw_spec; + +/* used by write_lzw_buffer to get more space for decoding if needed */ +void +grow_lzw_buffer(size_t amount) +{ + uint8_t *ret_buf = realloc(cur_lzw_spec->lzw_buf, (cur_lzw_spec->total_buf_size+amount) * sizeof(uint8_t)); + if(ret_buf != NULL) + { + cur_lzw_spec->total_buf_size += amount; + cur_lzw_spec->lzw_buf = ret_buf; + } + else + { + fprintf(stderr, "LZWDecode: h_arena_realloc() failed"); + return; + } +} + +lzwspec * +new_lzw_spec(HBytes *bytes) +{ + size_t const BUFSIZE = sizeof(uint8_t) * 1024; + lzwspec *ret = malloc(sizeof(lzwspec)); + memset(ret, 0, sizeof(lzwspec)); + ret->input_stream = bytes; + ret->lzw_buf = malloc(BUFSIZE); + ret->total_buf_size = BUFSIZE; + return ret; +} + +void +delete_lzw_spec(lzwspec *spec) +{ + free(spec->lzw_buf); + free(spec); +} + +void +bind_lzw_spec(lzwspec *spec) +{ cur_lzw_spec = spec; } @@ -2988,344 +3281,280 @@ p_take__m(HAllocator *mm__, size_t n, struct Env *aux) return h_left__m(mm__, bytes, skip); } -HParser *p_xrefdata__m(HAllocator *, const Dict *); + +// Parser for object streams HParser *p_objstm__m(HAllocator *, const Dict *); -HParser * -p_stream_data__m(HAllocator *mm__, const Dict *dict) +struct streamspec { + Dict *dict; /* stream dictionary */ + HParser *parser; /* data parser */ +}; + + + +/* + * ******************************************************************** + * Start Catalog parsing + * ******************************************************************** + */ + +/* + * decode the bytes in 'b' according to metadata in the stream dictionary 'd' + * and parse the result with 'p'. + */ +HParseResult * +decode_contentstream(const Dict *d, HBytes b, HParser *p) { + HParseResult *(*filter)(const Dict *, HBytes, HParser *); + const Dict *parms = NULL; const HParsedToken *v; + HParseResult *res = NULL; - v = dictentry(dict, "Type"); - if (v == NULL || v->token_type != TT_BYTES) // XXX -> custom type - return NULL; /* no /Type field */ - /* interpret known stream types */ - if (bytes_eq(v->bytes, "XRef")) - return p_xrefdata__m(mm__, dict); -#ifndef NOOBJSTM - if (bytes_eq(v->bytes, "ObjStm")) - return p_objstm__m(mm__, dict); -#endif - if (bytes_eq(v->bytes, "XObject")) { + /* + * Check if there is additional information in the dictionary + * that we should use to process the content stream + * + * If the data in the stream is encoded, a filter will be specified in + * the dictionary that must be used to decode the data first + * + * TODO:: Handle arrays of filters (chained) and their decode parameters + */ + v = dictentry(d, "Filter"); // look for a filter + + if (v != NULL) { // data is encoded + + + if (v->token_type != TT_BYTES) { + // XXX TT_SEQUENCE would be a filter chain; that’s not supported, yet. + // But it might also be something bogus, in which case we should fail. + return NULL; + } + + if (bytes_eq(v->bytes, "FlateDecode")) + filter = FlateDecode; + else if (bytes_eq(v->bytes, "ASCIIHexDecode")) + filter = ASCIIHexDecode; + else if (bytes_eq(v->bytes, "ASCII85Decode")) + filter = ASCII85Decode; + else if (bytes_eq(v->bytes, "RunLengthDecode")) + filter = RunLengthDecode; + else if (bytes_eq(v->bytes, "LZWDecode")) + filter = LZWDecode; + else { /* filter not supported */ + fprintf(stderr, "decode_stream:: Unsupported Filter [%.*s\n]", + (int)v->bytes.len, v->bytes.token); + return NULL; /* Treat the stream as a byte array */ + } + /* Check for parameters for the filter */ + v = dictentry(d, "DecodeParms"); + if (v && v->token_type == TT_Dict) + parms = v->user; + + res = filter(parms, b, p); + + /* Debug */ + if (res){ + fprintf(stdout, "decode_stream: parsed token type is = %u\n", res->ast->token_type); + } + } /* The dictionary provided direction for processing the stream */ + + /* + * It is possible that we should always process the stream as a content stream + * But not yet sure that covers all case. + */ + else { // content stream is not encoded /* - * TODO:: external objects can be images, forms, or postscript objects - * We are not handling them at the moment + * We know it is a stream and has a length + * Have to find out what kind of content stream it is + * For now, just check for text string (stream) + * Parse the text stream object + * Note: the stream can have text streams embedded in other stream content + * and can also have in-between content */ - fprintf (stdout, "p_stream_data__m: XObject parsing is not yet supported!\n"); - return NULL; + res = h_parse(p_textbegin, b.token, b.len); + if ((res != NULL) && (res->ast != NULL)) { + /* let's make sure */ + const HParsedToken *tstr = H_INDEX_TOKEN(res->ast, 0); + if (bytes_eq(tstr->bytes, "BT")) { + fprintf (stdout, "decode_stream:: Found a text stream\n"); + res = h_parse(p, b.token, b.len); + if (res == NULL) { + fprintf(stderr, "decode_stream::Text String parse failed!!\n"); + } + } + } } - return NULL; /* unrecognized type */ + + /* + * There are other parameters that can be passed in the dictionary + * They are not being handled currently + */ + const int numOptKeys = 3; + char *optionalKeys[3] = { "F", "FDecodeParms", "DL" }; + for (int i=0; i<numOptKeys; i++) { + v = dictentry(d, optionalKeys[i]); + if (v) fprintf(stderr, "decode_stream:: Unsupported Specifications [%s\n]", optionalKeys[i]); + } + return res; } -struct streamspec { - Dict *dict; /* stream dictionary */ - HParser *parser; /* data parser */ -}; HParsedToken * -act_ks_value(const HParseResult *p, void *u) +act_kcontentstream_value(const HParseResult *p, void *u) { struct streamspec *spec = u; HBytes bytes = H_CAST_BYTES(p->ast); HParseResult *res; /* decode and parse the stream data */ - res = decode_stream(spec->dict, bytes, spec->parser); - if (res == NULL) { - HBytes b = {NULL, 0}; - const HParsedToken *v = dictentry(spec->dict, "Type"); - if (v != NULL && v->token_type == TT_BYTES) { - b.token = v->bytes.token; - b.len = v->bytes.len; - } - if (b.len > INT_MAX) - b.len = INT_MAX; - fprintf(stderr, "parse error in stream (%*s)\n", - (int)b.len, b.token); - // XXX return the undecoded stream (p->ast)? + res = decode_contentstream(spec->dict, bytes, spec->parser); + if (!res) { + res = (HParseResult *)p; } return H_MAKE(HParseResult, res); } -/* - * This continuation takes the stream dictionary (as first element of x) and - * should return a parser that consumes exactly the bytes that make up the - * stream data. - * - * x = (dict ...) - */ -HParser * -kstream(HAllocator *mm__, const HParsedToken *x, void *env) -{ - struct Env *aux = env; - HParsedToken *dict_t = H_INDEX_TOKEN(x, 0); - Dict *dict = H_CAST(Dict, dict_t); - const HParsedToken *v = NULL; - HParser *bytes_p, *dict_p, *value_p; - struct streamspec *spec; - size_t sz; - - /* look for the Length entry */ - v = dictentry(dict, "Length"); - v = resolve(aux, v); /* resolve indirect references */ - if (v == NULL || v->token_type != TT_SINT || v->sint < 0) - goto fail; - sz = (size_t)v->sint; - - //fprintf(stderr, "parsing stream object, length %zu.\n", sz); // XXX debug - - dict_p = p_return__m(mm__, dict_t); - bytes_p = p_take__m(mm__, sz, aux); - - spec = h_alloc(mm__, sizeof(struct streamspec)); - spec->dict = dict; - spec->parser = p_stream_data__m(mm__, dict); - if (spec->parser != NULL) - value_p = h_action__m(mm__, bytes_p, act_ks_value, spec); - else - value_p = bytes_p; - - return h_sequence__m(mm__, dict_p, value_p, NULL); -fail: #if 0 - if (v == NULL) - fprintf(stderr, "stream /Length missing\n"); - else if (v -> token_type != TT_SINT) - fprintf(stderr, "stream /Length not an integer\n"); - else if (v < 0) - fprintf(stderr, "stream /Length negative\n"); +typedef struct { + enum {XR_FREE, XR_INUSE, XR_OBJSTM} type; + union { + struct { size_t next, ngen; } f; /* free */ + struct { size_t offs, gen; } n; /* inuse */ + struct { size_t stm, idx; } o; /* objstm */ + }; + const HParsedToken *obj; +} XREntry; + +typedef struct { size_t nr, gen; } Ref; #endif - //h_pprintln(stderr, p); // XXX debug - return p_fail; -} -HParser * -p_xrefsub__m(HAllocator *mm__, size_t base, size_t count, HParser *p_entry) +const HParsedToken * +parse_item(struct Env *aux, size_t nr, size_t gen, size_t offset, HParser *p) { - HParser *ret_base, *ret_count, *p_header, *p_entries; + HParseResult *res; + size_t def_nr, def_gen; - ret_base = p_return_uint__m(mm__, base); - ret_count = p_return_uint__m(mm__, count); - p_header = h_sequence__m(mm__, ret_base, ret_count, NULL); - p_entries = h_repeat_n__m(mm__, p_entry, count); + if (offset >= aux->sz) { + fprintf(stderr, "%s: position %zu (%#zx) for object %zu %zu is " + "out of bounds\n", aux->infile, offset, offset, nr, gen); + return NULL; + } - return h_sequence__m(mm__, p_header, p_entries, NULL); + if (p == NULL) { + fprintf(stderr, "parse_item: Unexpected request to parse object!!\n"); + return NULL; + } + HParser *pItem = h_right(h_seek(offset * 8, SEEK_SET), p); // XXX + res = h_parse(pItem, aux->input, aux->sz); + if (res == NULL) { + fprintf(stderr, "%s: error parsing object %zu %zu at position " + "%zu (%#zx)\n", aux->infile, nr, gen, offset, offset); + return NULL; + } + assert(res->ast != NULL && res->ast->token_type == TT_SEQUENCE); + /* res->ast = ((nr gen) obj) */ + + def_nr = H_INDEX_UINT(res->ast, 0, 0); + def_gen = H_INDEX_UINT(res->ast, 0, 1); + if (def_nr != nr || def_gen != gen) { + fprintf(stderr, "%s: object ID mismatch at position %zu " + "(%#zx): sought %zu %zu, found %zu %zu.\n", aux->infile, + offset, offset, nr, gen, def_nr, def_gen); + return NULL; + } + + return H_INDEX_TOKEN(res->ast, 1); } -HParser * -p_xrefdata__m(HAllocator *mm__, const Dict *dict) +const HParsedToken * +parse_objstm_item(struct Env *aux, size_t nr, size_t stm_nr, size_t idx, size_t *offset, HParser *p) { - const HParsedToken *v; - HParser *p_field[3], *p_entry, **p_subs; - size_t W[3]; - size_t Size, Wn, Wskip; + XREntry *ent; + const HParsedToken *stm; + + *offset = 0; // initialize the offset /* - * what follows is a horrible bunch of code that builds, from the - * entries /W, /Index, and /Size in the stream dictionary, a parser for - * the cross-reference data itself. - * - * in short, every cross-reference entry consists of (as of PDF 2.0) - * three fields, but it could be more. /W gives the widths (in bytes) - * of these fields. the /Index specifies the division of the data into - * subsections; it is an array of natural numbers that in pairs specify - * the base object number and length of each subsection - analogous to - * the subsection headers in classic xref sections. - * - * when /Index is missing, a default value of [0 Size] is defined, - * where Size is the value of the /Size field. as in normal trailer - * dictionaries, it specifies the total size of the (entire) - * cross-reference table. - * - * when /W states a width of 0 for a field, that field is not present - * in the data and a default value should be used "if there is one". - * most notably, the first field determines the "type" of the entry, - * analogous to the 'n' and 'f' tags in classic xref sections; a width - * of 0 for the first field is specified to mean that every entry is of - * type 1 (= "n"). that type, in particular, specifies a default of 0 - * for field 3 (generation). in fact, these are the only defaults - * defined by ISO 32000-1:2008 (PDF 1.7). - * - * entry type field no. default value - * 1 (type) 1 - * 1 ("n") 3 (gen.) 0 + * acquire the stream object */ - /* Size (required) - total size of xref table */ - v = dictentry(dict, "Size"); - if (v == NULL || v->token_type != TT_SINT || v->sint < 1) - return p_fail; - Size = v->sint; - - /* W (required) - field widths for each xref entry */ - v = dictentry(dict, "W"); - if (v == NULL || v->token_type != TT_SEQUENCE) - return p_fail; - if ((Wn = v->seq->used) < 3) - return p_fail; - Wskip = 0; - for (size_t i = 0; i < Wn; i++) { - HTokenType tt = v->seq->elements[i]->token_type; - int64_t w = v->seq->elements[i]->sint; + ent = lookup_xref(aux, stm_nr, 0); + if (ent == NULL) + return NULL; /* stream not found */ - if (tt != TT_SINT || w < 0) - return p_fail; - if (i < 3) { - /* we can't take >64 bits and want to use size_t */ - if (w > 8 || (uint64_t)w > sizeof(size_t)) - return p_fail; - W[i] = (size_t)w; - } else { - if ((uint64_t)w > SIZE_MAX - Wskip) - return p_fail; /* overflow */ - Wskip += w; + switch (ent->type) + { + case XR_FREE: + return NULL; /* stream deleted */ + case XR_INUSE: + if (ent->n.gen != 0) + return NULL; /* stream replaced */ + if (ent->obj == NULL) { + /* + * decode the stream and find the target object in it + */ + ent->obj = parse_item(aux, stm_nr, 0, ent->n.offs, p); + *offset = ent->n.offs; } + break; + case XR_OBJSTM: + return NULL; /* invalid: nested streams */ } - if (Wskip > SIZE_MAX / 8) - return p_fail; - /* - * build the parser for one xref entry. - * - * in summary, the only sensible forms for /W are: - * - * [t x y] with t,x,y > 0 full general form - * [0 x y] with x,y > 0 only type-1 ("in use") entries - * [0 x 0] with x > 0 only type-1 entries, only offsets - * - * however, though nonsensical, [t x 0] with t,x > 0 is not disallowed - * by the spec; as long as all entries are of type 1, the xref data can - * be interpreted without ambiguity. - * - * in fact, every nonsensical form is possible as long as there are 0 - * entries. - * - * we realize this mess by just initializing the default parser to - * p_fail and and replacing the known cases afterwards. - */ - for (size_t i = 0; i < 3; i++) { - if (W[i] == 0) - p_field[i] = p_fail; /* no known default */ - else - p_field[i] = h_bits__m(mm__, W[i] * 8, false); - } - /* known default cases: */ - if (W[0] == 0) - p_field[0] = p_return_1; /* all type 1 */ - if (W[2] == 0) { - p_field[2] = p_return_0; /* all generation 0 */ - #if 0 - /* XXX - * i've seen a 0-width field 3 used with values of 1 (inuse) - * and 2 (objstm) in field 1, implying "objstm idx 0" for the - * latter case. - */ - if (W[0] > 0) { - /* type field *must* be 1 */ - p_field[0] = h_attr_bool__m(mm__, p_field[0], - validate_eq_uint, (void *)1); - } - #endif + if ((stm = ent->obj) == NULL) { + fprintf(stderr, "%s: error parsing object stream at position " + "%zu (%#zx)\n", aux->infile, ent->n.offs, ent->n.offs); + return NULL; } - if (Wskip > 0) // XXX h_skip does not work with CF, yet - return p_fail; - p_entry = h_sequence__m(mm__, p_field[0], p_field[1], p_field[2], NULL); - p_entry = h_action__m(mm__, p_entry, act_xrstment, NULL); - /* Index (optional) - subsections [base count ...] */ - v = dictentry(dict, "Index"); - if (v == NULL) { - /* default: [0 Size] */ - p_subs = h_alloc(mm__, 2 * sizeof(HParser *)); - p_subs[0] = p_xrefsub__m(mm__, 0, Size, p_entry); - p_subs[1] = NULL; - } else if (v->token_type != TT_SEQUENCE) { - return p_fail; - } else { - size_t nsubs = v->seq->used / 2; - - /* build a parser for each subsection */ - if (nsubs >= SIZE_MAX / sizeof(HParser *)) - return p_fail; - p_subs = h_alloc(mm__, (nsubs + 1) * sizeof(HParser *)); - for (size_t i = 0; i < nsubs; i++) { - HParsedToken *base = v->seq->elements[2 * i]; - HParsedToken *n = v->seq->elements[2 * i + 1]; - - if (base->token_type != TT_SINT || base->sint < 0 || - n->token_type != TT_SINT || n->sint < 0 || - (uint64_t)n->sint > SIZE_MAX) - return p_fail; - - p_subs[i] = p_xrefsub__m(mm__, base->sint, n->sint, - p_entry); - } - p_subs[nsubs] = NULL; - } - return h_sequence__ma(mm__, (void **)p_subs); + return ent->obj; // The only path through this function is the one through the parser } -HParser * -p_objstm__m(HAllocator *mm__, const Dict *dict) -{ - const HParsedToken *v; - size_t N; - - v = dictentry(dict, "N"); - if (v == NULL || v->token_type != TT_SINT || v->sint < 0 || - (uint64_t)v->sint > SIZE_MAX) { - fprintf(stderr, "missing /N on object stream\n"); - return p_fail; - } - N = v->sint; - HParser *wel_ws = h_sequence__m(mm__, p_wel, p_ws, NULL); - HParser *idx = p_sepBy_n__m(mm__, p_npair, wel_ws, N); +const HParsedToken * +resolve_item(struct Env *aux, const HParsedToken *v, size_t *offset, HParser *p) +{ + XREntry *ent = NULL; + Ref *r; - return h_sequence__m(mm__, p_ws, idx, p_elemr, p_ws, NULL); - // XXX leading and trailing ws OK? + *offset = 0; // initialize the offset - // XXX consistency-check against /First, idx, /N -} - -/* - * This continuation is very similar to kstream, except that it does not - * rely on /Length to consume the right amount of input. If /Length is - * not present or indirect, it will operate on the entire rest of the input. - * This is permissible, other than for general streams, because the XRef data - * is always self-delimiting. - * - * x = (dict ...) - */ -HParser * -kxstream(HAllocator *mm__, const HParsedToken *x, void *env) -{ - struct Env *aux = env; - HParsedToken *dict_t = H_INDEX_TOKEN(x, 0); - Dict *dict = H_CAST(Dict, dict_t); - const HParsedToken *v; - HParser *bytes_p, *dict_p, *value_p; - struct streamspec *spec; + /* direct objects pass through */ + if (v == NULL || v->token_type != TT_Ref) + return v; - /* restrict bytes to Length if present (and not indirect) */ - v = dictentry(dict, "Length"); - if (v != NULL && v->token_type == TT_SINT && v->sint >= 0) - bytes_p = p_take__m(mm__, v->sint, aux); - else - bytes_p = p_rest__m(mm__, aux); // XXX consume the proper amount + /* we are looking at an indirect reference */ + r = v->user; - /* construct the parser for the stream data */ - spec = h_alloc(mm__, sizeof(struct streamspec)); - spec->dict = dict; - spec->parser = p_xrefdata__m(mm__, dict); - assert (spec->parser != NULL); + /* find the xref entry for this reference */ + ent = lookup_xref(aux, r->nr, r->gen); + if (ent == NULL) + return NULL; /* obj not found */ + if (ent->obj != NULL) + return resolve_item(aux, ent->obj, offset, p); - dict_p = p_return__m(mm__, dict_t); - value_p = h_action__m(mm__, bytes_p, act_ks_value, spec); + /* parse the object and memoize */ + ent->obj = v; /* break loops */ + switch (ent->type) + { + case XR_FREE: + return NULL; /* obj deleted */ + case XR_INUSE: + if (ent->n.gen != r->gen) + return NULL; /* obj nr reused */ + ent->obj = parse_item(aux, r->nr, r->gen, ent->n.offs, p); + *offset = ent->n.offs; + break; + case XR_OBJSTM: + if (r->gen != 0) + return NULL; /* invalid entry! */ + ent->obj = parse_objstm_item(aux, r->nr, ent->o.stm, ent->o.idx, offset, p); + break; + } - return h_sequence__m(mm__, dict_p, value_p, NULL); + return resolve_item(aux, ent->obj, offset, p); } @@ -3333,89 +3562,67 @@ kxstream(HAllocator *mm__, const HParsedToken *x, void *env) /* - * This continuation takes the text stream and saves it in the environment for further - * processing, e.g. writing it out to a file with the same name as the pdf input filename - * but woth a .psectxt suffix. + * This continuation takes the content stream and processes it for test extraction. + * It is very similar to kstream in approach. It decodes and extracts the stream contents + * and * It does not consume the string and returns the token as the output. * * x = (txtobj ...) */ HParser * -ktxtstream(HAllocator *mm__, const HParsedToken *x, void *env) +kcontentstream(HAllocator *mm__, const HParsedToken *x, void *env) { struct Env *aux = env; -#if 0 - if (x->token_type != TT_TextEntry) { - fprintf( - stderr, - "ktxtstream:: Unexpected token type =%d :: (Expected TT_TextEntry)\n", - x->token_type); - assert(x->token_type == TT_TextEntry); - return NULL; - } -#endif - assert (x->token_type == TT_SEQUENCE); - int n_tobjs = x->seq->used; + HParsedToken *dict_t = H_INDEX_TOKEN(x, 0); + Dict *dict = H_CAST(Dict, dict_t); + const HParsedToken *v = NULL; + HParser *bytes_p, *dict_p, *value_p; + struct streamspec *spec; + size_t sz=0, nOffset=0; - for (int n=0; n<n_tobjs; n++) { - assert(x->seq->elements[n]->token_type == TT_TextEntry); - TextEntry *tste = H_CAST(TextEntry, x->seq->elements[n]); - struct textstr *tstr = NULL; - /* - * To save all of the operators along with the text string, we have to walk - * through all of the tokens and keep a table of pointers to them - * For now, just keep a pointer to the text string in the environment - * - */ - switch (tste->type) { - case TW_Tj: - case TW_Tq: - case TW_Tqq: - tstr = &tste->tstr; - break; - case TW_TJ: - tstr = &tste->tarray.flattened; - break; - default: - fprintf(stderr, "ktxtstream:: Text token type '%u' ignored\n", - tste->type); - } + /* look for the Length entry -- could be a reference */ + v = dictentry(dict, "Length"); + v = resolve_item(aux, v, &nOffset, NULL); /* resolve indirect references */ + if (v == NULL || v->token_type != TT_SINT || v->sint < 0) { + if (v == NULL) + fprintf(stderr, "kcontentstream: stream /Length missing\n"); + else if (v -> token_type != TT_SINT) + fprintf(stderr, "kcontentstream: stream /Length not an integer\n"); + else if (v < 0) + fprintf(stderr, "kcontentstream: stream /Length negative\n"); + + //h_pprintln(stderr, p); // XXX debug + return p_fail; + } - fprintf(stdout, "ktxtstream: Value = %.*s\n", tstr->nchars, tstr->text); + sz = (size_t)v->sint; + dict_p = p_return__m(mm__, dict_t); + bytes_p = p_take__m(mm__, sz, aux); - // store the string in the environment - // not sure whether we need to actually store the string in malloc'ed area - // currently, we are reusing the token memory previously created - struct textnode *txtnd = (struct textnode *) malloc( - sizeof(struct textnode)); - txtnd->tstr = tstr; - txtnd->next = NULL; - if (aux->txthead == NULL) - aux->txthead = txtnd; - if (aux->txttail == NULL) - aux->txttail = txtnd; - else { - aux->txttail->next = txtnd; - aux->txttail = txtnd; - } - aux->ntextobjs += 1; + spec = h_alloc(mm__, sizeof(struct streamspec)); + spec->dict = dict; + v = dictentry(dict, "Type"); + if (v == NULL) // XXX -> custom type + spec->parser = p_textstream; + else if ( (v->token_type == TT_BYTES) && bytes_eq(v->bytes, "ObjStm") ) + spec->parser = p_objstm__m(mm__, dict); + else { + fprintf(stderr, "kcontentstream: Not a text or object stream!\n"); + return p_fail; } - return p_return__m(mm__, x); + value_p = h_action__m(mm__, bytes_p, act_kcontentstream_value, spec); + + return h_sequence__m(mm__, dict_p, value_p, NULL); + } -/* - * ******************************************************************** - * Start Catalog parsing - * ******************************************************************** - */ - void parse_pagenode( struct Env *aux, PtNode_S *pgNode // node @@ -3425,10 +3632,15 @@ void parse_pagenode( Dict *pageD = pgNode->pn.page; const HParsedToken *contents_t = NULL; // dictionary token Ref *contents_r = NULL; -// const HParsedToken *contents = NULL; // resolved token - XREntry *ent = NULL; - HParseResult *res = NULL; + const HParsedToken *contents = NULL; // resolved token +// XREntry *ent = NULL; +// HParseResult *res = NULL; +// const HParsedToken *v = NULL; +// const HParsedToken *dict_t; +// Dict *dict_s; // stream +// HParser *bytes_p, *dict_p, *value_p; + size_t sz = 0, nOffset = 0; // Hold on to the Resources dictionary // This dictionary may be empty @@ -3445,14 +3657,30 @@ void parse_pagenode( } else if (contents_t->token_type == TT_Ref) { contents_r = H_CAST(Ref, contents_t); - ent = lookup_xref(aux, contents_r->nr, contents_r->gen); - if (ent->type == XR_INUSE) { - size_t offset = ent->n.offs; - fprintf (stdout, "parse_pagenode:: Offset = %ld\n", offset); - res = h_parse(p_page, aux->input + offset, aux->sz - offset); - fprintf (stdout, "parse_pagenode:: res = %p\n", (void *) res); - } -// contents = resolve(aux, contents_t); + fprintf(stdout, "parse_pagenode: ref.nr = %ld, ref.gen=%ld\n", contents_r->nr, contents_r->gen); +// dict_t = resolve_item(aux, contents_t, &nOffset, p_dictobj); +// dict_s = H_CAST(Dict, dict_t); +// v = dictentry(dict_s, "Length"); +// v = resolve(aux, v); /* resolve indirect references if necessary */ +// if (v == NULL || v->token_type != TT_SINT || v->sint < 0) +// goto fail; +// sz = (size_t)v->sint; +// fprintf(stdout, "parse_pagenode: sz = %ld\n", sz); +// dict_p = p_return__m(dict_s->arena, dict_t); +// bytes_p = p_take__m(dict_s->arena, sz, aux); // parser for the byte stream +// return h_sequence__m(dict_s->arena, dict_p, p_page, NULL); + + contents = resolve_item(aux, contents_t, &nOffset, p_page); + fprintf(stdout, "parse_pagenode: Page node contents = %p\n", (void *)contents); + fprintf(stdout, "parse_pagenode: sz = %ld\n", sz); + +// ent = lookup_xref(aux, contents_r->nr, contents_r->gen); +// if (ent->type == XR_INUSE) { +// size_t offset = ent->n.offs; +// fprintf (stdout, "parse_pagenode:: Offset = %ld\n", offset); +// res = h_parse(p_page, aux->input + offset, aux->sz - offset); +// fprintf (stdout, "parse_pagenode:: res = %p\n", (void *) res); +// } } else { fprintf(stderr, "parse_pagenode: Page node is not a reference ... may be an array!\n"); @@ -3464,7 +3692,9 @@ void parse_pagenode( // contents->token_type); - end: +end: + +//fail: return; } @@ -3502,6 +3732,7 @@ parse_pagetree( const HParsedToken *item = NULL; size_t npages = 0; Ref *ptRef=NULL, *meRef=NULL; + size_t nOffset = 0; @@ -3520,7 +3751,7 @@ parse_pagetree( { node = &treeNode->kids[i]; kidRef = pgTable->elements[i]; - kidDict_t = resolve(aux, kidRef); // page or tree node dictionary token + kidDict_t = resolve_item(aux, kidRef, &nOffset, p_dictobj); // page or tree node dictionary token kidDict = H_CAST(Dict, kidDict_t); // page or tree node dictionary @@ -3607,8 +3838,9 @@ parse_pagetree( - end: - exit(0); +end: + return nOffset; +// exit(0); } @@ -3629,6 +3861,7 @@ parse_catalog(struct Env *aux, const HParsedToken *root) const Dict *ptRoot = NULL; // page tree root Dictionary const HParsedToken *kids = NULL; const HParsedToken *item = NULL; + size_t nOffset = 0; // initialize the catalog structure @@ -3640,7 +3873,7 @@ parse_catalog(struct Env *aux, const HParsedToken *root) // Ensure the reference is to the catalog dictionary - dict_t = resolve(aux, root); // token + dict_t = resolve_item(aux, root, &nOffset, p_dictobj); // token catalog = H_CAST(Dict, dict_t); // catalog dictionary item = dictentry(catalog, "Type"); if ( (item == NULL) || (item->token_type != TT_BYTES) || @@ -3651,74 +3884,413 @@ parse_catalog(struct Env *aux, const HParsedToken *root) aux->catalog.catalog = dict_t; // catalog dictionary token - // Catalog found -- Now get the root of the page tree associated with the catalog - ptRef = dictentry(catalog, "Pages"); // indirect reference to a dictionary - if ( (ptRef == NULL) || (ptRef->token_type != TT_Ref) ) { - fprintf(stderr, "parse_catalog: Page Tree not found!\n"); - goto end; + // Catalog found -- Now get the root of the page tree associated with the catalog + ptRef = dictentry(catalog, "Pages"); // indirect reference to a dictionary + if ( (ptRef == NULL) || (ptRef->token_type != TT_Ref) ) { + fprintf(stderr, "parse_catalog: Page Tree not found!\n"); + goto end; + } + aux->catalog.pRoot = ptRef; // indirect reference to the page tree + + + /* resolve and process the page tree root reference to extract the dictionary --> Page Tree Object */ + dict_t = resolve_item(aux, ptRef, &nOffset, p_dictobj); // page tree root node +// dict_t = resolve(aux, ptRef); // page tree root node + ptRoot = H_CAST(Dict, dict_t); // page tree root dictionary + + // Count is a required field + item = dictentry(ptRoot, "Count"); + if ( (item == NULL) || (item->token_type != TT_SINT) ) { + fprintf(stderr, "parse_catalog: Required page node count missing!\n"); + goto end; + } + else { + aux->catalog.pgCount = H_CAST_SINT(item); + } + + item = dictentry(ptRoot, "Parent"); // root node ==> parent should be NULL + if (item != NULL) { + fprintf(stderr, "parse_pagetree: Parent of root page tree node is not NULL [p = %p]!\n", + (void *)item); + goto end; + } + + + // Kids is a required field + kids = dictentry(ptRoot, "Kids"); // array of references to page or page tree nodes + if ( (kids == NULL) || (kids->token_type != TT_SEQUENCE) ) { + fprintf(stderr, "parse_catalog: There are no kids!\n"); + goto end; + } + + // parse_pagetree + aux->catalog.pgTree.type = PG_TREE; + aux->catalog.pgTree.parent = NULL; + parse_pagetree(aux, &aux->catalog.pgTree, ptRef, kids, 0); + + + + end: +// exit(0); + return success; +} + +/* + * ******************************************************************** + * End Catalog parsing + * ******************************************************************** + */ + + + + + + + + +/* + * ******************************************************************** + * Start xref parsing + * ******************************************************************** + */ + + +HParser *p_xrefdata__m(HAllocator *, const Dict *); + +HParser * +p_stream_data__m(HAllocator *mm__, const Dict *dict) +{ + const HParsedToken *v; + + v = dictentry(dict, "Type"); + if (v == NULL || v->token_type != TT_BYTES) // XXX -> custom type + return NULL; /* no /Type field */ + + /* interpret known stream types */ + if (bytes_eq(v->bytes, "XRef")) + return p_xrefdata__m(mm__, dict); +#ifndef NOOBJSTM + if (bytes_eq(v->bytes, "ObjStm")) + return p_objstm__m(mm__, dict); +#endif + if (bytes_eq(v->bytes, "XObject")) { + /* + * TODO:: external objects can be images, forms, or postscript objects + * We are not handling them at the moment + */ + fprintf (stderr, "p_stream_data__m: XObject parsing is not yet supported!\n"); + return NULL; + } + return NULL; /* unrecognized type */ +} + + +HParsedToken * +act_ks_value(const HParseResult *p, void *u) +{ + struct streamspec *spec = u; + HBytes bytes = H_CAST_BYTES(p->ast); + HParseResult *res; + + /* decode and parse the stream data */ + res = decode_stream(spec->dict, bytes, spec->parser); + if (res == NULL) { + HBytes b = {NULL, 0}; + const HParsedToken *v = dictentry(spec->dict, "Type"); + if (v != NULL && v->token_type == TT_BYTES) { + b.token = v->bytes.token; + b.len = v->bytes.len; + } + if (b.len > INT_MAX) + b.len = INT_MAX; + fprintf(stderr, "parse error in stream (%*s)\n", + (int)b.len, b.token); + // XXX return the undecoded stream (p->ast)? + } + + return H_MAKE(HParseResult, res); +} + +/* + * This continuation takes the stream dictionary (as first element of x) and + * should return a parser that consumes exactly the bytes that make up the + * stream data. + * + * x = (dict ...) + */ +HParser * +kstream(HAllocator *mm__, const HParsedToken *x, void *env) +{ + struct Env *aux = env; + HParsedToken *dict_t = H_INDEX_TOKEN(x, 0); + Dict *dict = H_CAST(Dict, dict_t); + const HParsedToken *v = NULL; + HParser *bytes_p, *dict_p, *value_p; + struct streamspec *spec; + size_t sz; + + /* look for the Length entry */ + v = dictentry(dict, "Length"); + v = resolve(aux, v); /* resolve indirect references */ + if (v == NULL || v->token_type != TT_SINT || v->sint < 0) + goto fail; + sz = (size_t)v->sint; + + //fprintf(stderr, "parsing stream object, length %zu.\n", sz); // XXX debug + + dict_p = p_return__m(mm__, dict_t); + bytes_p = p_take__m(mm__, sz, aux); + + spec = h_alloc(mm__, sizeof(struct streamspec)); + spec->dict = dict; + spec->parser = p_stream_data__m(mm__, dict); + if (spec->parser != NULL) + value_p = h_action__m(mm__, bytes_p, act_ks_value, spec); + else + value_p = bytes_p; + + return h_sequence__m(mm__, dict_p, value_p, NULL); + +fail: + if (v == NULL) + fprintf(stderr, "stream /Length missing\n"); + else if (v -> token_type != TT_SINT) + fprintf(stderr, "stream /Length not an integer\n"); + else if (v < 0) + fprintf(stderr, "stream /Length negative\n"); + + //h_pprintln(stderr, p); // XXX debug + return p_fail; +} + +HParser * +p_xrefsub__m(HAllocator *mm__, size_t base, size_t count, HParser *p_entry) +{ + HParser *ret_base, *ret_count, *p_header, *p_entries; + + ret_base = p_return_uint__m(mm__, base); + ret_count = p_return_uint__m(mm__, count); + p_header = h_sequence__m(mm__, ret_base, ret_count, NULL); + p_entries = h_repeat_n__m(mm__, p_entry, count); + + return h_sequence__m(mm__, p_header, p_entries, NULL); +} + +HParser * +p_xrefdata__m(HAllocator *mm__, const Dict *dict) +{ + const HParsedToken *v; + HParser *p_field[3], *p_entry, **p_subs; + size_t W[3]; + size_t Size, Wn, Wskip; + + /* + * what follows is a horrible bunch of code that builds, from the + * entries /W, /Index, and /Size in the stream dictionary, a parser for + * the cross-reference data itself. + * + * in short, every cross-reference entry consists of (as of PDF 2.0) + * three fields, but it could be more. /W gives the widths (in bytes) + * of these fields. the /Index specifies the division of the data into + * subsections; it is an array of natural numbers that in pairs specify + * the base object number and length of each subsection - analogous to + * the subsection headers in classic xref sections. + * + * when /Index is missing, a default value of [0 Size] is defined, + * where Size is the value of the /Size field. as in normal trailer + * dictionaries, it specifies the total size of the (entire) + * cross-reference table. + * + * when /W states a width of 0 for a field, that field is not present + * in the data and a default value should be used "if there is one". + * most notably, the first field determines the "type" of the entry, + * analogous to the 'n' and 'f' tags in classic xref sections; a width + * of 0 for the first field is specified to mean that every entry is of + * type 1 (= "n"). that type, in particular, specifies a default of 0 + * for field 3 (generation). in fact, these are the only defaults + * defined by ISO 32000-1:2008 (PDF 1.7). + * + * entry type field no. default value + * 1 (type) 1 + * 1 ("n") 3 (gen.) 0 + */ + + /* Size (required) - total size of xref table */ + v = dictentry(dict, "Size"); + if (v == NULL || v->token_type != TT_SINT || v->sint < 1) + return p_fail; + Size = v->sint; + + /* W (required) - field widths for each xref entry */ + v = dictentry(dict, "W"); + if (v == NULL || v->token_type != TT_SEQUENCE) + return p_fail; + if ((Wn = v->seq->used) < 3) + return p_fail; + Wskip = 0; + for (size_t i = 0; i < Wn; i++) { + HTokenType tt = v->seq->elements[i]->token_type; + int64_t w = v->seq->elements[i]->sint; + + if (tt != TT_SINT || w < 0) + return p_fail; + if (i < 3) { + /* we can't take >64 bits and want to use size_t */ + if (w > 8 || (uint64_t)w > sizeof(size_t)) + return p_fail; + W[i] = (size_t)w; + } else { + if ((uint64_t)w > SIZE_MAX - Wskip) + return p_fail; /* overflow */ + Wskip += w; + } + } + if (Wskip > SIZE_MAX / 8) + return p_fail; + + /* + * build the parser for one xref entry. + * + * in summary, the only sensible forms for /W are: + * + * [t x y] with t,x,y > 0 full general form + * [0 x y] with x,y > 0 only type-1 ("in use") entries + * [0 x 0] with x > 0 only type-1 entries, only offsets + * + * however, though nonsensical, [t x 0] with t,x > 0 is not disallowed + * by the spec; as long as all entries are of type 1, the xref data can + * be interpreted without ambiguity. + * + * in fact, every nonsensical form is possible as long as there are 0 + * entries. + * + * we realize this mess by just initializing the default parser to + * p_fail and and replacing the known cases afterwards. + */ + for (size_t i = 0; i < 3; i++) { + if (W[i] == 0) + p_field[i] = p_fail; /* no known default */ + else + p_field[i] = h_bits__m(mm__, W[i] * 8, false); } - aux->catalog.pRoot = ptRef; // indirect reference to the page tree + /* known default cases: */ + if (W[0] == 0) + p_field[0] = p_return_1; /* all type 1 */ + if (W[2] == 0) { + p_field[2] = p_return_0; /* all generation 0 */ + #if 0 + /* XXX + * i've seen a 0-width field 3 used with values of 1 (inuse) + * and 2 (objstm) in field 1, implying "objstm idx 0" for the + * latter case. + */ + if (W[0] > 0) { + /* type field *must* be 1 */ + p_field[0] = h_attr_bool__m(mm__, p_field[0], + validate_eq_uint, (void *)1); + } + #endif + } + if (Wskip > 0) // XXX h_skip does not work with CF, yet + return p_fail; + p_entry = h_sequence__m(mm__, p_field[0], p_field[1], p_field[2], NULL); + p_entry = h_action__m(mm__, p_entry, act_xrstment, NULL); + /* Index (optional) - subsections [base count ...] */ + v = dictentry(dict, "Index"); + if (v == NULL) { + /* default: [0 Size] */ + p_subs = h_alloc(mm__, 2 * sizeof(HParser *)); + p_subs[0] = p_xrefsub__m(mm__, 0, Size, p_entry); + p_subs[1] = NULL; + } else if (v->token_type != TT_SEQUENCE) { + return p_fail; + } else { + size_t nsubs = v->seq->used / 2; - /* resolve and process the page tree root reference to extract the dictionary --> Page Tree Object */ - dict_t = resolve(aux, ptRef); // page tree root node - ptRoot = H_CAST(Dict, dict_t); // page tree root dictionary + /* build a parser for each subsection */ + if (nsubs >= SIZE_MAX / sizeof(HParser *)) + return p_fail; + p_subs = h_alloc(mm__, (nsubs + 1) * sizeof(HParser *)); + for (size_t i = 0; i < nsubs; i++) { + HParsedToken *base = v->seq->elements[2 * i]; + HParsedToken *n = v->seq->elements[2 * i + 1]; - // Count is a required field - item = dictentry(ptRoot, "Count"); - if ( (item == NULL) || (item->token_type != TT_SINT) ) { - fprintf(stderr, "parse_catalog: Required page node count missing!\n"); - goto end; - } - else { - aux->catalog.pgCount = H_CAST_SINT(item); - } + if (base->token_type != TT_SINT || base->sint < 0 || + n->token_type != TT_SINT || n->sint < 0 || + (uint64_t)n->sint > SIZE_MAX) + return p_fail; - item = dictentry(ptRoot, "Parent"); // root node ==> parent should be NULL - if (item != NULL) { - fprintf(stderr, "parse_pagetree: Parent of root page tree node is not NULL [p = %p]!\n", - (void *)item); - goto end; + p_subs[i] = p_xrefsub__m(mm__, base->sint, n->sint, + p_entry); + } + p_subs[nsubs] = NULL; } + return h_sequence__ma(mm__, (void **)p_subs); +} +HParser * +p_objstm__m(HAllocator *mm__, const Dict *dict) +{ + const HParsedToken *v; + size_t N; - // Kids is a required field - kids = dictentry(ptRoot, "Kids"); // array of references to page or page tree nodes - if ( (kids == NULL) || (kids->token_type != TT_SEQUENCE) ) { - fprintf(stderr, "parse_catalog: There are no kids!\n"); - goto end; + v = dictentry(dict, "N"); + if (v == NULL || v->token_type != TT_SINT || v->sint < 0 || + (uint64_t)v->sint > SIZE_MAX) { + fprintf(stderr, "missing /N on object stream\n"); + return p_fail; } + N = v->sint; - // parse_pagetree - aux->catalog.pgTree.type = PG_TREE; - aux->catalog.pgTree.parent = NULL; - parse_pagetree(aux, &aux->catalog.pgTree, ptRef, kids, 0); - + HParser *wel_ws = h_sequence__m(mm__, p_wel, p_ws, NULL); + HParser *idx = p_sepBy_n__m(mm__, p_npair, wel_ws, N); + return h_sequence__m(mm__, p_ws, idx, p_elemr, p_ws, NULL); + // XXX leading and trailing ws OK? - end: - exit(0); - return success; + // XXX consistency-check against /First, idx, /N } /* - * ******************************************************************** - * End Catalog parsing - * ******************************************************************** + * This continuation is very similar to kstream, except that it does not + * rely on /Length to consume the right amount of input. If /Length is + * not present or indirect, it will operate on the entire rest of the input. + * This is permissible, other than for general streams, because the XRef data + * is always self-delimiting. + * + * x = (dict ...) */ +HParser * +kxstream(HAllocator *mm__, const HParsedToken *x, void *env) +{ + struct Env *aux = env; + HParsedToken *dict_t = H_INDEX_TOKEN(x, 0); + Dict *dict = H_CAST(Dict, dict_t); + const HParsedToken *v; + HParser *bytes_p, *dict_p, *value_p; + struct streamspec *spec; -/* - * main program - */ + /* restrict bytes to Length if present (and not indirect) */ + v = dictentry(dict, "Length"); + if (v != NULL && v->token_type == TT_SINT && v->sint >= 0) + bytes_p = p_take__m(mm__, v->sint, aux); + else + bytes_p = p_rest__m(mm__, aux); // XXX consume the proper amount + + /* construct the parser for the stream data */ + spec = h_alloc(mm__, sizeof(struct streamspec)); + spec->dict = dict; + spec->parser = p_xrefdata__m(mm__, dict); + assert (spec->parser != NULL); + + dict_p = p_return__m(mm__, dict_t); + value_p = h_action__m(mm__, bytes_p, act_ks_value, spec); + + return h_sequence__m(mm__, dict_p, value_p, NULL); +} -#include <stdio.h> -#include <inttypes.h> -#include <stdlib.h> /* realloc() */ -#include <fcntl.h> /* open() */ -#include <unistd.h> /* lseek() */ -#include <sys/mman.h> /* mmap() */ -const char *infile = NULL; /* * This helper implements the standard backwards parsing strategy to read all @@ -3728,17 +4300,22 @@ const char *infile = NULL; * Allocates and returns an array of HParsedTokens, each containing the result * of a successful 'p_xref' parse. Sets the output parameter 'nxrefs' to the * number of elements. - * - * A return value of NULL indicates an empty result. */ -const HParsedToken ** -parse_xrefs(const uint8_t *input, size_t sz, size_t *nxrefs) +const char *infile = NULL; + +void +parse_xrefs(struct Env *aux) { + const uint8_t *input = aux->input; + size_t sz = aux->sz; HParseResult *res = NULL; const HParsedToken **xrefs = NULL; /* empty result */ const HParsedToken *tok = NULL; size_t n = 0, nfwd = 0; size_t offset = 0; + bool processRoot = true; + size_t maxObjNum = 0; + Dict *trailer = NULL; // XXX try formulating this as a parser using h_seek() @@ -3773,6 +4350,17 @@ parse_xrefs(const uint8_t *input, size_t sz, size_t *nxrefs) err(1, "realloc"); xrefs[n++] = res->ast; + + /* process the root */ + if (processRoot) { + // Size is a required field in the trailer dictionary + trailer = H_INDEX(Dict, res->ast, 1); + maxObjNum = H_CAST_SINT(dictentry(trailer, "Size")); + + processRoot = false; + } + + /* look up the next offset (to the previous xref section) */ tok = dictentry(H_INDEX(Dict, res->ast, 1), "Prev"); if (tok == NULL) @@ -3806,53 +4394,47 @@ parse_xrefs(const uint8_t *input, size_t sz, size_t *nxrefs) } end: - *nxrefs = n; - return xrefs; + aux->xrefs = xrefs; + aux->nxrefs = n; + if (n > maxObjNum) { + fprintf(stderr, "%s: Number of xrefs found -%ld- " + "Greater than specified /Size -%ld-.\n" + "Ignoring objects numberd greater than -%ld-!\n", + infile, n, maxObjNum, n); + aux->nxrefs = maxObjNum; + } + + + // Process the trailer dictionary + const HParsedToken *root = dictentry(trailer, "Root"); + assert(root->token_type == TT_Ref); + parse_catalog(aux, root); + + return; } /* - * This utility extracts the text stream from the global environment - * writes it out to a file with the same name as the pdf input filename - * but with a .psectxt suffix. + * ******************************************************************** + * End xref parsing + * ******************************************************************** */ -void -text_extract(const struct Env *aux) -{ - fprintf(stdout, "text_extract:: num text objects = %ld\n", aux->ntextobjs); - fprintf(stdout, "text_extract:: %s\n", aux->infile); - int infnlen = strlen(aux->infile); - int sfxlen = strlen(".psectxt"); - int namelen = infnlen + sfxlen + 1; - char *outfn = (char *) malloc(sizeof(char) * namelen); - if (outfn == NULL) { - fprintf(stderr, "text_extract:: h_arena_realloc() failed"); - return; - } - memcpy(outfn, aux->infile, infnlen); - memcpy(&outfn[infnlen], ".psectxt", sfxlen); - outfn[namelen-1] = '\0'; // null terminate the string - // open the file for writing - FILE *stream; - if (!(stream = fopen(outfn, "w"))) { - fprintf(stderr, - "text_extract:: Failed to open file '%s' for writing\n", outfn); - return; - } - struct textnode *curr = aux->txthead; - for (int i = 0; i < aux->ntextobjs; i++) { - fprintf(stdout, "%.*s\n", (int) curr->tstr->nchars, curr->tstr->text); - fprintf(stream, "%.*s\n", (int) curr->tstr->nchars, curr->tstr->text); - curr = curr->next; - } - fclose(stream); - free(outfn); - return; -} + +/* + * main program + */ + +#include <stdio.h> +#include <inttypes.h> +#include <stdlib.h> /* realloc() */ +#include <fcntl.h> /* open() */ +#include <unistd.h> /* lseek() */ +#include <sys/mman.h> /* mmap() */ + @@ -3894,12 +4476,17 @@ main(int argc, char *argv[]) init_parser(&aux); /* parse all cross-reference sections and trailer dictionaries */ - aux.xrefs = parse_xrefs(input, sz, &aux.nxrefs); + parse_xrefs(&aux); // XXX debug //fprintf(stderr, "%s: %zu xref sections parsed\n", infile, aux.nxrefs); //for (size_t i = 0; i < aux.nxrefs; i++) // h_pprintln(stderr, aux.xrefs[i]); + +// /* parse the catalog page tree */ +// bool success = parse_catalog(&aux); +// fprintf(stdout, "main: parse_catalog successfully = %s\n", success ? "true":"false"); + /* run the main parser */ res = h_parse(p_pdf, input, sz); if (!res) { @@ -3921,5 +4508,10 @@ main(int argc, char *argv[]) /* print result */ h_pprintln(stdout, res->ast); + /* Save the extracted text */ + if (aux.ntextobjs > 0) { + text_extract(&aux); + } + return 0; } -- GitLab