diff --git a/Makefile b/Makefile index 6154e1d8363343e12153e52a27e9826581afdc3a..19fed2b3a1e0eeec0e8e2a3757ba60d1c2cefbff 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ CFLAGS += -std=c99 -Wall -Werror -DLOG # lib@ -> ../hammer/build/opt/src HAMMER_INCLUDE = . HAMMER_LIB = ./lib -CFLAGS += -I$(HAMMER_INCLUDE) +CFLAGS += -I$(HAMMER_INCLUDE) -g # (-pg :: profile using gprof) (-g :: debug info) LDFLAGS += -L$(HAMMER_LIB) SOURCES = pdf.c lzw-lib.c diff --git a/pdf.c b/pdf.c index fc31efb80fffe83c59869af82fea6e8989cab8a8..50d2e7d48a8ee09b187506fbc9e395c9fa28b25d 100644 --- a/pdf.c +++ b/pdf.c @@ -2,12 +2,14 @@ * pesco 2019,2020 * pompolic 2020 * Paul Vines 2020 + * Kragen Sitaker 2020, 2021 * Sumit Ray 2021 + * */ #include <string.h> /* strncmp(), memset(), memcpy() */ #include <stdlib.h> /* exit() */ - +#include <strings.h> /* bcopy */ #include <hammer/hammer.h> #include <hammer/glue.h> @@ -16,6 +18,7 @@ #define CHX(...) h_choice(__VA_ARGS__, NULL) #define OPT(X) h_ignore(h_optional(X)) #define REP(P,N) h_repeat_n(P, N) + #define IGN(P) h_ignore(P) #define LIT(S) h_literal(S) #define IN(STR) h_in((const uint8_t *)(STR), sizeof(STR) - 1) @@ -110,6 +113,191 @@ validate_notnull(HParseResult *p, void *u) } + +// Forward declaration of Token structures +typedef struct { size_t nr, gen; } Ref; + +typedef HCountedArray Dict; + + + +// Catalog Tree +typedef struct RsrcDict_S { + const HParsedToken *resources; // font references dictionary (resources == NULL) ==> inherit + const HParsedToken *fonts; // dictonary of fonts used in this page + size_t numFonts; + const HParsedToken *xobj; // xobj used in this page (?? is this <=1??, can page use multiple xobjects??) +// Dict **seenFonts; +// size_t numSeenFonts; +// const HParsedToken **seenCmaps; // memoized cmaps (should this be a bytestream? +// size_t numCmapsSeen; +} RsrcDict_T; + +struct PtNode_S; + +typedef struct PtNode_S { + enum {PG_TREE, PG_NODE, XO_NODE} type; + const HParsedToken *parent; // Type = Page tree -- reference + RsrcDict_T *pgRsrc; // resource structure + const HParsedToken *me; // Reference for me + size_t offset; // + union { + struct { + const Dict *dict; // page node dictionary + const HParsedToken *textStream; // content stream -- may be a result of concatenating array of content streams + } pn; + struct { + struct PtNode_S *kids; // page table + size_t count; // number of kids + size_t leaves; // number of pages in tree + } pt; + struct { + const Dict *dict; // page node dictionary + const HParsedToken *textStream; // content stream -- may be a result of concatenating array of content streams + } xn; + }; + +} PtNode_T; + +struct XoNode_S; +typedef struct XoNode_S { + char *name; + PtNode_T *node; + struct XoNode_S *next; +} XoNode_T; + + +typedef struct Catalog_S { + const HParsedToken *catalog; // reference + const HParsedToken *pRoot; // reference + PtNode_T pgTree; // page tree + size_t pgCount; // page tree node count + XoNode_T xObjs; // list of XObjects + XoNode_T *xoHead; + XoNode_T *xoTail; + size_t xoCount; // number of xobjects +} Catalog_T; + + +// Forward declaration of text extraction related structures +struct textnode; +struct textstr; +struct TextEntry_S; + +// *********************************************************** +/* + * Text data structures + */ + +typedef struct { + struct PtNode_S *page; + struct TextEntry_S *font; +} TextState_T; + + +struct textnode { + struct textstr *tstr; + struct textnode *next; +}; + + +struct fontref { + const uint8_t *fontname; + uint32_t namelen; + double fontsize; + char *fn; /* null terminated string */ +}; +struct textpos { + double tx; + double ty; +}; +struct textmat { double cell[6]; }; +struct textstr { + uint8_t *text; + uint32_t nchars; + const HParsedToken *tobj; +}; + + +struct textwfmt { /* text with formatting specifications */ + double aw; /* word spacing */ + double ac; /* character spacing */ + struct textstr tstr; /* the string */ +}; +struct tarrayelt { + union { + double adj; + struct textstr tstr; + }; + bool isStr; +}; +struct textarray { + struct tarrayelt *elts; + uint32_t nelts; + struct textstr flattened; +}; + + + +/* operator:: + * TS -- Text state : Table 105 + * TP -- Text position : Table 108 + * TW -- Test showing : Table 109 + * */ +typedef struct TextEntry_S { + enum {TS_Tc, TS_Tw, TS_Tz, TS_TL, TS_Tf, TS_Tr, TS_Ts, + TP_Td, TP_TD, TP_Tm, TP_Tstar, + TW_Tj, TW_Tq, TW_Tqq, TW_TJ} type; + TextState_T ts; // text state associated with this string (TBD: other state attributes) + union { + double value; /* many just have a value */ + uint8_t mode; /* text mode */ + struct fontref fref; /* font name reference */ + struct textpos pos; /* text position */ + struct textmat fm; /* font matrix */ + struct textstr tstr; /* the string */ + struct textwfmt twfmt; /* text with formatting -- qq_op */ + struct textarray tarray; /* text contained in an array object */ + }; + const HParsedToken *obj; +} TextEntry; // text object entries + + + +#if 0 +// Haven't used this type yet - maybe OBE +typedef struct { + struct textmat fm; /* font matrix associated with this text object */ + TextEntry **ops; /* operators associated w/string */ + uint8_t *txt; /* the string associated with this object */ +} TextString; +#endif + +// Initial use -- object streams +typedef struct { + Ref oid; + const HParsedToken *obj; +} Objref_T; +typedef struct { + size_t numObjs; + Objref_T *tok; + HArena *arena; +} Objstm; + +typedef struct { + char *name; + char *type; + char *basefont; + char *encoding; + const HParsedToken *descriptor; + const HParsedToken *toUnicode; + const HParsedToken *descendantFonts; +} Fontinfo_T; + + +// *********************************************************** + + /* * auxiliary global data structure needed by the parser */ @@ -120,13 +308,26 @@ struct Env { const HParsedToken **xrefs; /* all xref sections of the file */ size_t nxrefs; + + struct textnode *txthead; /* parsed text objects from the file */ + struct textnode *txttail; /* parsed text objects from the file */ + size_t ntextobjs; + + Catalog_T catalog; /* Catalog object and document structure */ + TextState_T tstate; /* current text state */ + }; +Fontinfo_T *lookup_font(TextState_T *state, struct Env *aux); + +// *********************************************************** + + /* * custom token types */ -HTokenType TT_XREntry, TT_Ref, TT_Dict, TT_HParseResult; +HTokenType TT_XREntry, TT_Ref, TT_Dict, TT_HParseResult, TT_TextEntry, TT_Objstm; typedef struct { enum {XR_FREE, XR_INUSE, XR_OBJSTM} type; @@ -138,9 +339,6 @@ typedef struct { const HParsedToken *obj; } XREntry; -typedef struct { size_t nr, gen; } Ref; - -typedef HCountedArray Dict; /* look up a value in a dictionary */ const HParsedToken * @@ -230,7 +428,7 @@ pp_dict(FILE *stream, const HParsedToken *tok, int indent, int delta) if (tok->seq->used > 2) fprintf(stream, "\n%*s}", indent, ""); else - fprintf(stream, " }"); + fprintf(stream, " }\n"); } /* @@ -580,19 +778,15 @@ act_a85string(const HParseResult *p, void *u) size_t chunk_number; size_t required_bytes; size_t out_pos = 0; -// HCountedArray *seq = H_CAST_SEQ(p->ast); HCountedArray *chunks = H_FIELD_SEQ(0); HBytes last_chunk = H_FIELD_BYTES(1); /* Number of 4-byte chunks, minus the potential last partial group and EOD */ -// assert(full->token_type == TT_SEQUENCE); -// assert(partial->->token_type == TT_BYTES); chunk_number = chunks->used; /* Special-case: last chunk before EOD may be 4, 3, 2 or 1 bytes * The latter two happening if the group was parsed from a partial * group consisting less than 5 chars */ -// HBytes *last_chunk = &partial->elements[1]->bytes; required_bytes = (chunk_number * 4 + last_chunk.len); result_bytes = h_arena_malloc(p->arena, sizeof(uint8_t) * required_bytes); @@ -608,6 +802,7 @@ act_a85string(const HParseResult *p, void *u) result_bytes[out_pos+2], result_bytes[out_pos+3], *((unsigned int *)(chunks->elements[i]->bytes.token))); // XXX DEBUG out_pos += 4; } + if (last_chunk.len) { memcpy(result_bytes + out_pos, last_chunk.token, last_chunk.len); out_pos += last_chunk.len; @@ -806,6 +1001,13 @@ act_xrent(const HParseResult *p, void *u) return H_MAKE(XREntry, xr); } + +HParsedToken * +act_xr_td(const HParseResult *p, void *u) +{ + return (HParsedToken*)p->ast; +} + HParsedToken * act_xrstment(const HParseResult *p, void *u) { @@ -958,469 +1160,2552 @@ act_rldstring(const HParseResult *p, void *u) } +// XXX review +/* + * ******************************************************************** + * Catalog parsing + * ******************************************************************** + */ +HParsedToken * +act_cstream(const HParseResult *p, void *u) +{ +// HCountedArray *contents = H_FIELD_SEQ(0); +// +// fprintf(stdout, "act_contentstream:: stream length = %ld\n", contents->used); + return (HParsedToken *)p->ast; +} + + + +bool +validate_pgcontents(HParseResult *p, void *u) +{ + return false; +} + +HParsedToken * +act_pgcontents(const HParseResult *p, void *u) +{ + return (HParsedToken *)p->ast; +} + +HParsedToken * +act_page(const HParseResult *p, void *u) +{ + return (HParsedToken *)p->ast; +} +HParsedToken * +act_dictobj(const HParseResult *p, void *u) +{ + return (HParsedToken *)p->ast; +} /* - * input grammar + * ******************************************************************** + * Start Text parsing + * ******************************************************************** */ +void pp_fontstate(FILE *stream, const TextState_T *state) +{ + assert(state); + fprintf(stream, "\nFont State: Page = "); + if ( (state->page->type == PG_NODE) || (state->page->type == XO_NODE) ) + pp_ref(stream, state->page->me, 0, 0); + if (state->font) // not all operators need or set this + fprintf(stream, ", Font = %s\n", state->font->fref.fn); + else + fprintf(stream, ", Font not yet specified\n"); -HParser *p_pdf; -HParser *p_pdfdbg; -HParser *p_startxref; -HParser *p_xref; -HParser *p_objdef; -HParser *p_a85string; -HParser *p_ahexstream; -HParser *p_rldstring; -HParser *p_ws; -HParser *p_wel; -HParser *p_elemr; -HParser *p_npair; - -/* continuations for h_bind() */ -HParser *kstream(HAllocator *, const HParsedToken *, void *); -HParser *kxstream(HAllocator *, const HParsedToken *, void *); +} +void pp_fontinfo(FILE *stream, const TextState_T *state, const Fontinfo_T *fi) +{ + assert(state && fi); + pp_fontstate(stream, state); + fprintf(stream, "Font Info: "); + if (fi->name) fprintf(stream, "Font= %s", fi->name); + if (fi->type) fprintf(stream, ", Type= %s", fi->type); + if (fi->basefont) fprintf(stream, ", Basefont= %s", fi->basefont); + if (fi->encoding) fprintf(stream, ", Encoding= %s", fi->encoding); + if (fi->descriptor) pp_dict(stream, fi->descriptor, 0, 0); + if (fi->toUnicode) pp_dict(stream, fi->toUnicode, 0, 0); + if (fi->descendantFonts) pp_dict(stream, fi->descendantFonts, 0, 0); + fprintf(stream, "\n"); +} +/* + * Pretty printer for text components of the ast + */ void -init_runlengthdecode_parser(struct Env *aux) +pp_textentry(FILE *stream, const HParsedToken *tok, int indent, int delta) { - H_RULE(rldeod, h_ch(0x80)); - H_ARULE(longlength, h_ch_range(0x81, 0xFF)); - H_ARULE(shortlength, h_ch_range(0x0, 0x7F)); - - H_RULE(shortdata, h_uint8()); - H_RULE(longdata, h_uint8()); + TextEntry *txte = H_CAST(TextEntry, tok); - H_RULE(shortrun, h_length_value(shortlength, shortdata)); - H_ARULE(longrun, SEQ(longlength, longdata)); + switch (txte->type) { + /* + * Always pretty print the text show operators + * + * If TEXT_VERBOSE is set, pretty-print the other operators + */ +#define TEXT_VERBOSE +#ifdef TEXT_VERBOSE + case TS_Tf: + fprintf(stream, "Tf_op: fn=%.*s, fontsize=%3.3f\n", + txte->fref.namelen, txte->fref.fontname, txte->fref.fontsize); + break; + case TP_Td: + fprintf(stream, "Td_op: text position ::tx=%3.3f:ty=%3.3f\n", + txte->pos.tx, txte->pos.ty); + break; +#endif + case TW_Tj: + case TW_Tq: + case TW_Tqq: + fprintf(stream, "len=%u, ", txte->tstr.nchars); + fwrite((const void *)txte->tstr.text, (int) txte->tstr.nchars, 1, stream); + break; + case TW_TJ: + fprintf(stream, "len=%u, ", txte->tarray.flattened.nchars); + fwrite((const void *)txte->tarray.flattened.text, + (int) txte->tarray.flattened.nchars, 1, stream); + break; - H_ARULE(rldstring, SEQ(h_many(CHX(shortrun, longrun)), IGN(rldeod))); - p_rldstring = rldstring; + default: + ; + } } + void -init_parser(struct Env *aux) +pp_objstm(FILE *stream, const HParsedToken *tok, int indent, int delta) { - TT_HParseResult = h_allocate_token_new("HParseResult", NULL, pp_parseresult); - TT_XREntry = h_allocate_token_new("XREntry", NULL, pp_xrentry); - TT_Ref = h_allocate_token_new("Ref", NULL, pp_ref); - TT_Dict = h_allocate_token_new("Dict", NULL, pp_dict); + Objstm *entry = H_CAST(Objstm, tok); - /* lines */ - H_RULE(cr, p_mapch('\r', '\n')); /* semantic value: \n */ - H_RULE(lf, h_ch('\n')); /* semantic value: \n */ - H_RULE(crlf, h_right(cr, lf)); /* semantic value: \n */ - H_RULE(eol, CHX(crlf, cr, lf)); - H_RULE(end, h_end_p()); +// pp_parseresult(stream, (const HParsedToken *)entry->res, 0, 0); + fprintf(stream, "pp_objstm:: Num Objects = %lu\n", entry->numObjs); - /* character classes */ -#define LWCHARS "\0\t\f " -#define WCHARS LWCHARS "\n\r" -#define DCHARS "()<>[]{}/%" - H_RULE(wchar, IN(WCHARS)); /* white-space */ - H_RULE(lwchar, IN(LWCHARS)); /* "line" whitespace */ - //H_RULE(dchar, IN(DCHARS)); /* delimiter */ - H_RULE(rchar, NOT_IN(WCHARS DCHARS)); /* regular */ - H_RULE(nchar, NOT_IN(WCHARS DCHARS "#")); /* name */ - H_RULE(schar, NOT_IN("()\n\r\\")); /* string literal */ - H_ARULE(digit, h_ch_range('0', '9')); - H_ARULE(pdigit, h_ch_range('1', '9')); - H_ARULE(hlower, h_ch_range('a', 'f')); - H_ARULE(hupper, h_ch_range('A', 'F')); - H_RULE(hdigit, CHX(digit, hlower, hupper)); - H_ARULE(odigit, h_ch_range('0', '7')); + for (int i=0; i<entry->numObjs; i++) { + fprintf(stream, "oid: <nr=%lu, gen=%lu>, ", entry->tok[i].oid.nr, entry->tok[i].oid.gen); + fprintf(stream, "Type = %u, ", entry->tok[i].obj->token_type); + if (entry->tok[i].obj->token_type == TT_Dict) { + pp_dict(stream, entry->tok[i].obj, 0, 0); + } + fprintf(stream, "\n"); + } +} - H_RULE(sp, h_ch(' ')); - H_RULE(percent, h_ch('%')); - H_RULE(period, h_ch('.')); - H_RULE(slash, h_ch('/')); - H_RULE(hash, h_ch('#')); - H_RULE(bslash, h_ch('\\')); - H_RULE(lparen, h_ch('(')); - H_RULE(rparen, h_ch(')')); - H_RULE(langle, h_ch('<')); - H_RULE(rangle, h_ch('>')); - H_RULE(lbrack, h_ch('[')); - H_RULE(rbrack, h_ch(']')); - H_RULE(plus, h_ch('+')); - H_RULE(minus, h_ch('-')); +/* + * semantic actions + */ - /* whitespace */ - H_RULE(comment, SEQ(percent, h_many(NOT_IN("\r\n")), CHX(cr,lf,end))); - /* ^ NB: must consume cr/lf for ws to be LL(k) */ - H_RULE(wel, IGN(CHX(wchar, comment))); - H_RULE(ws, IGN(h_many(wel))); - H_RULE(lws, IGN(h_many(IGN(lwchar)))); - /* tokenization */ -#define MANY_WS(X) h_many(CHX(wel, X)) -#define TOK(X) h_middle(ws, X, h_not(rchar)) -#define KW(S) IGN(TOK(LIT(S))) - /* misc */ - H_RULE(nl, IGN(h_right(lws, eol))); - H_RULE(epsilon, h_epsilon_p()); - H_RULE(empty, SEQ(epsilon)); - H_RULE(digits, h_many1(digit)); - H_VARULE(nat, digits); - H_VRULE(pnat, nat); - H_RULE(npair, SEQ(pnat, wel,ws, nat)); +/* + * Simplify the code by casting the choice of integer number and real number to double + */ +bool +validate_tnumb(HParseResult *p, void *u) +{ + assert((p->ast->token_type == TT_SINT) || (p->ast->token_type == TT_DOUBLE)); - /* - * objects - */ - - H_ARULE(ref, SEQ(npair, wel,ws, LIT("R"))); - H_RULE(null, LIT("null")); - H_RULE(boole, CHX(LIT("true"), LIT("false"))); + return true; +} - /* numbers */ - H_ARULE(sign, CHX(minus, IGN(plus))); - H_VRULE(intnn, nat); - H_ARULE(realnn, CHX(SEQ(digits, period, digits), /* 12.3 */ - SEQ(digits, period, empty), /* 123. */ - SEQ(empty, period, digits))); /* .123 */ - // XXX ^ we _could_ move the "123." case into intnn... - H_RULE(numbnn, CHX(realnn, intnn)); - H_RULE(snumb, SEQ(sign, numbnn)); - H_VARULE(numb, CHX(snumb, numbnn)); - /* names */ - H_ARULE(nesc, SEQ(hash, hdigit, hdigit)); - H_ARULE(nstr, h_many(CHX(nchar, nesc))); /* '/' is valid */ - H_RULE(name, h_right(slash, nstr)); +HParsedToken * +act_tnumb(const HParseResult *p, void *u) +{ - /* strings */ - H_RULE(snest, h_indirect()); - H_RULE(bsn, p_mapch('n', 0x0a)); /* LF */ - H_RULE(bsr, p_mapch('r', 0x0d)); /* CR */ - H_RULE(bst, p_mapch('t', 0x09)); /* HT */ - H_RULE(bsb, p_mapch('b', 0x08)); /* BS (backspace) */ - H_RULE(bsf, p_mapch('f', 0x0c)); /* FF */ - H_RULE(escape, CHX(bsn, bsr, bst, bsb, bsf, lparen, rparen, bslash)); - H_ARULE(octal, CHX(REP(odigit,3), REP(odigit,2), REP(odigit,1))); - H_RULE(wrap, IGN(eol)); - H_RULE(sesc, h_right(bslash, CHX(escape, octal, wrap, epsilon))); - /* NB: lone backslashes and escaped newlines are ignored */ - H_ARULE(schars, h_many(CHX(schar, snest, sesc, eol))); - H_RULE(snest_, SEQ(lparen, schars, rparen)); - H_RULE(litstr, h_middle(lparen, schars, rparen)); - H_RULE(hexstr, h_middle(langle, MANY_WS(hdigit), rangle)); - H_ARULE(string, CHX(litstr, hexstr)); - h_bind_indirect(snest, snest_); + double value; - H_RULE(array, h_indirect()); - H_RULE(dict, h_indirect()); + if (p->ast->token_type == TT_SINT) value = (double)p->ast->sint; + else value = p->ast->dbl; - /* classify objects by whether they start/end with a delimiter: */ - H_RULE(robj, CHX(ref, null, boole, numb)); /* rchars */ - H_RULE(dobj, CHX(string, array, dict)); /* dchars */ - H_RULE(obj, CHX(robj, name, dobj)); + return H_MAKE_DOUBLE(value); +} - /* dictionaries */ - H_RULE(dopen, LIT("<<")); - H_RULE(dclose, LIT(">>")); - H_RULE(k_v, CHX(CHX(SEQ(name, wel,ws, obj), - SEQ(name, CHX(name,dobj))), - VIOL(SEQ(name, wel,ws), "Key with no value (severity=2)"))); - H_ARULE(dict_, h_middle(dopen, MANY_WS(k_v), dclose)); - // XXX this allows, for instance, "<<<<" to be parsed as "<< <<". ok? - // XXX validate: dict keys must be unique - h_bind_indirect(dict, dict_); - /* arrays */ - H_RULE(elemd, h_indirect()); /* elements following a delimiter */ - H_RULE(elemr, h_indirect()); /* elements following a regular char */ - H_ARULE(array_, h_middle(lbrack, elemd, rbrack)); - H_RULE(elemd_, CHX(SEQ(ws, dobj, elemd), - SEQ(ws, name, elemr), - SEQ(ws, robj, elemr), - ws)); - H_RULE(elemr_, CHX(SEQ(ws, dobj, elemd), - SEQ(ws, name, elemr), - SEQ(wel,ws, robj, elemr), - ws)); - h_bind_indirect(elemd, elemd_); - h_bind_indirect(elemr, elemr_); - h_bind_indirect(array, array_); - /* streams */ - H_RULE(stmbeg, SEQ(dict, OPT(ws), LIT("stream"), OPT(cr), - CHX(lf, VIOL(epsilon, "No linefeed after 'stream' (severity=7)")))); - H_RULE(stmend, CHX(SEQ(eol, LIT("endstream")), - VIOL(LIT("ndstream"), "Stream length >1-too-long (severity=10)"), - VIOL(SEQ(h_many(wchar), LIT("endstream")), - "No newline before 'endstream' (severity=7)"), - VIOL(LIT("endstream"), "Stream length 1-too-long (severity=9)"), - VIOL(SEQ(OPT(h_ch_range(0, 255)), OPT(eol), LIT("endstream")), - "Stream length 1-too-short (severity=4)"), - VIOL(SEQ(h_many1(h_butnot(h_ch_range(0, 255), CHX(KW("endobj"), - SEQ(npair, wel, KW("obj")), - KW("xref"), - LIT("endstream")))), LIT("endstream")), - "Stream length >1-too-short (severity=5)"), - VIOL(h_many1(h_butnot(h_ch_range(0, 255), CHX(KW("endobj"), - SEQ(npair, wel, KW("obj")), - KW("xref")))), - "Missing endstream token (severity=7)"))); - H_RULE(stream, h_left(h_bind(stmbeg, kstream, aux), stmend)); - // XXX is whitespace allowed between the eol and "endstream"? - // peter wyatt says no. (2020-03-25) +/* + * Text state operators - Table 105 + * TS_Tc, TS_Tw, TS_Tz, TS_TL, TS_Tf, TS_Tr, TS_Ts + * + * ***************************************************************** + * ***************************************************************** + * + */ +HParsedToken * +act_Tc_op(const HParseResult *p, void *u) +{ + TextEntry *txte = H_ALLOC(TextEntry); + struct Env *aux = (struct Env*)u; + const HParsedToken *tval = H_INDEX_TOKEN(p->ast, 0); - /* - * file structure - */ - /* header */ - H_RULE(version, SEQ(pdigit, IGN(period), digit)); - H_RULE(header, h_middle(LIT("%PDF-"), version, nl)); + txte->type = TS_Tc; + txte->obj = NULL; - /* body */ - H_RULE(indobj, CHX(stream, obj)); - H_RULE(objdef, SEQ(ws, npair, wel, KW("obj"), ws, indobj, - CHX(VIOL(SEQ(OPT(ws), OPT(lws), KW("endobj"), h_many(CHX(wel, eol)), h_many1(KW("endobj"))), - "More than 1 endobj token (severity=1)"), - VIOL(SEQ(OPT(ws), OPT(lws), KW("endobj"), h_many(CHX(wel, eol)), h_many1(SEQ(dclose, h_many1(CHX(wchar, eol)), KW("endobj")))), - "More than 1 >> and endobj token (severity=2)"), - SEQ(OPT(ws), OPT(lws), KW("endobj")), - VIOL(h_optional(KW("endobj")), "Missing endobj token (severity=1)")))); - H_RULE(body, h_many(objdef)); + assert(tval->token_type == TT_DOUBLE); + txte->value = tval->dbl; - /* cross-reference section */ - H_RULE(xreol, CHX(SEQ(sp, cr), SEQ(sp, lf), crlf)); - // ^ XXX does the real world follow this rule?! cf. loop.pdf - H_RULE(xrtyp, CHX(h_ch('n'), h_ch('f'))); - H_ARULE(xroff, REP(digit, 10)); - H_ARULE(xrgen, REP(digit, 5)); - H_ARULE(xrent, SEQ(xroff, IGN(CHX(VIOL(SEQ(lwchar, h_many1(lwchar)), "Multi-WS in xref offset_gen entry (severity=1)"), sp)), - xrgen, IGN(CHX(VIOL(SEQ(lwchar, h_many1(lwchar)), "Multi-WS in xref gen_use entry (severity=1)"), sp)), - xrtyp, IGN(CHX(VIOL(SEQ(wchar, wchar, h_many1(wchar)), "Greater-than-2-byte WS at end of xref entry (severity=1)"), - xreol, - VIOL(SEQ(h_many1(wchar)), "Nonconformant WS at end of xref entry (severity=1)"))))); - H_RULE(xrhead, SEQ(nat, IGN(sp), nat, nl)); - H_RULE(xrsub, SEQ(xrhead, h_many(xrent))); - H_ARULE(xrefs, SEQ(KW("xref"), nl, h_many(xrsub))); + // associate the text with the current state + txte->ts.page = aux->tstate.page; + txte->ts.font = aux->tstate.font; - /* cross-reference streams */ - H_RULE(xstream, h_bind(stmbeg, kxstream, aux)); - H_AVRULE(xrstm, SEQ(ws, npair, wel, KW("obj"), ws, xstream)); - // XXX skip however much we consumed and check for "endstream endobj"? + fprintf(stdout, "act_Tc_op:: %3.3f\n", txte->value); + return H_MAKE(TextEntry, txte); +} - /* trailer */ - H_RULE(startxr, SEQ(nl, KW("startxref"), nl, - lws, nat, nl, - LIT("%%EOF"), OPT(nl))); +/* + * Tw operator: word spacing specification + * H_ARULE(Tw_op, SEQ(tnumb, ws, LIT("Tw"))); // 9.3.3 - wordSpace + */ +HParsedToken * +act_Tw_op(const HParseResult *p, void *u) +{ + TextEntry *txte = H_ALLOC(TextEntry); + struct Env *aux = (struct Env*)u; - /* used for the backwards search */ - H_RULE(lasteof, SEQ(nl, KW("startxref"), nl, - lws, nat, nl, - // XXX the real world sometimes omits nl after %%EOF inside the file. - // the next 'tail' would be appended right after the 'F', - // presumably because the previous version of the file - // ended without a trailing newline. m) - // this is invalid per spec, because it creates a run-on - // comment, but we should probably accept-and-warn. - // XXX should lws be allowed before EOF marker? - // NB: lws before xref offset is allowed, cf. p.48 (example 4) - LIT("%%EOF"), - CHX(VIOL(SEQ(nl, h_many1(nl), end), - "(offset FROM END) Multiple newlines after final %%EOF (severity=4)"), - SEQ(h_many(nl), end), - VIOL(SEQ(h_butnot(h_ch_range(0, 255), LIT("%%EOF"))), - "(offset FROM END) Data after final %%EOF (severity=7)")))); - H_RULE(xr_td, SEQ(xrefs, KW("trailer"), ws, dict)); + txte->type = TS_Tw; + txte->obj = NULL; + txte->value = H_FIELD_DOUBLE(0); - H_RULE(start_junk, VIOL(h_many1(h_butnot(h_ch_range(0, 255), header)), - "Junk bytes before %PDF header (severity=1)")); - H_RULE(hdr_junk, CHX(comment, - VIOL(h_many1(h_butnot(h_ch_range(0, 255), SEQ(npair, wel, KW("obj")))), - "Uncommented junk after header (severity=1)"))); - H_RULE(tail, SEQ(body, CHX(SEQ(h_optional(xr_td), startxr), - VIOL(SEQ(xr_td, OPT(SEQ(nl, KW("startxref"), nl, lws, nat, nl)), - OPT(nl), OPT(LIT("%%EOF")), OPT(nl)), - "Improper end of trailer - missing startxref and/or %%EOF (severity=5)")))); - H_RULE(final_eof_junk, CHX(VIOL(SEQ(h_many1(nl), end), "Multiple newlines after final %%EOF (severity=4)"), - VIOL(h_many1(h_butnot(h_ch_range(0, 255), LIT("%%EOF"))), - "Data after final %%EOF (severity=7)"), - end)); - H_RULE(pdf, SEQ(OPT(start_junk), header, OPT(hdr_junk), h_many1(tail), final_eof_junk)); + // associate the text with the current state + txte->ts.page = aux->tstate.page; + txte->ts.font = aux->tstate.font; - /* debug parser to consume as much as possible */ - H_RULE(pdfdbg, SEQ(OPT(start_junk), header, OPT(hdr_junk), h_many(tail), body, OPT(xr_td), OPT(SEQ(startxr, final_eof_junk)))); + fprintf(stdout, "act_Tw_op:: %3.3f\n", txte->value); + return H_MAKE(TextEntry, txte); +} - /* - * filters - */ - /* Whitespace can occur between any digit and has to be ignored, */ - H_RULE(aws, IGN(h_many(wchar))); // all white space, include CR & LF, but not comments - #define MANY_AWS(X) h_many(CHX(aws, X)) +/* + * Tz operator: horizintal scaling specification + * H_ARULE(Tz_op, SEQ(tnumb, ws, LIT("Tz"))); // 9.3.4 - horizontal scaling + */ +HParsedToken * +act_Tz_op(const HParseResult *p, void *u) +{ + TextEntry *txte = H_ALLOC(TextEntry); + struct Env *aux = (struct Env*)u; - /* Ascii85Decode */ - H_RULE(a85eod, SEQ(h_ch('~'), aws, h_ch('>'))); - H_ARULE(a85zero, h_ch('z')); - H_ARULE(a85digit, h_ch_range('!', 'u')); - /* Line whitespace can occur between any digit and has to be ignored, */ - #define MANY_LWS(X) h_many(CHX(lws, X)) - /* This encoding of zero is not allowed */ -// H_RULE(a85fiveexcl, h_repeat_n(SEQ(h_ch('!'), aws), 5)); // seeing this is a violation - // Folded the test for a85fiveexcl into the validation component - H_VARULE(a85fivedigits, h_repeat_n(SEQ(a85digit, aws), 5)); - /* - * Suggestion for violations. VIOL() will report error conditions, but it also relaxes validation. - * A hacky way to avoid that would be to add back the validation manually to the H_RULE backing the VIOL() itself. - * - */ + txte->type = TS_Tz; + txte->obj = NULL; + txte->value = H_FIELD_DOUBLE(0); - /* H_RULE(a85fivedigits_viol, h_repeat_n(SEQ(h_ch('!'), aws), 5)); - * H_RULE(a85fivedigits_report_error, CHX(a85fivedigits, VIOL(a85fivedigits_viol, "Zero ASCII85Encoded as '!!!!!'"))); // Relaxes validation, will parse - * - * HParser *a85fivedigits_viol_alt = h_attr_bool(h_action(h_repeat_n(SEQ(h_ch('!'), aws), 5), act_a85fivedigits, NULL), validate_a85fivedigits, NULL); - * H_RULE(a85fivedigits_report_error, CHX(a85fivedigits, VIOL(a85fivedigits_viol_alt, "Zero ASCII85Encoded as '!!!!!'"))); // Validation will run and stop the parse - */ + // associate the text with the current state + txte->ts.page = aux->tstate.page; + txte->ts.font = aux->tstate.font; -// H_VARULE(a85fivedigits, SEQ(h_and(h_not(a85fiveexcl)), h_repeat_n(SEQ(a85digit, aws), 5))); - // TODO:: will need to pull out error conditions -- a85fiveexcl or 'z' as one of the digits - H_ARULE(a85group, CHX(a85zero, a85fivedigits)); + fprintf(stdout, "act_Tz_op:: %3.3f\n", txte->value); + return H_MAKE(TextEntry, txte); +} - H_VARULE(a85partial2group, h_repeat_n(SEQ(a85digit, aws), 2)); - H_VARULE(a85partial3group, h_repeat_n(SEQ(a85digit, aws), 3)); - H_VARULE(a85partial4group, h_repeat_n(SEQ(a85digit, aws), 4)); - H_RULE(a85partialgroup, CHX(a85partial4group, a85partial3group, a85partial2group)); - H_ARULE(a85string, SEQ(h_many(a85group), h_optional(a85partialgroup), IGN(a85eod))); - //p_test = a85group; +/* + * TL operator: leading (line spacing) specification + * H_ARULE(TL_op, SEQ(tnumb, ws, LIT("TL"))); // 9.3.5 - leading + */ +HParsedToken * +act_TL_op(const HParseResult *p, void *u) +{ + TextEntry *txte = H_ALLOC(TextEntry); + struct Env *aux = (struct Env*)u; - /* - * Not sure whether comments can be embedded within content streams - * If not, use the rule aws rather than ws - */ - /* - * It seems somewhat unclear. ASCII85Decode definitely can't have - * comments, because % can be part of a valid ASCII85Encoded character. - * However, it seems that comments are generally allowed: - * "Any occurrence of the PERCENT SIGN (25h) outside a string or - * inside a content stream (see 7.8.2, "Content streams") introduces - * a comment." ISO32000:2-2017 7.2.4 - */ - // XXX Ask Peter Wyatt + txte->type = TS_TL; + txte->obj = NULL; + txte->value = H_FIELD_DOUBLE(0); + // associate the text with the current state + txte->ts.page = aux->tstate.page; + txte->ts.font = aux->tstate.font; - /* AsciiHexDecode */ - H_RULE(ahexeod, h_ch('>')); - H_ARULE(hdigitpair, SEQ(aws, hdigit, aws, hdigit)); - H_ARULE(ahextruncated, SEQ(aws, hdigit)); + fprintf(stdout, "act_TL_op:: %3.3f\n", txte->value); + return H_MAKE(TextEntry, txte); +} - H_RULE(ahs_end, SEQ(h_optional(ahextruncated), aws, ahexeod)); - H_ARULE(ahexstream, SEQ(h_many(hdigitpair), ahs_end)); - init_runlengthdecode_parser(aux); +/* + * Font name and size specification + * H_ARULE(Tf_op, SEQ(name, ws, nat, ws, KW("Tf"), ws)); // font and size + * + * TODO: Verify that the name is specified in the resource dictionary + */ +HParsedToken * +act_Tf_op(const HParseResult *p, void *u) +{ + TextEntry *txte = H_ALLOC(TextEntry); + struct Env *aux = (struct Env*)u; + const HParsedToken *fn_token = H_FIELD_TOKEN(0); + + txte->type = TS_Tf; + txte->obj = NULL; + + txte->fref.fontname = fn_token->bytes.token; + txte->fref.namelen = fn_token->bytes.len; + txte->fref.fn = h_arena_malloc(p->arena, sizeof (char) * (fn_token->bytes.len + 1)); + memcpy(txte->fref.fn, fn_token->bytes.token, fn_token->bytes.len); + txte->fref.fn[fn_token->bytes.len] = '\0'; + HTokenType tokenType = p->ast->seq->elements[1]->token_type; + if (tokenType == TT_UINT) + txte->fref.fontsize = (double) H_FIELD_UINT(1); + else if (tokenType == TT_DOUBLE) + txte->fref.fontsize = (double) H_FIELD_DOUBLE(1); + else + fprintf(stderr, "act_Tf_op: Unexpected token type for fontsize - token_type=%u\n", + tokenType); - /* global parser variables */ - p_pdf = pdf; - p_pdfdbg = pdfdbg; - p_startxref = lasteof; //startxr; - p_xref = CHX(xr_td, xrstm); - p_objdef = objdef; - p_a85string = a85string; - p_ahexstream = ahexstream; - p_ws = ws; - p_wel = wel; - p_elemr = h_action(elemr, h_act_flatten, NULL); - p_npair = npair; + // save this foont as the current state to be used by subsequent text + const HParsedToken * restok = H_MAKE(TextEntry, txte); + aux->tstate.font = txte; - p_fail = h_nothing_p(); - p_epsilon = epsilon; - p_return_0 = h_action(epsilon, act_return_uint, (void *)0); - p_return_1 = h_action(epsilon, act_return_uint, (void *)1); + // associate the text with the current state + txte->ts.page = aux->tstate.page; + txte->ts.font = aux->tstate.font; // recursive :-) defn - /* Parsing of severity messages */ - H_RULE(viol_preamble, SEQ(h_many(NOT_IN("=")), LIT("="))); - H_RULE(severity_num, h_action(h_many1(h_action(h_ch_range('0', '9'), act_digit, NULL)), - act_nat, NULL)); - H_RULE(violsev, SEQ(IGN(viol_preamble), severity_num)); - p_violsev = violsev; + // DEBUG + fprintf(stdout, "act_Tf_op: fn=%.*s, fontsize=%3.3f, fontstate=%p, page=", + txte->fref.namelen, txte->fref.fontname, txte->fref.fontsize, (void*)txte); + if (aux->tstate.page->type==PG_NODE) + pp_ref(stdout, aux->tstate.page->me, 0, 0); + fprintf(stdout, "\n"); -#if 0 - // XXX testing - int r; - void errx(int, const char *, ...); - HParser *p = obj; - if ((r = h_compile(p, PB_LALR, NULL)) != 0) - errx(1, "h_compile() failed: %d", r); - errx(0, "OK"); -#endif + return ((HParsedToken *)restok); } + /* - * lookup and resolution of indirect references + * Tr operator: rendering mode + * H_VRULE(tmode, nat); // True if <= 7 + * H_ARULE(Tr_op, SEQ(tmode, ws, LIT("Tr"))); // 9.3.6 - rendering mode + * * - * ** Parameter 'gen' is unused */ +#define TEXTMODE_MAX 7 -XREntry * -lookup_xref(struct Env *aux, size_t nr, size_t gen) +bool +validate_tmode(HParseResult *p, void *u) { - HParsedToken *ss; /* xref subsection */ - size_t base, n; + return H_CAST_UINT(p->ast) <= TEXTMODE_MAX; +} - /* for each cross-reference section (i.e. update) */ - for (size_t i = 0; i < aux->nxrefs; i++) { - HCountedArray *subs = H_INDEX_SEQ(aux->xrefs[i], 0); +HParsedToken * +act_Tr_op(const HParseResult *p, void *u) +{ + TextEntry *txte = H_ALLOC(TextEntry); + struct Env *aux = (struct Env*)u; - /* for each cross-reference subsection */ - for (size_t j = 0; j < subs->used; j++) { - ss = subs->elements[j]; - base = H_INDEX_UINT(ss, 0, 0); - n = H_INDEX_UINT(ss, 0, 1); - if (nr >= base && nr - base < n) - return H_INDEX(XREntry, ss, 1, nr - base); - } - } + txte->type = TS_Tr; + txte->obj = NULL; + txte->mode = H_FIELD_UINT(0); - return NULL; + // associate the text with the current state + txte->ts.page = aux->tstate.page; + txte->ts.font = aux->tstate.font; + + fprintf(stdout, "act_Tr_op:: %d\n", txte->mode); + return H_MAKE(TextEntry, txte); } -const HParsedToken * -parse_obj(struct Env *aux, size_t nr, size_t gen, size_t offset) + +/* + * Ts operator: rise specification + * H_ARULE(Ts_op, SEQ(tnumb, ws, LIT("Ts"))); // rise + */ +HParsedToken * +act_Ts_op(const HParseResult *p, void *u) { - HParseResult *res; - size_t def_nr, def_gen; + TextEntry *txte = H_ALLOC(TextEntry); + struct Env *aux = (struct Env*)u; - if (offset >= aux->sz) { - fprintf(stderr, "%s: position %zu (%#zx) for object %zu %zu is " - "out of bounds\n", aux->infile, offset, offset, nr, gen); - return NULL; - } - res = h_parse(p_objdef, aux->input + offset, aux->sz - offset); - if (res == NULL) { + txte->type = TS_Ts; + txte->obj = NULL; + txte->value = H_FIELD_DOUBLE(0); + + // associate the text with the current state + txte->ts.page = aux->tstate.page; + txte->ts.font = aux->tstate.font; + + fprintf(stdout, "act_Ts_op:: %3.3f\n", txte->value); + return H_MAKE(TextEntry, txte); +} + + + +/* + * 9.4.2 - Text positioning operators - Table 108 + * TP_Td, TP_TD, TP_Tm, TP_Tstar + * + * ***************************************************************** + * ***************************************************************** + * + * TP_Td: String position - Translation specification + * H_ARULE(Td_op, SEQ(tnumb, ws, tnumb, ws, LIT("Td"), ws)); // move to next line with offset + */ +HParsedToken * +act_Td_op(const HParseResult *p, void *u) +{ + TextEntry *txte = H_ALLOC(TextEntry); + struct Env *aux = (struct Env*)u; + + + txte->type = TP_Td; + txte->obj = NULL; + txte->pos.tx = H_FIELD_DOUBLE(0); + txte->pos.ty = H_FIELD_DOUBLE(1); + + // associate the text with the current state + // NOTE: This operator does not require a font + txte->ts.page = aux->tstate.page; + txte->ts.font = aux->tstate.font; + + fprintf(stdout, "act_Td_op: text position ::tx=%.3f:ty=%.3f\n", + txte->pos.tx, txte->pos.ty); + + return H_MAKE(TextEntry, txte); +} + + + + +/* + * TP_TD: Offset to next line and set the leading parameter state + * H_ARULE(TD_op, SEQ(tnumb, ws, tnumb, ws, LIT("TD"))); // move to next line with offset and set state + */ +HParsedToken * +act_TD_op(const HParseResult *p, void *u) +{ + TextEntry *txte = H_ALLOC(TextEntry); + struct Env *aux = (struct Env*)u; + + + txte->type = TP_TD; + txte->obj = NULL; + txte->pos.tx = H_FIELD_DOUBLE(0); + txte->pos.ty = H_FIELD_DOUBLE(1); + + // associate the text with the current state + txte->ts.page = aux->tstate.page; + txte->ts.font = aux->tstate.font; + + fprintf(stdout, "act_TD_op: text position ::tx=%3.3f:ty=%3.3f\n", txte->pos.tx, txte->pos.ty); + pp_fontstate(stdout, &txte->ts); + + return H_MAKE(TextEntry, txte); +} + + +/* + * TP_Tm: Text matrix specification + * H_ARULE(Tm_op, SEQ(REP(SEQ(tnumb, ws), 6), LIT("Tm"), ws)); // set text matrix + */ +HParsedToken * +act_Tm_op(const HParseResult *p, void *u) +{ + TextEntry *txte = H_ALLOC(TextEntry); + + + txte->type = TP_Tm; + txte->obj = NULL; + + assert((p->ast->token_type == TT_SEQUENCE) && + (p->ast->seq->elements[0]->token_type == TT_SEQUENCE) && + (p->ast->seq->elements[0]->seq->used == 6)); + for (int i=0; i<6; i++) + + txte->fm.cell[i] = p->ast->seq->elements[0]->seq->elements[i]->seq->elements[0]->dbl; + + fprintf(stdout, "act_Tm_op: text matrix ::\n"); + for (int i=0; i<3; i++) + fprintf(stdout, "%3.3f : %3.3f\n", txte->fm.cell[i*2], txte->fm.cell[i*2+1]); + + return H_MAKE(TextEntry, txte); +} + + +/* + * TP_Tstar: Move to the next line + * H_ARULE(Tstar_op, SEQ(LIT("T*"), ws)); // move to next line + */ +HParsedToken * +act_Tstar_op(const HParseResult *p, void *u) +{ + TextEntry *txte = H_ALLOC(TextEntry); + struct Env *aux = (struct Env*)u; + + txte->type = TP_Tstar; + txte->obj = NULL; + txte->value = 0; + + // associate the text with the current state + txte->ts.page = aux->tstate.page; + txte->ts.font = aux->tstate.font; + + fprintf(stdout, "act_Tstar_op: position pointer\n"); + pp_fontstate(stdout, &txte->ts); + + return H_MAKE(TextEntry, txte); +} + + + +/* + * 9.4.3 - Text showing operators - Table 109 + * TW_Tj, TW_Tq, TW_Tqq, TW_TJ + * + * ***************************************************************** + * ***************************************************************** + * + * TW_Tj: Show string + * H_ARULE(Tj_op, SEQ(string, ws, LIT("Tj"), ws)); // show text string + */ +HParsedToken * +act_Tj_op(const HParseResult *p, void *u) +{ + TextEntry *txte = H_ALLOC(TextEntry); + const HParsedToken *tstr = H_INDEX_TOKEN(p->ast, 0); + struct Env *aux = (struct Env*)u; + + + txte->type = TW_Tj; + txte->obj = NULL; + + txte->tstr.text = (uint8_t *)tstr->bytes.token; + txte->tstr.nchars = tstr->bytes.len; + + // associate the text with the current state + txte->ts.page = aux->tstate.page; + txte->ts.font = aux->tstate.font; + + + fprintf(stdout, "\nact_Tj_op:: nchars=%u, txt=%.*s\n", txte->tstr.nchars, + txte->tstr.nchars, txte->tstr.text); + pp_fontstate(stdout, &txte->ts); + + return H_MAKE(TextEntry, txte); +} + + +/* + * TW_Tq: Offset to next line then show string + * H_ARULE(TsingleQ_op, SEQ(string, ws, LIT(quote), ws)); // Move to next line and show text + */ +HParsedToken * +act_TsingleQ_op(const HParseResult *p, void *u) +{ + TextEntry *txte = H_ALLOC(TextEntry); + const HParsedToken *tstr = H_INDEX_TOKEN(p->ast, 0); + struct Env *aux = (struct Env*)u; + + + txte->type = TW_Tq; + txte->obj = NULL; + + txte->tstr.text = (uint8_t *)tstr->bytes.token; + txte->tstr.nchars = tstr->bytes.len; + + // associate the text wth the current font + txte->ts.page = aux->tstate.page; + txte->ts.font = aux->tstate.font; + + + fprintf(stdout, "\nact_TsingleQ_op:: nchars=%u, txt=%.*s\n", txte->tstr.nchars, + txte->tstr.nchars, txte->tstr.text); + pp_fontstate(stdout, &txte->ts); + + return H_MAKE(TextEntry, txte); +} + + +/* + * TW_Tqq: Offset to next line then show string, apply formatting specifications + * H_ARULE(TdoubleQ_op, SEQ(tnumb, ws, tnumb, ws, string, ws, LIT(dquote), ws)); // Move to next line and show formatted text + * + */ +HParsedToken * +act_TdoubleQ_op(const HParseResult *p, void *u) +{ + TextEntry *txte = H_ALLOC(TextEntry); + const HParsedToken *aw = H_INDEX_TOKEN(p->ast, 0); + const HParsedToken *ac = H_INDEX_TOKEN(p->ast, 1); + const HParsedToken *tstr = H_INDEX_TOKEN(p->ast, 2); + struct Env *aux = (struct Env*)u; + + + txte->type = TW_Tqq; + txte->obj = NULL; + + txte->twfmt.aw = aw->dbl; + txte->twfmt.ac = ac->dbl; + txte->twfmt.tstr.text = (uint8_t *)tstr->bytes.token; + txte->twfmt.tstr.nchars = tstr->bytes.len; + + // associate the text wth the current font + txte->ts.page = aux->tstate.page; + txte->ts.font = aux->tstate.font; + + + + fprintf(stdout, "act_TdoubleQ_op:: aw=%3.3f, ac=%3.3f\n", txte->twfmt.aw, txte->twfmt.ac); + fprintf(stdout, "\nact_TdoubleQ_op:: nchars=%u, txt=%.*s\n", txte->tstr.nchars, + txte->tstr.nchars, txte->tstr.text); + pp_fontstate(stdout, &txte->ts); + + return H_MAKE(TextEntry, txte); +} + +/* + * TW_TJ: Show array of strings, with potentially re-positioning specifications for each string + * H_RULE(TArr_elem, SEQ(OPT(SEQ(nanumbs)), string, ws)) + * H_ARULE(TJ_op, SEQ(h_many(TArr_elem), LIT("TJ"), ws)); // show one or more text strings + * + * TODO:: Implement the array parser + */ +HParsedToken * +act_TJ_op(const HParseResult *p, void *u) +{ + TextEntry *txte = H_ALLOC(TextEntry); + const HParsedToken *tarr = H_INDEX_TOKEN(p->ast, 0); + struct Env *aux = (struct Env*)u; + + + txte->type = TW_TJ; + txte->obj = NULL; + + + // associate the text wth the current font + txte->ts.page = aux->tstate.page; + txte->ts.font = aux->tstate.font; + + /* + * Parse each element of the array + * Build up the pointers to each of the string pieces + */ + txte->tarray.nelts = tarr->seq->used; + txte->tarray.elts = h_arena_malloc(p->arena, sizeof(struct tarrayelt) * txte->tarray.nelts); + txte->tarray.flattened.nchars = 0; + + for (int i=0; i<txte->tarray.nelts; i++) { + const HParsedToken *elt = tarr->seq->elements[i]; + assert( (elt->token_type == TT_SEQUENCE) && (elt->seq->used == 1) ); + switch (elt->seq->elements[0]->token_type) { + case TT_DOUBLE: + txte->tarray.elts[i].adj = elt->seq->elements[0]->dbl; + txte->tarray.elts[i].isStr = false; + break; + case TT_BYTES: + txte->tarray.elts[i].tstr.text = (uint8_t *)elt->seq->elements[0]->bytes.token; + txte->tarray.elts[i].tstr.nchars = elt->seq->elements[0]->bytes.len; + txte->tarray.elts[i].isStr = true; + txte->tarray.flattened.nchars += txte->tarray.elts[i].tstr.nchars; + + + // Debug +// fprintf(stdout, "act_TJ_op:Cumulative=%d/0x%x bytes, Additional:%d bytes\n", +// txte->tarray.flattened.nchars, txte->tarray.flattened.nchars, txte->tarray.elts[i].tstr.nchars); + fprintf(stdout, "act_TJ_op::: Using font= %p - page=", txte->ts.font); + pp_ref(stdout, txte->ts.page->me, 0, 0); + fprintf(stdout, "\nact_TJ_op:: nchars=%u, txt=%.*s\n", txte->tarray.elts[i].tstr.nchars, + txte->tarray.elts[i].tstr.nchars, txte->tarray.elts[i].tstr.text); + break; + default: + fprintf(stderr, "act_TJ_op:: Unexpected element type :: %d\n", elt->seq->elements[0]->token_type); + fflush(stderr); + assert(false); + } + } + + /* hold on to a flattened copy of the string */ + txte->tarray.flattened.text = h_arena_malloc(p->arena, sizeof(char) * txte->tarray.flattened.nchars); + int j = 0; // current index + for (int i=0; i<txte->tarray.nelts; i++) { + if (txte->tarray.elts[i].isStr) { + // Debug +// fprintf(stdout, "act_TJ_op:Start=%p-%d/0x%xbytes, Writing to:%p-%dbytes\n", +// (void *)txte->tarray.flattened.text, txte->tarray.flattened.nchars, txte->tarray.flattened.nchars, +// (void *)&txte->tarray.flattened.text[j], txte->tarray.elts[i].tstr.nchars); +// fprintf(stdout, "act_TJ_op: %.*s\n", txte->tarray.elts[i].tstr.nchars, txte->tarray.elts[i].tstr.text); + memcpy(&txte->tarray.flattened.text[j], txte->tarray.elts[i].tstr.text, txte->tarray.elts[i].tstr.nchars); + j += txte->tarray.elts[i].tstr.nchars; + } + } + + fprintf(stdout, "\nact_TJ_op:: nchars=%u, txt=%.*s\n", txte->tarray.flattened.nchars, + txte->tarray.flattened.nchars, txte->tarray.flattened.text); + pp_fontstate(stdout, &txte->ts); + return H_MAKE(TextEntry, txte); +} + + + + +/* + * Parse the text object delimited by "BT" and "ET" + */ +HParsedToken * +act_txtobj(const HParseResult *p, void *u) +{ + + fprintf(stdout, "act_txtobj:: Here\n"); + + assert(p->ast->token_type == TT_SEQUENCE); + + TextEntry *txtobj = H_ALLOC(TextEntry); + TextEntry *txte = NULL; + const HParsedToken *opstream = H_INDEX_TOKEN(p->ast, 1); + const HParsedToken *tt_text=NULL; + uint8_t *tstr=NULL; + int textlen=0; + + + fprintf(stdout, "act_txtobj:: numtokens = %lu\n", opstream->seq->used); + + // Walk through the tokens to determine how much space to allocate + // Count the number of characters in the stream + // Concatenate the text into the allocated space + for (int i =0; i < opstream->seq->used; i++) { + + txte = H_CAST(TextEntry, opstream->seq->elements[i]); + // Process the text showing operators + switch (txte->type) { + case TP_Td: + case TP_TD: + case TP_Tstar: + textlen += 1; + break; + + case TW_TJ: + textlen += txte->tarray.flattened.nchars; + break; + + case TW_Tq: + case TW_Tqq: + textlen += 1; + case TW_Tj: + textlen += txte->tstr.nchars; + break; + break; + default: + ; // ignore + } + } + tstr = h_arena_malloc(p->arena, sizeof(uint8_t) * textlen); + int idx=0; + TextState_T *ts; + // Now concatenate the pieces + for (int i =0; i < opstream->seq->used; i++) { + TextEntry *txte = H_CAST(TextEntry, opstream->seq->elements[i]); + ts = &txte->ts; + + // Process the text operators + switch (txte->type) { + case TP_Td: + case TP_TD: + case TP_Tstar: + tstr[idx] = '\n'; + idx += 1; + break; + + case TW_TJ: + memcpy(&tstr[idx], txte->tarray.flattened.text, txte->tarray.flattened.nchars); + idx += txte->tarray.flattened.nchars; + fprintf(stdout, "act_txtobj - array:: len=%u, str=", txte->tarray.flattened.nchars); + fwrite((const void *)txte->tarray.flattened.text, (int) txte->tarray.flattened.nchars, 1, stdout); + pp_fontstate(stdout, ts); + break; + + case TW_Tq: + case TW_Tqq: + tstr[idx] = '\n'; + idx += 1; + + case TW_Tj: + memcpy(&tstr[idx], txte->tstr.text, txte->tstr.nchars); + idx += txte->tstr.nchars; + fprintf(stdout, "act_txtobj:: len=%u, str=", txte->tstr.nchars); + fwrite((const void *)txte->tstr.text, (int) txte->tstr.nchars, 1, stdout); + pp_fontstate(stdout, ts); + break; + default: + ; // ignore + } + } + assert(idx == textlen); + + + txtobj->type = TW_Tj; + txtobj->obj = opstream; + txtobj->tstr.text = tstr; + txtobj->tstr.nchars = textlen; + txtobj->tstr.tobj = opstream; + if (textlen) { + txtobj->ts.page = ts->page; + txtobj->ts.font = ts->font; + } + else { + txtobj->ts.page = NULL; + txtobj->ts.font = NULL; + } + // pretty print the information + tt_text = H_MAKE(TextEntry, txtobj); + + // DEBUG + if (textlen) { + fprintf(stdout, "act_txtobj:: "); + pp_textentry(stdout, tt_text, 0, 0); + if (&txtobj->ts.page) + pp_fontstate(stdout, &txtobj->ts); + } + return (HParsedToken *)tt_text; +} + + + + +/* + * This continuation takes the text stream and saves it in the environment for further + * processing, e.g. writing it out to a file with the same name as the pdf input filename + * but woth a .psectxt suffix. + * It does not consume the string and returns the parser as the output. + * + * x = (txtobj ...) + */ +HParser * +ktxtstream(HAllocator *mm__, const HParsedToken *x, void *env) +{ + + struct Env *aux = env; + struct textnode *txtnd; + + assert (x->token_type == TT_SEQUENCE); + int n_tobjs = x->seq->used; + fprintf(stdout, "\n\nktxtstream: Num txtobjs = %d\n", n_tobjs); + + for (int n=0; n<n_tobjs; n++) { + + assert(x->seq->elements[n]->token_type == TT_TextEntry); + TextEntry *tste = H_CAST(TextEntry, x->seq->elements[n]); + assert(tste->type == TW_Tj); + fprintf(stdout, "ktxtstream: Value = %.*s\n", tste->tstr.nchars, tste->tstr.text); + + + // store the string in the environment + txtnd = h_alloc(mm__, sizeof(struct textnode)); + txtnd->tstr = &tste->tstr; + txtnd->next = NULL; + if (aux->txthead == NULL) + aux->txthead = txtnd; + if (aux->txttail == NULL) + aux->txttail = txtnd; + else { + aux->txttail->next = txtnd; + aux->txttail = txtnd; + } + aux->ntextobjs += 1; + + } + + return p_return__m(mm__, x); +} + + + +/* + * This utility extracts the text stream from the global environment + * writes it out to a file with the same name as the pdf input filename + * but with a .psectxt suffix. + */ +void +text_extract(struct Env *aux) +{ + fprintf(stdout, "text_extract:: num text objects = %ld\n", aux->ntextobjs); + fprintf(stdout, "text_extract:: %s\n", aux->infile); + + int infnlen = strlen(aux->infile); + int sfxlen = strlen(".psectxt"); + int namelen = infnlen + sfxlen; + + char *outfn = (char *) malloc(sizeof(char) * namelen+1); + if (outfn == NULL) { + fprintf(stderr, "text_extract:: malloc() failed"); + return; + } + memcpy(outfn, aux->infile, infnlen); + memcpy(&outfn[infnlen], ".psectxt", sfxlen); + outfn[namelen] = '\0'; // null terminate the string + + // open the file for writing + FILE *stream; + if (!(stream = fopen(outfn, "w"))) { + fprintf(stderr, + "text_extract:: Failed to open file '%s' for writing\n", outfn); + return; + } + + // DEBUG + char *outfn2 = (char *) malloc(sizeof(char) * namelen+1); + if (outfn2 == NULL) { + fprintf(stderr, "text_extract:: malloc() failed"); + return; + } + sfxlen = strlen(".strtxt"); + namelen = infnlen + sfxlen; + + memcpy(outfn2, aux->infile, infnlen); + memcpy(&outfn2[infnlen], ".strtxt", sfxlen); + outfn2[namelen] = '\0'; // null terminate the string + // open the file for writing + FILE *stream2; + if (!(stream2 = fopen(outfn2, "w"))) { + fprintf(stderr, + "text_extract:: Failed to open file '%s' for writing\n", outfn); + return; + } + // DEBUG + + struct textnode *curr = aux->txthead; + Fontinfo_T *ft; // font token + for (int i = 0; i < aux->ntextobjs; i++) { + + // DEBUG +// ft = lookup_font(&curr->tstr->ts, aux); +// pp_fontinfo(stream2, &curr->tstr->ts, ft); + fwrite((const void *)curr->tstr->text, (int) curr->tstr->nchars, 1, stream2); + // DEBUG + + const HParsedToken *tt_text = curr->tstr->tobj; + for (int j = 0; j < tt_text->seq->used; j++) { + struct textstr *tstr = NULL; + TextEntry *txte = H_CAST(TextEntry, tt_text->seq->elements[j]); + switch (txte->type) { + case TW_Tj: + case TW_Tq: + case TW_Tqq: + tstr = &txte->tstr; + break; + case TW_TJ: + tstr = &txte->tarray.flattened; + break; + default: + fprintf(stderr, "text_extract:: Text token type '%u' ignored\n", + txte->type); + } + if (tstr) { + ft = lookup_font(&txte->ts, aux); + if (ft) { + pp_fontinfo(stdout, &txte->ts, ft); + pp_fontinfo(stream, &txte->ts, ft); + } + else { + char *estr = "\nMissing Font Info!!\n"; + fwrite((const void *)estr, strlen(estr), 1, stdout); + fwrite((const void *)estr, strlen(estr), 1, stream); + } + fwrite((const void *)tstr->text, (int) tstr->nchars, 1, stdout); + fwrite((const void *)tstr->text, (int) tstr->nchars, 1, stream); + } + } + curr = curr->next; + } + fclose(stream); + free(outfn); + return; +} + + + + + + + + + +// ********************************************************************* +// DEBUG + + +// Utility -- Handles simplistic approach to UTF-16 +char convert2char(unsigned int b1) +{ + char val; + + if (b1 == 0) + { + val = '?'; + } + else if ( (b1 < 20) || ( b1 > 127 ) ) + { + fprintf(stdout, " 0X%02X ", b1); + val = '?'; + } + else + { + val = b1; + fprintf(stdout, "%c", val); + } + return val; +} + + +HParsedToken * +act_txtbegin_(const HParseResult *p, void *u) +{ + const HParsedToken *tok=p->ast; + + fprintf(stdout, "act_txtbegin:: Here %lx\n", (long unsigned int)tok); + + return (HParsedToken *)tok; +} +HParsedToken * +act_txtend(const HParseResult *p, void *u) +{ + + fprintf(stdout, "act_txtend:: Here\n"); + + return (HParsedToken *)p->ast; +} + + + +HParsedToken * +act_bytestream(const HParseResult *p, void *u) +{ + size_t n = p->ast->seq->used; + fprintf(stdout, "\nact_bytestream: token_type: %u, size: %lu\n", p->ast->token_type, n); + + uint8_t *bytebuf = h_arena_malloc(p->arena, sizeof(uint8_t) * n); + for (int i=0; i<n; i++) { + assert(p->ast->seq->elements[i]->token_type == TT_UINT); + bytebuf[i] = p->ast->seq->elements[i]->uint; + } + + fprintf(stdout, "act_bytestream: the string: %.*s\n", (int)n, (char *)bytebuf); + +#if 0 + char *buf = malloc(sizeof(char) * n); + if (buf) { + for (int i=0; i<n; i++) { + buf[i] = convert2char(p->ast->seq->elements[i]->uint); + } + fprintf(stdout, "act_bytestream: the string: %.*s\n", (int)n, buf); + free (buf); + } +#endif + HParsedToken *bb=H_MAKE_BYTES(bytebuf, n); + return bb; +// return (HParsedToken*)p->ast; +} + +// ********************************************************************* + + +/* + * ******************************************************************** + * End Text parsing + * ******************************************************************** + */ + + + + + + + +/* + * input grammar + */ + +HParser *p_pdf; +HParser *p_pdfdbg; +HParser *p_startxref; +HParser *p_xref; +HParser *p_objdef; +HParser *p_a85string; +HParser *p_ahexstream; +HParser *p_rldstring; +HParser *p_ws; +HParser *p_wel; +HParser *p_elemr; +HParser *p_npair; + +/* + * Parsers for text streams + */ +HParser *p_textbegin; +HParser *p_textstream; +HParser *p_trailer; +HParser *p_cstream; +HParser *p_byteostm; +HParser *p_bytestream; +HParser *p_dict; + + + + +/* continuations for h_bind() */ +HParser *kstream(HAllocator *, const HParsedToken *, void *); +HParser *kxstream(HAllocator *, const HParsedToken *, void *); +HParser *ktxtstream(HAllocator *, const HParsedToken *, void *); +HParser *kcontentstream(HAllocator *, const HParsedToken *, void *); +HParser *kbyteostream(HAllocator *, const HParsedToken *, void *); + +void +init_runlengthdecode_parser(struct Env *aux) +{ + H_RULE(rldeod, h_ch(0x80)); + H_ARULE(longlength, h_ch_range(0x81, 0xFF)); + H_ARULE(shortlength, h_ch_range(0x0, 0x7F)); + + H_RULE(shortdata, h_uint8()); + H_RULE(longdata, h_uint8()); + + H_RULE(shortrun, h_length_value(shortlength, shortdata)); + H_ARULE(longrun, SEQ(longlength, longdata)); + + H_ARULE(rldstring, SEQ(h_many(CHX(shortrun, longrun)), IGN(rldeod))); + + p_rldstring = rldstring; +} + +void +init_parser(struct Env *aux) +{ + TT_HParseResult = h_allocate_token_new("HParseResult", NULL, pp_parseresult); + TT_XREntry = h_allocate_token_new("XREntry", NULL, pp_xrentry); + TT_Ref = h_allocate_token_new("Ref", NULL, pp_ref); + TT_Dict = h_allocate_token_new("Dict", NULL, pp_dict); + TT_TextEntry = h_allocate_token_new("TextEntry", NULL, pp_textentry); + TT_Objstm = h_allocate_token_new("Objstm", NULL, pp_objstm); + + /* lines */ + H_RULE(cr, p_mapch('\r', '\n')); /* semantic value: \n */ + H_RULE(lf, h_ch('\n')); /* semantic value: \n */ + H_RULE(crlf, h_right(cr, lf)); /* semantic value: \n */ + H_RULE(eol, CHX(crlf, cr, lf)); + H_RULE(end, h_end_p()); + + /* character classes */ +#define LWCHARS "\0\t\f " +#define WCHARS LWCHARS "\n\r" +#define DCHARS "()<>[]{}/%" + H_RULE(wchar, IN(WCHARS)); /* white-space */ + H_RULE(lwchar, IN(LWCHARS)); /* "line" whitespace */ + //H_RULE(dchar, IN(DCHARS)); /* delimiter */ + H_RULE(rchar, NOT_IN(WCHARS DCHARS)); /* regular */ + H_RULE(nchar, NOT_IN(WCHARS DCHARS "#")); /* name */ + H_RULE(schar, NOT_IN("()\n\r\\")); /* string literal */ + H_ARULE(digit, h_ch_range('0', '9')); + H_ARULE(pdigit, h_ch_range('1', '9')); + H_ARULE(hlower, h_ch_range('a', 'f')); + H_ARULE(hupper, h_ch_range('A', 'F')); + H_RULE(hdigit, CHX(digit, hlower, hupper)); + H_ARULE(odigit, h_ch_range('0', '7')); + + H_RULE(sp, h_ch(' ')); + H_RULE(percent, h_ch('%')); + H_RULE(period, h_ch('.')); + H_RULE(slash, h_ch('/')); + H_RULE(hash, h_ch('#')); + H_RULE(bslash, h_ch('\\')); + H_RULE(lparen, h_ch('(')); + H_RULE(rparen, h_ch(')')); + H_RULE(langle, h_ch('<')); + H_RULE(rangle, h_ch('>')); + H_RULE(lbrack, h_ch('[')); + H_RULE(rbrack, h_ch(']')); + H_RULE(plus, h_ch('+')); + H_RULE(minus, h_ch('-')); + + /* whitespace */ + H_RULE(comment, SEQ(percent, h_many(NOT_IN("\r\n")), CHX(cr,lf,end))); + /* ^ NB: must consume cr/lf for ws to be LL(k) */ + H_RULE(wel, IGN(CHX(wchar, comment))); + H_RULE(ws, IGN(h_many(wel))); + H_RULE(lws, IGN(h_many(IGN(lwchar)))); + + /* tokenization */ +#define MANY_WS(X) h_many(CHX(wel, X)) +#define TOK(X) h_middle(ws, X, h_not(rchar)) +#define KW(S) IGN(TOK(LIT(S))) + + /* misc */ + H_RULE(nl, IGN(h_right(lws, eol))); + H_RULE(epsilon, h_epsilon_p()); + H_RULE(empty, SEQ(epsilon)); + H_RULE(digits, h_many1(digit)); + H_VARULE(nat, digits); + H_VRULE(pnat, nat); + H_RULE(npair, SEQ(pnat, wel,ws, nat)); + + /* + * objects + */ + + H_ARULE(ref, SEQ(npair, wel,ws, LIT("R"))); + H_RULE(null, LIT("null")); + H_RULE(boole, CHX(LIT("true"), LIT("false"))); + + /* numbers */ + H_ARULE(sign, CHX(minus, IGN(plus))); + H_VRULE(intnn, nat); + H_ARULE(realnn, CHX(SEQ(digits, period, digits), /* 12.3 */ + SEQ(digits, period, empty), /* 123. */ + SEQ(empty, period, digits))); /* .123 */ + // XXX ^ we _could_ move the "123." case into intnn... + H_RULE(numbnn, CHX(realnn, intnn)); + H_RULE(snumb, SEQ(sign, numbnn)); + H_VARULE(numb, CHX(snumb, numbnn)); + + /* names */ + H_ARULE(nesc, SEQ(hash, hdigit, hdigit)); + H_ARULE(nstr, h_many(CHX(nchar, nesc))); /* '/' is valid */ + H_RULE(name, h_right(slash, nstr)); + + /* strings */ + H_RULE(snest, h_indirect()); + H_RULE(bsn, p_mapch('n', 0x0a)); /* LF */ + H_RULE(bsr, p_mapch('r', 0x0d)); /* CR */ + H_RULE(bst, p_mapch('t', 0x09)); /* HT */ + H_RULE(bsb, p_mapch('b', 0x08)); /* BS (backspace) */ + H_RULE(bsf, p_mapch('f', 0x0c)); /* FF */ + H_RULE(escape, CHX(bsn, bsr, bst, bsb, bsf, lparen, rparen, bslash)); + H_ARULE(octal, CHX(REP(odigit,3), REP(odigit,2), REP(odigit,1))); + H_RULE(wrap, IGN(eol)); + H_RULE(sesc, h_right(bslash, CHX(escape, octal, wrap, epsilon))); + /* NB: lone backslashes and escaped newlines are ignored */ + H_ARULE(schars, h_many(CHX(schar, snest, sesc, eol))); + H_RULE(snest_, SEQ(lparen, schars, rparen)); + H_RULE(litstr, h_middle(lparen, schars, rparen)); + H_RULE(hexstr, h_middle(langle, MANY_WS(hdigit), rangle)); + H_ARULE(string, CHX(litstr, hexstr)); + h_bind_indirect(snest, snest_); + + H_RULE(array, h_indirect()); + H_RULE(dict, h_indirect()); + + /* classify objects by whether they start/end with a delimiter: */ + H_RULE(robj, CHX(ref, null, boole, numb)); /* rchars */ + H_RULE(dobj, CHX(string, array, dict)); /* dchars */ + H_RULE(obj, CHX(robj, name, dobj)); + + /* dictionaries */ + H_RULE(dopen, LIT("<<")); + H_RULE(dclose, LIT(">>")); + H_RULE(k_v, CHX(CHX(SEQ(name, wel,ws, obj), + SEQ(name, CHX(name,dobj))), + VIOL(SEQ(name, wel,ws), "Key with no value (severity=2)"))); + H_ARULE(dict_, h_middle(dopen, MANY_WS(k_v), dclose)); + // XXX this allows, for instance, "<<<<" to be parsed as "<< <<". ok? + // XXX validate: dict keys must be unique + h_bind_indirect(dict, dict_); + + /* arrays */ + H_RULE(elemd, h_indirect()); /* elements following a delimiter */ + H_RULE(elemr, h_indirect()); /* elements following a regular char */ + H_ARULE(array_, h_middle(lbrack, elemd, rbrack)); + H_RULE(elemd_, CHX(SEQ(ws, dobj, elemd), + SEQ(ws, name, elemr), + SEQ(ws, robj, elemr), + ws)); + H_RULE(elemr_, CHX(SEQ(ws, dobj, elemd), + SEQ(ws, name, elemr), + SEQ(wel,ws, robj, elemr), + ws)); + h_bind_indirect(elemd, elemd_); + h_bind_indirect(elemr, elemr_); + h_bind_indirect(array, array_); + + /* streams */ + H_RULE(stmbeg, SEQ(dict, OPT(ws), LIT("stream"), OPT(cr), + CHX(lf, VIOL(epsilon, "No linefeed after 'stream' (severity=7)")))); + H_RULE(stmend, CHX(SEQ(eol, LIT("endstream")), + VIOL(LIT("ndstream"), "Stream length >1-too-long (severity=10)"), + VIOL(SEQ(h_many(wchar), LIT("endstream")), + "No newline before 'endstream' (severity=7)"), + VIOL(LIT("endstream"), "Stream length 1-too-long (severity=9)"), + VIOL(SEQ(OPT(h_ch_range(0, 255)), OPT(eol), LIT("endstream")), + "Stream length 1-too-short (severity=4)"), + VIOL(SEQ(h_many1(h_butnot(h_ch_range(0, 255), CHX(KW("endobj"), + SEQ(npair, wel, KW("obj")), + KW("xref"), + LIT("endstream")))), LIT("endstream")), + "Stream length >1-too-short (severity=5)"), + VIOL(h_many1(h_butnot(h_ch_range(0, 255), CHX(KW("endobj"), + SEQ(npair, wel, KW("obj")), + KW("xref")))), + "Missing endstream token (severity=7)"))); + + H_RULE(stream, h_left(h_bind(stmbeg, kstream, aux), stmend)); + // XXX is whitespace allowed between the eol and "endstream"? + // peter wyatt says no. (2020-03-25) + + /* + * file structure + */ + + /* header */ + H_RULE(version, SEQ(pdigit, IGN(period), digit)); + H_RULE(header, h_middle(LIT("%PDF-"), version, nl)); + + /* body */ + H_RULE(indobj, CHX(stream, obj)); + H_RULE(objdef, SEQ(ws, npair, wel, KW("obj"), ws, indobj, + CHX(VIOL(SEQ(OPT(ws), OPT(lws), KW("endobj"), h_many(CHX(wel, eol)), h_many1(KW("endobj"))), + "More than 1 endobj token (severity=1)"), + VIOL(SEQ(OPT(ws), OPT(lws), KW("endobj"), h_many(CHX(wel, eol)), h_many1(SEQ(dclose, h_many1(CHX(wchar, eol)), KW("endobj")))), + "More than 1 >> and endobj token (severity=2)"), + SEQ(OPT(ws), OPT(lws), KW("endobj")), + VIOL(h_optional(KW("endobj")), "Missing endobj token (severity=1)")))); + H_RULE(body, h_many(objdef)); + + /* cross-reference section */ + H_RULE(xreol, CHX(SEQ(sp, cr), SEQ(sp, lf), crlf)); + // ^ XXX does the real world follow this rule?! cf. loop.pdf + H_RULE(xrtyp, CHX(h_ch('n'), h_ch('f'))); + H_ARULE(xroff, REP(digit, 10)); + H_ARULE(xrgen, REP(digit, 5)); + H_ARULE(xrent, SEQ(xroff, IGN(CHX(VIOL(SEQ(lwchar, h_many1(lwchar)), "Multi-WS in xref offset_gen entry (severity=1)"), sp)), + xrgen, IGN(CHX(VIOL(SEQ(lwchar, h_many1(lwchar)), "Multi-WS in xref gen_use entry (severity=1)"), sp)), + xrtyp, IGN(CHX(VIOL(SEQ(wchar, wchar, h_many1(wchar)), "Greater-than-2-byte WS at end of xref entry (severity=1)"), + xreol, + VIOL(SEQ(h_many1(wchar)), "Nonconformant WS at end of xref entry (severity=1)"))))); + H_RULE(xrhead, SEQ(nat, IGN(sp), nat, nl)); + H_RULE(xrsub, SEQ(xrhead, h_many(xrent))); + H_ARULE(xrefs, SEQ(KW("xref"), nl, h_many(xrsub))); + + /* cross-reference streams */ + H_RULE(xstream, h_bind(stmbeg, kxstream, aux)); + H_AVRULE(xrstm, SEQ(ws, npair, wel, KW("obj"), ws, xstream)); + // XXX skip however much we consumed and check for "endstream endobj"? + + + /* trailer */ + H_RULE(startxr, SEQ(nl, KW("startxref"), nl, + lws, nat, nl, + LIT("%%EOF"), OPT(nl))); + + /* used for the backwards search */ + H_RULE(lasteof, SEQ(nl, KW("startxref"), nl, + lws, nat, nl, + // XXX the real world sometimes omits nl after %%EOF inside the file. + // the next 'tail' would be appended right after the 'F', + // presumably because the previous version of the file + // ended without a trailing newline. m) + // this is invalid per spec, because it creates a run-on + // comment, but we should probably accept-and-warn. + // XXX should lws be allowed before EOF marker? + // NB: lws before xref offset is allowed, cf. p.48 (example 4) + LIT("%%EOF"), + CHX(VIOL(SEQ(nl, h_many1(nl), end), + "(offset FROM END) Multiple newlines after final %%EOF (severity=4)"), + SEQ(h_many(nl), end), + VIOL(SEQ(h_butnot(h_ch_range(0, 255), LIT("%%EOF"))), + "(offset FROM END) Data after final %%EOF (severity=7)")))); + + H_ARULE(xr_td, SEQ(xrefs, KW("trailer"), ws, dict)); + + H_RULE(start_junk, VIOL(h_many1(h_butnot(h_ch_range(0, 255), header)), + "Junk bytes before %PDF header (severity=1)")); + H_RULE(hdr_junk, CHX(comment, + VIOL(h_many1(h_butnot(h_ch_range(0, 255), SEQ(npair, wel, KW("obj")))), + "Uncommented junk after header (severity=1)"))); + H_RULE(tail, SEQ(body, CHX(SEQ(h_optional(xr_td), startxr), + VIOL(SEQ(xr_td, OPT(SEQ(nl, KW("startxref"), nl, lws, nat, nl)), + OPT(nl), OPT(LIT("%%EOF")), OPT(nl)), + "Improper end of trailer - missing startxref and/or %%EOF (severity=5)")))); + H_RULE(final_eof_junk, CHX(VIOL(SEQ(h_many1(nl), end), "Multiple newlines after final %%EOF (severity=4)"), + VIOL(h_many1(h_butnot(h_ch_range(0, 255), LIT("%%EOF"))), + "Data after final %%EOF (severity=7)"), + end)); + H_RULE(pdf, SEQ(OPT(start_junk), header, OPT(hdr_junk), h_many1(tail), final_eof_junk)); + + /* debug parser to consume as much as possible */ + H_RULE(pdfdbg, SEQ(OPT(start_junk), header, OPT(hdr_junk), h_many(tail), body, OPT(xr_td), OPT(SEQ(startxr, final_eof_junk)))); + + /* + * filters + */ + + /* Whitespace can occur between any digit and has to be ignored, */ + H_RULE(aws, IGN(h_many(wchar))); // all white space, include CR & LF, but not comments + #define MANY_AWS(X) h_many(CHX(aws, X)) + + + /* Ascii85Decode */ + H_RULE(a85eod, SEQ(h_ch('~'), aws, h_ch('>'))); + H_ARULE(a85zero, h_ch('z')); + H_ARULE(a85digit, h_ch_range('!', 'u')); + + /* Line whitespace can occur between any digit and has to be ignored, */ + #define MANY_LWS(X) h_many(CHX(lws, X)) + // Folded the test for a85fiveexcl into the validation component + H_VARULE(a85fivedigits, h_repeat_n(SEQ(a85digit, aws), 5)); + // TODO:: will need to pull out error conditions -- a85fiveexcl or 'z' as one of the digits + H_ARULE(a85group, CHX(a85zero, a85fivedigits)); + + H_VARULE(a85partial2group, h_repeat_n(SEQ(a85digit, aws), 2)); + H_VARULE(a85partial3group, h_repeat_n(SEQ(a85digit, aws), 3)); + H_VARULE(a85partial4group, h_repeat_n(SEQ(a85digit, aws), 4)); + H_RULE(a85partialgroup, CHX(a85partial4group, a85partial3group, a85partial2group)); + + H_ARULE(a85string, SEQ(h_many(a85group), h_optional(a85partialgroup), IGN(a85eod))); + + //p_test = a85group; + + + /* + * Not sure whether comments can be embedded within content streams + * If not, use the rule aws rather than ws + */ + /* + * It seems somewhat unclear. ASCII85Decode definitely can't have + * comments, because % can be part of a valid ASCII85Encoded character. + * However, it seems that comments are generally allowed: + * "Any occurrence of the PERCENT SIGN (25h) outside a string or + * inside a content stream (see 7.8.2, "Content streams") introduces + * a comment." ISO32000:2-2017 7.2.4 + */ + // XXX Ask Peter Wyatt + + + /* AsciiHexDecode */ + H_RULE(ahexeod, h_ch('>')); + H_ARULE(hdigitpair, SEQ(aws, hdigit, aws, hdigit)); + H_ARULE(ahextruncated, SEQ(aws, hdigit)); + + H_RULE(ahs_end, SEQ(h_optional(ahextruncated), aws, ahexeod)); + H_ARULE(ahexstream, SEQ(h_many(hdigitpair), ahs_end)); + + init_runlengthdecode_parser(aux); + + + // ========================================================================== + /* + * Text Objects Extraction - embedded in content streams + * + */ + // ========================================================================== + /* + * Text Objects Extraction - embedded in content streams + */ + +// XXX cleanup: indentation + H_RULE(txtbegin, h_indirect()); + H_RULE(txt_before_junk, IGN(SEQ(h_not(LIT("BT")), CHX(comment, h_uint8())))); + H_ARULE(txtbegin_, SEQ(IGN(h_many(txt_before_junk)), LIT("BT"), aws)); + h_bind_indirect(txtbegin, txtbegin_); + H_ARULE(txtend, KW("ET")); + /* 9.3 - Text state operators */ + H_AVRULE(tnumb, numb); + HParser *Tc_op = h_action(SEQ(tnumb, aws, LIT("Tc"), aws), act_Tc_op, aux); /* 9.3.2 - charSpace */ + HParser *Tw_op = h_action(SEQ(tnumb, aws, LIT("Tw"), aws), act_Tw_op, aux); /* 9.3.3 - wordSpace */ + HParser *Tz_op = h_action(SEQ(tnumb, aws, LIT("Tz"), aws), act_Tz_op, aux); /* 9.3.4 - horizontal scaling */ + HParser *TL_op = h_action(SEQ(tnumb, aws, LIT("TL"), aws), act_TL_op, aux); /* 9.3.5 - leading */ + HParser *Tf_op = h_action(SEQ(name, aws, numbnn, aws, KW("Tf"), aws), act_Tf_op, aux); /* font and size */ + /* TDO: must map to an existing font dictionary */ + H_VRULE(tmode, nat); /* True if <= 7 */ + HParser *Tr_op = h_action(SEQ(tmode, aws, LIT("Tr"), aws), act_Tr_op, aux); /* 9.3.6 - rendering mode */ + HParser *Ts_op = h_action(SEQ(tnumb, aws, LIT("Ts"), aws), act_Ts_op, aux); /* rise */ + H_RULE(textstate_ops, CHX(Tc_op, Tw_op, Tz_op, TL_op, Tf_op, Tr_op, Ts_op)); + + /* 9.4.2 - Text positioning operators */ + HParser *Td_op = h_action(SEQ(tnumb, aws, tnumb, aws, LIT("Td"), aws), act_Td_op, aux); /* move to next line with offset */ + HParser *TD_op = h_action(SEQ(tnumb, aws, tnumb, aws, LIT("TD"), aws), act_TD_op, aux); /* move to next line with offset and set state */ + HParser *Tm_op = h_action(SEQ(REP(SEQ(tnumb, aws), 6), LIT("Tm"), aws), act_Tm_op, aux); /* set text matrix */ + HParser *Tstar_op = h_action(SEQ(LIT("T*"), aws), act_Tstar_op, aux); /* move to next line */ + H_RULE(textpos_ops, CHX(Td_op, TD_op, Tm_op, Tstar_op)); + + /* 9.4.3 - Text showing operators */ + H_RULE(quote, h_ch('\'')); + H_RULE(dquote, h_ch('"')); +// H_ARULE(Tj_op, SEQ(string, aws, LIT("Tj"), aws), aux); /* show text string */ + HParser *Tj_op = h_action(SEQ(string, aws, LIT("Tj"), aws), act_Tj_op, aux); /* show text string */ + HParser *TsingleQ_op = h_action(SEQ(string, aws, quote, aws), act_TsingleQ_op, aux); /* Move to next line and show text */ + HParser *TdoubleQ_op = h_action(SEQ(tnumb, aws, tnumb, aws, string, aws, dquote, aws), act_TdoubleQ_op, aux); /* Move to next line and show formatted text */ + H_RULE(TArr_elem, SEQ(CHX(tnumb, string), aws)); + HParser *TJ_op = h_action(SEQ(IGN(lbrack), aws, h_many(TArr_elem), IGN(rbrack), aws, LIT("TJ"), aws), act_TJ_op, aux); /* show one or more text strings */ + H_RULE(textshow_ops, CHX(Tj_op, TsingleQ_op, TdoubleQ_op, TJ_op)); + + H_RULE(text_inbetween_junk, IGN(SEQ(h_not(txtend), h_uint8()))); + H_RULE(text_ops, CHX(textstate_ops, textpos_ops, textshow_ops, text_inbetween_junk)); + + /* Text object */ + H_ARULE(txtobj, SEQ(txtbegin, h_many(text_ops), txtend)); + H_RULE(txtobjs, h_many1(txtobj)); + + + /* text streams */ + H_RULE(txtstream, h_bind(txtobjs, ktxtstream, aux)); + + // Page Tree + H_RULE(contentstream, h_left(h_bind(stmbeg, kcontentstream, aux), stmend)); + H_ARULE(cstream, SEQ(ws, npair, wel, KW("obj"), ws, contentstream, + OPT(ws), OPT(lws), KW("endobj"))); + H_RULE(byteostream, h_left(h_bind(stmbeg, kbyteostream, aux), stmend)); + H_RULE(byteostm, SEQ(ws, npair, wel, KW("obj"), ws, byteostream, + OPT(ws), OPT(lws), KW("endobj"))); + + // convenient parser to just get a chunk of bytes + H_ARULE(bytestream, h_many(h_uint8())); + + + /* global parser variables */ + p_pdf = pdf; + p_pdfdbg = pdfdbg; + p_startxref = lasteof; //startxr; + p_xref = CHX(xr_td, xrstm); + p_objdef = objdef; + p_a85string = a85string; + p_ahexstream = ahexstream; + p_ws = ws; + p_wel = wel; + p_elemr = h_action(elemr, h_act_flatten, NULL); + p_npair = npair; + + /* text parser variables */ \ + p_textbegin = txtbegin; \ + p_textstream = txtstream; + p_cstream = cstream; + p_byteostm = byteostm; + p_bytestream = bytestream; + p_dict = dict; + + p_fail = h_nothing_p(); + p_epsilon = epsilon; + p_return_0 = h_action(epsilon, act_return_uint, (void *)0); + p_return_1 = h_action(epsilon, act_return_uint, (void *)1); + + /* Parsing of severity messages */ + H_RULE(viol_preamble, SEQ(h_many(NOT_IN("=")), LIT("="))); + H_RULE(severity_num, h_action(h_many1(h_action(h_ch_range('0', '9'), act_digit, NULL)), + act_nat, NULL)); + H_RULE(violsev, SEQ(IGN(viol_preamble), severity_num)); + p_violsev = violsev; + +#if 0 + // XXX testing + int r; + void errx(int, const char *, ...); + HParser *p = obj; + if ((r = h_compile(p, PB_LALR, NULL)) != 0) + errx(1, "h_compile() failed: %d", r); + errx(0, "OK"); +#endif +} + + +/* + * lookup and resolution of indirect references + * + * ** Parameter 'gen' is unused + */ + +XREntry * +lookup_xref(struct Env *aux, size_t nr, size_t gen) +{ + HParsedToken *ss; /* xref subsection */ + size_t base, n; + + /* for each cross-reference section (i.e. update) */ + for (size_t i = 0; i < aux->nxrefs; i++) { + HCountedArray *subs = H_INDEX_SEQ(aux->xrefs[i], 0); + + /* for each cross-reference subsection */ + for (size_t j = 0; j < subs->used; j++) { + ss = subs->elements[j]; + base = H_INDEX_UINT(ss, 0, 0); + n = H_INDEX_UINT(ss, 0, 1); + + if (nr >= base && nr - base < n) + return H_INDEX(XREntry, ss, 1, nr - base); + // TODO: Generate a more meaningful error message -- p_viol, error ontology category + } + } + + return NULL; +} + +const HParsedToken * +parse_obj(struct Env *aux, size_t nr, size_t gen, size_t offset) +{ + HParseResult *res; + size_t def_nr, def_gen; + + if (offset >= aux->sz) { + fprintf(stderr, "%s: position %zu (%#zx) for object %zu %zu is " + "out of bounds\n", aux->infile, offset, offset, nr, gen); + return NULL; + } + +// res = h_parse(p_objdef, aux->input + offset, aux->sz - offset); + HParser *p = h_right(h_seek(offset * 8, SEEK_SET), p_objdef); // XXX + res = h_parse(p, aux->input, aux->sz); // XXX review + if (res == NULL) { + fprintf(stderr, "%s: error parsing object %zu %zu at position " + "%zu (%#zx)\n", aux->infile, nr, gen, offset, offset); + return NULL; + } + assert(res->ast != NULL && res->ast->token_type == TT_SEQUENCE); + + def_nr = H_INDEX_UINT(res->ast, 0, 0); + def_gen = H_INDEX_UINT(res->ast, 0, 1); + if (def_nr != nr || def_gen != gen) { + fprintf(stderr, "%s: object ID mismatch at position %zu " + "(%#zx): sought %zu %zu, found %zu %zu.\n", aux->infile, + offset, offset, nr, gen, def_nr, def_gen); + return NULL; + } + + + // DEBUG + fprintf(stdout, "\nparse_obj: Parsed Result:\n"); + h_pprintln(stdout, res->ast); // XXX debug + fprintf(stdout, "\nparse_obj: Returning:\n"); + h_pprintln(stdout, H_INDEX_TOKEN(res->ast, 1)); // XXX debug + // DEBUG + + return H_INDEX_TOKEN(res->ast, 1); +} + +const HParsedToken * +parse_objstm_obj(struct Env *aux, size_t nr, size_t stm_nr, size_t idx) +{ + XREntry *ent; + const HParsedToken *stm; + + /* + * acquire the stream object + */ + + ent = lookup_xref(aux, stm_nr, 0); + if (ent == NULL) + return NULL; /* stream not found */ + + switch (ent->type) + { + case XR_FREE: + return NULL; /* stream deleted */ + case XR_INUSE: + if (ent->n.gen != 0) + return NULL; /* stream replaced */ + if (ent->obj == NULL) { + fprintf(stdout, "\nparse_objstm_obj:: Parsing object stream id = %lu, %d, at offset = %zu (%#zx)\n", + stm_nr, 0, ent->n.offs, ent->n.offs); // XXX DEBUG + ent->obj = parse_obj(aux, stm_nr, 0, ent->n.offs); + } + break; + case XR_OBJSTM: + return NULL; /* invalid: nested streams */ + } + + if ((stm = ent->obj) == NULL) { + fprintf(stderr, "%s: error parsing object stream at position " + "%zu (%#zx)\n", aux->infile, ent->n.offs, ent->n.offs); + return NULL; + } + else { + /* + * decode the stream and find the target object in it + */ + // XXX debug + fprintf(stdout, "\nparse_objstm_obj:: token type = %u, \n", ent->obj->token_type); + h_pprintln(stdout, stm); // XXX debug + // XXX debug + // get the object at index + + // XXX review: stm might be used uninitialized + stm = H_INDEX_TOKEN(stm, 1); // the first field is the stream dictionary + if (stm->token_type == TT_HParseResult){ + const HParseResult *res = H_CAST(HParseResult, stm); + stm = res->ast; + // Now get the index if the index is valid + const Objstm *ostm = H_CAST(Objstm, stm); + if ( (idx>=0) && (idx < ostm->numObjs)) { + stm = ostm->tok[idx].obj; + } + else + return NULL; + } + } + fprintf(stdout, "\npparse_objstm_obj:: Returning token of type = %u, \n", stm->token_type); + h_pprintln(stdout, stm); // XXX debug + return stm; +} + +const HParsedToken * +resolve(struct Env *aux, const HParsedToken *v) +{ + XREntry *ent = NULL; + Ref *r; + + /* direct objects pass through */ + if (v == NULL || v->token_type != TT_Ref) { + fprintf (stdout, "resolve: Returning token of token_type = %u\n", v->token_type); // XXX DEBUG + return v; + } + + /* we are looking at an indirect reference */ + r = v->user; + + /* find the xref entry for this reference */ + fprintf(stdout, "\nresolve:: Looking up xref = %lu, %lu\n", r->nr, r->gen); // XXX DEBUG + ent = lookup_xref(aux, r->nr, r->gen); + if (ent == NULL) + return NULL; /* obj not found */ + if (ent->obj != NULL) { + fprintf(stdout, "\nresolve:: ent->obj->token_type = %u\n", ent->obj->token_type); // XXX DEBUG + return resolve(aux, ent->obj); + } + + /* parse the object and memoize */ + ent->obj = v; /* break loops */ + switch (ent->type) + { + case XR_FREE: + return NULL; /* obj deleted */ + case XR_INUSE: + if (ent->n.gen != r->gen) + return NULL; /* obj nr reused */ + fprintf(stdout, "resolve:: parse object at offset = %zu (%#zx)\n", ent->n.offs, ent->n.offs); + ent->obj = parse_obj(aux, r->nr, r->gen, ent->n.offs); + break; + case XR_OBJSTM: + if (r->gen != 0) + return NULL; /* invalid entry! */ + fprintf(stdout, "resolve:: parse object stream - oid = %lu, stm_oid = %lu, stm_idx = %lu\n", + r->nr, ent->o.stm, ent->o.idx); // XXX DEBUG + ent->obj = parse_objstm_obj(aux, r->nr, ent->o.stm, ent->o.idx); + break; + } + + fprintf (stdout, "resolve: Recursive call to resolve - token_type = %u\n", ent->obj->token_type); // XXX DEBUG + return resolve(aux, ent->obj); +} + + +/* + * stream object handling incl. filters and cross-reference streams + */ + +#include <limits.h> /* INT_MAX */ +#include <zlib.h> +#include <err.h> + +struct predictor { + /* parameters */ + int num; /* default: 1 (no prediction) */ + int colors; /* default: 1 */ + int bpc; /* bits per component; default: 8 */ + int columns; /* default: 1 */ + + int rowsz; /* bytes per row = ceil(colors * bpc * columns / 8) */ + + /* state */ + HSuspendedParser *sp; + uint8_t (*predfun)(int, int, int); + uint8_t *buf; /* previous row of input */ + uint8_t c; /* byte 'c' (upper left) */ + int x; /* current position */ + +#ifndef ITERATIVE // XXX + uint8_t *out; + size_t nout; +#endif +}; + +int +depred_none(struct predictor *pred, uint8_t *inp, size_t sz) +{ +#ifdef ITERATIVE // XXX + return h_parse_chunk(pred->sp, inp, sz); +#else + pred->out = realloc(pred->out, pred->nout + sz); + assert(pred->out != NULL); + memcpy(pred->out + pred->nout, inp, sz); + pred->nout += sz; + return false; +#endif +} + +uint8_t pp_none(int a, int b, int c) { return 0; } +uint8_t pp_sub(int a, int b, int c) { return a; } +uint8_t pp_up(int a, int b, int c) { return b; } +uint8_t pp_avg(int a, int b, int c) { return (a + b) / 2; } + +uint8_t +pp_paeth(int a, int b, int c) +{ + int p = a + b - c; + int pa = abs(p - a); + int pb = abs(p - b); + int pc = abs(p - c); + + if (pa <= pb && pa <= pc) return a; + if (pb <= pc) return b; + return c; +} + +int +depred_png(struct predictor *pred, uint8_t *inp, size_t sz) +{ + /* NB: + * at this point, the specific value of pred->num no longer matters. + * the PNG predictor tags each row with the function used for that row + * and decoding always follows the tag. + */ + static uint8_t (*predfuns[])(int, int, int) = + {pp_none, pp_sub, pp_up, pp_avg, pp_paeth}; + + bool done = false; + int bpp; + + bpp = (pred->colors * pred->bpc + 7) / 8; /* bytes per pixel */ + assert (bpp > 0); + + for (size_t i=0; i < sz && !done; i++) { + int x = pred->x; + int a = x<bpp ? 0 : pred->buf[x-bpp]; /* left */ + int b = pred->buf[x]; /* up */ + int c = pred->c; /* up left */ + + if (pred->predfun == NULL) { /* we are before a new row */ + /* select predictor function */ + if (inp[i] > 4) { + fprintf(stderr, "unknown PNG predictor %d\n", + (int)inp[i]); + return -1; + } + pred->predfun = predfuns[inp[i]]; + + /* consume the tag */ + if (++i == sz) + break; + } + + /* undo the prediction and save the decoded value */ + pred->buf[x] = inp[i] + pred->predfun(a, b, c); + + /* advance to the right */ + pred->c = b; + pred->x = ++x; + + /* when row complete, pass it to parser and start a new row */ + if (x == pred->rowsz) { +#ifdef ITERATIVE // XXX + done = h_parse_chunk(pred->sp, pred->buf, pred->rowsz); +#else + pred->out = realloc(pred->out, pred->nout + pred->rowsz); + assert(pred->out != NULL); + memcpy(pred->out + pred->nout, pred->buf, pred->rowsz); + pred->nout += pred->rowsz; +#endif + pred->c = pred->x = 0; + if (pred->num != 2) /* support for 8-bpc TIFF */ + pred->predfun = NULL; + } + } + + return done; +} + +HParseResult * +FlateDecode(const Dict *parms, HBytes b, HParser *p) +{ + size_t const BUFSIZE = 8 * 1024; + uint8_t *buf; +#ifdef ITERATIVE // XXX + HSuspendedParser *sp; +#endif + HParseResult *res; + const HParsedToken *v; + size_t sz; + int done; + z_stream strm = {0}; + int ret; + struct predictor pred = {1, 1, 8, 1}; + int (*depredict)(struct predictor *, uint8_t *, size_t); + + /* set up the predictor (if any) */ + #define SETPARM(VAR,STR) do { \ + v = dictentry(parms, (STR)); \ + if (v != NULL) { \ + if (v->token_type != TT_SINT || v->sint < 0) \ + return NULL; \ + VAR = v->sint; \ + } } while(0) + SETPARM(pred.num, "Predictor"); + SETPARM(pred.colors, "Colors"); + SETPARM(pred.bpc, "BitsPerComponent"); + SETPARM(pred.columns, "Columns"); + #undef SETPARM + if (pred.num == 1) + depredict = depred_none; + else { + if (pred.num >= 10 && pred.num <= 15) + depredict = depred_png; + else if (pred.num == 2) { + /* for 8-bpc TIFF pred. 2, we can reuse PNG Sub */ + if (pred.bpc == 8) { + pred.predfun = pp_sub; /* predict left */ + depredict = depred_png; + } else { + // XXX add general TIFF predictor (bpc != 8) + fprintf(stderr, "FlateDecode: /Predictor %d " + "not supported for /BitsPerComponent %d\n", + pred.num, pred.bpc); + return NULL; + } + } else { + fprintf(stderr, "FlateDecode: /Predictor %d" + " not supported\n", pred.num); + return NULL; + } + + /* allocate row buffer */ + if (pred.columns > (INT_MAX - 7) / pred.colors / pred.bpc) { + fprintf(stderr, "FlateDecode: overflow\n"); + return NULL; + } + pred.rowsz = (pred.colors * pred.bpc * pred.columns + 7) / 8; + pred.buf = calloc(1, pred.rowsz); + if (pred.buf == NULL) + err(1, "FlateDecode"); + } + + /* set up zlib */ + // XXX pass our allocator to zlib + ret = inflateInit(&strm); + if (ret != Z_OK) + errx(1, "inflateInit: %s (%d)", strm.msg, ret); + buf = malloc(BUFSIZE); + if (buf == NULL) + err(1, "FlateDecode"); + +#ifdef ITERATIVE // XXX + /* initialize target parser */ + sp = h_parse_start(p); + assert(sp != NULL); + pred.sp = sp; +#endif + + done = 0; + strm.avail_in = b.len; + strm.next_in = (unsigned char *)b.token; + do { + strm.avail_out = BUFSIZE; + strm.next_out = buf; + + ret = inflate(&strm, Z_NO_FLUSH); + if (ret != Z_STREAM_END && ret != Z_OK) { + fprintf(stderr, "inflate: %s (%d)\n", strm.msg, ret); + break; + } + + sz = BUFSIZE - strm.avail_out; + done = depredict(&pred, buf, sz); + } while (done == 0 && ret == Z_OK); + +#ifdef ITERATIVE // XXX + res = h_parse_finish(sp); + // XXX always return NULL on error? +#else + + // DEBUG -- will not always work depending on the font encoding used +// fprintf (stdout, "FlateDecode:: Inflated string (%lu):\n%.*s\n", pred.nout, (int)pred.nout, pred.out); + // XXX DEBUG + + res = h_parse(p, pred.out, pred.nout); + free(pred.out); +#endif + inflateEnd(&strm); + free(pred.buf); + free(buf); + + if (done == -1) + return NULL; + return res; +} + + +#if 0 + // decoded stream in pred.out +// FILE *decodef = fopen ("flatecode.out", "w"); +// fprintf (decodef, "FlateDecode:: Inflated string (%lu)\n:%.*s\n", pred.nout, (int)pred.nout, pred.out); + fprintf (stdout, "FlateDecode:: Inflated string (%lu)\n:%.*s\n", pred.nout, (int)pred.nout, pred.out); + unsigned char *fdec = pred.out; +// char _l; + int i; + for (i=0; i<(pred.nout/2); ++i) + { + convert2char(*fdec); +// _l = convert2char(*fdec); +// fprintf(decodef, " %c-%d ", _l, _l); + fdec ++; + } + res = NULL; + + +#endif + + + + +/* LZW helpers */ + +typedef struct +{ + uint8_t *lzw_buf; + size_t total_buf_size; + size_t write_head; + size_t write_tail; + uint8_t write_checksum; + size_t eof_loc; + + HBytes *input_stream; + size_t read_head; + size_t read_tail; + uint8_t read_checksum; +} lzwspec; + +lzwspec *cur_lzw_spec; + +/* used by write_lzw_buffer to get more space for decoding if needed */ +void +grow_lzw_buffer(size_t amount) +{ + uint8_t *ret_buf = realloc(cur_lzw_spec->lzw_buf, (cur_lzw_spec->total_buf_size+amount) * sizeof(uint8_t)); + if(ret_buf != NULL) + { + cur_lzw_spec->total_buf_size += amount; + cur_lzw_spec->lzw_buf = ret_buf; + } + else + { + fprintf(stderr, "LZWDecode: h_arena_realloc() failed"); + return; + } +} + +lzwspec * +new_lzw_spec(HBytes *bytes) +{ + size_t const BUFSIZE = sizeof(uint8_t) * 1024; + lzwspec *ret = malloc(sizeof(lzwspec)); + memset(ret, 0, sizeof(lzwspec)); + ret->input_stream = bytes; + ret->lzw_buf = malloc(BUFSIZE); + ret->total_buf_size = BUFSIZE; + return ret; +} + +void +delete_lzw_spec(lzwspec *spec) +{ + free(spec->lzw_buf); + free(spec); +} + +void +bind_lzw_spec(lzwspec *spec) +{ + cur_lzw_spec = spec; +} + + +#include "lzw-lib.h" + +/* Buffer writer function for the lzw-ab implementation, with a fixed signature. + * Although the type is defined as int, it is expected to write one byte at a time. + * Modifies cur_lzw_spec. Set up the lzw spec to use with bind_lzw_spec() */ + +void +write_lzw_buffer(int value) +{ + size_t const BUFSIZE = sizeof(uint8_t) * 1024; + + if(!cur_lzw_spec->lzw_buf) + { + fprintf(stderr, "LZWDecode: lzw_buf is null!"); + assert(cur_lzw_spec->lzw_buf != NULL); + } + + assert(cur_lzw_spec->write_head <= cur_lzw_spec->total_buf_size); + + if (value == EOF) { + cur_lzw_spec->lzw_buf[cur_lzw_spec->write_head] = (uint8_t) value; + cur_lzw_spec->eof_loc = cur_lzw_spec->write_head; + cur_lzw_spec->write_head++; + return; + } + + /* We can get away with this cast due to writing single bytes. */ + cur_lzw_spec->lzw_buf[cur_lzw_spec->write_head++] = (uint8_t) value; + + /* If you looked at lzw-ab's code, the write head is reset here + * This function uses write_head as the offset of the last written item */ + if (cur_lzw_spec->write_head >= cur_lzw_spec->total_buf_size) + { + grow_lzw_buffer(BUFSIZE); + } + + cur_lzw_spec->write_checksum = cur_lzw_spec->write_checksum * 3 + (uint8_t) value; +} + + +/* Fixed signature function for reading bytes. Modifies cur_lzw_spec. Set cur_lzw_spec + * with bind_lzw_spec() */ +int read_lzw_buffer(void) +{ + uint8_t byte_read; + int ret_value; + + /* Input data is already waiting in the buffer */ + if (cur_lzw_spec->read_head == cur_lzw_spec->read_tail) + cur_lzw_spec->read_tail = cur_lzw_spec->input_stream->len; + + if (cur_lzw_spec->read_head < cur_lzw_spec->read_tail) + { + byte_read = cur_lzw_spec->input_stream->token[cur_lzw_spec->read_head++]; + cur_lzw_spec->read_checksum = cur_lzw_spec->read_checksum * 3 + byte_read; + ret_value = byte_read; + } + else + ret_value = EOF; + + return ret_value; +} + + +HParseResult * +LZWDecode(const Dict *parms, HBytes b, HParser *p) +{ + struct predictor pred = {1, 1, 8, 1}; + int (*depredict)(struct predictor *, uint8_t *, size_t); + HParseResult *res; + int done; + int ret; + const HParsedToken *v; + + /* set up the predictor (if any) */ + #define SETPARM(VAR,STR) do { \ + v = dictentry(parms, (STR)); \ + if (v != NULL) { \ + if (v->token_type != TT_SINT || v->sint < 0) \ + return NULL; \ + VAR = v->sint; \ + } } while(0) + SETPARM(pred.num, "Predictor"); + SETPARM(pred.colors, "Colors"); + SETPARM(pred.bpc, "BitsPerComponent"); + SETPARM(pred.columns, "Columns"); + #undef SETPARM + if (pred.num == 1) + depredict = depred_none; + else { + if (pred.num >= 10 && pred.num <= 15) + depredict = depred_png; + else if (pred.num == 2) { + /* for 8-bpc TIFF pred. 2, we can reuse PNG Sub */ + if (pred.bpc == 8) { + pred.predfun = pp_sub; /* predict left */ + depredict = depred_png; + } else { + // XXX add general TIFF predictor (bpc != 8) + fprintf(stderr, "LZWDecode: /Predictor %d " + "not supported for /BitsPerComponent %d\n", + pred.num, pred.bpc); + return NULL; + } + } else { + fprintf(stderr, "LZWDecode: /Predictor %d" + " not supported\n", pred.num); + return NULL; + } + + /* allocate row buffer */ + if (pred.columns > (INT_MAX - 7) / pred.colors / pred.bpc) { + fprintf(stderr, "LZWDecode: overflow\n"); + return NULL; + } + pred.rowsz = (pred.colors * pred.bpc * pred.columns + 7) / 8; + pred.buf = calloc(1, pred.rowsz); + if (pred.buf == NULL) + err(1, "LZWDecode"); + } + + lzwspec *lzw_spec = new_lzw_spec(&b); + bind_lzw_spec(lzw_spec); + + ret = lzw_decompress(write_lzw_buffer, read_lzw_buffer); + if (ret) { + fprintf(stderr, "lzw_decompress: error (%d)\n", ret); + assert(!"LZWDecode: failed to decompress\n"); + } + done = depredict(&pred, cur_lzw_spec->lzw_buf, cur_lzw_spec->write_head-1); + assert(!done); // XXX ITERATIVE + + // SR::TODO:: Do a H_MAKE rather than a parse and let the caller do the parse + res = h_parse(p, pred.out, pred.nout); + free(pred.out); + + bind_lzw_spec(NULL); + delete_lzw_spec(lzw_spec); + + return res; +} + +HParseResult * +RunLengthDecode(const Dict *parms, HBytes b, HParser *p) +{ + HParseResult *res; + + res = h_parse(p_rldstring, b.token, b.len); + if(!res) + { + fprintf(stderr, "parse error in RunLengthDecode filter\n"); + return NULL; + } + + assert(res->ast && res->ast->token_type == TT_BYTES); + // SR::TODO:: Do a H_MAKE rather than a parse and let the caller do the parse + res = h_parse(p, res->ast->bytes.token, res->ast->bytes.len); + + return res; +} + +/* + * Decodes ASCII hexadecimal data into binary data. + * parms should be empty, because the filter has no parameters + */ +HParseResult * +ASCIIHexDecode(const Dict *parms, HBytes b, HParser *p) +{ + HParseResult *f_res, *res; + + // XXX debug + fprintf(stdout, "ASCIIHexDecode:: bytes=[%.*s]\n", (int)b.len, b.token); + + f_res = h_parse(p_ahexstream, b.token, b.len); + if(!f_res) + { + fprintf(stderr, "parse error in ASCIIHexDecode filter\n"); + return NULL; + } + + assert(f_res->ast && f_res->ast->token_type == TT_BYTES); + fprintf(stdout, "ASCIIHexDecode::string = [%.*s]\n", + (int)f_res->ast->bytes.len, (char*)f_res->ast->bytes.token); + // SR::TODO:: Do a H_MAKE rather than a parse and let the caller do the parse + res = h_parse(p, f_res->ast->bytes.token, f_res->ast->bytes.len); + + if (res == NULL) + res = f_res; // return the decoded stream + + return res; +} + +/* + * Decodes ASCII base-85 encoded data and produces binary data. + * parms should be empty, because the filter has no parameters + */ +HParseResult* +ASCII85Decode(const Dict *parms, HBytes b, HParser *p) +{ + HParseResult *f_res, *res; + + // XXX debug + fprintf(stdout, "ASCII85Decode:: bytes=[%.*s]\n", (int)b.len, b.token); + + f_res = h_parse(p_a85string, b.token, b.len); + if(!f_res) + { + fprintf(stderr, "parse error in ASCII85Decode filter\n"); + return NULL; + } + + assert(f_res->ast && f_res->ast->token_type == TT_BYTES); + // SR::TODO:: Do a H_MAKE rather than a parse and let the caller do the parse + res = h_parse(p, f_res->ast->bytes.token, f_res->ast->bytes.len); + + if (res == NULL) + res = f_res; // return the decoded stream + + return res; +} + +/* + * decode the bytes in 'b' according to metadata in the stream dictionary 'd' + * and parse the result with 'p'. + */ +HParseResult * +decode_stream(const Dict *d, HBytes b, HParser *p) +{ + HParseResult *(*filter)(const Dict *, HBytes, HParser *); + const Dict *parms = NULL; + const HParsedToken *v; + + v = dictentry(d, "Filter"); + if (v == NULL) + return h_parse(p, b.token, b.len); + +#ifdef ITERATIVE // XXX + /* compile to a CF backend to enable incremental parsing */ + if (h_compile(p, PB_LLk, NULL) == -1) + errx(1, "stream data parser: LL(1) compile failed"); +#endif + + if (v->token_type != TT_BYTES) { + // XXX TT_SEQUENCE would be a filter chain; that’s not supported, yet. + // But it might also be something bogus, in which case we should fail. + return NULL; + } + + if (bytes_eq(v->bytes, "FlateDecode")) + filter = FlateDecode; + else if (bytes_eq(v->bytes, "ASCIIHexDecode")) + filter = ASCIIHexDecode; + else if (bytes_eq(v->bytes, "ASCII85Decode")) + filter = ASCII85Decode; + else if (bytes_eq(v->bytes, "RunLengthDecode")) + filter = RunLengthDecode; + else if (bytes_eq(v->bytes, "LZWDecode")) + filter = LZWDecode; + else + return NULL; /* filter not supported */ + + v = dictentry(d, "DecodeParms"); + if (v && v->token_type == TT_Dict) + parms = v->user; + + return filter(parms, b, p); +} + +HParsedToken * +act_rest(const HParseResult *p, void *env) +{ + struct Env *aux = env; + size_t offset = H_CAST_UINT(p->ast) / 8; + + return H_MAKE_BYTES(aux->input + offset, aux->sz - offset); +} + +HParser * +p_rest__m(HAllocator *mm__, struct Env *aux) +{ + return h_action__m(mm__, h_tell__m(mm__), act_rest, aux); +} + +/* combine current position with env=(input,sz) into HBytes */ +HParsedToken * +act_take_bytes(const HParseResult *p, void *env) +{ + const HBytes *bs = env; + size_t offset = H_CAST_UINT(p->ast) / 8; + + /* + * NB: we must allocate a new HBytes struct here because the old one is + * allocated only temporarily for the lifetime of the continuation + * below. + */ + // DEBUG + fprintf (stdout, "act_take_bytes: Current position (bytes)= %p, len=%ld\n", + (void *)bs->token + offset, bs->len); + return H_MAKE_BYTES(bs->token + offset, bs->len); +} + +HParser * +p_take__m(HAllocator *mm__, size_t n, struct Env *aux) +{ + HParser *skip, *bytes; + HBytes *bs; + + /* dummy struct to hold the pair (input,n) */ + bs = h_alloc(mm__, sizeof(HBytes)); + bs->token = aux->input; + bs->len = n; + + bytes = h_action__m(mm__, h_tell__m(mm__), act_take_bytes, bs); + skip = h_skip__m(mm__, n * 8); + + return h_left__m(mm__, bytes, skip); +} + + +// Parser for object streams +HParser *p_objstm__m(HAllocator *, const Dict *); + +// Action for stream continuation +HParsedToken *act_ks_value(const HParseResult *p, void *u); +struct streamspec { + Dict *dict; /* stream dictionary */ + HParser *parser; /* data parser */ +}; + + + +/* + * ******************************************************************** + * Start Catalog parsing + * ******************************************************************** + */ +/* + * decode the bytes in 'b' according to metadata in the stream dictionary 'd' + * and parse the result with 'p'. + */ +HParseResult * +decode_contentstream(const Dict *d, HBytes b, HParser *p) +{ + HParseResult *(*filter)(const Dict *, HBytes, HParser *); + const Dict *parms = NULL; + const HParsedToken *v; + HParseResult *res = NULL; + + + /* + * Check if there is additional information in the dictionary + * that we should use to process the content stream + * + * If the data in the stream is encoded, a filter will be specified in + * the dictionary that must be used to decode the data first + * + * TODO:: Handle arrays of filters (chained) and their decode parameters + */ + v = dictentry(d, "Filter"); // look for a filter + + if (v != NULL) { // data is encoded + + + if (v->token_type != TT_BYTES) { + // XXX TT_SEQUENCE would be a filter chain; that’s not supported, yet. + // But it might also be something bogus, in which case we should fail. + return NULL; + } + + if (bytes_eq(v->bytes, "FlateDecode")) + filter = FlateDecode; + else if (bytes_eq(v->bytes, "ASCIIHexDecode")) + filter = ASCIIHexDecode; + else if (bytes_eq(v->bytes, "ASCII85Decode")) + filter = ASCII85Decode; + else if (bytes_eq(v->bytes, "RunLengthDecode")) + filter = RunLengthDecode; + else if (bytes_eq(v->bytes, "LZWDecode")) + filter = LZWDecode; + else { /* filter not supported */ + fprintf(stderr, "decode_stream:: Unsupported Filter [%.*s]\n", + (int)v->bytes.len, v->bytes.token); + return NULL; /* Treat the stream as a byte array */ + } + /* Check for parameters for the filter */ + v = dictentry(d, "DecodeParms"); + if (v && v->token_type == TT_Dict) + parms = v->user; + + res = filter(parms, b, p); + + /* Debug */ + if (res){ + fprintf(stdout, "decode_contentstream: parsed token type is = %u\n", res->ast->token_type); + } + } /* The dictionary provided direction for processing the stream */ + + /* + * It is possible that we should always process the stream as a content stream + * But not yet sure that covers all case. + */ + else { // content stream is not encoded + res = h_parse(p, b.token, b.len); + if (res == NULL) { // Probably does not need to be flagged + fprintf(stderr, "decode_contentstream::Text String parse failed!!\n"); + } + } + + /* + * There are other parameters that can be passed in the dictionary + * They are not being handled currently + */ +// const int numOptKeys = 3; +// char *optionalKeys[3] = { "F", "FDecodeParms", "DL" }; +// for (int i=0; i<numOptKeys; i++) { +// v = dictentry(d, optionalKeys[i]); +// if (v) fprintf(stderr, "decode_contentstream:: Unsupported Specifications [%s\n]", optionalKeys[i]); +// } + return res; +} + + +HParsedToken * +act_kcontentstream_value(const HParseResult *p, void *u) +{ + struct streamspec *spec = u; + HBytes bytes = H_CAST_BYTES(p->ast); + HParseResult *res; + + /* decode and parse the stream data */ + res = decode_contentstream(spec->dict, bytes, spec->parser); +// if (!res) { +// res = (HParseResult *)p; +// } + + if (res) return (HParsedToken *)res->ast; + else return (HParsedToken *)p->ast; +} + + + + + +const HParsedToken * +parse_item(struct Env *aux, size_t nr, size_t gen, size_t offset, HParser *p) +{ + HParseResult *res; + size_t def_nr, def_gen; + + if (offset >= aux->sz) { + fprintf(stderr, "%s: position %zu (%#zx) for object %lu %lu is " + "out of bounds\n", aux->infile, offset, offset, nr, gen); + return NULL; + } + + if (p == NULL) { + fprintf(stderr, "parse_item: Attempt to use a NULL parser!\n"); + return NULL; + } + fprintf(stdout, "\nparse_item:: Parsing reference = %lu, %lu, at offset = %zu (%#zx)\n", + nr, gen, offset, offset); + HParser *pItem = h_right(h_seek(offset * 8, SEEK_SET), p); + res = h_parse(pItem, aux->input, aux->sz); + if (res == NULL) { fprintf(stderr, "%s: error parsing object %zu %zu at position " "%zu (%#zx)\n", aux->infile, nr, gen, offset, offset); return NULL; } assert(res->ast != NULL && res->ast->token_type == TT_SEQUENCE); - /* res->ast = ((nr gen) obj) */ + + +// size_t ntok =res->ast->seq->used; +// assert(ntok==2 || ntok==3); def_nr = H_INDEX_UINT(res->ast, 0, 0); def_gen = H_INDEX_UINT(res->ast, 0, 1); if (def_nr != nr || def_gen != gen) { fprintf(stderr, "%s: object ID mismatch at position %zu " - "(%#zx): sought %zu %zu, found %zu %zu.\n", aux->infile, - offset, offset, nr, gen, def_nr, def_gen); + "(%#zx): sought %zu %zu, found %zu %zu.\n", aux->infile, + offset, offset, nr, gen, def_nr, def_gen); return NULL; } @@ -1428,10 +3713,12 @@ parse_obj(struct Env *aux, size_t nr, size_t gen, size_t offset) } const HParsedToken * -parse_objstm_obj(struct Env *aux, size_t nr, size_t stm_nr, size_t idx) +parse_objstm_item(struct Env *aux, size_t nr, size_t stm_nr, size_t idx, size_t *offset, HParser *p) { XREntry *ent; - const HParsedToken *stm; + const HParsedToken *stm = NULL; + + *offset = 0; // initialize the offset /* * acquire the stream object @@ -1448,8 +3735,15 @@ parse_objstm_obj(struct Env *aux, size_t nr, size_t stm_nr, size_t idx) case XR_INUSE: if (ent->n.gen != 0) return NULL; /* stream replaced */ - if (ent->obj == NULL) - ent->obj = parse_obj(aux, stm_nr, 0, ent->n.offs); + if (ent->obj == NULL) { + /* + * decode the stream and find the target object in it + */ + fprintf(stdout, "\nparse_objstm_item:: Parsing object stream id = %lu, %d, at offset = %zu (%#zx)\n", + stm_nr, 0, ent->n.offs, ent->n.offs); + ent->obj = parse_item(aux, stm_nr, 0, ent->n.offs, p); + *offset = ent->n.offs; + } break; case XR_OBJSTM: return NULL; /* invalid: nested streams */ @@ -1460,32 +3754,61 @@ parse_objstm_obj(struct Env *aux, size_t nr, size_t stm_nr, size_t idx) "%zu (%#zx)\n", aux->infile, ent->n.offs, ent->n.offs); return NULL; } + else { + /* + * decode the stream and find the target object in it + */ + // XXX debug + fprintf(stdout, "\nparse_objstm_item:: Type of object looked up = %u at offset = %zu (%#zx)\n", + stm->token_type, ent->n.offs, ent->n.offs); + h_pprintln(stdout, ent->obj); + // XXX debug + // get the object at index + + stm = H_INDEX_TOKEN(stm, 1); // the first field is the stream dictionary + if (stm->token_type == TT_HParseResult){ + const HParseResult *res = H_CAST(HParseResult, stm); + stm = res->ast; + // Now get the index if the index is valid + const Objstm *ostm = H_CAST(Objstm, stm); + if ( (idx>=0) && (idx < ostm->numObjs)) { + stm = ostm->tok[idx].obj; + } + else + return NULL; + } + } - /* - * decode the stream and find the target object in it - */ - return NULL; // XXX + fprintf(stdout, "\nparse_objstm_item:: Returning token of type = %u, \n", stm->token_type); + h_pprintln(stdout, stm); // XXX debug + return stm; } + const HParsedToken * -resolve(struct Env *aux, const HParsedToken *v) +resolve_item(struct Env *aux, const HParsedToken *v, size_t *offset, HParser *p) { XREntry *ent = NULL; Ref *r; + /* direct objects pass through */ if (v == NULL || v->token_type != TT_Ref) return v; /* we are looking at an indirect reference */ - r = v->user; + *offset = 0; // initialize the offset + r = v->user; /* find the xref entry for this reference */ + fprintf(stdout, "\nresolve_item:: Looking up xref = %lu, %lu\n", r->nr, r->gen); ent = lookup_xref(aux, r->nr, r->gen); if (ent == NULL) - return NULL; /* obj not found */ - if (ent->obj != NULL) - return resolve(aux, ent->obj); + return NULL; /* obj not found -- xref error */ + if (ent->obj != NULL) { + fprintf(stdout, "\nresolve_item:: ent->obj->token_type = %u\n", ent->obj->token_type); + return resolve_item(aux, ent->obj, offset, p); + } /* parse the object and memoize */ ent->obj = v; /* break loops */ @@ -1496,536 +3819,1120 @@ resolve(struct Env *aux, const HParsedToken *v) case XR_INUSE: if (ent->n.gen != r->gen) return NULL; /* obj nr reused */ - ent->obj = parse_obj(aux, r->nr, r->gen, ent->n.offs); + fprintf(stdout, "resolve_item:: parse object at offset = %lu\n", ent->n.offs); + ent->obj = parse_item(aux, r->nr, r->gen, ent->n.offs, p); + *offset = ent->n.offs; break; case XR_OBJSTM: if (r->gen != 0) return NULL; /* invalid entry! */ - ent->obj = parse_objstm_obj(aux, r->nr, ent->o.stm, ent->o.idx); + fprintf(stdout, "resolve_item:: parse object stream - oid = %lu, stm_oid = %lu, stm_idx = %lu\n", + r->nr, ent->o.stm, ent->o.idx); + ent->obj = parse_objstm_item(aux, r->nr, ent->o.stm, ent->o.idx, offset, p); break; } - return resolve(aux, ent->obj); + // DEBUG + if (ent->obj) + fprintf (stdout, "resolve_item: Recursive call to resolve - token_type = %u at offset = %zu (%#zx)\n", + ent->obj->token_type, *offset, *offset); + return resolve_item(aux, ent->obj, offset, p); } -/* - * stream object handling incl. filters and cross-reference streams - */ - -#include <limits.h> /* INT_MAX */ -#include <stdlib.h> /* abs() */ -#include <zlib.h> -#include <err.h> -struct predictor { - /* parameters */ - int num; /* default: 1 (no prediction) */ - int colors; /* default: 1 */ - int bpc; /* bits per component; default: 8 */ - int columns; /* default: 1 */ +bool +is_parent(Dict *dict, const HParsedToken *expected) +{ + const HParsedToken *item; + Ref *ref, *pRef; + bool res=false; - int rowsz; /* bytes per row = ceil(colors * bpc * columns / 8) */ + item = dictentry(dict, "Parent"); + if (item == NULL) { + res = item == expected; + } + else if (item->token_type == TT_Ref) { + ref = H_CAST(Ref, item); + if (expected == NULL) { + fprintf(stderr, "is_parent: Inconsistent parent field=<%zu, %zu>, expected = NULL!\n", + ref->nr, ref->gen); + } + else { + assert(expected->token_type == TT_Ref); + pRef = (Ref *)expected->user; + res = (ref->nr == pRef->nr) && (ref->gen == pRef->gen); + } + } + else { + fprintf(stderr, "is_parent: Unexpected token type = %u!\n", item->token_type); + } - /* state */ - HSuspendedParser *sp; - uint8_t (*predfun)(int, int, int); - uint8_t *buf; /* previous row of input */ - uint8_t c; /* byte 'c' (upper left) */ - int x; /* current position */ -#ifndef ITERATIVE // XXX - uint8_t *out; - size_t nout; -#endif -}; + return res; +} -int -depred_none(struct predictor *pred, uint8_t *inp, size_t sz) +bool +has_value(Dict *dict, char *fn, char *value) { -#ifdef ITERATIVE // XXX - return h_parse_chunk(pred->sp, inp, sz); -#else - pred->out = realloc(pred->out, pred->nout + sz); - assert(pred->out != NULL); - memcpy(pred->out + pred->nout, inp, sz); - pred->nout += sz; - return false; -#endif + const HParsedToken *item; + bool res=false; + + item = dictentry(dict, fn); + if ( (item != NULL) && (item->token_type == TT_BYTES) && + (bytes_eq(item->bytes, value)) ) { + res = true; + } + + + // DEBUG + if (item == NULL) { + fprintf(stderr, "has_value: No such field (%s) in dictionary!\n", fn); + } + else if (item->token_type != TT_BYTES) { + fprintf(stderr, "has_value: Field:%s has token type %u for value!\n", + fn, item->token_type); + } + // DEBUG + + + return res; } -uint8_t pp_none(int a, int b, int c) { return 0; } -uint8_t pp_sub(int a, int b, int c) { return a; } -uint8_t pp_up(int a, int b, int c) { return b; } -uint8_t pp_avg(int a, int b, int c) { return (a + b) / 2; } -uint8_t -pp_paeth(int a, int b, int c) + +Dict * +get_fontdict(const HParsedToken *obj, struct Env* aux) { - int p = a + b - c; - int pa = abs(p - a); - int pb = abs(p - b); - int pc = abs(p - c); + const HParsedToken *item; + Ref *ref; + Dict *dict = NULL; + Objstm *stm = NULL; + + + assert(obj->token_type == TT_Ref); + ref = H_CAST(Ref, obj); + fprintf(stdout, "\n\nget_fontdict: Ref = [%lu, %lu]\n\n", ref->nr, ref->gen); + item = resolve(aux, obj); + if ( (item) && (item->token_type == TT_Dict) ) { + dict = H_CAST(Dict, item); + if (! has_value(dict, "Type", "Font")) + dict = NULL; + } + else if ( (item) && (item->token_type == TT_Objstm) ) { + stm = H_CAST(Objstm, item); + for (int i=0; i<stm->numObjs; i++) { + if ( (stm->tok[i].oid.nr == ref->nr) && + (stm->tok[i].oid.gen == ref->gen) ) { + if (stm->tok[i].obj->token_type != TT_Dict) { + fprintf(stdout, "\nget_fontdict:Expected Dictionary, Got a token of type=%u\n", + stm->tok[i].obj->token_type); + dict = NULL; + } + else { + dict = H_CAST(Dict, stm->tok[i].obj); + if (! has_value(dict, "Type", "Font")) + dict = NULL; + } + break; + } + } + } - if (pa <= pb && pa <= pc) return a; - if (pb <= pc) return b; - return c; + + return dict; } -int -depred_png(struct predictor *pred, uint8_t *inp, size_t sz) +const HParsedToken * +get_dictoftype( + const HParsedToken *obj, + const HParsedToken *pRefT, + char *value, + struct Env *aux) { - /* NB: - * at this point, the specific value of pred->num no longer matters. - * the PNG predictor tags each row with the function used for that row - * and decoding always follows the tag. - */ - static uint8_t (*predfuns[])(int, int, int) = - {pp_none, pp_sub, pp_up, pp_avg, pp_paeth}; - - bool done = false; - int bpp; + Dict *dict = NULL; + const HParsedToken *tok; + Objstm *stm = NULL; + + if (obj->token_type == TT_Dict) { + dict = H_CAST(Dict, obj); + if (is_parent(dict, pRefT) && has_value(dict, "Type", value)) + tok = obj; + else + dict = NULL; + } + else if (obj->token_type == TT_Objstm) { + stm = H_CAST(Objstm, obj); + for (int i=0; i<stm->numObjs; i++) { + h_pprintln(stdout, stm->tok[i].obj); + size_t ioff = 0; + const HParsedToken *sitem = resolve_item(aux, stm->tok[i].obj, &ioff, p_objdef); + if ((sitem) && (sitem->token_type == TT_Dict)) { + dict = H_CAST(Dict, sitem); + if (is_parent(dict, pRefT) && has_value(dict, "Type", value)) { + tok = sitem; + break; + } + else + dict = NULL; + } + } + } + else { + fprintf(stdout, "get_dictoftype: token type not yet handled: %u\n", + obj->token_type); + fprintf(stdout, "get_dictoftype: Possibly needed for CMAPS\n"); + h_pprintln(stdout, obj); + } - bpp = (pred->colors * pred->bpc + 7) / 8; /* bytes per pixel */ - assert (bpp > 0); + if (dict == NULL) + tok = NULL; - for (size_t i=0; i < sz && !done; i++) { - int x = pred->x; - int a = x<bpp ? 0 : pred->buf[x-bpp]; /* left */ - int b = pred->buf[x]; /* up */ - int c = pred->c; /* up left */ + // DEBUG + if (pRefT) { + fprintf(stdout, "\nget_dictoftype: Parent = "); + pp_ref(stdout, pRefT, 0, 0); + } + else + fprintf(stdout, "get_dictoftype: Parent = NULL"); + if (tok) { + fprintf(stdout, "\nget_dictoftype: Type = %s\n", value); + pp_dict(stdout, tok, 0, 0); + } + else { + fprintf(stdout, "\nget_dictoftype: Null dictionary of Type = %s\n", value); + } + return tok; +} - if (pred->predfun == NULL) { /* we are before a new row */ - /* select predictor function */ - if (inp[i] > 4) { - fprintf(stderr, "unknown PNG predictor %d\n", - (int)inp[i]); - return -1; - } - pred->predfun = predfuns[inp[i]]; - /* consume the tag */ - if (++i == sz) - break; - } +/* + * This continuation takes the content stream, decodes it if necessary and returns + * the byte stream for concatenation with other byte streams priot to test extraction. + * It is very similar to kstream in approach. + */ +HParser * +kbyteostream(HAllocator *mm__, const HParsedToken *x, void *env) +{ - /* undo the prediction and save the decoded value */ - pred->buf[x] = inp[i] + pred->predfun(a, b, c); + struct Env *aux = env; + HParsedToken *dict_t = H_INDEX_TOKEN(x, 0); + Dict *dict = H_CAST(Dict, dict_t); + const HParsedToken *v = NULL; + HParser *bytes_p, *dict_p, *value_p; + struct streamspec *spec; + size_t sz=0, nOffset=0; - /* advance to the right */ - pred->c = b; - pred->x = ++x; + fprintf(stdout, "kbyteostream: dictionary\n"); + pp_dict(stdout, dict_t, 5, 0); - /* when row complete, pass it to parser and start a new row */ - if (x == pred->rowsz) { -#ifdef ITERATIVE // XXX - done = h_parse_chunk(pred->sp, pred->buf, pred->rowsz); -#else - pred->out = realloc(pred->out, pred->nout + pred->rowsz); - assert(pred->out != NULL); - memcpy(pred->out + pred->nout, pred->buf, pred->rowsz); - pred->nout += pred->rowsz; -#endif - pred->c = pred->x = 0; - if (pred->num != 2) /* support for 8-bpc TIFF */ - pred->predfun = NULL; - } + /* look for the Length entry -- could be a reference */ + v = dictentry(dict, "Length"); + v = resolve_item(aux, v, &nOffset, p_objdef); /* resolve indirect references */ + if (v == NULL || v->token_type != TT_SINT || v->sint < 0) { + if (v == NULL) + fprintf(stderr, "kbyteostream: stream /Length missing\n"); + else if (v -> token_type != TT_SINT) + fprintf(stderr, "kbyteostream: stream /Length not an integer\n"); + else if (v < 0) + fprintf(stderr, "kbyteostream: stream /Length negative\n"); + + //h_pprintln(stderr, p); // XXX debug + return p_fail; } - return done; + sz = (size_t)v->sint; + + dict_p = p_return__m(mm__, dict_t); + bytes_p = p_take__m(mm__, sz, aux); + + spec = h_alloc(mm__, sizeof(struct streamspec)); + spec->dict = dict; + spec->parser = p_bytestream; + + value_p = h_action__m(mm__, bytes_p, act_ks_value, spec); + + return h_sequence__m(mm__, dict_p, value_p, NULL); + } -HParseResult * -FlateDecode(const Dict *parms, HBytes b, HParser *p) + + + + +/* + * This continuation takes the content stream and processes it for test extraction. + * It is very similar to kstream in approach. It decodes and extracts the stream contents + * and + * It does not consume the string and returns the token as the output. + * + * x = (txtobj ...) + */ +HParser * +kcontentstream(HAllocator *mm__, const HParsedToken *x, void *env) { - size_t const BUFSIZE = 8 * 1024; - uint8_t *buf; -#ifdef ITERATIVE // XXX - HSuspendedParser *sp; -#endif - HParseResult *res; - const HParsedToken *v; - size_t sz; - int done; - z_stream strm = {0}; - int ret; - struct predictor pred = {1, 1, 8, 1}; - int (*depredict)(struct predictor *, uint8_t *, size_t); - /* set up the predictor (if any) */ - #define SETPARM(VAR,STR) do { \ - v = dictentry(parms, (STR)); \ - if (v != NULL) { \ - if (v->token_type != TT_SINT || v->sint < 0) \ - return NULL; \ - VAR = v->sint; \ - } } while(0) - SETPARM(pred.num, "Predictor"); - SETPARM(pred.colors, "Colors"); - SETPARM(pred.bpc, "BitsPerComponent"); - SETPARM(pred.columns, "Columns"); - #undef SETPARM - if (pred.num == 1) - depredict = depred_none; + struct Env *aux = env; + HParsedToken *dict_t = H_INDEX_TOKEN(x, 0); + Dict *dict = H_CAST(Dict, dict_t); + const HParsedToken *v = NULL; + HParser *bytes_p, *dict_p, *value_p; + struct streamspec *spec; + size_t sz=0, nOffset=0; + + fprintf(stdout, "kcontentstream: dictionary\n"); + pp_dict(stdout, dict_t, 5, 0); + + /* look for the Length entry -- could be a reference */ + v = dictentry(dict, "Length"); + v = resolve_item(aux, v, &nOffset, p_objdef); /* resolve indirect references */ + if (v == NULL || v->token_type != TT_SINT || v->sint < 0) { + if (v == NULL) + fprintf(stderr, "kcontentstream: stream /Length missing\n"); + else if (v -> token_type != TT_SINT) + fprintf(stderr, "kcontentstream: stream /Length not an integer\n"); + else if (v < 0) + fprintf(stderr, "kcontentstream: stream /Length negative\n"); + + //h_pprintln(stderr, p); // XXX debug + return p_fail; + } + + sz = (size_t)v->sint; + + dict_p = p_return__m(mm__, dict_t); + bytes_p = p_take__m(mm__, sz, aux); + + spec = h_alloc(mm__, sizeof(struct streamspec)); + spec->dict = dict; + + v = dictentry(dict, "Type"); + if (v == NULL) // XXX -> custom type + spec->parser = p_textstream; + else if ( (v->token_type == TT_BYTES) && bytes_eq(v->bytes, "ObjStm") ) + spec->parser = p_objstm__m(mm__, dict); else { - if (pred.num >= 10 && pred.num <= 15) - depredict = depred_png; - else if (pred.num == 2) { - /* for 8-bpc TIFF pred. 2, we can reuse PNG Sub */ - if (pred.bpc == 8) { - pred.predfun = pp_sub; /* predict left */ - depredict = depred_png; - } else { - // XXX add general TIFF predictor (bpc != 8) - fprintf(stderr, "FlateDecode: /Predictor %d " - "not supported for /BitsPerComponent %d\n", - pred.num, pred.bpc); - return NULL; + fprintf(stdout, "kcontentstream: Not a text or object stream!\n"); + return p_fail; + } + + value_p = h_action__m(mm__, bytes_p, act_kcontentstream_value, spec); + + return h_sequence__m(mm__, dict_p, value_p, NULL); + +} + +const +HParsedToken *create_strmdict(HArena *arena, size_t len) +{ + uint8_t *buf = NULL; + const HParsedToken *tok = NULL; + const HParseResult *res = NULL; + const int bufSz = 48; // supports more than gigabytes of stream length + + buf = (uint8_t*) h_arena_malloc(arena, bufSz); + assert(buf); + sprintf((char*)buf, "<< /Length %ld >>", len); + res = h_parse(p_dict, buf, strlen((char*)buf)); + assert (res && res->ast); + tok = res->ast; + return tok; +} + +Fontinfo_T * +getFontinfo(const Dict *fontdict, char *name, struct Env *aux) +{ + + assert (fontdict); + Fontinfo_T *fontinfo = h_arena_malloc(fontdict->arena, sizeof(Fontinfo_T)); + fontinfo->name = NULL; + fontinfo->type = NULL; + fontinfo->basefont = NULL; + fontinfo->encoding = NULL; + fontinfo->descriptor = NULL; + fontinfo->toUnicode = NULL; + fontinfo->descendantFonts = NULL; + + assert(fontinfo); + const HParsedToken *item = dictentry(fontdict, "Name"); + if (item) { + assert(item->token_type == TT_BYTES); + if (bytes_eq(item->bytes, name)) { + fontinfo->name = h_arena_malloc(fontdict->arena, sizeof(char)*(item->bytes.len+1)); + memcpy(fontinfo->name, (char *)item->bytes.token, item->bytes.len); + fontinfo->name[item->bytes.len] = '\0'; + fprintf(stdout, "getFontinfo: Subtype = %s\n", fontinfo->type); + } + } + item = dictentry(fontdict, "Subtype"); + if (item) { + assert (item->token_type == TT_BYTES); + fontinfo->type = h_arena_malloc(fontdict->arena, sizeof(char)*(item->bytes.len+1)); + memcpy(fontinfo->type, (char *)item->bytes.token, item->bytes.len); + fontinfo->type[item->bytes.len] = '\0'; + fprintf(stdout, "getFontinfo: Subtype = %s\n", fontinfo->type); + } + item = dictentry(fontdict, "BaseFont"); + if (item) { + assert (item->token_type == TT_BYTES); + fontinfo->basefont = h_arena_malloc(fontdict->arena, sizeof(char)*(item->bytes.len+1)); + memcpy(fontinfo->basefont, (char *)item->bytes.token, item->bytes.len); + fontinfo->basefont[item->bytes.len] = '\0'; + fprintf(stdout, "getFontinfo: Basefont = %s\n", fontinfo->basefont); + } + size_t offset; // This is available if needed + item = dictentry(fontdict, "Encoding"); + if (item) { + // dereference it if it is a reference + offset = 0; + item = resolve_item(aux, item, &offset, p_objdef); + if (item) { // TODO: Failure ==> xref error -- Figure out how to handle + if (item->token_type == TT_BYTES) { + fontinfo->encoding = h_arena_malloc(fontdict->arena, sizeof(char)*(item->bytes.len+1)); + memcpy(fontinfo->encoding, (char *)item->bytes.token, item->bytes.len); + fontinfo->encoding[item->bytes.len] = '\0'; + fprintf(stdout, "getFontinfo: encoding = %s at offset %zu (%#zx)\n", + fontinfo->encoding, offset, offset); + } + else if (item->token_type == TT_Dict) + { + pp_dict(stdout, item, 0, 0); + + const Dict *encodingDict = H_CAST(Dict, item); + item = dictentry(encodingDict, "BaseEncoding"); + if (item) { + fontinfo->encoding = h_arena_malloc(fontdict->arena, sizeof(char)*(item->bytes.len+1)); + memcpy(fontinfo->encoding, (char *)item->bytes.token, item->bytes.len); + fontinfo->encoding[item->bytes.len] = '\0'; + fprintf(stdout, "getFontinfo: encoding = %s\n", fontinfo->encoding); + } + } + else { + fprintf(stdout, "\nUnexpected token type in parsing font -Encoding- attribute -" + "token_type = %u\n", item->token_type); } - } else { - fprintf(stderr, "FlateDecode: /Predictor %d" - " not supported\n", pred.num); - return NULL; } + } - /* allocate row buffer */ - if (pred.columns > (INT_MAX - 7) / pred.colors / pred.bpc) { - fprintf(stderr, "FlateDecode: overflow\n"); - return NULL; + item = dictentry(fontdict, "FontDescriptor"); + if (item) { + // dereference the reference + offset = 0; + item = resolve_item(aux, item, &offset, p_objdef); + if (item) { // TODO: Failure ==> xref error -- Figure out how to handle + fprintf(stdout, "getFontinfo: FontDescriptor item description:\n"); + h_pprintln(stdout, item); + item = get_dictoftype(item, NULL, "FontDescriptor", aux); + if (item) { + fontinfo->descriptor = item; + fprintf(stdout, "getFontinfo: FontDescriptor at offset %zu (%#zx):\n", + offset, offset); + pp_dict(stdout, item, 0, 0); + } } - pred.rowsz = (pred.colors * pred.bpc * pred.columns + 7) / 8; - pred.buf = calloc(1, pred.rowsz); - if (pred.buf == NULL) - err(1, "FlateDecode"); } - /* set up zlib */ - // XXX pass our allocator to zlib - ret = inflateInit(&strm); - if (ret != Z_OK) - errx(1, "inflateInit: %s (%d)", strm.msg, ret); - buf = malloc(BUFSIZE); - if (buf == NULL) - err(1, "FlateDecode"); + item = dictentry(fontdict, "ToUnicode"); + if (item) { + // dereference the reference + offset = 0; + item = resolve_item(aux, item, &offset, p_objdef); + if (item) { // TODO: Failure ==> xref error -- Figure out how to handle + fprintf(stdout, "getFontinfo: toUnicode item description:\n"); + h_pprintln(stdout, item); + item = get_dictoftype(item, NULL, "ToUnicode", aux); + if (item) { + fontinfo->toUnicode = item; + fprintf(stdout, "getFontinfo: toUnicode at offset %zu (%#zx):\n", + offset, offset); + pp_dict(stdout, item, 0, 0); + } + } + } -#ifdef ITERATIVE // XXX - /* initialize target parser */ - sp = h_parse_start(p); - assert(sp != NULL); - pred.sp = sp; -#endif + item = dictentry(fontdict, "DescendantFonts"); + if (item) { + // dereference the reference + offset = 0; + item = resolve_item(aux, item, &offset, p_objdef); + if (item) { // TODO: Failure ==> xref error -- Figure out how to handle + fprintf(stdout, "getFontinfo: descendantFonts item description:\n"); + h_pprintln(stdout, item); + item = get_dictoftype(item, NULL, "DescendantFonts", aux); + if (item) { + fontinfo->descendantFonts = item; + fprintf(stdout, "getFontinfo: descendantFonts at offset %zu (%#zx):\n", + offset, offset); + pp_dict(stdout, item, 0, 0); + } + } + else { // xref error + goto end; + } + } - done = 0; - strm.avail_in = b.len; - strm.next_in = (unsigned char *)b.token; - do { - strm.avail_out = BUFSIZE; - strm.next_out = buf; + end: + return fontinfo; +} - ret = inflate(&strm, Z_NO_FLUSH); - if (ret != Z_STREAM_END && ret != Z_OK) { - fprintf(stderr, "inflate: %s (%d)\n", strm.msg, ret); - break; +Fontinfo_T * +lookup_font(TextState_T *state, struct Env *aux) +{ + const HParsedToken *item = NULL; + Dict *fontlist; // font list dictionary in page + Fontinfo_T *fontinfo = NULL; + + PtNode_T *page = state->page; + assert ( (page->type == PG_NODE) || (page->type == XO_NODE) ); + struct TextEntry_S *fentry = state->font; + if (fentry) { + assert ( fentry->type == TS_Tf); + struct fontref *fr = &fentry->fref; + char *fn = fr->fn; + Dict *dict = NULL; + if (page->pgRsrc && (page->pgRsrc->numFonts > 0)) { + fontlist = H_CAST(Dict, page->pgRsrc->fonts); + assert(page->pgRsrc->numFonts == fontlist->used); + for (int i=0; i< page->pgRsrc->numFonts; i++) { + item = dictentry(fontlist, fn); // look for the font name in the dictionary */ + if (item) { + dict = get_fontdict(item, aux); + if (dict) { + fontinfo = getFontinfo(dict, fr->fn, aux); + fprintf(stdout, "\n\nlookup_font: fontinfo = %p\n\n", (void*)fontinfo); + } + } + + } + } + else { // inherit + // DEBUG + fprintf(stdout, "\n\nlookup_font: Font Resource not found for FontState:\n"); + pp_fontstate(stdout, state); + fprintf(stdout, "\n\nlookup_font: Inheritance not yet supported!\n\n"); } + } + return (fontinfo); +} - sz = BUFSIZE - strm.avail_out; - done = depredict(&pred, buf, sz); - } while (done == 0 && ret == Z_OK); +void parse_fonts(const HParsedToken *dict_t, RsrcDict_T *pgRsrc, struct Env *aux) +{ + Dict *fontdict = H_CAST(Dict, dict_t); + const HParsedToken *item = NULL; + size_t offset=0; + + item = dictentry(fontdict, "Font"); + item = resolve_item(aux, item, &offset, p_objdef); + if (item) { // TODO: Failure ==> xref error -- Figure out how to handle + assert(item->token_type == TT_Dict); + Dict *fontlist = H_CAST(Dict, item); + fprintf(stdout, "parse_fonts: Num fonts used in page = %lu \n", fontlist->used); + pp_dict(stdout, item, 0, 0); + if (pgRsrc->fonts) { + fprintf(stderr, "\n\nparse_fonts: Attempt to add fonts -- Supported??\n\n"); + } + else { + pgRsrc->fonts = item; + pgRsrc->numFonts = fontlist->used; + fprintf(stdout, "\n\nparse_fonts: Number of fonts used = %lu\n\n", pgRsrc->numFonts); + } + } +} -#ifdef ITERATIVE // XXX - res = h_parse_finish(sp); - // XXX always return NULL on error? -#else - res = h_parse(p, pred.out, pred.nout); - free(pred.out); -#endif - inflateEnd(&strm); - free(pred.buf); - free(buf); +void parse_rsrcdict(HArena *arena, const HParsedToken *dict_t, PtNode_T *pgNode, struct Env *aux); - if (done == -1) - return NULL; - return res; + +XoNode_T* +create_XoNode(HArena *arena, Catalog_T *catalog) +{ + XoNode_T *node = h_arena_malloc(arena, sizeof(XoNode_T)); + node->name = NULL; + node->node = NULL; + node->next = NULL; + if (catalog->xoHead == NULL) + catalog->xoHead = node; + if (catalog->xoTail == NULL) + catalog->xoTail = node; + else { + catalog->xoTail->next = node; + catalog->xoTail = node; + } + catalog->xoCount += 1; + + return node; } -/* LZW helpers */ -typedef struct +// XObject resources can be recursively specified +void parse_xobject( + const HParsedToken *dict_t, + PtNode_T *parent, + RsrcDict_T *pgRsrc, + struct Env *aux) { - uint8_t *lzw_buf; - size_t total_buf_size; - size_t write_head; - size_t write_tail; - uint8_t write_checksum; - size_t eof_loc; - HBytes *input_stream; - size_t read_head; - size_t read_tail; - uint8_t read_checksum; -} lzwspec; + Dict *xodict = H_CAST(Dict, dict_t); + const HParsedToken *item = NULL; // generic token + const HParsedToken *tok = NULL; // resolved token + const Dict *xobj_d = NULL; // dictionary associated with reference token + const HParsedToken *xobj_t = NULL; // xobject resource token + size_t offset = 0; + + + item = dictentry(xodict, "XObject"); + item = resolve_item(aux, item, &offset, p_objdef); + if (item) { // test for no XObject key (TODO: Failure ==> xref error) + assert(item->token_type == TT_Dict); + Dict *xolist = H_CAST(Dict, item); + + // DEBUG + fprintf(stdout, "\nparse_xobject: Node for Parent = "); + if (parent->me) pp_ref(stdout, parent->me, 0, 0); + fprintf(stdout, "\nOld XO Count = %lu, Num xobjects used in page = %lu \n", + aux->catalog.xoCount, xolist->used); + pp_dict(stdout, item, 0, 0); + // DEBUG + + aux->catalog.xoCount += xolist->used; + // work on each element of the dictionary + for (int i=0; i<xolist->used; i++) { + XoNode_T *xobj_r = create_XoNode(xodict->arena, &aux->catalog); + HBytes k = H_INDEX_BYTES(xolist->elements[i], 0); + xobj_r->name = h_arena_malloc(xodict->arena, k.len+1); + + memcpy(xobj_r->name, k.token, k.len); + xobj_r->name[k.len] = '\0'; + const HParsedToken *ref = H_INDEX_TOKEN(xolist->elements[i], 1); + assert(ref->token_type == TT_Ref); + xobj_r->node = h_arena_malloc(xodict->arena, sizeof(PtNode_T)); + xobj_r->node->type = XO_NODE; + xobj_r->node->parent = parent->me; + xobj_r->node->pgRsrc = pgRsrc; + xobj_r->node->me = ref; + xobj_r->node->offset = 0; // TODO: get the offset to the stream + tok = resolve_item(aux, ref, &xobj_r->node->offset, p_objdef); + + if (tok == NULL) continue; + + // DEBUG + fprintf(stdout, "\nparse_xobject: XObject Reference = : "); + pp_ref(stdout, ref, 0, 0); + fprintf(stdout, "\n"); + h_pprintln(stdout, tok); + // DEBUG + + // tok can be an image dictionary -- which we are ignoring + if (tok->token_type == TT_SEQUENCE) + xobj_t = H_INDEX_TOKEN(tok, 0); // expecting an XObject dictionary token + if (xobj_t == NULL) continue; + + xobj_t = get_dictoftype(xobj_t, NULL, "XObject", aux); // test it + if (xobj_t == NULL) continue; + + xobj_d = H_CAST(Dict, xobj_t); + item = dictentry(xobj_d, "Subtype"); + if (item == NULL || item->token_type != TT_BYTES) + continue; // no "Subtype" field + + /* + * TODO:: external objects can be images, forms, or postscript objects + * We are only handling forms at the moment + */ + if (bytes_eq(item->bytes, "Form")) { + fprintf(stdout, "\n\nparse_xobject:: Parsing Form XObject\n"); + const HParsedToken *xoRsrc_t; + const HParsedToken *rsrcdict_t; + size_t offset = 0; + + xoRsrc_t = dictentry(xobj_d, "Resources"); + if (xoRsrc_t) { + fprintf(stdout, "\n\nparse_xobject: Found resources in node\n"); + rsrcdict_t = resolve_item(aux, xoRsrc_t, &offset, p_objdef); + // DEBUG + fprintf(stdout, "\nparse_xobject: Resource token type = %u\n",rsrcdict_t->token_type); + h_pprintln(stdout, rsrcdict_t); + + parse_rsrcdict(xodict->arena, rsrcdict_t, xobj_r->node, aux); + + // set the text state to this xobject + // parse the text stream, which is field 2 of the sequence + aux->tstate.page = xobj_r->node; + xobj_t = H_INDEX_TOKEN(tok, 1); // expecting an HParseResult token + const HParseResult *res = H_CAST(HParseResult, xobj_t); + // DEBUG + fprintf(stdout, "\nparse_xobject: Byte Stream = : "); + h_pprintln(stdout, res->ast); + + HBytes stm = H_CAST_BYTES(res->ast); + res = h_parse(p_textstream, stm.token, stm.len); + + if (res) { // text found in stream + // DEBUG + fprintf(stdout, "\nparse_xobject: Parsing text : "); + h_pprintln(stdout, res->ast); + +// xobj_r->node->xn.dict = 0; + xobj_r->node->xn.textStream = res->ast; + } + } + } + } + } +} +void parse_rsrcdict(HArena *arena, const HParsedToken *dict_t, PtNode_T *pgNode, struct Env *aux) +{ + RsrcDict_T *rsrc = NULL; + + + // Process the dictionary + if ( (dict_t->token_type == TT_Dict) || (dict_t->token_type == TT_Objstm) ) { + rsrc = h_arena_malloc(arena, sizeof(RsrcDict_T)); + rsrc->resources = dict_t; + rsrc->fonts = NULL; + rsrc->numFonts = 0; + rsrc->xobj = NULL; +// rsrc->seenCmaps = NULL; +// rsrc->numCmapsSeen = 0; + } + else { + fprintf (stderr, "\nparse_rsrcdict: What token type is this? - %u\n", + dict_t->token_type); + } -lzwspec *cur_lzw_spec; + // Resource is a simple dictionary + if (dict_t->token_type == TT_Dict) { + // DEBUG + fprintf(stdout, "\nparse_rsrcdict: Simple dictionary:\n"); + h_pprintln(stdout, dict_t); -/* used by write_lzw_buffer to get more space for decoding if needed */ -void -grow_lzw_buffer(size_t amount) -{ - uint8_t *ret_buf = realloc(cur_lzw_spec->lzw_buf, (cur_lzw_spec->total_buf_size+amount) * sizeof(uint8_t)); - if(ret_buf != NULL) - { - cur_lzw_spec->total_buf_size += amount; - cur_lzw_spec->lzw_buf = ret_buf; + parse_fonts(dict_t, rsrc, aux); + parse_xobject(dict_t, pgNode, rsrc, aux); } - else - { - fprintf(stderr, "LZWDecode: h_arena_realloc() failed"); - return; + else if (dict_t->token_type == TT_Objstm) { + const Objstm *strmc = H_CAST(Objstm, dict_t); + fprintf(stdout, "\nparse_rsrcdict: stream object -numObjs = %lu\n", strmc->numObjs); + h_pprintln(stdout, dict_t); + + for (int i=0; i<strmc->numObjs; i++) { + fprintf(stdout, "\nparse_rsrcdict: oid = [%zu, %zu]\n", + strmc->tok[i].oid.nr, strmc->tok[i].oid.gen); + if (strmc->tok[i].obj->token_type == TT_Dict) { + parse_fonts(strmc->tok[i].obj, rsrc, aux); + parse_xobject(strmc->tok[i].obj, pgNode, rsrc, aux); + } + } } + + pgNode->pgRsrc = rsrc; + return; + } -lzwspec * -new_lzw_spec(HBytes *bytes) +void pp_pgrsrc(FILE *stream, const RsrcDict_T *pgRsrc) { - size_t const BUFSIZE = sizeof(uint8_t) * 1024; - lzwspec *ret = malloc(sizeof(lzwspec)); - memset(ret, 0, sizeof(lzwspec)); - ret->input_stream = bytes; - ret->lzw_buf = malloc(BUFSIZE); - ret->total_buf_size = BUFSIZE; - return ret; + if (pgRsrc) { + fprintf(stream, "\npp_pgrsrc: Num fonts used in this page = %lu\n", pgRsrc->numFonts); + fprintf(stream, "pp_pgrsrc: Resources\n"); + if (pgRsrc->resources) h_pprintln(stream, pgRsrc->resources); + fprintf(stream, "pp_pgrsrc: Fonts\n"); + if (pgRsrc->fonts) h_pprintln(stream, pgRsrc->fonts); + fprintf(stream, "pp_pgrsrc: XObjects\n"); + if (pgRsrc->xobj) h_pprintln(stream, pgRsrc->xobj); + } } -void -delete_lzw_spec(lzwspec *spec) +void pp_ptnode(FILE *stream, const PtNode_T *node) { - free(spec->lzw_buf); - free(spec); + fprintf(stream, "\nPage Tree Node Info:\n"); + fprintf(stream, "pp_ptnode: parent = "); + if (node->parent) h_pprintln(stream, node->parent); + fprintf(stream, "\npp_ptnode: me = "); + if (node->me) h_pprintln(stream, node->me); + if (node->pgRsrc) pp_pgrsrc(stream, node->pgRsrc); } -void -bind_lzw_spec(lzwspec *spec) +void parse_pagenode ( + struct Env *aux, + PtNode_T *myNode, + const HParsedToken *myRef, // my page node reference + const Dict *myDict, // my page node specification + const HParsedToken *parent, + HArena *arena + ) { - cur_lzw_spec = spec; -} + const HParsedToken *item = NULL; + const HParsedToken *contents_t = NULL; // dictionary token + Ref *contents_r = NULL; + const HParsedToken *contents = NULL; // resolved token + const HParsedToken *entry = NULL; + const HParsedToken *rsrcdict_t = NULL; + size_t nOffset = 0; -#include "lzw-lib.h" + // DEBUG + fprintf(stdout, "\nparse_pagenode: parsing Page Node = "); + pp_ref(stdout, myRef, 0, 0); -/* Buffer writer function for the lzw-ab implementation, with a fixed signature. - * Although the type is defined as int, it is expected to write one byte at a time. - * Modifies cur_lzw_spec. Set up the lzw spec to use with bind_lzw_spec() */ -void -write_lzw_buffer(int value) -{ - size_t const BUFSIZE = sizeof(uint8_t) * 1024; + // set some global state variables + aux->tstate.page = myNode; + myNode->type = PG_NODE; + myNode->me = myRef; + myNode->pn.dict = myDict; - if(!cur_lzw_spec->lzw_buf) - { - fprintf(stderr, "LZWDecode: lzw_buf is null!"); - assert(cur_lzw_spec->lzw_buf != NULL); + + + item = dictentry(myDict, "Parent"); + assert(item->token_type == TT_Ref); + if ( !( ( ((Ref*)item->user)->nr == ((Ref*)parent->user)->nr ) && + ( ((Ref*)item->user)->gen == ((Ref*)parent->user)->gen ) ) ) { + fprintf(stderr, "parse_pagenode: Inconsistent parent pointer [p = %p]!\n", + (void *)item); + // should this just be a warning? + goto end; + } + myNode->parent = item; + + // Hold on to the Resources dictionary + // This dictionary may be empty + // If there is no dictionary ==> inherit resources from parent + myNode->pgRsrc = NULL; + item = dictentry(myDict, "Resources"); + if (item) { + fprintf(stdout, "\n\nparse_pagenode: Found resources in node\n"); + rsrcdict_t = resolve(aux, item); + fprintf(stdout, "\nparse_pagenode: Resource token type = %u\n",rsrcdict_t->token_type); + parse_rsrcdict(arena, rsrcdict_t, myNode, aux); + pp_ptnode(stdout, myNode); } - assert(cur_lzw_spec->write_head <= cur_lzw_spec->total_buf_size); - if (value == EOF) { - cur_lzw_spec->lzw_buf[cur_lzw_spec->write_head] = (uint8_t) value; - cur_lzw_spec->eof_loc = cur_lzw_spec->write_head; - cur_lzw_spec->write_head++; - return; - } + // Process the contents stream or array + contents_t = dictentry(myDict, "Contents"); + if (contents_t == NULL) { + fprintf(stderr, "parse_pagenode: Page node without contents!\n"); + goto end; + } + else if (contents_t->token_type == TT_Ref) { + contents_r = H_CAST(Ref, contents_t); + fprintf(stdout, "parse_pagenode: ref.nr = %ld, ref.gen=%ld\n", contents_r->nr, contents_r->gen); - /* We can get away with this cast due to writing single bytes. */ - cur_lzw_spec->lzw_buf[cur_lzw_spec->write_head++] = (uint8_t) value; + contents = resolve_item(aux, contents_t, &myNode->offset, p_cstream); + if (!contents) { // TODO: Failure ==> xref error -- Figure out how to handle + goto end; + } + if (contents->token_type == TT_Objstm) { // Resources for the page node + parse_rsrcdict(arena, contents, myNode, aux); + pp_ptnode(stdout, myNode); + } + else { + // DEBUG + fprintf(stdout, "\n\nparse_pagenode: What is token 0 anyway?\n"); + HParsedToken *tok0 = H_INDEX_TOKEN(contents, 0); + h_pprintln(stdout, tok0); - /* If you looked at lzw-ab's code, the write head is reset here - * This function uses write_head as the offset of the last written item */ - if (cur_lzw_spec->write_head >= cur_lzw_spec->total_buf_size) - { - grow_lzw_buffer(BUFSIZE); - } + HParsedToken *res_strm = H_INDEX_TOKEN(contents, 1); + if (res_strm->token_type == TT_SEQUENCE) { // this seems like a big assumption + myNode->pn.textStream = res_strm; - cur_lzw_spec->write_checksum = cur_lzw_spec->write_checksum * 3 + (uint8_t) value; -} + fprintf(stdout, "parse_pagenode: Page node contents = %p\n", (void *)contents); + } + else + myNode->pn.textStream = NULL; + } + } + else if (contents_t->token_type == TT_SEQUENCE) { + size_t numelts = contents_t->seq->used; + size_t bufsz = 0; + HBytes bstrm; + const HParsedToken **pieces = h_arena_malloc(arena, sizeof(HBytes*) * numelts); + for (int i=0; i<numelts; i++) { + entry = H_INDEX_TOKEN(contents_t, i); + contents_r = H_CAST(Ref, entry); + fprintf(stdout, "\n\nparse_pagenode: objstream contents: strm obj#:%d, oid=<%zu, %zu>\n", + i+1, contents_r->nr, contents_r->gen); + contents = resolve_item(aux, entry, &nOffset, p_byteostm); + if (!contents) { // TODO: Failure ==> xref error -- Figure out how to handle + goto end; + } + HParsedToken *res_strm = H_INDEX_TOKEN(contents, 1); + fprintf(stdout, "\nparse_pagenode: Field 2 type = %u\n", res_strm->token_type); + HParseResult *bstrm_r = H_CAST(HParseResult, res_strm); + pieces[i] = bstrm_r->ast; + bstrm = H_CAST_BYTES(bstrm_r->ast); + bufsz += bstrm.len; + fprintf(stdout, "\n\nparse_pagenode: the extracted byte stream:\n%.*s, lensofar = %lu\n", + (int)bstrm.len, (char*)bstrm.token, bufsz); + fprintf(stdout, "\n\nparse_pagenode: Done parsing strm obj# = %d\n\n", i+1); -/* Fixed signature function for reading bytes. Modifies cur_lzw_spec. Set cur_lzw_spec - * with bind_lzw_spec() */ -int read_lzw_buffer(void) -{ - uint8_t byte_read; - int ret_value; + } + uint8_t *whole = h_arena_malloc(arena, sizeof(uint8_t) * bufsz); + size_t offset=0; + for (int i=0; i<numelts; i++) { + bstrm = H_CAST_BYTES(pieces[i]); + + memcpy(&whole[offset], bstrm.token, bstrm.len); + offset+=bstrm.len; + fprintf(stdout, "\n**** index=%d, offset=%lu\n", i, offset); + } + assert(offset == bufsz); + fprintf(stdout, "\n\nparse_pagenode: the extracted byte stream array:\n%.*s, bufsz = %lu\n", + (int)bufsz, (char*)whole, bufsz); + HParseResult *tstrm=h_parse(p_textstream, whole, bufsz); + if (tstrm) { + fprintf(stdout, "\n\nparse_pagenode: textstream token_type = %u\n\n", tstrm->ast->token_type); + myNode->pn.textStream = tstrm->ast; + } + } + else { + fprintf(stdout, "parse_pagenode: Unexpected page node contents token type = %u\n", contents_t->token_type); + goto end; + } - /* Input data is already waiting in the buffer */ - if (cur_lzw_spec->read_head == cur_lzw_spec->read_tail) - cur_lzw_spec->read_tail = cur_lzw_spec->input_stream->len; +// fprintf(stdout, "parse_pagenode:: Contents token type = %d\n", +// contents->token_type); - if (cur_lzw_spec->read_head < cur_lzw_spec->read_tail) - { - byte_read = cur_lzw_spec->input_stream->token[cur_lzw_spec->read_head++]; - cur_lzw_spec->read_checksum = cur_lzw_spec->read_checksum * 3 + byte_read; - ret_value = byte_read; - } - else - ret_value = EOF; - return ret_value; +end: + +//fail: + return; } -HParseResult * -LZWDecode(const Dict *parms, HBytes b, HParser *p) -{ - struct predictor pred = {1, 1, 8, 1}; - int (*depredict)(struct predictor *, uint8_t *, size_t); - HParseResult *res; - int done; - int ret; - const HParsedToken *v; - /* set up the predictor (if any) */ - #define SETPARM(VAR,STR) do { \ - v = dictentry(parms, (STR)); \ - if (v != NULL) { \ - if (v->token_type != TT_SINT || v->sint < 0) \ - return NULL; \ - VAR = v->sint; \ - } } while(0) - SETPARM(pred.num, "Predictor"); - SETPARM(pred.colors, "Colors"); - SETPARM(pred.bpc, "BitsPerComponent"); - SETPARM(pred.columns, "Columns"); - #undef SETPARM - if (pred.num == 1) - depredict = depred_none; - else { - if (pred.num >= 10 && pred.num <= 15) - depredict = depred_png; - else if (pred.num == 2) { - /* for 8-bpc TIFF pred. 2, we can reuse PNG Sub */ - if (pred.bpc == 8) { - pred.predfun = pp_sub; /* predict left */ - depredict = depred_png; - } else { - // XXX add general TIFF predictor (bpc != 8) - fprintf(stderr, "LZWDecode: /Predictor %d " - "not supported for /BitsPerComponent %d\n", - pred.num, pred.bpc); - return NULL; - } - } else { - fprintf(stderr, "LZWDecode: /Predictor %d" - " not supported\n", pred.num); - return NULL; - } +/* + * This helper implements the standard backwards parsing strategy to read + * the trailer dictionaries found at the very end of the input. + * + * It then follows the catalog dictionary to enumerate the pages in the pdf file + * identifying text streams and contents streams, saving the information to support + * text extraction in the environment structure. + * + * + * A return value of false indicates some parsing error. +*/ +// need to maintain information about pages +void +parse_pagetree( + struct Env *aux, + PtNode_T *myNode, + const HParsedToken *myRef, // my page tree node reference + const Dict *myDict, // my page tree specification + const HParsedToken *pRefT, // parent reference token + size_t curr // number of pages seen so far + ) +{ - /* allocate row buffer */ - if (pred.columns > (INT_MAX - 7) / pred.colors / pred.bpc) { - fprintf(stderr, "LZWDecode: overflow\n"); - return NULL; - } - pred.rowsz = (pred.colors * pred.bpc * pred.columns + 7) / 8; - pred.buf = calloc(1, pred.rowsz); - if (pred.buf == NULL) - err(1, "LZWDecode"); + const HParsedToken *item = NULL; + const HParsedToken *kids = NULL; + PtNode_T *kid = NULL; + const HParsedToken *kidRef = NULL; // page tree or page node reference + const HParsedToken *kidDict_t = NULL; + const HParsedToken *pageDict_t = NULL; + const HParsedToken *treeDict_t = NULL; + const Dict *kidDict = NULL; + const HParsedToken *rsrcdict_t = NULL; + + + fprintf(stdout, "\nparse_pagetree: parsing Page Tree Node = "); + pp_ref(stdout, myRef, 0, 0); + + + myNode->type = PG_TREE; + + item = dictentry(myDict, "Parent"); // if root node ==> parent should be NULL + myNode->parent = item; + + // Count is a required field except for the root + item = dictentry(myDict, "Count"); + if ( (item == NULL) || (item->token_type != TT_SINT) ) { + fprintf(stderr, "parse_pagetree: Required page node count missing!\n"); + goto end; // This should just be a warning + } + else { + myNode->pt.leaves = H_CAST_SINT(item); + if (aux->catalog.pgCount == 0) + aux->catalog.pgCount = myNode->pt.leaves; } - lzwspec *lzw_spec = new_lzw_spec(&b); - bind_lzw_spec(lzw_spec); - ret = lzw_decompress(write_lzw_buffer, read_lzw_buffer); - if (ret) { - fprintf(stderr, "lzw_decompress: error (%d)\n", ret); - assert(!"LZWDecode: failed to decompress\n"); + + + + // Kids is a required field + kids = dictentry(myDict, "Kids"); // array of references to page or page tree nodes + if ( (kids == NULL) || (kids->token_type != TT_SEQUENCE) ) { + fprintf(stderr, "parse_pagetree: This tree node has no pages!\n"); + goto end; // Nothing more to do here } - done = depredict(&pred, cur_lzw_spec->lzw_buf, cur_lzw_spec->write_head-1); - assert(!done); // XXX ITERATIVE - res = h_parse(p, pred.out, pred.nout); - free(pred.out); - bind_lzw_spec(NULL); - delete_lzw_spec(lzw_spec); - return res; -} -HParseResult * -RunLengthDecode(const Dict *parms, HBytes b, HParser *p) -{ - HParseResult *res; + // get the kids (pgTable) + HCountedArray *pgTable = H_CAST_SEQ(kids); + size_t pgtSz = pgTable->used; + myNode->pt.kids = (PtNode_T*)h_arena_malloc(pgTable->arena, pgtSz * sizeof(PtNode_T)); + myNode->pt.count = pgtSz; - res = h_parse(p_rldstring, b.token, b.len); - if(!res) + // Process the kids + for (int i=0; i<pgtSz; i++) { - fprintf(stderr, "parse error in RunLengthDecode filter\n"); - return NULL; - } + kid = &myNode->pt.kids[i]; + kid->parent = myRef; + kidRef = pgTable->elements[i]; + kidDict_t = resolve(aux, kidRef); // page or tree node dictionary or object stream token + + if (kidDict_t) { + // Look for a tree node + treeDict_t = get_dictoftype(kidDict_t, myRef, "Pages", aux); + if (treeDict_t) { + kidDict = H_CAST(Dict, treeDict_t); + parse_pagetree(aux, kid, kidRef, kidDict, myRef, curr); + } + // Look for a page node + pageDict_t = get_dictoftype(kidDict_t, myRef, "Page", aux); + if (pageDict_t) { + kidDict = H_CAST(Dict, pageDict_t); + if (++curr > aux->catalog.pgCount) { + fprintf(stderr, "parse_pagetree: More kids then specified leaves!\n"); + // TODO:: probably just a warning is enough here -- run the VIOL parser? + } + parse_pagenode(aux, kid, kidRef, kidDict, myRef, pgTable->arena); + } - assert(res->ast && res->ast->token_type == TT_BYTES); - res = h_parse(p, res->ast->bytes.token, res->ast->bytes.len); + // Look for Resources dictionary + myNode->pgRsrc = NULL; + item = dictentry(myDict, "Resources"); + if (item) { + fprintf(stdout, "\n\nparse_pagetree: Found resources in node\n"); + size_t offset = 0; + rsrcdict_t = resolve_item(aux, item, &offset, p_objdef); + if (!rsrcdict_t) { // TODO: Failure ==> xref error -- Figure out how to handle + goto end; + } + fprintf(stdout, "\nparse_pagetree: Resource token type = %u\n",rsrcdict_t->token_type); + parse_rsrcdict(pgTable->arena, rsrcdict_t, myNode, aux); + pp_ptnode(stdout, myNode); + } - return res; + } + else { + Ref *ref = (Ref *)kidRef->user; + fprintf(stderr, "parse_pagetree: Reference <%zu, %zu> not found -- Deleted?!\n", + ref->nr, ref->gen); + } + + } // end loop + + + +end: + return; } + + + /* - * Decodes ASCII hexadecimal data into binary data. - * parms should be empty, because the filter has no parameters + * This helper starts the process of elaborating the page tree + * starting with the trailer dictionary */ -HParseResult * -ASCIIHexDecode(const Dict *parms, HBytes b, HParser *p) +void +parse_catalog(struct Env *aux, const HParsedToken *root) { - HParseResult *f_res, *res; + const HParsedToken *dict_t = NULL; + const Dict *catalog = NULL; + const HParsedToken *ptRef = NULL; // page tree reference + const Dict *ptRoot = NULL; // page tree root Dictionary + + + // initialize the catalog structure + aux->catalog.catalog = NULL; + aux->catalog.pRoot = NULL; + aux->catalog.pgCount = 0; + // Initialize the xobject structure + aux->catalog.xObjs.name = NULL; + aux->catalog.xObjs.node = NULL; + aux->catalog.xObjs.next = NULL; + aux->catalog.xoHead = NULL; + aux->catalog.xoTail = NULL; + aux->catalog.xoCount = 0; + + // DEBUG + fprintf(stdout, "\nparse_catalog: parsing Catalog = "); + if (root->token_type == TT_Ref) + pp_ref(stdout, root, 0, 0); + else if (root->token_type == TT_Dict) + pp_dict(stdout, root, 0, 0); + + + + // Ensure the reference is to the catalog dictionary + size_t offset = 0; + dict_t = resolve_item(aux, root, &offset, p_objdef); + if (!dict_t) { // TODO: Failure ==> xref error -- Figure out how to handle + goto end; + } - // XXX debug - fprintf(stdout, "ASCIIHexDecode:: bytes=[%.*s]\n", (int)b.len, b.token); + aux->catalog.catalog = get_dictoftype(dict_t, NULL, "Catalog", aux); // catalog dictionary token + if (aux->catalog.catalog) { // Caution:: relying on the short-circuiting behavior here + catalog = H_CAST(Dict, aux->catalog.catalog); - f_res = h_parse(p_ahexstream, b.token, b.len); - if(!f_res) - { - fprintf(stderr, "parse error in ASCIIHexDecode filter\n"); - return NULL; - } - assert(f_res->ast && f_res->ast->token_type == TT_BYTES); - fprintf(stdout, "ASCIIHexDecode::string = [%.*s]\n", - (int)f_res->ast->bytes.len, (char*)f_res->ast->bytes.token); - res = h_parse(p, f_res->ast->bytes.token, f_res->ast->bytes.len); + // Catalog found -- Now get the root of the page tree associated with the catalog + ptRef = dictentry(catalog, "Pages"); // indirect reference to a dictionary + if ( (ptRef == NULL) || (ptRef->token_type != TT_Ref) ) { + fprintf(stderr, "parse_catalog: Page Tree not found!\n"); + goto end; + } + aux->catalog.pRoot = ptRef; // indirect reference to the page tree - if (res == NULL) - res = f_res; // return the undecoded stream - return res; + /* resolve and process the page tree root reference to extract the dictionary --> Page Tree Object */ + dict_t = resolve_item(aux, ptRef, &offset, p_objdef); // page tree root node + if (!dict_t) { // TODO: Failure ==> xref error -- Figure out how to handle + goto end; + } + dict_t = get_dictoftype(dict_t, NULL, "Pages", aux); // page tree root dictionary (parent is NULL) + ptRoot = H_CAST(Dict, dict_t); + + if (ptRoot == NULL) { + fprintf(stderr, "parse_catalog: No page table!\n"); + goto end; // Nothing more to do here + } + // parse_pagetree + parse_pagetree(aux, &aux->catalog.pgTree, ptRef, ptRoot, NULL, 0); + } + else { // looks like the field "Type:Catalog" is a hint, not a requirement for a valid pdf + fprintf (stdout, "\n\nThe Catalog is missing!!"); + goto end; + + } + + end: + return; } /* - * Decodes ASCII base-85 encoded data and produces binary data. - * parms should be empty, because the filter has no parameters + * ******************************************************************** + * End Catalog parsing + * ******************************************************************** */ -HParseResult* -ASCII85Decode(const Dict *parms, HBytes b, HParser *p) -{ - HParseResult *f_res, *res; - // XXX debug - fprintf(stdout, "ASCII85Decode:: bytes=[%.*s]\n", (int)b.len, b.token); - f_res = h_parse(p_a85string, b.token, b.len); - if(!f_res) - { - fprintf(stderr, "parse error in ASCII85Decode filter\n"); - return NULL; - } - assert(f_res->ast && f_res->ast->token_type == TT_BYTES); - res = h_parse(p, f_res->ast->bytes.token, f_res->ast->bytes.len); - if (res == NULL) - res = f_res; // return the undecoded stream - return res; -} + + +/* + * ******************************************************************** + * Start xref parsing + * ******************************************************************** + */ /* * decode the bytes in 'b' according to metadata in the stream dictionary 'd' * and parse the result with 'p'. @@ -2063,8 +4970,11 @@ decode_stream(const Dict *d, HBytes b, HParser *p) filter = RunLengthDecode; else if (bytes_eq(v->bytes, "LZWDecode")) filter = LZWDecode; - else - return NULL; /* filter not supported */ + else { /* filter not supported */ + fprintf(stderr, "decode_stream:: Unsupported Filter [%.*s\n]", + (int)v->bytes.len, v->bytes.token); + return NULL; /* Treat the stream as a byte array */ + } v = dictentry(d, "DecodeParms"); if (v && v->token_type == TT_Dict) @@ -2073,87 +4983,52 @@ decode_stream(const Dict *d, HBytes b, HParser *p) return filter(parms, b, p); } -HParsedToken * -act_rest(const HParseResult *p, void *env) -{ - struct Env *aux = env; - size_t offset = H_CAST_UINT(p->ast) / 8; - - return H_MAKE_BYTES(aux->input + offset, aux->sz - offset); -} - -HParser * -p_rest__m(HAllocator *mm__, struct Env *aux) -{ - return h_action__m(mm__, h_tell__m(mm__), act_rest, aux); -} - -/* combine current position with env=(input,sz) into HBytes */ -HParsedToken * -act_take_bytes(const HParseResult *p, void *env) -{ - const HBytes *bs = env; - size_t offset = H_CAST_UINT(p->ast) / 8; - - /* - * NB: we must allocate a new HBytes struct here because the old one is - * allocated only temporarily for the lifetime of the continuation - * below. - */ - return H_MAKE_BYTES(bs->token + offset, bs->len); -} - -HParser * -p_take__m(HAllocator *mm__, size_t n, struct Env *aux) -{ - HParser *skip, *bytes; - HBytes *bs; - - /* dummy struct to hold the pair (input,n) */ - bs = h_alloc(mm__, sizeof(HBytes)); - bs->token = aux->input; - bs->len = n; - - bytes = h_action__m(mm__, h_tell__m(mm__), act_take_bytes, bs); - skip = h_skip__m(mm__, n * 8); - - return h_left__m(mm__, bytes, skip); -} HParser *p_xrefdata__m(HAllocator *, const Dict *); -HParser *p_objstm__m(HAllocator *, const Dict *); -HParser *p_raw_test__m(HAllocator *, const Dict *); -/* - * Look into the dictionary associated with the stream to see if there is data - * needed to interpret the stream - */ + HParser * -p_stream_data__m(HAllocator *mm__, const Dict *dict) +p_stream_data__m(HAllocator *mm__, const Dict *dict, struct Env *aux) { const HParsedToken *v; v = dictentry(dict, "Type"); if (v == NULL || v->token_type != TT_BYTES) // XXX -> custom type - //return p_raw_test__m(mm__, dict); /* no /Type field */ - return NULL; /* no /Type field */ + return NULL; /* no /Type field */ /* interpret known stream types */ if (bytes_eq(v->bytes, "XRef")) return p_xrefdata__m(mm__, dict); -#ifndef NOOBJSTM - if (bytes_eq(v->bytes, "ObjStm")) + + if (bytes_eq(v->bytes, "ObjStm")) { + fprintf(stdout, "\np_stream_data__m:: Parsing object stream\n"); return p_objstm__m(mm__, dict); -#endif + } + if (bytes_eq(v->bytes, "XObject")) { + /* + * external objects can be images, forms, or postscript objects + * Forms and postscript objects can be handled as bytestreams + * Additional XObject Forms processing will be handled during page traversal + * Explicitly avoid parsing Image objects to improve speed -- send back NULL + */ + v = dictentry(dict, "Subtype"); + if (bytes_eq(v->bytes, "Form")) { + fprintf(stdout, "\n\np_stream_data_m:: Found Form XObject\n"); + fprintf(stdout, "p_stream_data_m:: Current XObject count = %lu\n", aux->catalog.xoCount); +// parse_xobject(mm__, dict, aux); + return p_bytestream; +#if 0 + if (bytes_eq(v->bytes, "Image")) { + fprintf(stdout, "\n\np_stream_data_m:: Found XObject - Image\n"); + return p_fail; + } +#endif + } + } return NULL; /* unrecognized type */ } -struct streamspec { - Dict *dict; /* stream dictionary */ - HParser *parser; /* data parser */ -}; - HParsedToken * act_ks_value(const HParseResult *p, void *u) { @@ -2163,19 +5038,20 @@ act_ks_value(const HParseResult *p, void *u) /* decode and parse the stream data */ res = decode_stream(spec->dict, bytes, spec->parser); - // XXX: test a85_integration branch's version if (res == NULL) { HBytes b = {NULL, 0}; const HParsedToken *v = dictentry(spec->dict, "Type"); - if (v != NULL && v->token_type == TT_BYTES) - b = v->bytes; + if (v != NULL && v->token_type == TT_BYTES) { + b.token = v->bytes.token; + b.len = v->bytes.len; + } if (b.len > INT_MAX) b.len = INT_MAX; fprintf(stderr, "parse error in stream (%*s)\n", (int)b.len, b.token); // XXX return the undecoded stream (p->ast)? } - + fprintf(stdout, "\n\nact_ks_value\n\n"); return H_MAKE(HParseResult, res); } @@ -2189,6 +5065,12 @@ act_ks_value(const HParseResult *p, void *u) HParser * kstream(HAllocator *mm__, const HParsedToken *x, void *env) { + // DEBUG + fprintf (stdout, "\n\nkstream:"); + h_pprintln(stdout, x); + // DEBUG + + struct Env *aux = env; HParsedToken *dict_t = H_INDEX_TOKEN(x, 0); Dict *dict = H_CAST(Dict, dict_t); @@ -2204,9 +5086,10 @@ kstream(HAllocator *mm__, const HParsedToken *x, void *env) goto fail; sz = (size_t)v->sint; + //fprintf(stderr, "parsing stream object, length %zu.\n", sz); // XXX debug dict_p = p_return__m(mm__, dict_t); - bytes_p = p_take__m(mm__, sz, aux); // parser for the byte stream + bytes_p = p_take__m(mm__, sz, aux); spec = h_alloc(mm__, sizeof(struct streamspec)); spec->dict = dict; @@ -2223,15 +5106,15 @@ kstream(HAllocator *mm__, const HParsedToken *x, void *env) value_p = bytes_p; return h_sequence__m(mm__, dict_p, value_p, NULL); + fail: -#if 0 if (v == NULL) fprintf(stderr, "stream /Length missing\n"); else if (v -> token_type != TT_SINT) fprintf(stderr, "stream /Length not an integer\n"); else if (v < 0) fprintf(stderr, "stream /Length negative\n"); -#endif + //h_pprintln(stderr, p); // XXX debug return p_fail; } @@ -2402,6 +5285,39 @@ p_xrefdata__m(HAllocator *mm__, const Dict *dict) return h_sequence__ma(mm__, (void **)p_subs); } + + +HParsedToken * +act_ostm(const HParseResult *p, void *u) +{ + assert(((HParsedToken *)u)->token_type == TT_SINT); + size_t N = ((HParsedToken *)u)->sint; + + Objstm *ostrm = H_ALLOC(Objstm); + ostrm->numObjs = N; + ostrm->tok = h_arena_malloc(p->arena, N* sizeof(Objref_T)); + ostrm->arena = p->arena; + + for (int i=0; i<ostrm->numObjs; i++) { + const HParsedToken *num = H_FIELD_TOKEN(0, 2*i); + assert(num->token_type == TT_UINT); + ostrm->tok[i].oid.nr = H_CAST_UINT(num); + ostrm->tok[i].oid.gen = 0; + ostrm->tok[i].obj = H_FIELD_TOKEN(1, i); + } + +// const HCountedArray *indices = H_FIELD_SEQ(0); +// const HCountedArray *ostrm = H_FIELD_SEQ(1); + + const HParsedToken *tok = H_MAKE(Objstm, ostrm); + + // DEBUG + fprintf (stdout, "act_ostm:: Object Stream Details:\n"); + pp_objstm(stdout, tok, 0, 0); + + return (HParsedToken *)tok; +} + HParser * p_objstm__m(HAllocator *mm__, const Dict *dict) { @@ -2411,7 +5327,7 @@ p_objstm__m(HAllocator *mm__, const Dict *dict) v = dictentry(dict, "N"); if (v == NULL || v->token_type != TT_SINT || v->sint < 0 || (uint64_t)v->sint > SIZE_MAX) { - fprintf(stderr, "missing /N on object stream\n"); + fprintf(stderr, "p_objstm__m: missing /N on object stream\n"); return p_fail; } N = v->sint; @@ -2419,7 +5335,10 @@ p_objstm__m(HAllocator *mm__, const Dict *dict) HParser *wel_ws = h_sequence__m(mm__, p_wel, p_ws, NULL); HParser *idx = p_sepBy_n__m(mm__, p_npair, wel_ws, N); - return h_sequence__m(mm__, p_ws, idx, p_elemr, p_ws, NULL); + HParser *p_ostm = h_sequence__m(mm__, p_ws, idx, p_elemr, p_ws, NULL); + HParser *ostm_p = h_action__m(mm__, p_ostm, act_ostm, (void *)v); + + return ostm_p; // XXX leading and trailing ws OK? // XXX consistency-check against /First, idx, /N @@ -2466,9 +5385,6 @@ kxstream(HAllocator *mm__, const HParsedToken *x, void *env) /* construct the parser for the stream data */ spec = h_alloc(mm__, sizeof(struct streamspec)); spec->dict = dict; - // TODO: Seems the assumption is that this form of content stream is - // strictly used for xrefs. Is that true? Ask Peter Wyatt - // Also, do we have an instance of a pdf file that uses this feature? spec->parser = p_xrefdata__m(mm__, dict); assert (spec->parser != NULL); @@ -2479,18 +5395,6 @@ kxstream(HAllocator *mm__, const HParsedToken *x, void *env) } -/* - * main program - */ - -#include <stdio.h> -#include <inttypes.h> -#include <stdlib.h> /* realloc() */ -#include <fcntl.h> /* open() */ -#include <unistd.h> /* lseek() */ -#include <sys/mman.h> /* mmap() */ - -const char *infile = NULL; /* * This helper implements the standard backwards parsing strategy to read all @@ -2500,17 +5404,20 @@ const char *infile = NULL; * Allocates and returns an array of HParsedTokens, each containing the result * of a successful 'p_xref' parse. Sets the output parameter 'nxrefs' to the * number of elements. - * - * A return value of NULL indicates an empty result. */ const HParsedToken ** parse_xrefs(const uint8_t *input, size_t sz, size_t *nxrefs) { + const uint8_t *input = aux->input; + size_t sz = aux->sz; HParseResult *res = NULL; const HParsedToken **xrefs = NULL; /* empty result */ const HParsedToken *tok = NULL; size_t n = 0, nfwd = 0; size_t offset = 0; + bool processRoot = true; + size_t maxObjNum = 0; + Dict *trailer = NULL; // XXX try formulating this as a parser using h_seek() @@ -2545,9 +5452,19 @@ parse_xrefs(const uint8_t *input, size_t sz, size_t *nxrefs) err(1, "realloc"); xrefs[n++] = res->ast; + + /* process the root */ + if (processRoot) { + // Size is a required field in the trailer dictionary + trailer = H_INDEX(Dict, res->ast, 1); + maxObjNum = H_CAST_SINT(dictentry(trailer, "Size")); + + processRoot = false; + } + + /* look up the next offset (to the previous xref section) */ tok = dictentry(H_INDEX(Dict, res->ast, 1), "Prev"); - if (tok == NULL) break; if (tok->token_type != TT_SINT) { @@ -2579,10 +5496,50 @@ parse_xrefs(const uint8_t *input, size_t sz, size_t *nxrefs) } end: - *nxrefs = n; - return xrefs; + aux->xrefs = xrefs; + aux->nxrefs = n; + if (n > maxObjNum) { + fprintf(stderr, "%s: Number of xrefs found -%ld- " + "Greater than specified /Size -%ld-.\n" + "Ignoring objects numberd greater than -%ld-!\n", + infile, n, maxObjNum, n); + aux->nxrefs = maxObjNum; + } + + + // Process the trailer dictionary + if (trailer) { // trailer==NULL or n==0 ==> xrefs were not parsed correctly + const HParsedToken *root = dictentry(trailer, "Root"); + assert(root->token_type == TT_Ref); + parse_catalog(aux, root); + } + return; } + + +/* + * ******************************************************************** + * End xref parsing + * ******************************************************************** + */ + + + + +/* + * main program + */ + +#include <stdio.h> +#include <inttypes.h> +#include <fcntl.h> /* open() */ +#include <unistd.h> /* lseek() */ +#include <sys/mman.h> /* mmap() */ + + + + int main(int argc, char *argv[]) { @@ -2621,7 +5578,10 @@ main(int argc, char *argv[]) init_parser(&aux); /* parse all cross-reference sections and trailer dictionaries */ - aux.xrefs = parse_xrefs(input, sz, &aux.nxrefs); + parse_xrefs(&aux); + + fprintf(stdout, "\n\nmain:: Done parsing xrefs and page tree. Starting main parser.\n\n"); + /* run the main parser */ res = h_parse(p_pdf, input, sz); @@ -2644,5 +5604,10 @@ main(int argc, char *argv[]) /* print result */ h_pprintln(stdout, res->ast); + /* Save the extracted text */ + if (aux.ntextobjs > 0) { + text_extract(&aux); + } + return 0; }