diff --git a/pdf.c b/pdf.c index e096822e8718a4a63e7227e540d86d88a9c1838e..b2508a3735d3bfd073e2cd55623faa08af109cef 100644 --- a/pdf.c +++ b/pdf.c @@ -2,6 +2,7 @@ * pesco 2019,2020 * pompolic 2020 * Paul Vines 2020 + * Kragen Sitaker 2020, 2021 * Sumit Ray 2021 * */ @@ -111,6 +112,43 @@ validate_notnull(HParseResult *p, void *u) } + +// Forward declaration of Token structures +typedef struct { size_t nr, gen; } Ref; + +typedef HCountedArray Dict; + + + +// Catalog Tree -- for now, just make it a table +struct PtNode_T; +typedef struct PtNode_T { + enum {PG_TREE, PG_NODE} type; + const HParsedToken *parent; // Type = Page tree -- reference + union { + struct { + const HParsedToken *pageRef; + Dict *page; // page node dictionary + const HParsedToken *resources; // font references dictionary (resources == NULL) ==> inherit + const HParsedToken *textStream; // content stream -- may be a result of concatenating array of content streams + } pn; + struct PtNode_T *kids; // page table + }; + +} PtNode_S; + +typedef struct { + const HParsedToken *catalog; // reference + const HParsedToken *pRoot; // reference + PtNode_S pgTree; + size_t pgCount; +} Catalog_S; + + +// Forward declaration of text extraction related structures +struct textnode; +struct textstr; + /* * auxiliary global data structure needed by the parser */ @@ -121,13 +159,97 @@ struct Env { const HParsedToken **xrefs; /* all xref sections of the file */ size_t nxrefs; + + struct textnode *txthead; /* parsed text objects from the file */ + struct textnode *txttail; /* parsed text objects from the file */ + size_t ntextobjs; + + Catalog_S catalog; /* Catalog object and document structure */ }; + +// *********************************************************** +/* + * Text data structures + */ +struct textnode { + struct textstr *tstr; + struct textnode *next; +}; + + +struct fontref { + const char *fontname; + uint32_t namelen; + uint32_t fontsize; +}; +struct textpos { + double tx; + double ty; +}; +struct textmat { double cell[6]; }; +struct textstr { + char *text; + uint32_t nchars; +}; +struct textwfmt { /* text with formatting specifications */ + double aw; /* word spacing */ + double ac; /* character spacing */ + struct textstr tstr; /* the string */ +}; +struct tarrayelt { + union { + double adj; + struct textstr tstr; + }; + bool isStr; +}; +struct textarray { + struct tarrayelt *elts; + uint32_t nelts; + struct textstr flattened; +}; + + + +/* operator:: + * TS -- Text state : Table 105 + * TP -- Text position : Table 108 + * TW -- Test showing : Table 109 + * */ +typedef struct { + enum {TS_Tc, TS_Tw, TS_Tz, TS_TL, TS_Tf, TS_Tr, TS_Ts, + TP_Td, TP_TD, TP_Tm, TP_Tstar, + TW_Tj, TW_Tq, TW_Tqq, TW_TJ} type; + union { + double value; /* many just have a value */ + uint8_t mode; /* text mode */ + struct fontref fref; /* font name reference */ + struct textpos pos; /* text position */ + struct textmat fm; /* font matrix */ + struct textstr tstr; /* the string */ + struct textwfmt twfmt; /* text with formatting -- qq_op */ + struct textarray tarray; /* text contained in an array object */ + }; + const HParsedToken *obj; +} TextEntry; // text object entries + +// Haven't used this type yet - maybe OBE +typedef struct { + struct textmat fm; /* font matrix associated with this text object */ + TextEntry **ops; /* operators associated w/string */ + char *txt; /* the string associated with this object */ +} TextString; + +// *********************************************************** + + + /* * custom token types */ -HTokenType TT_XREntry, TT_Ref, TT_Dict, TT_HParseResult; +HTokenType TT_XREntry, TT_Ref, TT_Dict, TT_HParseResult, TT_TextEntry; typedef struct { enum {XR_FREE, XR_INUSE, XR_OBJSTM} type; @@ -139,9 +261,6 @@ typedef struct { const HParsedToken *obj; } XREntry; -typedef struct { size_t nr, gen; } Ref; - -typedef HCountedArray Dict; /* look up a value in a dictionary */ const HParsedToken * @@ -546,7 +665,7 @@ act_a85partialgroup(const HParseResult *p, void *u) assert(bytes_used > 0); size_t shift = 4 - bytes_used; - bytes = h_arena_malloc(p->arena, bytes_used); + bytes = h_arena_malloc(p->arena, bytes_used); for (int i=0; i<bytes_used; i++) { bytes[i] = bytes_helper[shift + i]; } @@ -850,90 +969,806 @@ validate_xrstm(HParseResult *p, void *u) const Dict *tdict = H_FIELD(Dict, 1, 0); const HParsedToken *v = dictentry(tdict, "Type"); - if (!xrefs) - { - return false; - } + if (!xrefs) + { + return false; + } + +#if 0 + if (v == NULL) + fprintf(stderr, "stream dict has no /Type\n"); + else if (v->token_type != TT_BYTES) + fprintf(stderr, "stream /Type is no name object\n"); + else if (bytes_eq(v->bytes, "XRef")) + return true; + return false; +#endif // XXX this block can be removed + + return (v != NULL && v->token_type == TT_BYTES && + bytes_eq(v->bytes, "XRef")); +} + +HParsedToken * +act_dict_(const HParseResult *p, void *env) +{ + Dict *dict = H_CAST_SEQ(p->ast); + + return H_MAKE(Dict, dict); +} + +#define act_array_ h_act_flatten + +HParsedToken * +act_shortlength(const HParseResult *p, void *u) +{ + uint8_t length = H_CAST_UINT(p->ast); + /* Length can range from 0-127, corresponding to the range 1-128, inclusive */ + uint8_t finallength = length+1; + + return H_MAKE_UINT(finallength); +} + +HParsedToken * +act_longlength(const HParseResult *p, void *u) +{ + uint8_t length = H_CAST_UINT(p->ast); + uint8_t finallength = 257-length; + + return H_MAKE_UINT(finallength); +} + +HParsedToken * +act_longrun(const HParseResult *p, void *u) +{ + HParsedToken **elements = h_seq_elements(p->ast); + HParsedToken *res = H_MAKE_SEQ(); + + uint8_t length = H_CAST_UINT(elements[0]); + uint8_t data = H_CAST_UINT(elements[1]); + + for (size_t len = 0; len < length; ++len) + { + h_seq_snoc(res, H_MAKE_UINT(data)); + } + + return res; +} + +HParsedToken * +act_rldstring(const HParseResult *p, void *u) +{ + const HParsedToken *flattened = h_seq_flatten(p->arena, p->ast); + HCountedArray *flattened_seq = H_CAST_SEQ(flattened); + size_t bytes_required; + uint8_t *result_bytes; + + bytes_required = flattened_seq->used - 1; + result_bytes = h_arena_malloc(p->arena, sizeof(uint8_t) * bytes_required); + + for (size_t i = 0; i < flattened_seq->used-1; ++i) + { + result_bytes[i] = H_CAST_UINT(flattened_seq->elements[i]); + } + + return H_MAKE_BYTES(result_bytes, bytes_required); +} + + + +/* + * ******************************************************************** + * Catalog parsing + * ******************************************************************** + */ +HParsedToken * +act_contentstream(const HParseResult *p, void *u) +{ +// HCountedArray *contents = H_FIELD_SEQ(0); +// +// fprintf(stdout, "act_contentstream:: stream length = %ld\n", contents->used); + return (HParsedToken *)p->ast; +} + + + +bool +validate_pgcontents(HParseResult *p, void *u) +{ + return false; +} + +HParsedToken * +act_pgcontents(const HParseResult *p, void *u) +{ + return (HParsedToken *)p->ast; +} + + +HParsedToken * +act_page(const HParseResult *p, void *u) +{ + return (HParsedToken *)p->ast; +} + + + +/* + * ******************************************************************** + * Start Text parsing + * ******************************************************************** + */ + +/* + * Pretty printer for text components of the ast + */ +void +pp_textentry(FILE *stream, const HParsedToken *tok, int indent, int delta) +{ + TextEntry *txte = H_CAST(TextEntry, tok); + + switch (txte->type) { + /* + * Always pretty print the text show operators + * + * If TEXT_VERBOSE is set, pretty-print the other operators + */ +#define TEXT_VERBOSE +#ifdef TEXT_VERBOSE + case TS_Tf: + fprintf(stream, "Tf_op: fn=%.*s, fontsize=%d\n", + txte->fref.namelen, txte->fref.fontname, txte->fref.fontsize); + break; + case TP_Td: + fprintf(stream, "Td_op: text position ::tx=%3.3f:ty=%3.3f\n", + txte->pos.tx, txte->pos.ty); + break; +#endif + case TW_Tj: + fprintf(stream, "%.*s\n", txte->tstr.nchars, txte->tstr.text); + break; + case TW_Tq: + fprintf(stream, "%.*s\n", txte->tstr.nchars, txte->tstr.text); + break; + case TW_Tqq: + fprintf(stream, "%.*s\n", txte->tstr.nchars, txte->tstr.text); + break; + case TW_TJ: + fprintf(stream, "%.*s\n", txte->tarray.flattened.nchars, + txte->tarray.flattened.text); + break; + + + default: + ; + } +} + +/* + * semantic actions + */ + + + +/* + * Simplify the code by casting the choice of integer number and real number to double + */ +bool +validate_tnumb(HParseResult *p, void *u) +{ + assert((p->ast->token_type == TT_SINT) || (p->ast->token_type == TT_DOUBLE)); + + return true; +} + + +HParsedToken * +act_tnumb(const HParseResult *p, void *u) +{ + + double value; + + if (p->ast->token_type == TT_SINT) value = (double)p->ast->sint; + else value = p->ast->dbl; + + return H_MAKE_DOUBLE(value); +} + + + + +/* + * Text state operators - Table 105 + * TS_Tc, TS_Tw, TS_Tz, TS_TL, TS_Tf, TS_Tr, TS_Ts + * + * ***************************************************************** + * ***************************************************************** + * + */ +HParsedToken * +act_Tc_op(const HParseResult *p, void *u) +{ + + fprintf(stdout, "act_Tc_op:: Here\n"); + + TextEntry *txte = H_ALLOC(TextEntry); + const HParsedToken *tval = H_INDEX_TOKEN(p->ast, 0); + + + txte->type = TS_Tc; + txte->obj = NULL; + + assert(tval->token_type == TT_DOUBLE); + txte->value = tval->dbl; + + fprintf(stdout, "act_Tc_op:: %3.3f\n", txte->value); + return H_MAKE(TextEntry, txte); +} + + +/* + * Tw operator: word spacing specification + * H_ARULE(Tw_op, SEQ(tnumb, ws, LIT("Tw"))); // 9.3.3 - wordSpace + */ +HParsedToken * +act_Tw_op(const HParseResult *p, void *u) +{ + + fprintf(stdout, "act_Tw_op:: Here\n"); + + TextEntry *txte = H_ALLOC(TextEntry); + + + txte->type = TS_Tw; + txte->obj = NULL; + txte->value = H_FIELD_DOUBLE(0); + + fprintf(stdout, "act_Tw_op:: %3.3f\n", txte->value); + return H_MAKE(TextEntry, txte); +} + + + +/* + * Tz operator: horizintal scaling specification + * H_ARULE(Tz_op, SEQ(tnumb, ws, LIT("Tz"))); // 9.3.4 - horizontal scaling + */ +HParsedToken * +act_Tz_op(const HParseResult *p, void *u) +{ + + fprintf(stdout, "act_Tz_op:: Here\n"); + + TextEntry *txte = H_ALLOC(TextEntry); + + + txte->type = TS_Tz; + txte->obj = NULL; + txte->value = H_FIELD_DOUBLE(0); + + fprintf(stdout, "act_Tz_op:: %3.3f\n", txte->value); + return H_MAKE(TextEntry, txte); +} + + + +/* + * TL operator: leading (line spacing) specification + * H_ARULE(TL_op, SEQ(tnumb, ws, LIT("TL"))); // 9.3.5 - leading + */ +HParsedToken * +act_TL_op(const HParseResult *p, void *u) +{ + + fprintf(stdout, "act_TL_op:: Here\n"); + + TextEntry *txte = H_ALLOC(TextEntry); + + + txte->type = TS_TL; + txte->obj = NULL; + txte->value = H_FIELD_DOUBLE(0); + + fprintf(stdout, "act_TL_op:: %3.3f\n", txte->value); + return H_MAKE(TextEntry, txte); +} + + +/* + * Font name and size specification + * H_ARULE(Tf_op, SEQ(name, ws, nat, ws, KW("Tf"), ws)); // font and size + * + * TODO: Verify that the name is specified in the resource dictionary + */ +HParsedToken * +act_Tf_op(const HParseResult *p, void *u) +{ + + fprintf(stdout, "act_Tf_op:: Here\n"); + + TextEntry *txte = H_ALLOC(TextEntry); + const HParsedToken *fn_token = H_FIELD_TOKEN(0); + + txte->type = TS_Tf; + txte->obj = NULL; + + txte->fref.fontname = (char *)fn_token->bytes.token; + txte->fref.namelen = fn_token->bytes.len; + txte->fref.fontsize = H_FIELD_UINT(1); + + fprintf(stdout, "act_Tf_op: fn=%.*s, fontsize=%d\n", + txte->fref.namelen, txte->fref.fontname, txte->fref.fontsize); + + return H_MAKE(TextEntry, txte); +} + + + +/* + * Tr operator: rendering mode + * H_VRULE(tmode, nat); // True if <= 7 + * H_ARULE(Tr_op, SEQ(tmode, ws, LIT("Tr"))); // 9.3.6 - rendering mode + * + * + */ +#define TEXTMODE_MAX 7 + +bool +validate_tmode(HParseResult *p, void *u) +{ + return H_CAST_UINT(p->ast) <= TEXTMODE_MAX; +} + +HParsedToken * +act_Tr_op(const HParseResult *p, void *u) +{ + + fprintf(stdout, "act_Tr_op:: Here\n"); + + TextEntry *txte = H_ALLOC(TextEntry); + + + txte->type = TS_Tr; + txte->obj = NULL; + txte->mode = H_FIELD_UINT(0); + + fprintf(stdout, "act_Tr_op:: %d\n", txte->mode); + return H_MAKE(TextEntry, txte); +} + + +/* + * Ts operator: rise specification + * H_ARULE(Ts_op, SEQ(tnumb, ws, LIT("Ts"))); // rise + */ +HParsedToken * +act_Ts_op(const HParseResult *p, void *u) +{ + + fprintf(stdout, "act_Ts_op:: Here\n"); + + TextEntry *txte = H_ALLOC(TextEntry); + + + txte->type = TS_Ts; + txte->obj = NULL; + txte->value = H_FIELD_DOUBLE(0); + + fprintf(stdout, "act_Ts_op:: %3.3f\n", txte->value); + return H_MAKE(TextEntry, txte); +} + + + +/* + * 9.4.2 - Text positioning operators - Table 108 + * TP_Td, TP_TD, TP_Tm, TP_Tstar + * + * ***************************************************************** + * ***************************************************************** + * + * TP_Td: String position - Translation specification + * H_ARULE(Td_op, SEQ(tnumb, ws, tnumb, ws, LIT("Td"), ws)); // move to next line with offset + */ +HParsedToken * +act_Td_op(const HParseResult *p, void *u) +{ + fprintf(stdout, "act_Td_op:: Here\n"); + + TextEntry *txte = H_ALLOC(TextEntry); + + + txte->type = TP_Td; + txte->obj = NULL; + txte->pos.tx = H_FIELD_DOUBLE(0); + txte->pos.ty = H_FIELD_DOUBLE(1); + + fprintf(stdout, "act_Td_op: text position ::tx=%.3f:ty=%.3f\n", + txte->pos.tx, txte->pos.ty); + + return H_MAKE(TextEntry, txte); +} + + + + +/* + * TP_TD: Offset to next line and set the leading parameter state + * H_ARULE(TD_op, SEQ(tnumb, ws, tnumb, ws, LIT("TD"))); // move to next line with offset and set state + */ +HParsedToken * +act_TD_op(const HParseResult *p, void *u) +{ + + fprintf(stdout, "act_TD_op:: Here\n"); + + TextEntry *txte = H_ALLOC(TextEntry); + + + txte->type = TP_TD; + txte->obj = NULL; + txte->pos.tx = H_FIELD_DOUBLE(0); + txte->pos.ty = H_FIELD_DOUBLE(1); + + fprintf(stdout, "act_TD_op: text position ::tx=%3.3f:ty=%3.3f\n", txte->pos.tx, txte->pos.ty); + + return H_MAKE(TextEntry, txte); +} + + +/* + * TP_Tm: Text matrix specification + * H_ARULE(Tm_op, SEQ(REP(SEQ(tnumb, ws), 6), LIT("Tm"), ws)); // set text matrix + */ +HParsedToken * +act_Tm_op(const HParseResult *p, void *u) +{ + + fprintf(stdout, "act_Tm_op:: Here\n"); + + TextEntry *txte = H_ALLOC(TextEntry); + + + txte->type = TP_Tm; + txte->obj = NULL; + + assert((p->ast->token_type == TT_SEQUENCE) && + (p->ast->seq->elements[0]->token_type == TT_SEQUENCE) && + (p->ast->seq->elements[0]->seq->used == 6)); + for (int i=0; i<6; i++) + + txte->fm.cell[i] = p->ast->seq->elements[0]->seq->elements[i]->seq->elements[0]->dbl; + + fprintf(stdout, "act_Tm_op: text matrix ::\n"); + for (int i=0; i<3; i++) + fprintf(stdout, "%3.3f : %3.3f\n", txte->fm.cell[i*2], txte->fm.cell[i*2+1]); + + return H_MAKE(TextEntry, txte); +} + + +/* + * TP_Tstar: Move to the next line + * H_ARULE(Tstar_op, SEQ(LIT("T*"), ws)); // move to next line + */ +HParsedToken * +act_Tstar_op(const HParseResult *p, void *u) +{ + + fprintf(stdout, "act_Tstar_op:: Here\n"); + + TextEntry *txte = H_ALLOC(TextEntry); + + txte->type = TP_Tstar; + txte->obj = NULL; + txte->value = 0; + + fprintf(stdout, "act_Tstar_op: position pointer\n"); + + return H_MAKE(TextEntry, txte); +} + + + +/* + * 9.4.3 - Text showing operators - Table 109 + * TW_Tj, TW_Tq, TW_Tqq, TW_TJ + * + * ***************************************************************** + * ***************************************************************** + * + * TW_Tj: Show string + * H_ARULE(Tj_op, SEQ(string, ws, LIT("Tj"), ws)); // show text string + */ +HParsedToken * +act_Tj_op(const HParseResult *p, void *u) +{ + + fprintf(stdout, "act_Tj_op:: Here\n"); + + TextEntry *txte = H_ALLOC(TextEntry); + const HParsedToken *tstr = H_INDEX_TOKEN(p->ast, 0); + + + txte->type = TW_Tj; + txte->obj = NULL; + + txte->tstr.text = (char *)tstr->bytes.token; + txte->tstr.nchars = tstr->bytes.len; + + fprintf(stdout, "act_Tj_op:: %.*s\n", txte->tstr.nchars, txte->tstr.text); + return H_MAKE(TextEntry, txte); +} + + +/* + * TW_Tq: Offset to next line then show string + * H_ARULE(TsingleQ_op, SEQ(string, ws, LIT(quote), ws)); // Move to next line and show text + */ +HParsedToken * +act_TsingleQ_op(const HParseResult *p, void *u) +{ + + fprintf(stdout, "act_TsingleQ_op:: Here\n"); + + TextEntry *txte = H_ALLOC(TextEntry); + const HParsedToken *tstr = H_INDEX_TOKEN(p->ast, 0); + + + txte->type = TW_Tq; + txte->obj = NULL; + + txte->tstr.text = (char *)tstr->bytes.token; + txte->tstr.nchars = tstr->bytes.len; + + fprintf(stdout, "act_TsingleQ_op:: %.*s\n", txte->tstr.nchars, txte->tstr.text); + return H_MAKE(TextEntry, txte); +} + + +/* + * TW_Tqq: Offset to next line then show string, apply formatting specifications + * H_ARULE(TdoubleQ_op, SEQ(tnumb, ws, tnumb, ws, string, ws, LIT(dquote), ws)); // Move to next line and show formatted text + * + */ +HParsedToken * +act_TdoubleQ_op(const HParseResult *p, void *u) +{ + + fprintf(stdout, "act_TdoubleQ_op:: Here\n"); + + TextEntry *txte = H_ALLOC(TextEntry); + const HParsedToken *aw = H_INDEX_TOKEN(p->ast, 0); + const HParsedToken *ac = H_INDEX_TOKEN(p->ast, 1); + const HParsedToken *tstr = H_INDEX_TOKEN(p->ast, 2); + + + txte->type = TW_Tqq; + txte->obj = NULL; + + txte->twfmt.aw = aw->dbl; + txte->twfmt.ac = ac->dbl; + txte->twfmt.tstr.text = (char *)tstr->bytes.token; + txte->twfmt.tstr.nchars = tstr->bytes.len; + + fprintf(stdout, "act_TdoubleQ_op:: aw=%3.3f, ac=%3.3f\n", txte->twfmt.aw, txte->twfmt.ac); + fprintf(stdout, "act_TdoubleQ_op:: %.*s\n", txte->twfmt.tstr.nchars, txte->twfmt.tstr.text); + + return H_MAKE(TextEntry, txte); +} + +/* + * TW_TJ: Show array of strings, with potentially re-positioning specifications for each string + * H_RULE(TArr_elem, SEQ(OPT(SEQ(nanumbs)), string, ws)) + * H_ARULE(TJ_op, SEQ(h_many(TArr_elem), LIT("TJ"), ws)); // show one or more text strings + * + * TODO:: Implement the array parser + */ +HParsedToken * +act_TJ_op(const HParseResult *p, void *u) +{ + + fprintf(stdout, "act_TJ_op:: Here\n"); + + TextEntry *txte = H_ALLOC(TextEntry); + const HParsedToken *tarr = H_INDEX_TOKEN(p->ast, 0); + + txte->type = TW_TJ; + txte->obj = NULL; + + + /* + * Parse each element of the array + * Build up the pointers to each of the string pieces + */ + txte->tarray.nelts = tarr->seq->used; + txte->tarray.elts = h_arena_malloc(p->arena, sizeof(struct tarrayelt) * txte->tarray.nelts); + txte->tarray.flattened.nchars = 0; + + for (int i=0; i<txte->tarray.nelts; i++) { + const HParsedToken *elt = tarr->seq->elements[i]; + assert( (elt->token_type == TT_SEQUENCE) && (elt->seq->used == 1) ); + switch (elt->seq->elements[0]->token_type) { + case TT_DOUBLE: + txte->tarray.elts[i].adj = elt->seq->elements[0]->dbl; + txte->tarray.elts[i].isStr = false; + break; + case TT_BYTES: + txte->tarray.elts[i].tstr.text = (char *)elt->seq->elements[0]->bytes.token; + txte->tarray.elts[i].tstr.nchars = elt->seq->elements[0]->bytes.len; + txte->tarray.elts[i].isStr = true; + txte->tarray.flattened.nchars += txte->tarray.elts[i].tstr.nchars; + // Debug +// fprintf(stdout, "act_TJ_op:Cumulative=%d/0x%x bytes, Additional:%d bytes\n", +// txte->tarray.flattened.nchars, txte->tarray.flattened.nchars, txte->tarray.elts[i].tstr.nchars); + break; + default: + fprintf(stderr, "act_TJ_op:: Unexpected element type :: %d\n", elt->seq->elements[0]->token_type); + fflush(stderr); + assert(false); + } + } + + /* hold on to a flattened copy of the string */ + txte->tarray.flattened.text = h_arena_malloc(p->arena, sizeof(char) * txte->tarray.flattened.nchars); + int j = 0; // current index + for (int i=0; i<txte->tarray.nelts; i++) { + if (txte->tarray.elts[i].isStr) { + // Debug +// fprintf(stdout, "act_TJ_op:Start=%p-%d/0x%xbytes, Writing to:%p-%dbytes\n", +// (void *)txte->tarray.flattened.text, txte->tarray.flattened.nchars, txte->tarray.flattened.nchars, +// (void *)&txte->tarray.flattened.text[j], txte->tarray.elts[i].tstr.nchars); +// fprintf(stdout, "act_TJ_op: %.*s\n", txte->tarray.elts[i].tstr.nchars, txte->tarray.elts[i].tstr.text); + memcpy(&txte->tarray.flattened.text[j], txte->tarray.elts[i].tstr.text, txte->tarray.elts[i].tstr.nchars); + j += txte->tarray.elts[i].tstr.nchars; + } + } + + fprintf(stdout, "act_TJ_op:: %.*s\n", txte->tarray.flattened.nchars, txte->tarray.flattened.text); + return H_MAKE(TextEntry, txte); +} + + + + +/* + * Parse the text object delimited by "BT" and "ET" + */ +HParsedToken * +act_txtobj(const HParseResult *p, void *u) +{ + fprintf(stdout, "act_txtobj:: Here\n"); + + assert(p->ast->token_type == TT_SEQUENCE); + + TextEntry *txtobj = H_ALLOC(TextEntry); + const HParsedToken *opstream = H_INDEX_TOKEN(p->ast, p->ast->seq->used-1); + const HParsedToken *tt_text=NULL; + char *tstr=NULL; + int textlen=0; + + + fprintf(stdout, "act_txtobj:: numtokens = %lu\n", opstream->seq->used); + + // Walk through the tokens to determine how much space to allocate + // Count the number of characters in the stream + // Concatenate the text into the allocated space + for (int i =0; i < opstream->seq->used; i++) { + + TextEntry *txte = H_CAST(TextEntry, opstream->seq->elements[i]); + // Process the text showing operators + switch (txte->type) { + case TW_TJ: + textlen += txte->tarray.flattened.nchars; + break; + + case TW_Tj: + case TW_Tq: + case TW_Tqq: + textlen += txte->tstr.nchars; + break; + default: + ; // ignore + } + } + tstr = h_arena_malloc(p->arena, sizeof(uint8_t) * textlen); + int idx=0; + // Now concatenate the pieces + for (int i =0; i < opstream->seq->used; i++) { + TextEntry *txte = H_CAST(TextEntry, opstream->seq->elements[i]); + + // Process the text showing operators + // Process the text showing operators + switch (txte->type) { + case TW_TJ: + memcpy(&tstr[idx], txte->tarray.flattened.text, txte->tarray.flattened.nchars); + idx += txte->tarray.flattened.nchars; + break; + + case TW_Tj: + case TW_Tq: + case TW_Tqq: + memcpy(&tstr[idx], txte->tstr.text, txte->tstr.nchars); + idx += txte->tstr.nchars; + break; + default: + ; // ignore + } + } + assert(idx == textlen); + + txtobj->type = TW_TJ; + txtobj->obj = NULL; + txtobj->tarray.flattened.text = tstr; + txtobj->tarray.flattened.nchars = textlen; + // pretty print the information + tt_text = H_MAKE(TextEntry, txtobj); + pp_textentry(stdout, tt_text, 0, 0); + + return (HParsedToken *)tt_text; +} + + + + -#if 0 - if (v == NULL) - fprintf(stderr, "stream dict has no /Type\n"); - else if (v->token_type != TT_BYTES) - fprintf(stderr, "stream /Type is no name object\n"); - else if (bytes_eq(v->bytes, "XRef")) - return true; - return false; -#endif // XXX this block can be removed - return (v != NULL && v->token_type == TT_BYTES && - bytes_eq(v->bytes, "XRef")); -} -HParsedToken * -act_dict_(const HParseResult *p, void *env) -{ - Dict *dict = H_CAST_SEQ(p->ast); - return H_MAKE(Dict, dict); -} -#define act_array_ h_act_flatten -HParsedToken * -act_shortlength(const HParseResult *p, void *u) + +// Utility -- Handles simplistic approach to UTF-16 +char convert2char(unsigned int b1) { - uint8_t length = H_CAST_UINT(p->ast); - /* Length can range from 0-127, corresponding to the range 1-128, inclusive */ - uint8_t finallength = length+1; + char val; - return H_MAKE_UINT(finallength); + if (b1 == 0) + { + val = '?'; + } + else if ( (b1 < 20) || ( b1 > 127 ) ) + { + fprintf(stdout, " 0X%02X ", b1); + val = '?'; + } + else + { + val = b1; + fprintf(stdout, "%c", val); + } + return val; } -HParsedToken * -act_longlength(const HParseResult *p, void *u) -{ - uint8_t length = H_CAST_UINT(p->ast); - uint8_t finallength = 257-length; - return H_MAKE_UINT(finallength); -} +// ********************************************************************* +// DEBUG HParsedToken * -act_longrun(const HParseResult *p, void *u) +act_txtbegin_(const HParseResult *p, void *u) { - HParsedToken **elements = h_seq_elements(p->ast); - HParsedToken *res = H_MAKE_SEQ(); - - uint8_t length = H_CAST_UINT(elements[0]); - uint8_t data = H_CAST_UINT(elements[1]); + const HParsedToken *tok=p->ast; - for (size_t len = 0; len < length; ++len) - { - h_seq_snoc(res, H_MAKE_UINT(data)); - } + fprintf(stdout, "act_txtbegin:: Here %lx\n", (long unsigned int)tok); - return res; + return (HParsedToken *)tok; } - HParsedToken * -act_rldstring(const HParseResult *p, void *u) +act_txtend(const HParseResult *p, void *u) { - const HParsedToken *flattened = h_seq_flatten(p->arena, p->ast); - HCountedArray *flattened_seq = H_CAST_SEQ(flattened); - size_t bytes_required; - uint8_t *result_bytes; - - bytes_required = flattened_seq->used - 1; - result_bytes = h_arena_malloc(p->arena, sizeof(uint8_t) * bytes_required); - for (size_t i = 0; i < flattened_seq->used-1; ++i) - { - result_bytes[i] = H_CAST_UINT(flattened_seq->elements[i]); - } + fprintf(stdout, "act_txtend:: Here\n"); - return H_MAKE_BYTES(result_bytes, bytes_required); + return (HParsedToken *)p->ast; } + +/* + * ******************************************************************** + * End Text parsing + * ******************************************************************** + */ + + /* * input grammar */ @@ -951,9 +1786,20 @@ HParser *p_wel; HParser *p_elemr; HParser *p_npair; +/* + * Parsers for text streams + */ +HParser *p_textbegin; +HParser *p_textstream; +HParser *p_trailer; +HParser *p_page; + + /* continuations for h_bind() */ HParser *kstream(HAllocator *, const HParsedToken *, void *); HParser *kxstream(HAllocator *, const HParsedToken *, void *); +HParser *ktxtstream(HAllocator *, const HParsedToken *, void *); +HParser *kcontentstream(HAllocator *, const HParsedToken *, void *); void init_runlengthdecode_parser(struct Env *aux) @@ -977,9 +1823,9 @@ void init_parser(struct Env *aux) { TT_HParseResult = h_allocate_token_new("HParseResult", NULL, pp_parseresult); - TT_XREntry = h_allocate_token_new("XREntry", NULL, pp_xrentry); - TT_Ref = h_allocate_token_new("Ref", NULL, pp_ref); - TT_Dict = h_allocate_token_new("Dict", NULL, pp_dict); + TT_XREntry = h_allocate_token_new("XREntry", NULL, pp_xrentry); + TT_Ref = h_allocate_token_new("Ref", NULL, pp_ref); + TT_Dict = h_allocate_token_new("Dict", NULL, pp_dict); /* lines */ H_RULE(cr, p_mapch('\r', '\n')); /* semantic value: \n */ @@ -1041,6 +1887,13 @@ init_parser(struct Env *aux) H_VRULE(pnat, nat); H_RULE(npair, SEQ(pnat, wel,ws, nat)); + /* Whitespace can occur between any digit and has to be ignored, */ + /* Comments are not allowed inside streams, and % character should cause + * a parse error. */ + H_RULE(aws, IGN(h_many(wchar))); // all white space, include CR & LF, but not comments + #define MANY_AWS(X) h_many(CHX(aws, X)) + + /* * objects */ @@ -1225,16 +2078,12 @@ init_parser(struct Env *aux) /* debug parser to consume as much as possible */ H_RULE(pdfdbg, SEQ(OPT(start_junk), header, OPT(hdr_junk), h_many(tail), body, OPT(xr_td), OPT(SEQ(startxr, final_eof_junk)))); + + + /* * filters */ - /* Whitespace can occur between any digit and has to be ignored, */ - /* Comments are not allowed inside streams, and % character should cause - * a parse error. */ - H_RULE(aws, IGN(h_many(wchar))); // all white space, include CR & LF, but not comments - #define MANY_AWS(X) h_many(CHX(aws, X)) - - /* Ascii85Decode */ H_RULE(a85eod, SEQ(h_ch('~'), aws, h_ch('>'))); @@ -1246,7 +2095,7 @@ init_parser(struct Env *aux) * a parse error. */ #define MANY_LWS(X) h_many(CHX(lws, X)) - /* This encoding of zero is not allowed */ + /* Encoding of zero is not allowed */ // Folded the test for a85fiveexcl into the validation component H_VARULE(a85fivedigits, h_repeat_n(SEQ(a85digit, aws), 5)); // TODO:: will need to pull out error conditions -- a85fiveexcl or 'z' as one of the digits @@ -1270,6 +2119,73 @@ init_parser(struct Env *aux) init_runlengthdecode_parser(aux); + // ========================================================================== + /* + * Text Objects Extraction - embedded in content streams + * + */ + // ========================================================================== + /* \ + * Text Objects Extraction - embedded in content streams \ + */ \ + \ + H_RULE(txtbegin, h_indirect()); \ + H_RULE(txt_before_junk, IGN(SEQ(h_not(LIT("BT")), CHX(comment, h_uint8())))); \ + H_ARULE(txtbegin_, SEQ(IGN(h_many(txt_before_junk)), LIT("BT"), aws)); \ + h_bind_indirect(txtbegin, txtbegin_); \ + H_ARULE(txtend, KW("ET")); \ + /* 9.3 - Text state operators */ \ + H_AVRULE(tnumb, numb); \ + H_ARULE(Tc_op, SEQ(tnumb, aws, LIT("Tc"), aws)); /* 9.3.2 - charSpace */ \ + H_ARULE(Tw_op, SEQ(tnumb, aws, LIT("Tw"), aws)); /* 9.3.3 - wordSpace */ \ + H_ARULE(Tz_op, SEQ(tnumb, aws, LIT("Tz"), aws)); /* 9.3.4 - horizontal scaling */ \ + H_ARULE(TL_op, SEQ(tnumb, aws, LIT("TL"), aws)); /* 9.3.5 - leading */ \ + H_ARULE(Tf_op, SEQ(name, aws, nat, aws, KW("Tf"), aws)); /* font and size */ \ + /* TDO: must map to an existing font dictionary */ \ + H_VRULE(tmode, nat); /* True if <= 7 */ \ + H_ARULE(Tr_op, SEQ(tmode, aws, LIT("Tr"), aws)); /* 9.3.6 - rendering mode */ \ + H_ARULE(Ts_op, SEQ(tnumb, aws, LIT("Ts"), aws)); /* rise */ \ + H_RULE(textstate_ops, CHX(Tc_op, Tw_op, Tz_op, TL_op, Tf_op, Tr_op, Ts_op)); \ + \ + /* 9.4.2 - Text positioning operators */ \ + H_ARULE(Td_op, SEQ(tnumb, aws, tnumb, aws, LIT("Td"), aws)); /* move to next line with offset */ \ + H_ARULE(TD_op, SEQ(tnumb, aws, tnumb, aws, LIT("TD"), aws)); /* move to next line with offset and set state */ \ + H_ARULE(Tm_op, SEQ(REP(SEQ(tnumb, aws), 6), LIT("Tm"), aws)); /* set text matrix */ \ + H_ARULE(Tstar_op, SEQ(LIT("T*"), aws)); /* move to next line */ \ + H_RULE(textpos_ops, CHX(Td_op, TD_op, Tm_op, Tstar_op)); \ + \ + /* 9.4.3 - Text showing operators */ \ + H_RULE(quote, h_ch('\'')); \ + H_RULE(dquote, h_ch('"')); \ + H_ARULE(Tj_op, SEQ(string, aws, LIT("Tj"), aws)); /* show text string */ \ + H_ARULE(TsingleQ_op, SEQ(string, aws, quote, aws)); /* Move to next line and show text */ \ + H_ARULE(TdoubleQ_op, SEQ(tnumb, aws, tnumb, aws, string, aws, dquote, aws)); /* Move to next line and show formatted text */ \ + H_RULE(TArr_elem, SEQ(CHX(tnumb, string), aws)); \ + H_ARULE(TJ_op, SEQ(IGN(lbrack), aws, h_many(TArr_elem), IGN(rbrack), aws, LIT("TJ"), aws)); /* show one or more text strings */ \ + H_RULE(textshow_ops, CHX(Tj_op, TsingleQ_op, TdoubleQ_op, TJ_op)); \ + \ + H_RULE(text_inbetween_junk, IGN(SEQ(h_not(txtend), h_uint8()))); \ + \ + H_RULE(text_ops, CHX(textstate_ops, textpos_ops, textshow_ops, text_inbetween_junk)); \ + \ + /* Text object */ \ + H_ARULE(txtobj, SEQ(txtbegin, h_many(text_ops), txtend)); \ + \ + /* text streams */ \ + H_RULE(txtstream, h_bind(h_many1(txtobj), ktxtstream, aux)); \ + \ + \ + // Page Tree + H_ARULE(contentstream, h_middle(stmbeg, h_many1(h_uint8()), stmend)); +// H_ARULE(contentstream, h_middle(stmbeg, h_many(SEQ(h_not(stmend), h_uint8())), stmend)); + H_ARULE(pgcontents, CHX(array, contentstream)); + H_ARULE(page, SEQ(ws, npair, wel, KW("obj"), ws, pgcontents, + OPT(ws), OPT(lws), KW("endobj"))); +// H_ARULE(page, CHX(ref, array)); + p_page = page; + + + /* global parser variables */ p_pdf = pdf; p_pdfdbg = pdfdbg; @@ -1283,6 +2199,10 @@ init_parser(struct Env *aux) p_elemr = h_action(elemr, h_act_flatten, NULL); p_npair = npair; + /* text parser variables */ \ + p_textbegin = txtbegin; \ + p_textstream = txtstream; \ + p_fail = h_nothing_p(); p_epsilon = epsilon; p_return_0 = h_action(epsilon, act_return_uint, (void *)0); @@ -1455,7 +2375,6 @@ resolve(struct Env *aux, const HParsedToken *v) */ #include <limits.h> /* INT_MAX */ -#include <stdlib.h> /* abs() */ #include <zlib.h> #include <err.h> @@ -1883,6 +2802,7 @@ LZWDecode(const Dict *parms, HBytes b, HParser *p) done = depredict(&pred, cur_lzw_spec->lzw_buf, cur_lzw_spec->write_head-1); assert(!done); // XXX ITERATIVE + // SR::TODO:: Do a H_MAKE rather than a parse and let the caller do the parse res = h_parse(p, pred.out, pred.nout); free(pred.out); @@ -1905,6 +2825,7 @@ RunLengthDecode(const Dict *parms, HBytes b, HParser *p) } assert(res->ast && res->ast->token_type == TT_BYTES); + // SR::TODO:: Do a H_MAKE rather than a parse and let the caller do the parse res = h_parse(p, res->ast->bytes.token, res->ast->bytes.len); return res; @@ -1932,6 +2853,7 @@ ASCIIHexDecode(const Dict *parms, HBytes b, HParser *p) assert(f_res->ast && f_res->ast->token_type == TT_BYTES); fprintf(stdout, "ASCIIHexDecode::string = [%.*s]\n", (int)f_res->ast->bytes.len, (char*)f_res->ast->bytes.token); + // SR::TODO:: Do a H_MAKE rather than a parse and let the caller do the parse res = h_parse(p, f_res->ast->bytes.token, f_res->ast->bytes.len); if (res == NULL) @@ -1957,6 +2879,7 @@ ASCII85Decode(const Dict *parms, HBytes b, HParser *p) } assert(f_res->ast && f_res->ast->token_type == TT_BYTES); + // SR::TODO:: Do a H_MAKE rather than a parse and let the caller do the parse res = h_parse(p, f_res->ast->bytes.token, f_res->ast->bytes.len); if (res == NULL) @@ -2002,8 +2925,11 @@ decode_stream(const Dict *d, HBytes b, HParser *p) filter = RunLengthDecode; else if (bytes_eq(v->bytes, "LZWDecode")) filter = LZWDecode; - else - return NULL; /* filter not supported */ + else { /* filter not supported */ + fprintf(stderr, "decode_stream:: Unsupported Filter [%.*s\n]", + (int)v->bytes.len, v->bytes.token); + return NULL; /* Treat the stream as a byte array */ + } v = dictentry(d, "DecodeParms"); if (v && v->token_type == TT_Dict) @@ -2039,6 +2965,9 @@ act_take_bytes(const HParseResult *p, void *env) * allocated only temporarily for the lifetime of the continuation * below. */ + // DEBUG + fprintf (stdout, "act_take_bytes: Current position (bytes)= %p, len=%ld\n", + (void *)bs->token + offset, bs->len); return H_MAKE_BYTES(bs->token + offset, bs->len); } @@ -2078,7 +3007,14 @@ p_stream_data__m(HAllocator *mm__, const Dict *dict) if (bytes_eq(v->bytes, "ObjStm")) return p_objstm__m(mm__, dict); #endif - + if (bytes_eq(v->bytes, "XObject")) { + /* + * TODO:: external objects can be images, forms, or postscript objects + * We are not handling them at the moment + */ + fprintf (stdout, "p_stream_data__m: XObject parsing is not yet supported!\n"); + return NULL; + } return NULL; /* unrecognized type */ } @@ -2099,8 +3035,10 @@ act_ks_value(const HParseResult *p, void *u) if (res == NULL) { HBytes b = {NULL, 0}; const HParsedToken *v = dictentry(spec->dict, "Type"); - if (v != NULL && v->token_type == TT_BYTES) - b = v->bytes; + if (v != NULL && v->token_type == TT_BYTES) { + b.token = v->bytes.token; + b.len = v->bytes.len; + } if (b.len > INT_MAX) b.len = INT_MAX; fprintf(stderr, "parse error in stream (%*s)\n", @@ -2391,6 +3329,384 @@ kxstream(HAllocator *mm__, const HParsedToken *x, void *env) } + + + +/* + * This continuation takes the text stream and saves it in the environment for further + * processing, e.g. writing it out to a file with the same name as the pdf input filename + * but woth a .psectxt suffix. + * It does not consume the string and returns the token as the output. + * + * x = (txtobj ...) + */ +HParser * +ktxtstream(HAllocator *mm__, const HParsedToken *x, void *env) +{ + + struct Env *aux = env; +#if 0 + if (x->token_type != TT_TextEntry) { + fprintf( + stderr, + "ktxtstream:: Unexpected token type =%d :: (Expected TT_TextEntry)\n", + x->token_type); + assert(x->token_type == TT_TextEntry); + return NULL; + } +#endif + assert (x->token_type == TT_SEQUENCE); + int n_tobjs = x->seq->used; + + for (int n=0; n<n_tobjs; n++) { + + assert(x->seq->elements[n]->token_type == TT_TextEntry); + TextEntry *tste = H_CAST(TextEntry, x->seq->elements[n]); + struct textstr *tstr = NULL; + /* + * To save all of the operators along with the text string, we have to walk + * through all of the tokens and keep a table of pointers to them + * For now, just keep a pointer to the text string in the environment + * + */ + switch (tste->type) { + case TW_Tj: + case TW_Tq: + case TW_Tqq: + tstr = &tste->tstr; + break; + case TW_TJ: + tstr = &tste->tarray.flattened; + break; + default: + fprintf(stderr, "ktxtstream:: Text token type '%u' ignored\n", + tste->type); + } + + fprintf(stdout, "ktxtstream: Value = %.*s\n", tstr->nchars, tstr->text); + + + // store the string in the environment + // not sure whether we need to actually store the string in malloc'ed area + // currently, we are reusing the token memory previously created + struct textnode *txtnd = (struct textnode *) malloc( + sizeof(struct textnode)); + txtnd->tstr = tstr; + txtnd->next = NULL; + if (aux->txthead == NULL) + aux->txthead = txtnd; + if (aux->txttail == NULL) + aux->txttail = txtnd; + else { + aux->txttail->next = txtnd; + aux->txttail = txtnd; + } + aux->ntextobjs += 1; + + } + + return p_return__m(mm__, x); +} + + + +/* + * ******************************************************************** + * Start Catalog parsing + * ******************************************************************** + */ + +void parse_pagenode( + struct Env *aux, + PtNode_S *pgNode // node + ) +{ + + Dict *pageD = pgNode->pn.page; + const HParsedToken *contents_t = NULL; // dictionary token + Ref *contents_r = NULL; +// const HParsedToken *contents = NULL; // resolved token + XREntry *ent = NULL; + HParseResult *res = NULL; + + + // Hold on to the Resources dictionary + // This dictionary may be empty + // If there is no dictionary ==> inherit resources from parent + // TODO:: Capture a list of fonts associated with this page + pgNode->pn.resources = dictentry(pageD, "Resources"); + + + // Process the contents stream or array (this will need a parser) + contents_t = dictentry(pageD, "Contents"); + if (contents_t == NULL) { + fprintf(stderr, "parse_pagenode: Page node without contents!\n"); + goto end; + } + else if (contents_t->token_type == TT_Ref) { + contents_r = H_CAST(Ref, contents_t); + ent = lookup_xref(aux, contents_r->nr, contents_r->gen); + if (ent->type == XR_INUSE) { + size_t offset = ent->n.offs; + fprintf (stdout, "parse_pagenode:: Offset = %ld\n", offset); + res = h_parse(p_page, aux->input + offset, aux->sz - offset); + fprintf (stdout, "parse_pagenode:: res = %p\n", (void *) res); + } +// contents = resolve(aux, contents_t); + } + else { + fprintf(stderr, "parse_pagenode: Page node is not a reference ... may be an array!\n"); + // TODO:: Haven't handle this case + goto end; + } + +// fprintf(stdout, "parse_pagenode:: Contents token type = %d\n", +// contents->token_type); + + + end: + return; +} + + + +/* + * This helper implements the standard backwards parsing strategy to read + * the trailer dictionaries found at the very end of the input. + * + * It then follows the catalog dictionary to enumerate the pages in the pdf file + * identifying text streams and contents streams, saving the information to support + * text extraction in the environment structure. + * + * + * A return value of false indicates some parsing error. +*/ +// need to maintain information about pages +size_t +parse_pagetree( + struct Env *aux, + PtNode_S *treeNode, + const HParsedToken *ptRef_t, // my page tree node reference + const HParsedToken *kids, // my kids + size_t curr + ) +{ + + PtNode_S *node = NULL; + const HParsedToken *kidRef = NULL; // page tree or page node reference + const HParsedToken *kidDict_t = NULL; + Dict *kidDict = NULL; + const HParsedToken *nType = NULL; // node type + const HParsedToken *meRef_t = NULL; // this page tree node + const HParsedToken *grandKids = NULL; + const HParsedToken *item = NULL; + size_t npages = 0; + Ref *ptRef=NULL, *meRef=NULL; + + + + // get the kids (pgTable) + HCountedArray *pgTable = H_CAST_SEQ(kids); + size_t pgtSz = pgTable->used; + if (curr + pgtSz > aux->catalog.pgCount) { + fprintf(stderr, "parse_pagetree: More kids then specified leaves!\n"); + // TODO:: probably just a warning is enough here -- run the VIOL parser? + } + treeNode->kids = (PtNode_S*)h_arena_malloc(pgTable->arena, pgtSz * sizeof(PtNode_S)); + + + // Process the kids + for (int i=0; i<pgtSz; i++) + { + node = &treeNode->kids[i]; + kidRef = pgTable->elements[i]; + kidDict_t = resolve(aux, kidRef); // page or tree node dictionary token + kidDict = H_CAST(Dict, kidDict_t); // page or tree node dictionary + + + // check if this is a page node or tree node + // if tree node, call parse_pagetree recursively + // otherwise, process the page + nType = dictentry(kidDict, "Type"); + if ( (nType == NULL) || (nType->token_type != TT_BYTES) ) { + fprintf(stderr, "parse_pagetree: Not a page or page tree node!\n"); + // TODO:: Call VIOL + assert(nType->token_type == TT_BYTES); + } + + // Parent is a required field, for every page & page tree node except root + // this will be passed in to the child + meRef_t = dictentry(kidDict, "Parent"); + if ( (meRef_t == NULL) || (meRef_t->token_type != TT_Ref) ) { + fprintf(stderr, "parse_pagetree: Invalid parent tree node!\n"); + // TODO:: Call VIOL + } + else { + meRef = H_CAST(Ref, meRef_t); + ptRef = H_CAST(Ref, ptRef_t); + if ( (meRef->nr != ptRef->nr) || (meRef->gen != ptRef->gen) ) { + fprintf(stderr, "parse_pagetree: I am not the parent of my child! " + "[kids parent = (%ld, %ld), me = (%ld, %ld)]\n", + meRef->nr, meRef->gen, ptRef->nr, ptRef->gen); + // TODO -- Use VIOL + goto end; + } + } + + // the child node is a page tree node + if (bytes_eq(nType->bytes, "Pages")) { // tree node + /* resolve and process the page tree root reference to extract the dictionary --> Page Tree Object */ + grandKids = dictentry(kidDict, "Kids"); + if (grandKids == NULL) { + fprintf(stderr, "parse_pagetree: page tree node with no grand kids!\n"); + // TODO:: Violation ...this node should not be NULL + goto end; + } + else if (grandKids->token_type != TT_SEQUENCE) { + fprintf(stderr, "parse_pagetree: Token type error!! type = %d\n", grandKids->token_type); + // TODO:: Violation ... + goto end; + } + + + + // Count is a required field + item = dictentry(kidDict, "Count"); + if ( (item == NULL) || (item->token_type != TT_SINT) ) { + fprintf(stderr, "parse_pagetree: Required page node count missing!\n"); + goto end; + } + // verify the count + npages = H_CAST_SINT(item); + if (npages+curr > aux->catalog.pgCount) { + fprintf(stderr, "parse_pagetree: page count greater than anticipated leaves:: " + "computed = %ld, max expected = %ld\n", npages+curr, aux->catalog.pgCount); + // TODO:: probably just a warning is enough here -- run the VIOL parser? + } + curr += npages; + + + // parse_pagetree + node->type = PG_TREE; + node->parent = meRef_t; + parse_pagetree(aux, node, kidRef, grandKids, curr); + + } + + + // process a page node + else if (bytes_eq(nType->bytes, "Page")) { // page node + node->type = PG_NODE; + node->parent = meRef_t; + node->pn.pageRef = kidRef; + node->pn.page = kidDict; + + parse_pagenode(aux, node); + } + } // end loop + + + + end: + exit(0); + +} + + + + +/* + * This helper starts the process of elaborating the page tree + * starting with the trailer dictionary + */ +bool +parse_catalog(struct Env *aux, const HParsedToken *root) +{ + bool success = false; + const HParsedToken *dict_t = NULL; + const Dict *catalog = NULL; + const HParsedToken *ptRef = NULL; // page tree reference + const Dict *ptRoot = NULL; // page tree root Dictionary + const HParsedToken *kids = NULL; + const HParsedToken *item = NULL; + + + // initialize the catalog structure + aux->catalog.catalog = NULL; + aux->catalog.pRoot = NULL; + aux->catalog.pgCount = 0; + + + + + // Ensure the reference is to the catalog dictionary + dict_t = resolve(aux, root); // token + catalog = H_CAST(Dict, dict_t); // catalog dictionary + item = dictentry(catalog, "Type"); + if ( (item == NULL) || (item->token_type != TT_BYTES) || + (! bytes_eq(item->bytes, "Catalog")) ) { + fprintf(stderr, "parse_catalog: Catalog not found!\n"); + goto end; + } + aux->catalog.catalog = dict_t; // catalog dictionary token + + + // Catalog found -- Now get the root of the page tree associated with the catalog + ptRef = dictentry(catalog, "Pages"); // indirect reference to a dictionary + if ( (ptRef == NULL) || (ptRef->token_type != TT_Ref) ) { + fprintf(stderr, "parse_catalog: Page Tree not found!\n"); + goto end; + } + aux->catalog.pRoot = ptRef; // indirect reference to the page tree + + + /* resolve and process the page tree root reference to extract the dictionary --> Page Tree Object */ + dict_t = resolve(aux, ptRef); // page tree root node + ptRoot = H_CAST(Dict, dict_t); // page tree root dictionary + + // Count is a required field + item = dictentry(ptRoot, "Count"); + if ( (item == NULL) || (item->token_type != TT_SINT) ) { + fprintf(stderr, "parse_catalog: Required page node count missing!\n"); + goto end; + } + else { + aux->catalog.pgCount = H_CAST_SINT(item); + } + + item = dictentry(ptRoot, "Parent"); // root node ==> parent should be NULL + if (item != NULL) { + fprintf(stderr, "parse_pagetree: Parent of root page tree node is not NULL [p = %p]!\n", + (void *)item); + goto end; + } + + + // Kids is a required field + kids = dictentry(ptRoot, "Kids"); // array of references to page or page tree nodes + if ( (kids == NULL) || (kids->token_type != TT_SEQUENCE) ) { + fprintf(stderr, "parse_catalog: There are no kids!\n"); + goto end; + } + + // parse_pagetree + aux->catalog.pgTree.type = PG_TREE; + aux->catalog.pgTree.parent = NULL; + parse_pagetree(aux, &aux->catalog.pgTree, ptRef, kids, 0); + + + + end: + exit(0); + return success; +} + +/* + * ******************************************************************** + * End Catalog parsing + * ******************************************************************** + */ + /* * main program */ @@ -2494,6 +3810,52 @@ end: return xrefs; } + + +/* + * This utility extracts the text stream from the global environment + * writes it out to a file with the same name as the pdf input filename + * but with a .psectxt suffix. + */ +void +text_extract(const struct Env *aux) +{ + fprintf(stdout, "text_extract:: num text objects = %ld\n", aux->ntextobjs); + fprintf(stdout, "text_extract:: %s\n", aux->infile); + + int infnlen = strlen(aux->infile); + int sfxlen = strlen(".psectxt"); + int namelen = infnlen + sfxlen + 1; + + char *outfn = (char *) malloc(sizeof(char) * namelen); + if (outfn == NULL) { + fprintf(stderr, "text_extract:: h_arena_realloc() failed"); + return; + } + memcpy(outfn, aux->infile, infnlen); + memcpy(&outfn[infnlen], ".psectxt", sfxlen); + outfn[namelen-1] = '\0'; // null terminate the string + + // open the file for writing + FILE *stream; + if (!(stream = fopen(outfn, "w"))) { + fprintf(stderr, + "text_extract:: Failed to open file '%s' for writing\n", outfn); + return; + } + struct textnode *curr = aux->txthead; + for (int i = 0; i < aux->ntextobjs; i++) { + fprintf(stdout, "%.*s\n", (int) curr->tstr->nchars, curr->tstr->text); + fprintf(stream, "%.*s\n", (int) curr->tstr->nchars, curr->tstr->text); + curr = curr->next; + } + fclose(stream); + free(outfn); + return; +} + + + int main(int argc, char *argv[]) {