diff --git a/pdf.c b/pdf.c index 59ed8b077837d7427c4d07fef5ccd3e0d460324c..c7d99574018a16d7a53491479e120787080407ab 100644 --- a/pdf.c +++ b/pdf.c @@ -255,78 +255,30 @@ typedef HCountedArray Dict; -// Catalog Tree -typedef struct RsrcDict_S { - const HParsedToken *resources; // font references dictionary (resources == NULL) ==> inherit - const HParsedToken *fonts; // dictonary of fonts used in this page - size_t numFonts; - const HParsedToken *xobj; // xobj used in this page (?? is this <=1??, can page use multiple xobjects??) -// Dict **seenFonts; -// size_t numSeenFonts; -// const HParsedToken **seenCmaps; // memoized cmaps (should this be a bytestream? -// size_t numCmapsSeen; -} RsrcDict_T; - -struct PtNode_S; - -typedef struct PtNode_S { - enum {PG_TREE, PG_NODE, XO_NODE} type; - const HParsedToken *parent; // Type = Page tree -- reference - RsrcDict_T *pgRsrc; // resource structure - const HParsedToken *me; // Reference for me - size_t offset; // - union { - struct { - const Dict *dict; // page node dictionary - const HParsedToken *textStream; // content stream -- may be a result of concatenating array of content streams - } pn; - struct { - struct PtNode_S *kids; // page table - size_t count; // number of kids - size_t leaves; // number of pages in tree - } pt; - struct { - const Dict *dict; // page node dictionary - const HParsedToken *textStream; // content stream -- may be a result of concatenating array of content streams - } xn; - }; - -} PtNode_T; - -struct XoNode_S; -typedef struct XoNode_S { - char *name; - PtNode_T *node; - struct XoNode_S *next; -} XoNode_T; - - -typedef struct Catalog_S { - const HParsedToken *catalog; // reference - const HParsedToken *pRoot; // reference - PtNode_T pgTree; // page tree - size_t pgCount; // page tree node count - XoNode_T xObjs; // list of XObjects - XoNode_T *xoHead; - XoNode_T *xoTail; - size_t xoCount; // number of xobjects -} Catalog_T; // Forward declaration of text extraction related structures struct textnode; struct textstr; struct TextEntry_S; +struct PtNode_S; // *********************************************************** /* * Text data structures */ +typedef struct RsrcDict_S { + const HParsedToken *resources; // font references dictionary (resources == NULL) ==> inherit + const HParsedToken *fonts; // dictonary of fonts used in this page + size_t numFonts; + const HParsedToken *xobj; // xobj used in this page (?? is this <=1??, can page use multiple xobjects??) +// Dict **seenFonts; +// size_t numSeenFonts; +// const HParsedToken **seenCmaps; // memoized cmaps (should this be a bytestream? +// size_t numCmapsSeen; +} RsrcDict_T; + -typedef struct { - struct PtNode_S *page; - struct TextEntry_S *font; -} TextState_T; struct textnode { @@ -345,7 +297,9 @@ struct textpos { double tx; double ty; }; + struct textmat { double cell[6]; }; + struct textstr { uint8_t *text; uint32_t nchars; @@ -354,8 +308,8 @@ struct textstr { struct textwfmt { /* text with formatting specifications */ - double aw; /* word spacing */ - double ac; /* character spacing */ + double aw; /* word spacing */ + double ac; /* character spacing */ struct textstr tstr; /* the string */ }; struct tarrayelt { @@ -371,6 +325,17 @@ struct textarray { struct textstr flattened; }; +typedef struct { + struct PtNode_S *node; + struct TextEntry_S *font; // font name reference + double char_spacing; // width adjustment for characters + double word_spacing; // width adjustment for the space character + double line_spacing; // line adjustment + double horiz_scaling; // line adjustment + double font_size; // font size + struct textpos curr_pos; // text position on page +} TextState_T; + /* operator:: @@ -382,7 +347,8 @@ typedef struct TextEntry_S { enum {TS_Tc, TS_Tw, TS_Tz, TS_TL, TS_Tf, TS_Tr, TS_Ts, TP_Td, TP_TD, TP_Tm, TP_Tstar, TW_Tj, TW_Tq, TW_Tqq, TW_TJ} type; - TextState_T ts; // text state associated with this string (TBD: other state attributes) +// TextState_T ts; // text state associated with this string (TBD: other state attributes) + struct PtNode_S *node; // page or XObject node the string is associated with union { double value; /* many just have a value */ uint8_t mode; /* text mode */ @@ -397,15 +363,64 @@ typedef struct TextEntry_S { } TextEntry; // text object entries +// Page Node +typedef struct PtNode_S { + enum {PG_TREE, PG_NODE, XO_NODE} type; + struct PtNode_S *parent_n; // reference to the parent node struct + const HParsedToken *parent_t; // Type = Page tree -- reference + const HParsedToken *me_t; // Reference for me (*(Ref*)me->user ==> Ref structure) + unsigned int nodeNum; // number in my parent's kids array + RsrcDict_T *pgRsrc; // resource structure + struct textpos mediaBox; // layout page dimensions + TextState_T ts; // text state associated with this node + size_t offset; // This may not be useful -- often within encoded streams + struct { + unsigned int width; // page width described by MediaBox + unsigned int height; // page height described by MediaBox + } pagesz; + union { + struct { + const Dict *dict; // page node dictionary + unsigned int page_num; // number based on pgCount in the Catalog + const HParsedToken *textStream; // content stream -- may be a result of concatenating array of content streams + } pn; + struct { + struct PtNode_S *kids; // page tree array + unsigned int numPages; // number of kids + } pt; + struct { + const Dict *dict; // page node dictionary + struct textpos curr_pos; // text position on page + const HParsedToken *textStream; // content stream -- may be a result of concatenating array of content streams + } xn; // content within an xobject + }; + +} PtNode_T; + +struct XoNode_S; +typedef struct XoNode_S { + char *name; + PtNode_T *node; + struct XoNode_S *next; +} XoNode_T; + + +typedef struct Catalog_S { + const HParsedToken *catalog; // reference + const HParsedToken *pRoot; // reference + PtNode_T pgTree; // page tree + unsigned int lastPage; // incremented on visit of a page node + unsigned int lastNode; // incremented on visit of any PtNode_T + + // TODO:: Determine if this should be moved to the PageTree + // XoNode is one type of PageNode + XoNode_T xObjs; // list of XObjects + XoNode_T *xoHead; + XoNode_T *xoTail; + unsigned int xoCount; // number of xobjects -- not sure we use this +} Catalog_T; + -#if 0 -// Haven't used this type yet - maybe OBE -typedef struct { - struct textmat fm; /* font matrix associated with this text object */ - TextEntry **ops; /* operators associated w/string */ - uint8_t *txt; /* the string associated with this object */ -} TextString; -#endif // Initial use -- object streams typedef struct { @@ -447,12 +462,12 @@ struct Env { struct textnode *txttail; /* parsed text objects from the file */ size_t ntextobjs; - Catalog_T catalog; /* Catalog object and document structure */ - TextState_T tstate; /* current text state */ + Catalog_T catalog; /* Catalog object and document structure */ + PtNode_T *curr_node; /* current text state */ unsigned int paren_nest_depth; /* String nesting depth */ unsigned int array_nest_depth; /* Array nesting depth */ - unsigned int dict_nest_depth; /* Dictionary nesting depth */ + unsigned int dict_nest_depth; /* Dictionary nesting depth */ }; @@ -1477,9 +1492,9 @@ void pp_fontstate(FILE *stream, const TextState_T *state) { assert(state); fprintf(stream, "\nFont State: Page = "); - if ( (state->page) && - ( (state->page->type == PG_NODE) || (state->page->type == XO_NODE) ) ) - pp_ref(stream, state->page->me, 0, 0); + if ( (state->node) && + ( (state->node->type == PG_NODE) || (state->node->type == XO_NODE) ) ) + pp_ref(stream, state->node->me_t, 0, 0); else fprintf(stream, ", Text not associated with a page or xobject\n"); if (state->font) // not all operators need or set this @@ -1621,8 +1636,7 @@ act_Tc_op(const HParseResult *p, void *u) txte->value = tval->dbl; // associate the text with the current state - txte->ts.page = aux->tstate.page; - txte->ts.font = aux->tstate.font; + txte->node = aux->curr_node; //fprintf(stdout, "act_Tc_op:: %3.3f\n", txte->value); return H_MAKE(TextEntry, txte); @@ -1645,8 +1659,7 @@ act_Tw_op(const HParseResult *p, void *u) txte->value = H_FIELD_DOUBLE(0); // associate the text with the current state - txte->ts.page = aux->tstate.page; - txte->ts.font = aux->tstate.font; + txte->node = aux->curr_node; //fprintf(stdout, "act_Tw_op:: %3.3f\n", txte->value); return H_MAKE(TextEntry, txte); @@ -1655,7 +1668,7 @@ act_Tw_op(const HParseResult *p, void *u) /* - * Tz operator: horizintal scaling specification + * Tz operator: horizontal scaling specification * H_ARULE(Tz_op, SEQ(tnumb, ws, LIT("Tz"))); // 9.3.4 - horizontal scaling */ HParsedToken * @@ -1670,8 +1683,7 @@ act_Tz_op(const HParseResult *p, void *u) txte->value = H_FIELD_DOUBLE(0); // associate the text with the current state - txte->ts.page = aux->tstate.page; - txte->ts.font = aux->tstate.font; + txte->node = aux->curr_node; //fprintf(stdout, "act_Tz_op:: %3.3f\n", txte->value); return H_MAKE(TextEntry, txte); @@ -1695,8 +1707,7 @@ act_TL_op(const HParseResult *p, void *u) txte->value = H_FIELD_DOUBLE(0); // associate the text with the current state - txte->ts.page = aux->tstate.page; - txte->ts.font = aux->tstate.font; + txte->node = aux->curr_node; //fprintf(stdout, "act_TL_op:: %3.3f\n", txte->value); return H_MAKE(TextEntry, txte); @@ -1712,8 +1723,8 @@ act_TL_op(const HParseResult *p, void *u) HParsedToken * act_Tf_op(const HParseResult *p, void *u) { - TextEntry *txte = H_ALLOC(TextEntry); - struct Env *aux = (struct Env*)u; + TextEntry *txte = H_ALLOC(TextEntry); + struct Env *aux = (struct Env*)u; const HParsedToken *fn_token = H_FIELD_TOKEN(0); txte->type = TS_Tf; @@ -1729,27 +1740,12 @@ act_Tf_op(const HParseResult *p, void *u) txte->fref.fontsize = (double) H_FIELD_UINT(1); else if (tokenType == TT_DOUBLE) txte->fref.fontsize = (double) H_FIELD_DOUBLE(1); - //else - // fprintf(stderr, "act_Tf_op: Unexpected token type for fontsize - token_type=%u\n", - // tokenType); + // associate the text with the current state + txte->node = aux->curr_node; - // save this foont as the current state to be used by subsequent text - const HParsedToken * restok = H_MAKE(TextEntry, txte); - aux->tstate.font = txte; - // associate the text with the current state - txte->ts.page = aux->tstate.page; - txte->ts.font = aux->tstate.font; // recursive :-) defn - - // DEBUG - //fprintf(stdout, "act_Tf_op: fn=%.*s, fontsize=%3.3f, fontstate=%p, page=", - // txte->fref.namelen, txte->fref.fontname, txte->fref.fontsize, (void*)txte); - //if (aux->tstate.page->type==PG_NODE) - // pp_ref(stdout, aux->tstate.page->me, 0, 0); - //fprintf(stdout, "\n"); - - return ((HParsedToken *)restok); + return H_MAKE(TextEntry, txte); } @@ -1781,8 +1777,7 @@ act_Tr_op(const HParseResult *p, void *u) txte->mode = H_FIELD_UINT(0); // associate the text with the current state - txte->ts.page = aux->tstate.page; - txte->ts.font = aux->tstate.font; + txte->node = aux->curr_node; //fprintf(stdout, "act_Tr_op:: %d\n", txte->mode); return H_MAKE(TextEntry, txte); @@ -1805,8 +1800,7 @@ act_Ts_op(const HParseResult *p, void *u) txte->value = H_FIELD_DOUBLE(0); // associate the text with the current state - txte->ts.page = aux->tstate.page; - txte->ts.font = aux->tstate.font; + txte->node = aux->curr_node; //fprintf(stdout, "act_Ts_op:: %3.3f\n", txte->value); return H_MAKE(TextEntry, txte); @@ -1838,8 +1832,7 @@ act_Td_op(const HParseResult *p, void *u) // associate the text with the current state // NOTE: This operator does not require a font - txte->ts.page = aux->tstate.page; - txte->ts.font = aux->tstate.font; + txte->node = aux->curr_node; //fprintf(stdout, "act_Td_op: text position ::tx=%.3f:ty=%.3f\n", // txte->pos.tx, txte->pos.ty); @@ -1867,8 +1860,7 @@ act_TD_op(const HParseResult *p, void *u) txte->pos.ty = H_FIELD_DOUBLE(1); // associate the text with the current state - txte->ts.page = aux->tstate.page; - txte->ts.font = aux->tstate.font; + txte->node = aux->curr_node; //fprintf(stdout, "act_TD_op: text position ::tx=%3.3f:ty=%3.3f\n", txte->pos.tx, txte->pos.ty); //pp_fontstate(stdout, &txte->ts); @@ -1885,6 +1877,7 @@ HParsedToken * act_Tm_op(const HParseResult *p, void *u) { TextEntry *txte = H_ALLOC(TextEntry); + struct Env *aux = (struct Env*)u; txte->type = TP_Tm; @@ -1897,6 +1890,9 @@ act_Tm_op(const HParseResult *p, void *u) txte->fm.cell[i] = p->ast->seq->elements[0]->seq->elements[i]->seq->elements[0]->dbl; + // associate the text with the current state + txte->node = aux->curr_node; + //fprintf(stdout, "act_Tm_op: text matrix ::\n"); //for (int i=0; i<3; i++) // fprintf(stdout, "%3.3f : %3.3f\n", txte->fm.cell[i*2], txte->fm.cell[i*2+1]); @@ -1920,8 +1916,7 @@ act_Tstar_op(const HParseResult *p, void *u) txte->value = 0; // associate the text with the current state - txte->ts.page = aux->tstate.page; - txte->ts.font = aux->tstate.font; + txte->node = aux->curr_node; //fprintf(stdout, "act_Tstar_op: position pointer\n"); //pp_fontstate(stdout, &txte->ts); @@ -1956,8 +1951,7 @@ act_Tj_op(const HParseResult *p, void *u) txte->tstr.nchars = tstr->bytes.len; // associate the text with the current state - txte->ts.page = aux->tstate.page; - txte->ts.font = aux->tstate.font; + txte->node = aux->curr_node; //fprintf(stdout, "\nact_Tj_op:: nchars=%u, txt=%.*s\n", txte->tstr.nchars, @@ -1987,8 +1981,7 @@ act_TsingleQ_op(const HParseResult *p, void *u) txte->tstr.nchars = tstr->bytes.len; // associate the text wth the current font - txte->ts.page = aux->tstate.page; - txte->ts.font = aux->tstate.font; + txte->node = aux->curr_node; //fprintf(stdout, "\nact_TsingleQ_op:: nchars=%u, txt=%.*s\n", txte->tstr.nchars, @@ -2023,8 +2016,7 @@ act_TdoubleQ_op(const HParseResult *p, void *u) txte->twfmt.tstr.nchars = tstr->bytes.len; // associate the text wth the current font - txte->ts.page = aux->tstate.page; - txte->ts.font = aux->tstate.font; + txte->node = aux->curr_node; @@ -2056,8 +2048,7 @@ act_TJ_op(const HParseResult *p, void *u) // associate the text wth the current font - txte->ts.page = aux->tstate.page; - txte->ts.font = aux->tstate.font; + txte->node = aux->curr_node; /* * Parse each element of the array @@ -2081,14 +2072,6 @@ act_TJ_op(const HParseResult *p, void *u) txte->tarray.elts[i].isStr = true; txte->tarray.flattened.nchars += txte->tarray.elts[i].tstr.nchars; - - // Debug -// fprintf(stdout, "act_TJ_op:Cumulative=%d/0x%x bytes, Additional:%d bytes\n", -// txte->tarray.flattened.nchars, txte->tarray.flattened.nchars, txte->tarray.elts[i].tstr.nchars); - //fprintf(stdout, "act_TJ_op::: Using font= %p - page=", txte->ts.font); - //pp_ref(stdout, txte->ts.page->me, 0, 0); - //fprintf(stdout, "\nact_TJ_op:: nchars=%u, txt=%.*s\n", txte->tarray.elts[i].tstr.nchars, - // txte->tarray.elts[i].tstr.nchars, txte->tarray.elts[i].tstr.text); break; default: log_message(SEV_DONTCARE, "act_TJ_op:: Unexpected element type :: %d\n", elt->seq->elements[0]->token_type); @@ -2123,24 +2106,57 @@ act_TJ_op(const HParseResult *p, void *u) /* * Parse the text object delimited by "BT" and "ET" + * Text Matrix and Text Line Matrix are reinitialized at each invocation + * + * TODO:: Maintain line number to vertically (horizontally) sort the text */ HParsedToken * act_txtobj(const HParseResult *p, void *u) { - - //fprintf(stdout, "act_txtobj:: Here\n"); - - assert(p->ast->token_type == TT_SEQUENCE); - + // we will need to handle the case when the text string is longer than 8192 characters + struct Env *aux = (struct Env*)u; TextEntry *txtobj = H_ALLOC(TextEntry); TextEntry *txte = NULL; const HParsedToken *opstream = H_INDEX_TOKEN(p->ast, 1); const HParsedToken *tt_text=NULL; uint8_t *tstr=NULL; int textlen=0; + struct textmat tm; + PtNode_T *node = aux->curr_node; + double cs = node->ts.char_spacing; + double ws = node->ts.word_spacing; + double ls = node->ts.line_spacing; + double *px=&tm.cell[4]; + double *py=&tm.cell[5]; + + // initialize the text matrix + tm.cell[0] = 1; + tm.cell[1] = 0; + tm.cell[2] = 0; + tm.cell[3] = 1; + tm.cell[4] = 0; + tm.cell[5] = 0; + + { // debug + if (node->type == PG_NODE) { + fprintf(stdout, "/nPage Num = %d, ", node->pn.page_num); + } + else if (node->type == XO_NODE) { + fprintf(stdout, "Is a XO_NODE:\n"); + h_pprintln(stdout, node->me_t); + } + fprintf(stdout, "Starting Position: (x, y) = (%f, %f), Font = %s\n", + node->ts.curr_pos.tx, node->ts.curr_pos.ty, + (node->ts.font)?node->ts.font->fref.fn:"null"); + } + // if the current page/XObject is not set, return a NULL token + if (aux->curr_node == NULL) { + return ((HParsedToken *) NULL); + } + + // TODO:: Handle non-horizontal text - //fprintf(stdout, "act_txtobj:: numtokens = %lu\n", opstream->seq->used); // Walk through the tokens to determine how much space to allocate // Count the number of characters in the stream @@ -2148,75 +2164,202 @@ act_txtobj(const HParseResult *p, void *u) for (int i =0; i < opstream->seq->used; i++) { txte = H_CAST(TextEntry, opstream->seq->elements[i]); - // Process the text showing operators + + // make sure we are working on the same node as the current node + assert(txte && (txte->node == node)); + switch (txte->type) { - case TP_Td: - case TP_TD: + // text state operators + case TS_Tc: + node->ts.char_spacing = txte->value; + break; + + case TS_Tw: + node->ts.word_spacing = txte->value; + break; + + case TS_Tz: + node->ts.horiz_scaling = txte->value; + break; + + case TS_TL: + node->ts.line_spacing = txte->value; + break; + + case TS_Tf: + node->ts.font = txte; + node->ts.font_size = txte->fref.fontsize; + break; + + + // text positioning and showing operators + case TP_TD: + node->ts.line_spacing = txte->pos.ty; + case TP_Td: + if ( (*px == 0.0) && (*py == 0.0) ) { // initialize + *px = txte->pos.tx; + *py = txte->pos.ty; + // check to see if we are starting a new line + if ( (node->ts.curr_pos.ty != 0.0) && + (node->ts.curr_pos.ty != *py) ) { + textlen += 1; // add a newline + } + } else { + if (txte->pos.ty != 0.0) { + //we are not rendering -- we just know it is not in the same line if y not equal + textlen += 1; // add a newline + *py -= txte->pos.ty; // should this be a +=?? + } + if (txte->pos.tx) { // handle x -- when should we add a space + // TODO:: handle x -- not sure .. for now, ignore + *px += txte->pos.tx; + } + } + break; case TP_Tstar: - textlen += 1; + *py -= node->ts.line_spacing; + textlen += 1; break; + case TW_Tqq: - textlen += 1; + node->ts.word_spacing = txte->twfmt.aw; + node->ts.char_spacing = txte->twfmt.ac; + case TW_Tq: + *py -= node->ts.line_spacing; + textlen += 1; + case TW_Tj: + textlen += txte->tstr.nchars; + *px += txte->tstr.nchars * node->ts.font->fref.fontsize; // TODO:: handle character width from font description + break; + case TW_TJ: textlen += txte->tarray.flattened.nchars; + *px += txte->tarray.flattened.nchars * node->ts.font->fref.fontsize; // TODO:: handle character width from font description break; - case TW_Tq: - textlen += 1; - case TW_Tj: - textlen += txte->tstr.nchars; - break; - break; default: ; // ignore } } + + // Are we within the page bounds? If not, generate a warning + if ( (*px < 0) || (*px > aux->curr_node->mediaBox.tx) ) { + fprintf (stdout, "Final position of the text string is outside media box bounds.\n" + "Media Box-page width=%f, String end position-@width=%f\n", + aux->curr_node->mediaBox.tx, *px); + } + if ( (*py < 0) || (*py > aux->curr_node->mediaBox.ty) ) { + fprintf (stdout, "Final position of the text string is outside media box bounds.\n" + "Media Box-page height=%f, String end position-@height=%f\n", + aux->curr_node->mediaBox.tx, *px); + } + + // reset text state + *px = *py = 0.0; + node->ts.char_spacing = cs; + node->ts.word_spacing = ws; + node->ts.line_spacing = ls; + tstr = h_arena_malloc(p->arena, sizeof(uint8_t) * textlen); int idx=0; TextState_T *ts; // Now concatenate the pieces for (int i =0; i < opstream->seq->used; i++) { txte = H_CAST(TextEntry, opstream->seq->elements[i]); - ts = &txte->ts; + ts = &txte->node->ts; - // Process the text operators + { // debug + fprintf(stdout, "Position: (x, y) = (%f, %f), Font = %s\n", *px, *py, + ts->font->fref.fn); + } + + // Process the text operators switch (txte->type) { - case TP_Td: - case TP_TD: - case TP_Tstar: - tstr[idx] = '\n'; - idx += 1; + // text state operators + case TS_Tc: + node->ts.char_spacing = txte->value; + break; + + case TS_Tw: + node->ts.word_spacing = txte->value; + break; + + case TS_Tz: + node->ts.horiz_scaling = txte->value; + break; + + case TS_TL: + node->ts.line_spacing = txte->value; + break; + + case TS_Tf: + ts->font = txte; + node->ts.font_size = txte->fref.fontsize; + break; + + + // text positioning and showing operators + case TP_TD: + case TP_Td: + if ( (*px == 0.0) && (*py == 0.0) ) { // initialize + *px = txte->pos.tx; + *py = txte->pos.ty; + if ( (node->ts.curr_pos.ty != 0.0) && + (node->ts.curr_pos.ty != *py) ) { + tstr[idx] = '\n'; idx += 1; + } + } else { + if (txte->pos.ty != 0.0) { + tstr[idx] = '\n'; idx += 1; + *py -= txte->pos.ty; // should this be a +=?? + } + if (txte->pos.tx) { // handle x -- when should we add a space + // TODO:: handle x -- not sure .. for now, ignore + *px += txte->pos.tx; + } + } + break; + + case TP_Tstar: + tstr[idx] = '\n'; idx += 1; + *py -= node->ts.line_spacing; break; - case TW_Tqq: - tstr[idx] = '\n'; - idx += 1; - case TW_TJ: - memcpy(&tstr[idx], txte->tarray.flattened.text, txte->tarray.flattened.nchars); - idx += txte->tarray.flattened.nchars; - //fprintf(stdout, "act_txtobj - array:: len=%u, str=", txte->tarray.flattened.nchars); - //fwrite((const void *)txte->tarray.flattened.text, (int) txte->tarray.flattened.nchars, 1, stdout); - //pp_fontstate(stdout, ts); - break; - case TW_Tq: - tstr[idx] = '\n'; - idx += 1; + case TW_Tqq: + node->ts.word_spacing = txte->twfmt.aw; + node->ts.char_spacing = txte->twfmt.ac; + case TW_Tq: + *py -= node->ts.line_spacing; + tstr[idx] = '\n'; idx += 1; + case TW_Tj: + memcpy(&tstr[idx], txte->tstr.text, txte->tstr.nchars); + idx += txte->tstr.nchars; + *px += txte->tarray.flattened.nchars * node->ts.font->fref.fontsize; // TODO:: handle character width from font description + break; + + + case TW_TJ: + memcpy(&tstr[idx], txte->tarray.flattened.text, txte->tarray.flattened.nchars); + idx += txte->tarray.flattened.nchars; + *px += txte->tarray.flattened.nchars * node->ts.font->fref.fontsize; // TODO:: handle character width from font description + break; - case TW_Tj: - memcpy(&tstr[idx], txte->tstr.text, txte->tstr.nchars); - idx += txte->tstr.nchars; - //fprintf(stdout, "act_txtobj:: len=%u, str=", txte->tstr.nchars); - //fwrite((const void *)txte->tstr.text, (int) txte->tstr.nchars, 1, stdout); - //pp_fontstate(stdout, ts); - break; default: ; // ignore } } assert(idx == textlen); + // update the position on the page + node->ts.curr_pos.tx = *px; + node->ts.curr_pos.ty = *py; + + { // debug + fprintf(stdout, "Ending Position: (x, y) = (%f, %f), Font = %s\n", *px, *py, + node->ts.font->fref.fn); + } txtobj->type = TW_Tj; txtobj->obj = opstream; @@ -2224,25 +2367,14 @@ act_txtobj(const HParseResult *p, void *u) txtobj->tstr.nchars = textlen; txtobj->tstr.tobj = opstream; if (textlen) { - txtobj->ts.page = ts->page; - txtobj->ts.font = ts->font; + txtobj->node = ts->node; } else { - txtobj->ts.page = NULL; - txtobj->ts.font = NULL; + txtobj->node = NULL; } // pretty print the information tt_text = H_MAKE(TextEntry, txtobj); - // DEBUG - /* - if (textlen) { - fprintf(stdout, "act_txtobj:: "); - pp_textentry(stdout, tt_text, 0, 0); - if (&txtobj->ts.page) - pp_fontstate(stdout, &txtobj->ts); - } - */ return (HParsedToken *)tt_text; } @@ -2250,7 +2382,7 @@ act_txtobj(const HParseResult *p, void *u) /* * This continuation takes the text stream and saves it in the environment for further * processing, e.g. writing it out to a file with the same name as the pdf input filename - * but woth a .psectxt suffix. + * but with a .psectxt suffix. * It does not consume the string and returns the parser as the output. * * x = (txtobj ...) @@ -2415,10 +2547,10 @@ text_extract(struct Env *aux) break; } if (tstr) { - ft = lookup_font(&txte->ts, aux); + ft = lookup_font(&txte->node->ts, aux); if (ft) { //pp_fontinfo(stdout, &txte->ts, ft); - pp_fontinfo(stream, &txte->ts, ft); + pp_fontinfo(stream, &txte->node->ts, ft); } else { char *estr = "\nMissing Font Info!!\n"; @@ -2915,9 +3047,11 @@ init_parser(struct Env *aux) H_RULE(txtbegin, h_indirect()); H_RULE(txt_before_junk, IGN(SEQ(h_not(LIT("BT")), CHX(comment, h_uint8())))); - H_ARULE(txtbegin_, SEQ(IGN(h_many(txt_before_junk)), LIT("BT"), aws)); + HParser *txtbegin_ = h_action(SEQ(IGN(h_many(txt_before_junk)), LIT("BT"), aws), act_txtbegin_, aux); /* Text Begin */ +// H_ARULE(txtbegin_, SEQ(IGN(h_many(txt_before_junk)), LIT("BT"), aws)); h_bind_indirect(txtbegin, txtbegin_); - H_ARULE(txtend, KW("ET")); + HParser *txtend = h_action(KW("ET"), act_txtend, aux); +// H_ARULE(txtend, KW("ET")); /* 9.3 - Text state operators */ H_AVRULE(tnumb, numb); HParser *Tc_op = h_action(SEQ(tnumb, aws, LIT("Tc"), aws), act_Tc_op, aux); /* 9.3.2 - charSpace */ @@ -2953,7 +3087,7 @@ init_parser(struct Env *aux) H_RULE(text_ops, CHX(textstate_ops, textpos_ops, textshow_ops, text_inbetween_junk)); /* Text object */ - H_ARULE(txtobj, SEQ(txtbegin, h_many(text_ops), txtend)); + HParser *txtobj = h_action(SEQ(txtbegin, h_many(text_ops), txtend), act_txtobj, aux); H_RULE(txtobjs, h_many1(txtobj)); @@ -4164,7 +4298,7 @@ get_fontdict(const HParsedToken *obj, struct Env* aux) const HParsedToken * get_dictoftype( const HParsedToken *obj, - const HParsedToken *pRefT, + const HParsedToken *parent_t, char *value, struct Env *aux) { @@ -4174,7 +4308,7 @@ get_dictoftype( if (obj->token_type == TT_Dict) { dict = H_CAST(Dict, obj); - if (is_parent(dict, pRefT) && has_value(dict, "Type", value)) + if (is_parent(dict, parent_t) && has_value(dict, "Type", value)) tok = obj; else dict = NULL; @@ -4187,7 +4321,7 @@ get_dictoftype( const HParsedToken *sitem = resolve_item(aux, stm->tok[i].obj, &ioff, p_objdef); if ((sitem) && (sitem->token_type == TT_Dict)) { dict = H_CAST(Dict, sitem); - if (is_parent(dict, pRefT) && has_value(dict, "Type", value)) { + if (is_parent(dict, parent_t) && has_value(dict, "Type", value)) { tok = sitem; break; } @@ -4196,34 +4330,10 @@ get_dictoftype( } } } - /* - else { - fprintf(stdout, "get_dictoftype: token type not yet handled: %u\n", - obj->token_type); - fprintf(stdout, "get_dictoftype: Possibly needed for CMAPS\n"); - h_pprintln(stdout, obj); - } - */ if (dict == NULL) tok = NULL; - // DEBUG - /* - if (pRefT) { - fprintf(stdout, "\nget_dictoftype: Parent = "); - pp_ref(stdout, pRefT, 0, 0); - } - else - fprintf(stdout, "get_dictoftype: Parent = NULL"); - if (tok) { - fprintf(stdout, "\nget_dictoftype: Type = %s\n", value); - pp_dict(stdout, tok, 0, 0); - } - else { - fprintf(stdout, "\nget_dictoftype: Null dictionary of Type = %s\n", value); - } - */ return tok; } @@ -4344,6 +4454,8 @@ kcontentstream(HAllocator *mm__, const HParsedToken *x, void *env) } +#if 0 +// unused function const HParsedToken *create_strmdict(HArena *arena, size_t len) { @@ -4360,6 +4472,7 @@ HParsedToken *create_strmdict(HArena *arena, size_t len) tok = res->ast; return tok; } +#endif Fontinfo_T * getFontinfo(const Dict *fontdict, char *name, struct Env *aux) @@ -4504,7 +4617,7 @@ lookup_font(TextState_T *state, struct Env *aux) Dict *fontlist; // font list dictionary in page Fontinfo_T *fontinfo = NULL; - PtNode_T *page = state->page; + PtNode_T *page = state->node; assert ( (page->type == PG_NODE) || (page->type == XO_NODE) ); struct TextEntry_S *fentry = state->font; if (fentry) { @@ -4584,7 +4697,7 @@ bool parse_fonts(const HParsedToken *dict_t, RsrcDict_T *pgRsrc, struct Env *aux return processed; } -void parse_rsrcdict(HArena *arena, const HParsedToken *dict_t, PtNode_T *pgNode, struct Env *aux); +RsrcDict_T *parse_rsrcdict(HArena *arena, const HParsedToken *dict_t, PtNode_T *pgNode, struct Env *aux); XoNode_T* @@ -4638,14 +4751,15 @@ void parse_xobject( Dict *xolist = H_CAST(Dict, item); - // DEBUG - fprintf(stdout, "\nparse_xobject: Node for Parent = "); - if (parent->me) pp_ref(stdout, parent->me, 0, 0); - fprintf(stdout, "\nOld XO Count = %lu, Num xobjects used in page = %lu \n", - aux->catalog.xoCount, xolist->used); - pp_dict(stdout, item, 0, 0); - // DEBUG - + { + // DEBUG + fprintf(stdout, "\nparse_xobject: Node for Parent = "); + if (parent->me_t) pp_ref(stdout, parent->me_t, 0, 0); + fprintf(stdout, "\nOld XO Count = %u, Num xobjects used in page = %lu \n", + aux->catalog.xoCount, xolist->used); + pp_dict(stdout, item, 0, 0); + // DEBUG + } aux->catalog.xoCount += xolist->used; // work on each element of the dictionary for (int i=0; i<xolist->used; i++) { @@ -4658,11 +4772,12 @@ void parse_xobject( const HParsedToken *ref = H_INDEX_TOKEN(xolist->elements[i], 1); assert(ref->token_type == TT_Ref); xobj_r->node = h_arena_malloc(xodict->arena, sizeof(PtNode_T)); - xobj_r->node->type = XO_NODE; - xobj_r->node->parent = parent->me; - xobj_r->node->pgRsrc = pgRsrc; - xobj_r->node->me = ref; - xobj_r->node->offset = 0; // TODO: get the offset to the stream + xobj_r->node->type = XO_NODE; + xobj_r->node->parent_n = parent; + xobj_r->node->parent_t = parent->me_t; + xobj_r->node->pgRsrc = pgRsrc; + xobj_r->node->me_t = ref; + xobj_r->node->offset = 0; // TODO: get the offset to the stream tok = resolve_item(aux, ref, &xobj_r->node->offset, p_objdef); if (tok == NULL) continue; @@ -4711,12 +4826,15 @@ void parse_xobject( xobj_r->node->xn.textStream = NULL; continue; } + { // DEBUG + fprintf(stdout, "parse_xobject: **! XObject redefining resources for the page\n"); + } - parse_rsrcdict(xodict->arena, rsrcdict_t, xobj_r->node, aux); + xobj_r->node->pgRsrc = parse_rsrcdict(xodict->arena, rsrcdict_t, xobj_r->node, aux); // set the text state to this xobject // parse the text stream, which is field 2 of the sequence - aux->tstate.page = xobj_r->node; + aux->curr_node = xobj_r->node; xobj_t = H_INDEX_TOKEN(tok, 1); // expecting an HParseResult token const HParseResult *res = H_CAST(HParseResult, xobj_t); // DEBUG @@ -4750,9 +4868,9 @@ void parse_xobject( } -void parse_rsrcdict(HArena *arena, const HParsedToken *dict_t, PtNode_T *pgNode, struct Env *aux) +RsrcDict_T *parse_rsrcdict(HArena *arena, const HParsedToken *dict_t, PtNode_T *pgNode, struct Env *aux) { - RsrcDict_T *rsrc = NULL; + RsrcDict_T *rsrc = NULL; // Process the dictionary @@ -4796,11 +4914,11 @@ void parse_rsrcdict(HArena *arena, const HParsedToken *dict_t, PtNode_T *pgNode, } } - pgNode->pgRsrc = rsrc; - return; + return (rsrc); } + void pp_pgrsrc(FILE *stream, const RsrcDict_T *pgRsrc) { if (pgRsrc) { @@ -4818,9 +4936,10 @@ void pp_ptnode(FILE *stream, const PtNode_T *node) { fprintf(stream, "\nPage Tree Node Info:\n"); fprintf(stream, "pp_ptnode: parent = "); - if (node->parent) h_pprintln(stream, node->parent); - fprintf(stream, "\npp_ptnode: me = "); - if (node->me) h_pprintln(stream, node->me); + fprintf(stream, "pp_ptnode: parent_ref = %p\n", node->parent_n); + if (node->parent_t) h_pprintln(stream, node->parent_t); + fprintf(stream, "\npp_ptnode: me_ref = "); + if (node->me_t) h_pprintln(stream, node->me_t); if (node->pgRsrc) pp_pgrsrc(stream, node->pgRsrc); } @@ -4829,58 +4948,102 @@ void parse_pagenode ( PtNode_T *myNode, const HParsedToken *myRef, // my page node reference const Dict *myDict, // my page node specification - const HParsedToken *parent, + const HParsedToken *parent_t, // parent reference token + PtNode_T *parent_n, // my parent node HArena *arena ) { - const HParsedToken *item = NULL; const HParsedToken *contents_t = NULL; // dictionary token - //Ref *contents_r = NULL; const HParsedToken *contents = NULL; // resolved token const HParsedToken *entry = NULL; - const HParsedToken *rsrcdict_t = NULL; - size_t nOffset = 0; - - // DEBUG - //fprintf(stdout, "\nparse_pagenode: parsing Page Node = "); - //pp_ref(stdout, myRef, 0, 0); - - - // set some global state variables - aux->tstate.page = myNode; - myNode->type = PG_NODE; - myNode->me = myRef; - myNode->pn.dict = myDict; + size_t nOffset = 0; + + + // Initialize the page tree node + myNode->type = PG_NODE; + myNode->parent_n = parent_n; // NULL for root + myNode->parent_t = parent_t; + myNode->me_t = myRef; + aux->catalog.lastNode++; // keep track of the node count + aux->catalog.lastPage++; // keep track of the page count + aux->curr_node = myNode; + myNode->ts.node = myNode; // set the current page to me + myNode->ts.font = NULL; + myNode->ts.char_spacing = 0.0; + myNode->ts.word_spacing = 0.0; + myNode->ts.line_spacing = 0.0; + // locate the starting point at top left of a 8.5x11" paper + myNode->ts.curr_pos.tx = 0.0; // 8.5" = 612 points + myNode->ts.curr_pos.ty = 0.0; // 11" = 792 points + myNode->pn.dict = myDict; + myNode->nodeNum = aux->catalog.lastNode; + myNode->pn.page_num = aux->catalog.lastPage; + // initialize by inheriting -- override with local + myNode->pgRsrc = parent_n->pgRsrc; + myNode->mediaBox.tx = parent_n->mediaBox.tx; + myNode->mediaBox.ty = parent_n->mediaBox.ty; + + + + const HParsedToken *item = NULL; + item = dictentry(myDict, "Parent"); + { // Debug + fprintf(stdout, "PageNode:: Parent (from caller) ="); + h_pprintln(stdout, parent_t); + fprintf(stdout, "PageNode:: Parent (from node) ="); + h_pprintln(stdout, item); + fprintf(stdout, "PageNode:: Me ="); + h_pprintln(stdout, myRef); + } + bool matched = false; + if (item && parent_t) { // neither item nor parent_t should be NULL + assert(item->token_type == TT_Ref); + if ( ( ((Ref*)item->user)->nr == ((Ref*)parent_t->user)->nr ) && + ( ((Ref*)item->user)->gen == ((Ref*)parent_t->user)->gen ) ) { + matched = true; + } + } + if (! matched) { + fprintf(stdout, "parse_pagenode: Inconsistent or corrupt parent key!\n"); + fprintf(stdout, "Parent (from caller) =\n"); + h_pprintln(stdout, parent_t); + fprintf(stdout, "Parent (from node) =\n"); + h_pprintln(stdout, item); + + // just continue for now + goto end; + } - item = dictentry(myDict, "Parent"); - assert(item->token_type == TT_Ref); - if ( !( ( ((Ref*)item->user)->nr == ((Ref*)parent->user)->nr ) && - ( ((Ref*)item->user)->gen == ((Ref*)parent->user)->gen ) ) ) { - //fprintf(stderr, "parse_pagenode: Inconsistent parent pointer [p = %p]!\n", - // (void *)item); - // should this just be a warning? - goto end; - } - myNode->parent = item; + // Hold on to the mediaBox (page dimensions)-- required + item = dictentry(myDict, "MediaBox"); // inheritable if not NULL + if (item) { + item = resolve(aux, item); + if (item->token_type == TT_SEQUENCE) { + int llx, lly, urx, ury; // typical but can be any diagonal + llx = H_INDEX_SINT(item, 0); + lly = H_INDEX_SINT(item, 1); + urx = H_INDEX_SINT(item, 2); + ury = H_INDEX_SINT(item, 3); + myNode->mediaBox.tx = abs(llx - urx); + myNode->mediaBox.ty = abs(lly - ury);; + } + } - // Hold on to the Resources dictionary - // This dictionary may be empty - // If there is no dictionary ==> inherit resources from parent - myNode->pgRsrc = NULL; + // Hold on to the Resources dictionary -- required + const HParsedToken *rsrcDictT; item = dictentry(myDict, "Resources"); if (item) { - //fprintf(stdout, "\n\nparse_pagenode: Found resources in node\n"); - rsrcdict_t = resolve(aux, item); - //fprintf(stdout, "\nparse_pagenode: Resource token type = %u\n",rsrcdict_t->token_type); - parse_rsrcdict(arena, rsrcdict_t, myNode, aux); - //pp_ptnode(stdout, myNode); + rsrcDictT = resolve(aux, item); + myNode->pgRsrc = parse_rsrcdict(myDict->arena, rsrcDictT, myNode, aux); } - // Process the contents stream or array + + + // Process the contents stream or array -- optional contents_t = dictentry(myDict, "Contents"); if (contents_t == NULL) { //fprintf(stderr, "parse_pagenode: Page node without contents!\n"); @@ -4895,7 +5058,11 @@ void parse_pagenode ( goto end; } if (contents->token_type == TT_Objstm) { // Resources for the page node - parse_rsrcdict(arena, contents, myNode, aux); + { // DEBUG + fprintf(stdout, "parse_pagenode: **! Contents redefining resources for the page\n"); + } + + myNode->pgRsrc = parse_rsrcdict(arena, contents, myNode, aux); //pp_ptnode(stdout, myNode); } /* Indirect reference to an array, which may in turn have indirect references */ @@ -4979,13 +5146,6 @@ void parse_pagenode ( myNode->pn.textStream = tstrm->ast; } } - else { - //fprintf(stdout, "parse_pagenode: Unexpected page node contents token type = %u\n", contents_t->token_type); - goto end; - } - -// fprintf(stdout, "parse_pagenode:: Contents token type = %d\n", -// contents->token_type); end: @@ -5004,78 +5164,150 @@ end: * identifying text streams and contents streams, saving the information to support * text extraction in the environment structure. * + * Parent reference and parent node are both NULL for the page tree * * A return value of false indicates some parsing error. */ -// need to maintain information about pages void parse_pagetree( struct Env *aux, - PtNode_T *myNode, + PtNode_T *myNode, // my page tree node const HParsedToken *myRef, // my page tree node reference const Dict *myDict, // my page tree specification - const HParsedToken *pRefT, // parent reference token - size_t curr // number of pages seen so far + const HParsedToken *parent_t, // my parent reference token + PtNode_T *parent_n // my parent node ) { - - const HParsedToken *item = NULL; - const HParsedToken *kids = NULL; - PtNode_T *kid = NULL; - const HParsedToken *kidRef = NULL; // page tree or page node reference - const HParsedToken *kidDict_t = NULL; - const HParsedToken *pageDict_t = NULL; - const HParsedToken *treeDict_t = NULL; - const Dict *kidDict = NULL; - const HParsedToken *rsrcdict_t = NULL; - - - //fprintf(stdout, "\nparse_pagetree: parsing Page Tree Node = "); - //pp_ref(stdout, myRef, 0, 0); - - + // Initialize the page tree node myNode->type = PG_TREE; + myNode->parent_n = parent_n; // NULL for root + myNode->parent_t = parent_t; + myNode->me_t = myRef; + aux->catalog.lastNode++; // keep track of the node count + myNode->nodeNum = aux->catalog.lastNode; + if (parent_n) { // inheritable if in a tree node + myNode->pgRsrc = parent_n->pgRsrc; + myNode->mediaBox.tx = parent_n->mediaBox.tx; + myNode->mediaBox.ty = parent_n->mediaBox.ty; + } + else { + myNode->pgRsrc = NULL; + myNode->mediaBox.tx = 0; + myNode->mediaBox.ty = 0; + } + myNode->offset = 0; - item = dictentry(myDict, "Parent"); // if root node ==> parent should be NULL - myNode->parent = item; - // Count is a required field except for the root + // make sure that the parent node matches + const HParsedToken *item = NULL; + item = dictentry(myDict, "Parent"); // if root node ==> parent should be NULL + { // Debug + h_pprintln(stdout, NULL); // will this generate an exception?? + fprintf(stdout, "PageTree:: Parent (from caller) ="); + h_pprintln(stdout, parent_t); + fprintf(stdout, "PageTree:: Parent (from node) ="); + h_pprintln(stdout, item); + } + bool matched = false; + if (item && parent_t) { + if ( ( ((Ref*)item->user)->nr == ((Ref*)parent_t->user)->nr ) && + ( ((Ref*)item->user)->gen == ((Ref*)parent_t->user)->gen ) ) { + matched = true; + } + } + else { // either the parent is the page tree root or the node does not have a valid Parent key + if (item == parent_t) {// if item is NULL, we expect parent_t to also be NULL (==> root) + matched = true; + } + } + if (! matched) { + fprintf(stdout, "parse_pagetree: Inconsistent or corrupt parent key!\n"); + fprintf(stdout, "Parent (from caller) =\n"); + h_pprintln(stdout, parent_t); + fprintf(stdout, "Parent (from node) =\n"); + h_pprintln(stdout, item); + + // just continue for now + goto end; + } + + + // Count is a required field item = dictentry(myDict, "Count"); - if ( (item == NULL) || (item->token_type != TT_SINT) ) { - //fprintf(stderr, "parse_pagetree: Required page node count missing!\n"); - goto end; // This should just be a warning + bool parsed = false; + if (item) { + item = resolve(aux, item); // anything can be a ref + if ( item && (item->token_type == TT_SINT) ) parsed = true; } - else { - myNode->pt.leaves = H_CAST_SINT(item); - if (aux->catalog.pgCount == 0) - aux->catalog.pgCount = myNode->pt.leaves; + if ( ! parsed ) { + fprintf(stdout, "parse_pagetree: Required page node count missing or corrupt!\n"); + goto end; // Should just be a warning? } + myNode->pt.numPages = H_CAST_SINT(item); + + + // page tree node may have resources and media box that are inheritable + item = dictentry(myDict, "MediaBox"); // inheritable if not NULL + if (item) { + item = resolve(aux, item); + if (item->token_type == TT_SEQUENCE) { + int llx, lly, urx, ury; // typical but can be any diagonal + llx = H_INDEX_SINT(item, 0); + lly = H_INDEX_SINT(item, 1); + urx = H_INDEX_SINT(item, 2); + ury = H_INDEX_SINT(item, 3); + myNode->mediaBox.tx = abs(llx - urx); + myNode->mediaBox.ty = abs(lly - ury);; + } + } + + const HParsedToken *rsrcDictT = NULL; + item = dictentry(myDict, "Resources"); // inheritable if not NULL + if (item) { + rsrcDictT = resolve(aux, item); + myNode->pgRsrc = parse_rsrcdict(myDict->arena, rsrcDictT, myNode, aux); + } - // Kids is a required field - kids = dictentry(myDict, "Kids"); // array of references to page or page tree nodes - if ( (kids == NULL) || (kids->token_type != TT_SEQUENCE) ) { - //fprintf(stderr, "parse_pagetree: This tree node has no pages!\n"); - goto end; // Nothing more to do here - } + // Kids array is a required field -- process the kids (pgTable) + const HParsedToken *kids = NULL; + parsed = false; + kids = dictentry(myDict, "Kids"); // array of references to page or page tree nodes + if (kids) { + kids = resolve(aux, kids); + if (kids->token_type == TT_SEQUENCE) parsed = true; + } + if ( ! parsed ) { + fprintf(stdout, "parse_pagetree: This tree node has no pages!\n"); + goto end; // Nothing more to do here + } + PtNode_T *kid = NULL; + const HParsedToken *kidRef = NULL; // page tree or page node reference + const HParsedToken *kidDict_t = NULL; + const HParsedToken *pageDict_t = NULL; + const HParsedToken *treeDict_t = NULL; + const Dict *kidDict = NULL; - // get the kids (pgTable) HCountedArray *pgTable = H_CAST_SEQ(kids); size_t pgtSz = pgTable->used; + if (pgtSz != myNode->pt.numPages) { + fprintf(stdout, "parse_pagetree: Size mismatch:: " + "Kids array size = %lu, Page Count in node = %u\n", + pgtSz, myNode->pt.numPages); + // continue processing the pages + } myNode->pt.kids = (PtNode_T*)h_arena_malloc(pgTable->arena, pgtSz * sizeof(PtNode_T)); - myNode->pt.count = pgtSz; // Process the kids for (int i=0; i<pgtSz; i++) { kid = &myNode->pt.kids[i]; - kid->parent = myRef; kidRef = pgTable->elements[i]; kidDict_t = resolve(aux, kidRef); // page or tree node dictionary or object stream token @@ -5084,43 +5316,15 @@ parse_pagetree( treeDict_t = get_dictoftype(kidDict_t, myRef, "Pages", aux); if (treeDict_t) { kidDict = H_CAST(Dict, treeDict_t); - parse_pagetree(aux, kid, kidRef, kidDict, myRef, curr); + parse_pagetree(aux, kid, kidRef, kidDict, myRef, myNode); } // Look for a page node pageDict_t = get_dictoftype(kidDict_t, myRef, "Page", aux); if (pageDict_t) { kidDict = H_CAST(Dict, pageDict_t); - if (++curr > aux->catalog.pgCount) { - //fprintf(stderr, "parse_pagetree: More kids then specified leaves!\n"); - // TODO:: probably just a warning is enough here -- run the VIOL parser? - } - parse_pagenode(aux, kid, kidRef, kidDict, myRef, pgTable->arena); + parse_pagenode(aux, kid, kidRef, kidDict, myRef, myNode, pgTable->arena); } - - // Look for Resources dictionary - myNode->pgRsrc = NULL; - item = dictentry(myDict, "Resources"); - if (item) { - //fprintf(stdout, "\n\nparse_pagetree: Found resources in node\n"); - size_t offset = 0; - rsrcdict_t = resolve_item(aux, item, &offset, p_objdef); - if (!rsrcdict_t) { // TODO: Failure ==> xref error -- Figure out how to handle - goto end; - } - //fprintf(stdout, "\nparse_pagetree: Resource token type = %u\n",rsrcdict_t->token_type); - parse_rsrcdict(pgTable->arena, rsrcdict_t, myNode, aux); - //pp_ptnode(stdout, myNode); - } - } - /* - else { - Ref *ref = (Ref *)kidRef->user; - fprintf(stderr, "parse_pagetree: Reference <%zu, %zu> not found -- Deleted?!\n", - ref->nr, ref->gen); - } - */ - } // end loop @@ -5146,9 +5350,10 @@ parse_catalog(struct Env *aux, const HParsedToken *root) // initialize the catalog structure - aux->catalog.catalog = NULL; - aux->catalog.pRoot = NULL; - aux->catalog.pgCount = 0; + aux->catalog.catalog = NULL; + aux->catalog.pRoot = NULL; + aux->catalog.lastPage = 0; + aux->catalog.lastNode = 0; // Initialize the xobject structure aux->catalog.xObjs.name = NULL; aux->catalog.xObjs.node = NULL; @@ -5157,15 +5362,6 @@ parse_catalog(struct Env *aux, const HParsedToken *root) aux->catalog.xoTail = NULL; aux->catalog.xoCount = 0; - // DEBUG - /* - fprintf(stdout, "\nparse_catalog: parsing Catalog = "); - if (root->token_type == TT_Ref) - pp_ref(stdout, root, 0, 0); - else if (root->token_type == TT_Dict) - pp_dict(stdout, root, 0, 0); - */ - // Ensure the reference is to the catalog dictionary size_t offset = 0; @@ -5173,6 +5369,7 @@ parse_catalog(struct Env *aux, const HParsedToken *root) if (!dict_t) { // TODO: Failure ==> xref error -- Figure out how to handle goto end; } + fprintf(stdout, "Trailer at offset = %lu\n", offset); // TODO:: Remove me aux->catalog.catalog = get_dictoftype(dict_t, NULL, "Catalog", aux); // catalog dictionary token if (aux->catalog.catalog) { // Caution:: relying on the short-circuiting behavior here @@ -5182,29 +5379,27 @@ parse_catalog(struct Env *aux, const HParsedToken *root) // Catalog found -- Now get the root of the page tree associated with the catalog ptRef = dictentry(catalog, "Pages"); // indirect reference to a dictionary if ( (ptRef == NULL) || (ptRef->token_type != TT_Ref) ) { - //fprintf(stderr, "parse_catalog: Page Tree not found!\n"); goto end; } aux->catalog.pRoot = ptRef; // indirect reference to the page tree /* resolve and process the page tree root reference to extract the dictionary --> Page Tree Object */ - dict_t = resolve_item(aux, ptRef, &offset, p_objdef); // page tree root node - if (!dict_t) { // TODO: Failure ==> xref error -- Figure out how to handle + dict_t = resolve_item(aux, ptRef, &offset, p_objdef); // page tree root node + if (!dict_t) { goto end; } + // make sure the retrieved token is a dictionary of /Type "Pages" dict_t = get_dictoftype(dict_t, NULL, "Pages", aux); // page tree root dictionary (parent is NULL) if (!dict_t) { - //fprintf(stderr, "parse_catalog: No page table!\n"); goto end; // Nothing more to do here } ptRoot = H_CAST(Dict, dict_t); // parse_pagetree - parse_pagetree(aux, &aux->catalog.pgTree, ptRef, ptRoot, NULL, 0); + parse_pagetree(aux, &aux->catalog.pgTree, ptRef, ptRoot, NULL, NULL); } else { // looks like the field "Type:Catalog" is a hint, not a requirement for a valid pdf - //fprintf (stdout, "\n\nThe Catalog is missing!!"); goto end; }