diff --git a/pdf.c b/pdf.c index 098475a5024ab9d535b444f23aa01646d6c9c9ba..df1bc4ffc3eefc6203e71345e01683c37ab3e9e3 100644 --- a/pdf.c +++ b/pdf.c @@ -126,12 +126,15 @@ typedef struct RsrcDict_S { const HParsedToken *resources; // font references dictionary (resources == NULL) ==> inherit const HParsedToken *fonts; // dictonary of fonts used in this page size_t numFonts; + const HParsedToken *xobj; // xobj used in this page (?? is this <=1??, can page use multiple xobjects??) // Dict **seenFonts; // size_t numSeenFonts; - const HParsedToken **seenCmaps; // memoized cmaps (should this be a bytestream? - size_t numCmapsSeen; +// const HParsedToken **seenCmaps; // memoized cmaps (should this be a bytestream? +// size_t numCmapsSeen; } RsrcDict_T; + struct PtNode_S; + typedef struct PtNode_S { enum {PG_TREE, PG_NODE, XO_NODE} type; const HParsedToken *parent; // Type = Page tree -- reference @@ -148,19 +151,23 @@ typedef struct PtNode_S { size_t count; // number of kids size_t leaves; // number of pages in tree } pt; - struct { - struct PtNode_S *xobjs; // table of xobjects - size_t count; // number of xobject streams - } xn; }; } PtNode_T; +typedef struct XoNode_S { + char *name; + PtNode_T *node; +} XoNode_T; + + typedef struct Catalog_S { const HParsedToken *catalog; // reference const HParsedToken *pRoot; // reference - PtNode_T pgTree; - size_t pgCount; + PtNode_T pgTree; // page tree + size_t pgCount; // page tree node count + XoNode_T *xObjs; // table of XObjects + size_t xoCount; // number of xobjects } Catalog_T; @@ -274,9 +281,9 @@ typedef struct { char *type; char *basefont; char *encoding; - Dict *descriptor; - Dict *toUnicode; - Dict *descendantFonts; + const HParsedToken *descriptor; + const HParsedToken *toUnicode; + const HParsedToken *descendantFonts; } Fontinfo_T; @@ -395,7 +402,7 @@ pp_dict(FILE *stream, const HParsedToken *tok, int indent, int delta) { const HCountedArray *k_v; - if (tok->seq->used == 0) { + if (tok->seq->used == 0) { fprintf(stream, "{ }"); return; } @@ -1181,7 +1188,7 @@ void pp_fontstate(FILE *stream, const TextState_T *state) { assert(state); fprintf(stream, "\nFont State: Page = "); - if (state->page->type == PG_NODE) + if ( (state->page->type == PG_NODE) || (state->page->type == XO_NODE) ) pp_ref(stream, state->page->me, 0, 0); fprintf(stream, ", Font = %s\n", state->font->fref.fn); @@ -1195,6 +1202,9 @@ void pp_fontinfo(FILE *stream, const TextState_T *state, const Fontinfo_T *fi) if (fi->type) fprintf(stream, ", Type= %s", fi->type); if (fi->basefont) fprintf(stream, ", Basefont= %s", fi->basefont); if (fi->encoding) fprintf(stream, ", Encoding= %s", fi->encoding); + if (fi->descriptor) pp_dict(stream, fi->descriptor, 0, 0); + if (fi->toUnicode) pp_dict(stream, fi->toUnicode, 0, 0); + if (fi->descendantFonts) pp_dict(stream, fi->descendantFonts, 0, 0); fprintf(stream, "\n"); } @@ -1248,7 +1258,7 @@ pp_objstm(FILE *stream, const HParsedToken *tok, int indent, int delta) Objstm *entry = H_CAST(Objstm, tok); // pp_parseresult(stream, (const HParsedToken *)entry->res, 0, 0); - fprintf(stream, "Num Objects = %lu\n", entry->numObjs); + fprintf(stream, "pp_objstm:: Num Objects = %lu\n", entry->numObjs); for (int i=0; i<entry->numObjs; i++) { fprintf(stream, "oid: <nr=%lu, gen=%lu>, ", entry->tok[i].oid.nr, entry->tok[i].oid.gen); @@ -2042,9 +2052,16 @@ text_extract(struct Env *aux) } if (tstr) { ft = lookup_font(&tstr->ts, aux); - pp_fontinfo(stdout, &tstr->ts, ft); + if (ft) { + pp_fontinfo(stdout, &tstr->ts, ft); + pp_fontinfo(stream, &tstr->ts, ft); + } + else { + char *estr = "\nMissing Font Info!!\n"; + fwrite((const void *)estr, strlen(estr), 1, stdout); + fwrite((const void *)estr, strlen(estr), 1, stream); + } fwrite((const void *)tstr->text, (int) tstr->nchars, 1, stdout); - pp_fontinfo(stream, &tstr->ts, ft); fwrite((const void *)tstr->text, (int) tstr->nchars, 1, stream); } } @@ -2691,8 +2708,10 @@ parse_obj(struct Env *aux, size_t nr, size_t gen, size_t offset) // DEBUG -// fprintf(stdout, "\nparse_obj: Parsed Result:\n"); -// h_pprintln(stdout, res->ast); // XXX debug + fprintf(stdout, "\nparse_obj: Parsed Result:\n"); + h_pprintln(stdout, res->ast); // XXX debug + fprintf(stdout, "\nparse_obj: Returning:\n"); + h_pprintln(stdout, H_INDEX_TOKEN(res->ast, 1)); // XXX debug // DEBUG return H_INDEX_TOKEN(res->ast, 1); @@ -2719,8 +2738,11 @@ parse_objstm_obj(struct Env *aux, size_t nr, size_t stm_nr, size_t idx) case XR_INUSE: if (ent->n.gen != 0) return NULL; /* stream replaced */ - if (ent->obj == NULL) + if (ent->obj == NULL) { + fprintf(stdout, "\nparse_objstm_obj:: Parsing object stream id = %lu, %d, at offset = %zu (%#zx)\n", + stm_nr, 0, ent->n.offs, ent->n.offs); ent->obj = parse_obj(aux, stm_nr, 0, ent->n.offs); + } break; case XR_OBJSTM: return NULL; /* invalid: nested streams */ @@ -2732,7 +2754,8 @@ parse_objstm_obj(struct Env *aux, size_t nr, size_t stm_nr, size_t idx) return NULL; } else { - fprintf(stdout, "parse_objstm_obj:: token type = %u, \n", ent->obj->token_type); + fprintf(stdout, "\nparse_objstm_obj:: token type = %u, \n", ent->obj->token_type); + h_pprintln(stdout, stm); // XXX debug if (stm->token_type != TT_Dict) { const HCountedArray *tarr = H_CAST_SEQ(stm); fprintf (stdout, "\nparse_objstm_obj: tarr->elements[1]->token_type = %u\n", @@ -2762,18 +2785,23 @@ resolve(struct Env *aux, const HParsedToken *v) Ref *r; /* direct objects pass through */ - if (v == NULL || v->token_type != TT_Ref) + if (v == NULL || v->token_type != TT_Ref) { + fprintf (stdout, "resolve: Returning token of token_type = %u\n", v->token_type); return v; + } /* we are looking at an indirect reference */ r = v->user; /* find the xref entry for this reference */ + fprintf(stdout, "\nresolve:: Looking up xref = %lu, %lu\n", r->nr, r->gen); ent = lookup_xref(aux, r->nr, r->gen); if (ent == NULL) return NULL; /* obj not found */ - if (ent->obj != NULL) + if (ent->obj != NULL) { + fprintf(stdout, "\nresolve:: ent->obj->token_type = %u\n", ent->obj->token_type); return resolve(aux, ent->obj); + } /* parse the object and memoize */ ent->obj = v; /* break loops */ @@ -2784,15 +2812,19 @@ resolve(struct Env *aux, const HParsedToken *v) case XR_INUSE: if (ent->n.gen != r->gen) return NULL; /* obj nr reused */ + fprintf(stdout, "resolve:: parse object at offset = %zu (%#zx)\n", ent->n.offs, ent->n.offs); ent->obj = parse_obj(aux, r->nr, r->gen, ent->n.offs); break; case XR_OBJSTM: if (r->gen != 0) return NULL; /* invalid entry! */ + fprintf(stdout, "resolve:: parse object stream - oid = %lu, stm_oid = %lu, stm_idx = %lu\n", + r->nr, ent->o.stm, ent->o.idx); ent->obj = parse_objstm_obj(aux, r->nr, ent->o.stm, ent->o.idx); break; } + fprintf (stdout, "resolve: Recursive call to resolve - token_type = %u\n", ent->obj->token_type); return resolve(aux, ent->obj); } @@ -3546,15 +3578,17 @@ parse_item(struct Env *aux, size_t nr, size_t gen, size_t offset, HParser *p) size_t def_nr, def_gen; if (offset >= aux->sz) { - fprintf(stderr, "%s: position %zu (%#zx) for object %zu %zu is " + fprintf(stderr, "%s: position %zu (%#zx) for object %lu %lu is " "out of bounds\n", aux->infile, offset, offset, nr, gen); return NULL; } if (p == NULL) { - fprintf(stderr, "parse_item: Unexpected request to parse object!!\n"); + fprintf(stderr, "parse_item: Attempt to use a NULL parser!\n"); return NULL; } + fprintf(stdout, "\nparse_item:: Parsing reference = %lu, %lu, at offset = %zu (%#zx)\n", + nr, gen, offset, offset); HParser *pItem = h_right(h_seek(offset * 8, SEEK_SET), p); res = h_parse(pItem, aux->input, aux->sz); if (res == NULL) { @@ -3562,18 +3596,12 @@ parse_item(struct Env *aux, size_t nr, size_t gen, size_t offset, HParser *p) "%zu (%#zx)\n", aux->infile, nr, gen, offset, offset); return NULL; } + assert(res->ast != NULL && res->ast->token_type == TT_SEQUENCE); - // DEBUG - fprintf(stdout, "\nparse_item: Parsed Result:\n"); - h_pprintln(stdout, res->ast); // XXX debug -// const HParsedToken* res_t = h_make(res->arena, TT_HParseResult, res); -// pp_parseresult(stdout, res_t, 5, 0); - // DEBUG - assert(res->ast != NULL && res->ast->token_type == TT_SEQUENCE); - size_t ntok =res->ast->seq->used; - assert(ntok==2 || ntok==3); +// size_t ntok =res->ast->seq->used; +// assert(ntok==2 || ntok==3); def_nr = H_INDEX_UINT(res->ast, 0, 0); def_gen = H_INDEX_UINT(res->ast, 0, 1); if (def_nr != nr || def_gen != gen) { @@ -3583,8 +3611,14 @@ parse_item(struct Env *aux, size_t nr, size_t gen, size_t offset, HParser *p) return NULL; } - HParsedToken *res_f2 = H_INDEX_TOKEN(res->ast, 1); - return res_f2; + // DEBUG + fprintf(stdout, "\nparse_item: Parsed Result:\n"); + h_pprintln(stdout, res->ast); // XXX debug + fprintf(stdout, "\nparse_item: Returning:\n"); + h_pprintln(stdout, H_INDEX_TOKEN(res->ast, 1)); // XXX debug + // DEBUG + + return H_INDEX_TOKEN(res->ast, 1); } const HParsedToken * @@ -3614,6 +3648,8 @@ parse_objstm_item(struct Env *aux, size_t nr, size_t stm_nr, size_t idx, size_t /* * decode the stream and find the target object in it */ + fprintf(stdout, "\nparse_objstm_item:: Parsing object stream id = %lu, %d, at offset = %zu (%#zx)\n", + stm_nr, 0, ent->n.offs, ent->n.offs); ent->obj = parse_item(aux, stm_nr, 0, ent->n.offs, p); *offset = ent->n.offs; } @@ -3627,8 +3663,16 @@ parse_objstm_item(struct Env *aux, size_t nr, size_t stm_nr, size_t idx, size_t "%zu (%#zx)\n", aux->infile, ent->n.offs, ent->n.offs); return NULL; } - - return ent->obj; // The only path through this function is the one through the parser + else { + if ((stm->token_type == TT_SEQUENCE) && (stm->seq->used >=2) ) { + const HParsedToken *res_t = H_INDEX_TOKEN(stm, 1); // expecting an HParseResult token + const HParseResult *res = H_CAST(HParseResult, res_t); + stm = res->ast; + } + } + fprintf(stdout, "\nparse_objstm_item:: token type = %u, \n", ent->obj->token_type); + h_pprintln(stdout, stm); // XXX debug + return stm; } @@ -3638,21 +3682,24 @@ resolve_item(struct Env *aux, const HParsedToken *v, size_t *offset, HParser *p) XREntry *ent = NULL; Ref *r; - *offset = 0; // initialize the offset /* direct objects pass through */ if (v == NULL || v->token_type != TT_Ref) return v; /* we are looking at an indirect reference */ - r = v->user; + *offset = 0; // initialize the offset + r = v->user; /* find the xref entry for this reference */ + fprintf(stdout, "\nresolve_item:: Looking up xref = %lu, %lu\n", r->nr, r->gen); ent = lookup_xref(aux, r->nr, r->gen); if (ent == NULL) return NULL; /* obj not found */ - if (ent->obj != NULL) + if (ent->obj != NULL) { + fprintf(stdout, "\nresolve_item:: ent->obj->token_type = %u\n", ent->obj->token_type); return resolve_item(aux, ent->obj, offset, p); + } /* parse the object and memoize */ ent->obj = v; /* break loops */ @@ -3663,16 +3710,21 @@ resolve_item(struct Env *aux, const HParsedToken *v, size_t *offset, HParser *p) case XR_INUSE: if (ent->n.gen != r->gen) return NULL; /* obj nr reused */ + fprintf(stdout, "resolve_item:: parse object at offset = %lu\n", ent->n.offs); ent->obj = parse_item(aux, r->nr, r->gen, ent->n.offs, p); *offset = ent->n.offs; break; case XR_OBJSTM: if (r->gen != 0) return NULL; /* invalid entry! */ + fprintf(stdout, "resolve_item:: parse object stream - oid = %lu, stm_oid = %lu, stm_idx = %lu\n", + r->nr, ent->o.stm, ent->o.idx); ent->obj = parse_objstm_item(aux, r->nr, ent->o.stm, ent->o.idx, offset, p); break; } + fprintf (stdout, "resolve_item: Recursive call to resolve - token_type = %u at offset = %zu (%#zx)\n", + ent->obj->token_type, *offset, *offset); return resolve_item(aux, ent->obj, offset, p); } @@ -3772,15 +3824,18 @@ get_fontdict(const HParsedToken *obj, struct Env* aux) return dict; } -Dict * +const HParsedToken * get_dictoftype(const HParsedToken *obj, const HParsedToken *pRefT, char *value) { Dict *dict = NULL; + const HParsedToken *tok; Objstm *stm = NULL; if (obj->token_type == TT_Dict) { dict = H_CAST(Dict, obj); - if (! (is_parent(dict, pRefT) && has_value(dict, "Type", value)) ) + if (is_parent(dict, pRefT) && has_value(dict, "Type", value)) + tok = obj; + else dict = NULL; } else if (obj->token_type == TT_Objstm) { @@ -3788,24 +3843,38 @@ get_dictoftype(const HParsedToken *obj, const HParsedToken *pRefT, char *value) for (int i=0; i<stm->numObjs; i++) { if (stm->tok[i].obj->token_type == TT_Dict) { dict = H_CAST(Dict, stm->tok[i].obj); -// ref = &stm->tok[i].oid; - if (! (is_parent(dict, pRefT) && has_value(dict, "Type", value))) + if (is_parent(dict, pRefT) && has_value(dict, "Type", value)) { + tok = stm->tok[i].obj; + break; + } + else dict = NULL; } - } } else { - fprintf(stderr, "get_dict: token type not yet handled: %u\n", + fprintf(stderr, "get_dictoftype: token type not yet handled: %u\n", obj->token_type); } + if (dict == NULL) + tok = NULL; + // DEBUG - if (dict && stm) { - const HParsedToken *dt = h_make(stm->arena, TT_Dict, dict); - pp_dict(stderr, dt, 10, 5); + if (pRefT) { + fprintf(stdout, "\nget_dictoftype: Parent = "); + pp_ref(stdout, pRefT, 0, 0); } - return dict; + else + fprintf(stdout, "get_dictoftype: Parent = NULL"); + if (tok) { + fprintf(stdout, "\nget_dictoftype: Type = %s\n", value); + pp_dict(stdout, tok, 0, 0); + } + else { + fprintf(stdout, "\nget_dictoftype: Null dictionary of Type = %s\n", value); + } + return tok; } @@ -3946,8 +4015,8 @@ Fontinfo_T * getFontinfo(const Dict *fontdict, char *name, struct Env *aux) { - // will need to save a pointer to the arena to avoid malloc - Fontinfo_T *fontinfo = (Fontinfo_T*)malloc(sizeof(Fontinfo_T)); + assert (fontdict); + Fontinfo_T *fontinfo = h_arena_malloc(fontdict->arena, sizeof(Fontinfo_T)); fontinfo->name = NULL; fontinfo->type = NULL; fontinfo->basefont = NULL; @@ -3961,7 +4030,7 @@ getFontinfo(const Dict *fontdict, char *name, struct Env *aux) if (item) { assert(item->token_type == TT_BYTES); if (bytes_eq(item->bytes, name)) { - fontinfo->name = (char *)malloc(sizeof(char)*(item->bytes.len+1)); + fontinfo->name = h_arena_malloc(fontdict->arena, sizeof(char)*(item->bytes.len+1)); memcpy(fontinfo->name, (char *)item->bytes.token, item->bytes.len); fontinfo->name[item->bytes.len] = '\0'; fprintf(stdout, "getFontinfo: Subtype = %s\n", fontinfo->type); @@ -3970,7 +4039,7 @@ getFontinfo(const Dict *fontdict, char *name, struct Env *aux) item = dictentry(fontdict, "Subtype"); if (item) { assert (item->token_type == TT_BYTES); - fontinfo->type = (char *)malloc(sizeof(char)*(item->bytes.len+1)); + fontinfo->type = h_arena_malloc(fontdict->arena, sizeof(char)*(item->bytes.len+1)); memcpy(fontinfo->type, (char *)item->bytes.token, item->bytes.len); fontinfo->type[item->bytes.len] = '\0'; fprintf(stdout, "getFontinfo: Subtype = %s\n", fontinfo->type); @@ -3978,20 +4047,23 @@ getFontinfo(const Dict *fontdict, char *name, struct Env *aux) item = dictentry(fontdict, "BaseFont"); if (item) { assert (item->token_type == TT_BYTES); - fontinfo->basefont = (char *)malloc(sizeof(char)*(item->bytes.len+1)); + fontinfo->basefont = h_arena_malloc(fontdict->arena, sizeof(char)*(item->bytes.len+1)); memcpy(fontinfo->basefont, (char *)item->bytes.token, item->bytes.len); fontinfo->basefont[item->bytes.len] = '\0'; fprintf(stdout, "getFontinfo: Basefont = %s\n", fontinfo->basefont); } + size_t offset; // This is available if needed item = dictentry(fontdict, "Encoding"); if (item) { // dereference it if it is a reference - item = resolve(aux, item); + offset = 0; + item = resolve_item(aux, item, &offset, p_objdef); if (item->token_type == TT_BYTES) { - fontinfo->encoding = (char *)malloc(sizeof(char)*(item->bytes.len+1)); + fontinfo->encoding = h_arena_malloc(fontdict->arena, sizeof(char)*(item->bytes.len+1)); memcpy(fontinfo->encoding, (char *)item->bytes.token, item->bytes.len); fontinfo->encoding[item->bytes.len] = '\0'; - fprintf(stdout, "getFontinfo: encoding = %s\n", fontinfo->encoding); + fprintf(stdout, "getFontinfo: encoding = %s at offset %zu (%#zx)\n", + fontinfo->encoding, offset, offset); } else if (item->token_type == TT_Dict) { @@ -4000,14 +4072,71 @@ getFontinfo(const Dict *fontdict, char *name, struct Env *aux) const Dict *encodingDict = H_CAST(Dict, item); item = dictentry(encodingDict, "BaseEncoding"); assert(item->token_type==TT_BYTES); - fontinfo->encoding = (char *)malloc(sizeof(char)*(item->bytes.len+1)); + fontinfo->encoding = h_arena_malloc(fontdict->arena, sizeof(char)*(item->bytes.len+1)); memcpy(fontinfo->encoding, (char *)item->bytes.token, item->bytes.len); fontinfo->encoding[item->bytes.len] = '\0'; fprintf(stdout, "getFontinfo: encoding = %s\n", fontinfo->encoding); } - else + else { + fprintf(stderr, "\nUnexpected token type in parsing font -Encoding- attribute -" + "token_type = %u\n", item->token_type); assert(0); + } + } + + item = dictentry(fontdict, "FontDescriptor"); + if (item) { + // dereference the reference + offset = 0; + item = resolve_item(aux, item, &offset, p_objdef); + if (item) { + fprintf(stdout, "getFontinfo: FontDescriptor item description:\n"); + h_pprintln(stdout, item); + item = get_dictoftype(item, NULL, "FontDescriptor"); + if (item) { + fontinfo->descriptor = item; + fprintf(stdout, "getFontinfo: FontDescriptor at offset %zu (%#zx):\n", + offset, offset); + pp_dict(stdout, item, 0, 0); + } + } + } + + item = dictentry(fontdict, "ToUnicode"); + if (item) { + // dereference the reference + offset = 0; + item = resolve_item(aux, item, &offset, p_objdef); + if (item) { + fprintf(stdout, "getFontinfo: toUnicode item description:\n"); + h_pprintln(stdout, item); + item = get_dictoftype(item, NULL, "ToUnicode"); + if (item) { + fontinfo->toUnicode = item; + fprintf(stdout, "getFontinfo: toUnicode at offset %zu (%#zx):\n", + offset, offset); + pp_dict(stdout, item, 0, 0); + } + } + } + + item = dictentry(fontdict, "DescendantFonts"); + if (item) { + // dereference the reference + offset = 0; + item = resolve_item(aux, item, &offset, p_objdef); + if (item) { + fprintf(stdout, "getFontinfo: descendantFonts item description:\n"); + h_pprintln(stdout, item); + item = get_dictoftype(item, NULL, "DescendantFonts"); + if (item) { + fontinfo->descendantFonts = item; + fprintf(stdout, "getFontinfo: descendantFonts at offset %zu (%#zx):\n", + offset, offset); + pp_dict(stdout, item, 0, 0); + } + } } @@ -4026,7 +4155,7 @@ lookup_font(TextState_T *state, struct Env *aux) assert ( fentry->type == TS_Tf); struct fontref *fr = &fentry->fref; char *fn = fr->fn; - Fontinfo_T *fontinfo; + Fontinfo_T *fontinfo = NULL; Dict *dict = NULL; if (page->pgRsrc && (page->pgRsrc->numFonts > 0)) { fontlist = H_CAST(Dict, page->pgRsrc->fonts); @@ -4042,15 +4171,18 @@ lookup_font(TextState_T *state, struct Env *aux) } } else { // inherit + // DEBUG + fprintf(stdout, "\n\nlookup_font: Font Resource not found for FontState:\n"); + pp_fontstate(stdout, state); fprintf(stderr, "\n\nlookup_font: Inheritance not yet supported!\n\n"); } return (fontinfo); } -void parse_fonts(const HParsedToken *dictT, RsrcDict_T *pgRsrc) +void parse_fonts(const HParsedToken *dict_t, RsrcDict_T *pgRsrc) { - Dict *fontdict = H_CAST(Dict, dictT); + Dict *fontdict = H_CAST(Dict, dict_t); const HParsedToken *item = NULL; @@ -4059,6 +4191,7 @@ void parse_fonts(const HParsedToken *dictT, RsrcDict_T *pgRsrc) assert(item->token_type == TT_Dict); Dict *fontlist = H_CAST(Dict, item); fprintf(stdout, "Num fonts used in page = %lu \n", fontlist->used); + pp_dict(stdout, item, 0, 0); if (pgRsrc->fonts) { fprintf(stderr, "\n\nparse_fonts: Attempt to add fonts -- Supported??\n\n"); } @@ -4070,37 +4203,159 @@ void parse_fonts(const HParsedToken *dictT, RsrcDict_T *pgRsrc) } } -void parse_rsrcdict(HArena *arena, const HParsedToken *dictT, PtNode_T *pgNode) +void parse_rsrcdict(HArena *arena, const HParsedToken *dict_t, PtNode_T *pgNode, struct Env *aux); + + +void parse_xobject( + const HParsedToken *dict_t, + PtNode_T *parent, + RsrcDict_T *pgRsrc, + struct Env *aux) +{ + + Dict *xodict = H_CAST(Dict, dict_t); + const HParsedToken *item = NULL; // generic token + const HParsedToken *tok = NULL; // resolved token + const Dict *xobj_d = NULL; // dictionary associated with reference token + const HParsedToken *xobj_t = NULL; // xobject resource token + + +// HArena *arena = xodict->arena; +// const Dict *dict = NULL; // generic dictionary +// const HParsedToken *xobjr_t = NULL; // reference to an xobject token + + + item = dictentry(xodict, "XObject"); + if (item) { + assert(item->token_type == TT_Dict); + Dict *xolist = H_CAST(Dict, item); + // DEBUG + fprintf(stdout, "\nparse_xobject: Node for Parent = "); + if (parent->me) pp_ref(stdout, parent->me, 0, 0); + fprintf(stdout, "\nNum xobjects used in page = %lu \n", xolist->used); + pp_dict(stdout, item, 0, 0); + + size_t oldC = aux->catalog.xoCount; // previous count + aux->catalog.xoCount += xolist->used; + aux->catalog.xObjs = realloc(aux->catalog.xObjs, aux->catalog.xoCount * + sizeof(XoNode_T)); + // work on the newly created nodes + for (int i=oldC; i<aux->catalog.xoCount; i++) { + XoNode_T *xobj_r = &aux->catalog.xObjs[i]; + HBytes k = H_INDEX_BYTES(xolist->elements[i-oldC], 0); + xobj_r->name = h_arena_malloc(xodict->arena, k.len+1); + + memcpy(xobj_r->name, k.token, k.len); + xobj_r->name[k.len] = '\0'; + const HParsedToken *ref = H_INDEX_TOKEN(xolist->elements[i-oldC], 1); + assert(ref->token_type == TT_Ref); + xobj_r->node = h_arena_malloc(xodict->arena, sizeof(PtNode_T)); + xobj_r->node->type = XO_NODE; + xobj_r->node->parent = parent->me; + xobj_r->node->pgRsrc = pgRsrc; + xobj_r->node->me = ref; + xobj_r->node->offset = 0; // TODO: get the offset to the stream + tok = resolve_item(aux, ref, &xobj_r->node->offset, p_objdef); + + if (tok == NULL) continue; + + // DEBUG + fprintf(stdout, "\nparse_xobject: XObject Reference = : "); + pp_ref(stdout, ref, 0, 0); + fprintf(stdout, "\n"); + h_pprintln(stdout, tok); + + xobj_t = H_INDEX_TOKEN(tok, 0); // expecting an xobject dictionary token + if (xobj_t == NULL) continue; + + xobj_t = get_dictoftype(xobj_t, NULL, "XObject"); // test it + if (xobj_t == NULL) continue; + + xobj_d = H_CAST(Dict, xobj_t); + item = dictentry(xobj_d, "Subtype"); + if (item == NULL || item->token_type != TT_BYTES) + continue; // no "Subtype" field + + /* + * TODO:: external objects can be images, forms, or postscript objects + * We are only handling forms at the moment + */ + if (bytes_eq(item->bytes, "Form")) { + fprintf(stdout, "\n\nparse_xobject:: Parsing Form XObject\n"); + const HParsedToken *xoRsrc_t; + const HParsedToken *rsrcdict_t; + size_t offset = 0; + + xoRsrc_t = dictentry(xobj_d, "Resources"); + if (xoRsrc_t) { + fprintf(stdout, "\n\nparse_xobject: Found resources in node\n"); + rsrcdict_t = resolve_item(aux, xoRsrc_t, &offset, p_objdef); + // DEBUG + fprintf(stdout, "\nparse_xobject: Resource token type = %u\n",rsrcdict_t->token_type); + h_pprintln(stdout, rsrcdict_t); + + parse_rsrcdict(xodict->arena, rsrcdict_t, xobj_r->node, aux); + + // set the text state to this xobject + // parse the text stream, which is field 2 of the sequence + aux->tstate.page = xobj_r->node; + xobj_t = H_INDEX_TOKEN(tok, 1); // expecting an HParseResult token + const HParseResult *res = H_CAST(HParseResult, xobj_t); + // DEBUG + fprintf(stdout, "\nparse_xobject: Byte Stream = : "); + h_pprintln(stdout, res->ast); + HBytes stm = H_CAST_BYTES(res->ast); + uint8_t *bstm = h_arena_malloc(xodict->arena, sizeof(uint8_t) * stm.len); + memcpy(bstm, stm.token, stm.len); + res = h_parse(p_textstream, bstm, stm.len); + // DEBUG + fprintf(stdout, "\nparse_xobject: Parsing text : "); + h_pprintln(stdout, res->ast); + } + } + } + } +} +void parse_rsrcdict(HArena *arena, const HParsedToken *dict_t, PtNode_T *pgNode, struct Env *aux) { RsrcDict_T *rsrc = NULL; // Process the dictionary - if ( (dictT->token_type == TT_Dict) || (dictT->token_type == TT_Objstm) ) { + if ( (dict_t->token_type == TT_Dict) || (dict_t->token_type == TT_Objstm) ) { rsrc = h_arena_malloc(arena, sizeof(RsrcDict_T)); - rsrc->resources = dictT; + rsrc->resources = dict_t; rsrc->fonts = NULL; rsrc->numFonts = 0; - rsrc->seenCmaps = NULL; - rsrc->numCmapsSeen = 0; + rsrc->xobj = NULL; +// rsrc->seenCmaps = NULL; +// rsrc->numCmapsSeen = 0; } else { - fprintf (stderr, "\nparse_rsrcdict: What token type is this? - %u\n", dictT->token_type); + fprintf (stderr, "\nparse_rsrcdict: What token type is this? - %u\n", + dict_t->token_type); } // Resource is a simple dictionary - if (dictT->token_type == TT_Dict) { - parse_fonts(dictT, rsrc); + if (dict_t->token_type == TT_Dict) { + // DEBUG + fprintf(stdout, "\nparse_rsrcdict: Simple dictionary:\n"); + h_pprintln(stdout, dict_t); + + parse_fonts(dict_t, rsrc); + parse_xobject(dict_t, pgNode, rsrc, aux); } - else if (dictT->token_type == TT_Objstm) { - const Objstm *strmc = H_CAST(Objstm, dictT); - fprintf(stdout, "\nparse_rsrcdict: numObjs = %lu\n", strmc->numObjs); - h_pprintln(stdout, dictT); + else if (dict_t->token_type == TT_Objstm) { + const Objstm *strmc = H_CAST(Objstm, dict_t); + fprintf(stdout, "\nparse_rsrcdict: stream object -numObjs = %lu\n", strmc->numObjs); + h_pprintln(stdout, dict_t); + for (int i=0; i<strmc->numObjs; i++) { fprintf(stdout, "\nparse_rsrcdict: oid = [%zu, %zu]\n", strmc->tok[i].oid.nr, strmc->tok[i].oid.gen); if (strmc->tok[i].obj->token_type == TT_Dict) { parse_fonts(strmc->tok[i].obj, rsrc); + parse_xobject(strmc->tok[i].obj, pgNode, rsrc, aux); } } } @@ -4108,18 +4363,6 @@ void parse_rsrcdict(HArena *arena, const HParsedToken *dictT, PtNode_T *pgNode) pgNode->pgRsrc = rsrc; return; -#if 0 - item = dictentry(fontdict, "FontDescriptor"); - if (item) - fprintf(stdout, "\n\nparse_fontdict: Found FontDescriptor [%.*s] dictionary in node\n", - (int) item->bytes.len, item->bytes.token); - item = dictentry(fontdict, "FontName"); - if (item) - fprintf(stdout, "\n\nparse_fontdict: Found FontName [%.*s] dictionary in node\n", - (int) item->bytes.len, item->bytes.token); - h_pprintln(stdout, dt); -#endif - } void parse_pagenode ( @@ -4138,9 +4381,12 @@ void parse_pagenode ( const HParsedToken *contents = NULL; // resolved token const HParsedToken *entry = NULL; const HParsedToken *rsrcdict_t = NULL; -// HParser *bytes_p, *dict_p, *value_p; size_t nOffset = 0; + // DEBUG + fprintf(stdout, "\nparse_pagenode: parsing Page Node = "); + pp_ref(stdout, myRef, 0, 0); + // set some global state variables aux->tstate.page = myNode; @@ -4170,7 +4416,7 @@ void parse_pagenode ( fprintf(stdout, "\n\nparse_pagenode: Found resources in node\n"); rsrcdict_t = resolve(aux, item); fprintf(stdout, "\nparse_pagenode: Resource token type = %u\n",rsrcdict_t->token_type); - parse_rsrcdict(arena, rsrcdict_t, myNode); + parse_rsrcdict(arena, rsrcdict_t, myNode, aux); } @@ -4184,7 +4430,7 @@ void parse_pagenode ( contents_r = H_CAST(Ref, contents_t); fprintf(stdout, "parse_pagenode: ref.nr = %ld, ref.gen=%ld\n", contents_r->nr, contents_r->gen); - contents = resolve_item(aux, contents_t, &nOffset, p_cstream); + contents = resolve_item(aux, contents_t, &nOffset, p_cstream); assert(contents); myNode->offset = nOffset; HParsedToken *res_strm = H_INDEX_TOKEN(contents, 1); @@ -4286,14 +4532,17 @@ parse_pagetree( PtNode_T *kid = NULL; const HParsedToken *kidRef = NULL; // page tree or page node reference const HParsedToken *kidDict_t = NULL; + const HParsedToken *pageDict_t = NULL; + const HParsedToken *treeDict_t = NULL; const Dict *kidDict = NULL; const HParsedToken *rsrcdict_t = NULL; + fprintf(stdout, "\nparse_pagetree: parsing Page Tree Node = "); + pp_ref(stdout, myRef, 0, 0); + + myNode->type = PG_TREE; - // Initialize the xobject structure that falls outside of the page traversal - myNode->xn.xobjs = NULL; - myNode->xn.count = 0; item = dictentry(myDict, "Parent"); // if root node ==> parent should be NULL myNode->parent = item; @@ -4340,16 +4589,15 @@ parse_pagetree( if (kidDict_t) { // Look for a tree node - kidDict = get_dictoftype(kidDict_t, myRef, "Pages"); - if (kidDict) { - - const HParsedToken *dt = h_make(pgTable->arena, TT_Dict, (Dict *)kidDict); - pp_dict(stdout, dt, 10, 5); + treeDict_t = get_dictoftype(kidDict_t, myRef, "Pages"); + if (treeDict_t) { + kidDict = H_CAST(Dict, treeDict_t); parse_pagetree(aux, kid, kidRef, kidDict, myRef, curr); } // Look for a page node - kidDict = get_dictoftype(kidDict_t, myRef, "Page"); - if (kidDict) { + pageDict_t = get_dictoftype(kidDict_t, myRef, "Page"); + if (pageDict_t) { + kidDict = H_CAST(Dict, pageDict_t); if (++curr > aux->catalog.pgCount) { fprintf(stderr, "parse_pagetree: More kids then specified leaves!\n"); // TODO:: probably just a warning is enough here -- run the VIOL parser? @@ -4364,7 +4612,7 @@ parse_pagetree( fprintf(stdout, "\n\nparse_pagenode: Found resources in node\n"); rsrcdict_t = resolve(aux, item); fprintf(stdout, "\nparse_pagenode: Resource token type = %u\n",rsrcdict_t->token_type); - parse_rsrcdict(pgTable->arena, rsrcdict_t, myNode); + parse_rsrcdict(pgTable->arena, rsrcdict_t, myNode, aux); } } @@ -4402,14 +4650,23 @@ parse_catalog(struct Env *aux, const HParsedToken *root) aux->catalog.catalog = NULL; aux->catalog.pRoot = NULL; aux->catalog.pgCount = 0; + // Initialize the xobject structure that falls outside of the page traversal + aux->catalog.xObjs = NULL; + aux->catalog.xoCount = 0; + // DEBUG + fprintf(stdout, "\nparse_catalog: parsing Catalog = "); + if (root->token_type == TT_Ref) + pp_ref(stdout, root, 0, 0); + else if (root->token_type == TT_Dict) + pp_dict(stdout, root, 0, 0); // Ensure the reference is to the catalog dictionary dict_t = resolve(aux, root); - catalog = get_dictoftype(dict_t, NULL, "Catalog"); - aux->catalog.catalog = dict_t; // catalog dictionary token + aux->catalog.catalog = get_dictoftype(dict_t, NULL, "Catalog"); // catalog dictionary token + catalog = H_CAST(Dict, aux->catalog.catalog); // Catalog found -- Now get the root of the page tree associated with the catalog @@ -4423,7 +4680,8 @@ parse_catalog(struct Env *aux, const HParsedToken *root) /* resolve and process the page tree root reference to extract the dictionary --> Page Tree Object */ dict_t = resolve(aux, ptRef); // page tree root node - ptRoot = get_dictoftype(dict_t, NULL, "Pages"); // page tree root dictionary (parent is NULL) + dict_t = get_dictoftype(dict_t, NULL, "Pages"); // page tree root dictionary (parent is NULL) + ptRoot = H_CAST(Dict, dict_t); if (ptRoot == NULL) { fprintf(stderr, "parse_catalog: No page table!\n"); @@ -4509,34 +4767,7 @@ decode_stream(const Dict *d, HBytes b, HParser *p) HParser *p_xrefdata__m(HAllocator *, const Dict *); -void parse_xobject(HAllocator *mm__, const Dict *dict, struct Env *aux) -{ - aux->catalog.pgTree.xn.count += 1; - aux->catalog.pgTree.xn.xobjs = realloc(aux->catalog.pgTree.xn.xobjs, aux->catalog.pgTree.xn.count * - sizeof(struct PtNode_S)); - // work on the newly created node - PtNode_T *xobj = &aux->catalog.pgTree.xn.xobjs[aux->catalog.pgTree.xn.count-1]; - xobj->type = XO_NODE; - xobj->parent = NULL; - xobj->pgRsrc = NULL; - xobj->me = NULL; // TODO: get the stream id - xobj->offset = 0; // TODO: get the offset to the stream - - // get resources of the xobject if exists - const HParsedToken *item; - const HParsedToken *rsrcdict_t; - - item = dictentry(dict, "Resources"); - if (item) { - fprintf(stdout, "\n\npparse_xobject: Found resources in node\n"); - rsrcdict_t = resolve(aux, item); - fprintf(stdout, "\nparse_xobject: Resource token type = %u\n",rsrcdict_t->token_type); - parse_rsrcdict(rsrcdict_t->seq->arena, rsrcdict_t, xobj); - } - // set the text state to this xobject - aux->tstate.page = xobj; -} HParser * p_stream_data__m(HAllocator *mm__, const Dict *dict, struct Env *aux) { @@ -4550,8 +4781,10 @@ p_stream_data__m(HAllocator *mm__, const Dict *dict, struct Env *aux) if (bytes_eq(v->bytes, "XRef")) return p_xrefdata__m(mm__, dict); - if (bytes_eq(v->bytes, "ObjStm")) + if (bytes_eq(v->bytes, "ObjStm")) { + fprintf(stdout, "\np_stream_data__m:: Parsing object stream\n"); return p_objstm__m(mm__, dict); + } if (bytes_eq(v->bytes, "XObject")) { /* @@ -4559,11 +4792,14 @@ p_stream_data__m(HAllocator *mm__, const Dict *dict, struct Env *aux) * We are not handling them at the moment */ v = dictentry(dict, "Subtype"); +#if 0 if (bytes_eq(v->bytes, "Form")) { - fprintf(stdout, "\n\np_stream_data_m:: Parsing XObject\n\n"); + fprintf(stdout, "\n\np_stream_data_m:: Parsing XObject\n"); + fprintf(stdout, "p_stream_data_m:: Current XObject count = %lu\n", aux->catalog.pgTree.xn.count); parse_xobject(mm__, dict, aux); return p_textstream; } +#endif } return NULL; /* unrecognized type */ } @@ -4849,6 +5085,9 @@ act_ostm(const HParseResult *p, void *u) // const HCountedArray *ostrm = H_FIELD_SEQ(1); const HParsedToken *tok = H_MAKE(Objstm, ostrm); + + // DEBUG + fprintf (stdout, "act_ostm:: Object Stream Details:\n"); pp_objstm(stdout, tok, 0, 0); return (HParsedToken *)tok; @@ -4863,7 +5102,7 @@ p_objstm__m(HAllocator *mm__, const Dict *dict) v = dictentry(dict, "N"); if (v == NULL || v->token_type != TT_SINT || v->sint < 0 || (uint64_t)v->sint > SIZE_MAX) { - fprintf(stderr, "missing /N on object stream\n"); + fprintf(stderr, "p_objstm__m: missing /N on object stream\n"); return p_fail; } N = v->sint; @@ -5108,16 +5347,6 @@ main(int argc, char *argv[]) aux = (struct Env){infile, input, sz}; init_parser(&aux); -#if 0 - // test -// char *str = "/056b570d-3ecb-465b-8c40-0feeba19107a"; - const char *str = "/056b570d-3ecb-465b-8c40-0feeba19107a 13.00 Tf"; - int slen = strlen(str); - res = h_parse(p_tf, (uint8_t *)str, slen); - fprintf(stdout, "This is a test\n"); - // test -#endif - /* parse all cross-reference sections and trailer dictionaries */ parse_xrefs(&aux);