From 24714c8784f60d3f3feaca267e781d221ef69097 Mon Sep 17 00:00:00 2001 From: "sumit.ray@baesystems.com" <sumit.ray@baesystems.com> Date: Fri, 9 Jul 2021 22:16:37 -0400 Subject: [PATCH] Added the page node pointer to tht text neodes --- pdf.c | 58 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/pdf.c b/pdf.c index f5436aa..efae767 100644 --- a/pdf.c +++ b/pdf.c @@ -122,7 +122,7 @@ typedef HCountedArray Dict; // Catalog Tree -typedef struct RsrcDict_T { +typedef struct RsrcDict_S { const HParsedToken *resources; // font references dictionary (resources == NULL) ==> inherit const HParsedToken **fonts; // Serialized list of fonts used in this page size_t numFonts; @@ -130,12 +130,12 @@ typedef struct RsrcDict_T { // size_t numSeenFonts; const HParsedToken **seenCmaps; // memoized cmaps (should this be a bytestream? size_t numCmapsSeen; -} RsrcDict_S; -struct PtNode_T; -typedef struct PtNode_T { +} RsrcDict_T; +struct PtNode_S; +typedef struct PtNode_S { enum {PG_TREE, PG_NODE, XO_NODE} type; const HParsedToken *parent; // Type = Page tree -- reference - RsrcDict_S *pgRsrc; // resource structure + RsrcDict_T *pgRsrc; // resource structure const HParsedToken *me; // Reference for me size_t offset; // union { @@ -144,20 +144,20 @@ typedef struct PtNode_T { const HParsedToken *textStream; // content stream -- may be a result of concatenating array of content streams } pn; struct { - struct PtNode_T *kids; // page table + struct PtNode_S *kids; // page table size_t count; // number of kids size_t leaves; // number of pages in tree } pt; }; -} PtNode_S; +} PtNode_T; -typedef struct { +typedef struct Catalog_S { const HParsedToken *catalog; // reference const HParsedToken *pRoot; // reference - PtNode_S pgTree; + PtNode_T pgTree; size_t pgCount; -} Catalog_S; +} Catalog_T; // Forward declaration of text extraction related structures @@ -281,7 +281,7 @@ struct Env { struct textnode *txttail; /* parsed text objects from the file */ size_t ntextobjs; - Catalog_S catalog; /* Catalog object and document structure */ + Catalog_T catalog; /* Catalog object and document structure */ TextState_T tstate; /* current text state */ }; @@ -1845,8 +1845,8 @@ HParser * ktxtstream(HAllocator *mm__, const HParsedToken *x, void *env) { - struct Env *aux = env; - + struct Env *aux = env; + struct textnode *txtnd; assert (x->token_type == TT_SEQUENCE); int n_tobjs = x->seq->used; @@ -1880,10 +1880,7 @@ ktxtstream(HAllocator *mm__, const HParsedToken *x, void *env) // store the string in the environment - // not sure whether we need to actually store the string in malloc'ed area - // currently, we are reusing the token memory previously created - struct textnode *txtnd = (struct textnode *) malloc( - sizeof(struct textnode)); + txtnd = h_alloc(mm__, sizeof(struct textnode)); txtnd->tstr = tstr; txtnd->next = NULL; if (aux->txthead == NULL) @@ -3806,7 +3803,7 @@ HParsedToken *create_strmdict(HArena *arena, size_t len) const HParsedToken * -lookup_font(PtNode_S *page, char *name) +lookup_font(PtNode_T *page, char *name) { const HParsedToken *tok = NULL; Dict *font; @@ -3831,7 +3828,7 @@ lookup_font(PtNode_S *page, char *name) return (tok); } -void parse_fonts(HArena *arena, const HParsedToken *dictT, RsrcDict_S *pgRsrc) +void parse_fonts(HArena *arena, const HParsedToken *dictT, RsrcDict_T *pgRsrc) { Dict *fontdict = H_CAST(Dict, dictT); const HParsedToken *item = NULL; @@ -3861,14 +3858,14 @@ void parse_fonts(HArena *arena, const HParsedToken *dictT, RsrcDict_S *pgRsrc) } } -void parse_rsrcdict(HArena *arena, const HParsedToken *dictT, PtNode_S *pgNode) +void parse_rsrcdict(HArena *arena, const HParsedToken *dictT, PtNode_T *pgNode) { - RsrcDict_S *rsrc = NULL; + RsrcDict_T *rsrc = NULL; // Process the dictionary if ( (dictT->token_type == TT_Dict) || (dictT->token_type == TT_Objstm) ) { - rsrc = h_arena_malloc(arena, sizeof(RsrcDict_S)); + rsrc = h_arena_malloc(arena, sizeof(RsrcDict_T)); rsrc->resources = dictT; rsrc->fonts = NULL; rsrc->numFonts = 0; @@ -3915,7 +3912,7 @@ void parse_rsrcdict(HArena *arena, const HParsedToken *dictT, PtNode_S *pgNode) void parse_pagenode ( struct Env *aux, - PtNode_S *myNode, + PtNode_T *myNode, const HParsedToken *myRef, // my page node reference const Dict *myDict, // my page node specification const HParsedToken *parent, @@ -3933,9 +3930,12 @@ void parse_pagenode ( size_t sz = 0, nOffset = 0; - myNode->type = PG_NODE; - myNode->me = myRef; - myNode->pn.dict = myDict; + // set some global state variables + aux->tstate.page = myNode; + myNode->type = PG_NODE; + myNode->me = myRef; + myNode->pn.dict = myDict; + item = dictentry(myDict, "Parent"); @@ -4068,7 +4068,7 @@ end: void parse_pagetree( struct Env *aux, - PtNode_S *myNode, + PtNode_T *myNode, const HParsedToken *myRef, // my page tree node reference const Dict *myDict, // my page tree specification const HParsedToken *pRefT, // parent reference token @@ -4078,7 +4078,7 @@ parse_pagetree( const HParsedToken *item = NULL; const HParsedToken *kids = NULL; - PtNode_S *kid = NULL; + PtNode_T *kid = NULL; const HParsedToken *kidRef = NULL; // page tree or page node reference const HParsedToken *kidDict_t = NULL; const Dict *kidDict = NULL; @@ -4119,7 +4119,7 @@ parse_pagetree( // get the kids (pgTable) HCountedArray *pgTable = H_CAST_SEQ(kids); size_t pgtSz = pgTable->used; - myNode->pt.kids = (PtNode_S*)h_arena_malloc(pgTable->arena, pgtSz * sizeof(PtNode_S)); + myNode->pt.kids = (PtNode_T*)h_arena_malloc(pgTable->arena, pgtSz * sizeof(PtNode_T)); myNode->pt.count = pgtSz; // Process the kids -- GitLab