From 24714c8784f60d3f3feaca267e781d221ef69097 Mon Sep 17 00:00:00 2001
From: "sumit.ray@baesystems.com" <sumit.ray@baesystems.com>
Date: Fri, 9 Jul 2021 22:16:37 -0400
Subject: [PATCH] Added the page node pointer to tht text neodes

---
 pdf.c | 58 +++++++++++++++++++++++++++++-----------------------------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/pdf.c b/pdf.c
index f5436aa..efae767 100644
--- a/pdf.c
+++ b/pdf.c
@@ -122,7 +122,7 @@ typedef HCountedArray Dict;
 
 
 // Catalog Tree
-typedef struct RsrcDict_T {
+typedef struct RsrcDict_S {
 	const HParsedToken  *resources;             // font references dictionary (resources == NULL) ==> inherit
 	const HParsedToken  **fonts;                // Serialized list of fonts used in this page
 	size_t                numFonts;
@@ -130,12 +130,12 @@ typedef struct RsrcDict_T {
 //	size_t                numSeenFonts;
 	const HParsedToken  **seenCmaps;            // memoized cmaps (should this be a bytestream?
 	size_t                numCmapsSeen;
-} RsrcDict_S;
-struct PtNode_T;
-typedef struct PtNode_T {
+} RsrcDict_T;
+struct PtNode_S;
+typedef struct PtNode_S {
 	enum {PG_TREE, PG_NODE, XO_NODE} type;
 	const HParsedToken  *parent;                // Type = Page tree -- reference
-	RsrcDict_S          *pgRsrc;                // resource structure
+	RsrcDict_T          *pgRsrc;                // resource structure
 	const HParsedToken  *me;                    // Reference for me
 	size_t               offset;                //
 	union {
@@ -144,20 +144,20 @@ typedef struct PtNode_T {
 			const HParsedToken  *textStream;    // content stream -- may be a result of concatenating array of content streams
 		} pn;
 		struct {
-			struct PtNode_T   *kids;            // page table
+			struct PtNode_S   *kids;            // page table
 			size_t             count;           // number of kids
 			size_t             leaves;          // number of pages in tree
 		} pt;
 	};
 
-} PtNode_S;
+} PtNode_T;
 
-typedef struct {
+typedef struct Catalog_S {
 	const  HParsedToken  *catalog;   // reference
 	const  HParsedToken  *pRoot;     // reference
-	PtNode_S              pgTree;
+	PtNode_T              pgTree;
 	size_t                pgCount;
-} Catalog_S;
+} Catalog_T;
 
 
 // Forward declaration of text extraction related structures
@@ -281,7 +281,7 @@ struct Env {
 	struct textnode     *txttail;  /* parsed text objects from the file */
 	size_t               ntextobjs;
 
-	Catalog_S            catalog;  /* Catalog object and document structure */
+	Catalog_T            catalog;  /* Catalog object and document structure */
 	TextState_T          tstate;   /* current text state */
 
 };
@@ -1845,8 +1845,8 @@ HParser *
 ktxtstream(HAllocator *mm__, const HParsedToken *x, void *env)
 {
 
-	struct Env *aux = env;
-
+	struct Env      *aux = env;
+	struct textnode *txtnd;
 
 	assert (x->token_type == TT_SEQUENCE);
 	int n_tobjs = x->seq->used;
@@ -1880,10 +1880,7 @@ ktxtstream(HAllocator *mm__, const HParsedToken *x, void *env)
 
 
 		// store the string in the environment
-		// not sure whether we need to actually store the string in malloc'ed area
-		// currently, we are reusing the token memory previously created
-		struct textnode *txtnd = (struct textnode *) malloc(
-				sizeof(struct textnode));
+		txtnd = h_alloc(mm__, sizeof(struct textnode));
 		txtnd->tstr = tstr;
 		txtnd->next = NULL;
 		if (aux->txthead == NULL)
@@ -3806,7 +3803,7 @@ HParsedToken *create_strmdict(HArena *arena, size_t len)
 
 
 const HParsedToken *
-lookup_font(PtNode_S *page, char *name)
+lookup_font(PtNode_T *page, char *name)
 {
 	const HParsedToken *tok = NULL;
 	Dict               *font;
@@ -3831,7 +3828,7 @@ lookup_font(PtNode_S *page, char *name)
 	return (tok);
 }
 
-void parse_fonts(HArena *arena, const HParsedToken *dictT, RsrcDict_S *pgRsrc)
+void parse_fonts(HArena *arena, const HParsedToken *dictT, RsrcDict_T *pgRsrc)
 {
 	Dict               *fontdict = H_CAST(Dict, dictT);
 	const HParsedToken *item = NULL;
@@ -3861,14 +3858,14 @@ void parse_fonts(HArena *arena, const HParsedToken *dictT, RsrcDict_S *pgRsrc)
 	}
 }
 
-void parse_rsrcdict(HArena *arena, const HParsedToken *dictT, PtNode_S *pgNode)
+void parse_rsrcdict(HArena *arena, const HParsedToken *dictT, PtNode_T *pgNode)
 {
-	RsrcDict_S        *rsrc = NULL;
+	RsrcDict_T        *rsrc = NULL;
 
 
 	// Process the dictionary
 	if ( (dictT->token_type == TT_Dict) || (dictT->token_type == TT_Objstm) ) {
-		rsrc = h_arena_malloc(arena, sizeof(RsrcDict_S));
+		rsrc = h_arena_malloc(arena, sizeof(RsrcDict_T));
 		rsrc->resources   = dictT;
 		rsrc->fonts       = NULL;
 		rsrc->numFonts    = 0;
@@ -3915,7 +3912,7 @@ void parse_rsrcdict(HArena *arena, const HParsedToken *dictT, PtNode_S *pgNode)
 
 void parse_pagenode (
 		struct Env         *aux,
-		PtNode_S           *myNode,
+		PtNode_T           *myNode,
 		const HParsedToken *myRef,     // my page node reference
 		const Dict         *myDict,    // my page node specification
 		const HParsedToken *parent,
@@ -3933,9 +3930,12 @@ void parse_pagenode (
 	size_t             sz = 0, nOffset = 0;
 
 
-	myNode->type    = PG_NODE;
-	myNode->me      = myRef;
-	myNode->pn.dict = myDict;
+	// set some global state variables
+	aux->tstate.page = myNode;
+	myNode->type     = PG_NODE;
+	myNode->me       = myRef;
+	myNode->pn.dict  = myDict;
+
 
 
 	item = dictentry(myDict, "Parent");
@@ -4068,7 +4068,7 @@ end:
 void
 parse_pagetree(
 		struct Env         *aux,
-		PtNode_S           *myNode,
+		PtNode_T           *myNode,
 		const HParsedToken *myRef,     // my page tree node reference
 		const Dict         *myDict,    // my page tree specification
 		const HParsedToken *pRefT,     // parent reference token
@@ -4078,7 +4078,7 @@ parse_pagetree(
 
 	const HParsedToken *item      = NULL;
 	const HParsedToken *kids      = NULL;
-	PtNode_S           *kid       = NULL;
+	PtNode_T           *kid       = NULL;
 	const HParsedToken *kidRef    = NULL; // page tree or page node reference
 	const HParsedToken *kidDict_t = NULL;
 	const Dict         *kidDict   = NULL;
@@ -4119,7 +4119,7 @@ parse_pagetree(
 	// get the kids (pgTable)
 	HCountedArray  *pgTable = H_CAST_SEQ(kids);
 	size_t          pgtSz   = pgTable->used;
-	myNode->pt.kids  = (PtNode_S*)h_arena_malloc(pgTable->arena, pgtSz * sizeof(PtNode_S));
+	myNode->pt.kids  = (PtNode_T*)h_arena_malloc(pgTable->arena, pgtSz * sizeof(PtNode_T));
 	myNode->pt.count = pgtSz;
 
 	// Process the kids
-- 
GitLab