From 76e546cef80d23e018a051db4b5b29ed1f409b20 Mon Sep 17 00:00:00 2001 From: "Sven M. Hallberg" <pesco@khjk.org> Date: Tue, 14 Mar 2023 17:02:45 +0000 Subject: [PATCH] process page content after main parser and only for text extraction Factors this code out of parse_xrefs() where it never belonged, into a new function process_page_content() that is called from main after the main parse has succeeded and only if text extraction was requested, i.e. -x or -X was passed on the command line. Also adjusts the code for style and drops some related XXXs. Fixes #49. --- pdf.c | 118 ++++++++++++++++++++++++++++++---------------------------- 1 file changed, 61 insertions(+), 57 deletions(-) diff --git a/pdf.c b/pdf.c index 49fc63a..72d1ed3 100644 --- a/pdf.c +++ b/pdf.c @@ -4356,6 +4356,60 @@ parse_catalog(struct Env *aux, const HParsedToken *root) } } + +void +process_page_content(struct Env *aux) +{ + const HParsedToken *tok, *xref; + Dict *trailer; + size_t Size; + + /* use the last xref section */ + if (aux->nxrefs == 0) + return; + xref = aux->xrefs[aux->nxrefs - 1]; + + /* make sure we parsed a valid trailer dictionary */ + tok = H_INDEX_TOKEN(xref, 1); + trailer = H_CAST(Dict, tok); + if (trailer == NULL) { + // XXX this cannot actually happen (rule xr_td) + log_message(7, "VIOLATION[7]: Invalid Trailer Section " + "or Trailer Section not found\n"); + return; + } + + /* fetch Size (a required field) from the trailer dictionary */ + tok = dictentry(trailer, "Size"); + if (tok == NULL || tok->token_type != TT_SINT) { + log_message(5, "VIOLATION[5]: error parsing trailer section!" + "Missing or malformed /Size field.\n"); + return; + } + Size = H_CAST_SINT(tok); + + // XXX this likely belongs somewhere else + // and it is wrong, since nxrefs is the number of xref _sections_, not + // the number of xref entries (objects) as this code assumes. m( +#if 0 + if (aux->nxrefs > Size) { + log_message(7, "%s: Number of xrefs found (%zu) " + "Greater than specified /Size (%zu).\n" + "Ignoring objects numbered greater than (%zu)!\n", + infile, aux->nxrefs, Size, n); + aux->nxrefs = Size; + } +#else + (void)Size; // XXX silence warning +#endif + + /* process the document starting from the /Root object */ + tok = dictentry(trailer, "Root"); + if (tok != NULL && tok->token_type == TT_Ref) + parse_catalog(aux, tok); +} + + /* * ******************************************************************** * End Catalog parsing @@ -4853,11 +4907,9 @@ parse_xrefs(struct Env *aux) const HParsedToken *tok = NULL; size_t n = 0, nfwd = 0; size_t offset = 0; - bool processRoot = true; - size_t maxObjNum = 0; - Dict *trailer = NULL; // Initialize the environment variables + // XXX not needed?! aux->xrefs = xrefs; aux->nxrefs = n; @@ -4907,25 +4959,6 @@ parse_xrefs(struct Env *aux) err(2, "parse_xrefs"); xrefs[n++] = res->ast; - - /* process the root */ - // XXX this validation likely belongs someplace else - if (processRoot) { - // Size is a required field in the trailer dictionary - const HParsedToken *trailer_t = H_INDEX_TOKEN(res->ast, 1); - trailer = H_CAST(Dict, trailer_t); - const HParsedToken *size_t = dictentry(trailer, "Size"); - if (size_t == NULL || size_t->token_type != TT_SINT) { - log_message(5, "VIOLATION[5]: error parsing trailer section!" - "Missing or malformed -Size- field\n"); - return; - } - maxObjNum = H_CAST_SINT(size_t); - - processRoot = false; - } - - /* look up the next offset (to the previous xref section) */ tok = dictentry(H_INDEX(Dict, res->ast, 1), "Prev"); if (tok == NULL) @@ -4958,42 +4991,10 @@ parse_xrefs(struct Env *aux) offset = (size_t)tok->sint; } - - - /* Make sure we parsed a valid trailer section */ - if (! trailer) { - log_message(7, "VIOLATION[7]: Invalid Trailer Section or Trailer Section not found\n"); - return; - } - - - aux->xrefs = xrefs; aux->nxrefs = n; - - - - // XXX likely belongs somewhere else (see XXX in the loop above) - if (n > maxObjNum) { - log_message(7, "%s: Number of xrefs found -%ld- " - "Greater than specified /Size -%ld-.\n" - "Ignoring objects numbered greater than -%ld-!\n", - infile, n, maxObjNum, n); - aux->nxrefs = maxObjNum; - } - - - // Process the trailer dictionary - // XXX content processing does not belong in this function - const HParsedToken *root = dictentry(trailer, "Root"); - if(root != NULL && root->token_type == TT_Ref) - parse_catalog(aux, root); - - return; } - - /* * ******************************************************************** * End xref parsing @@ -5210,9 +5211,12 @@ main(int argc, char *argv[]) h_pprintln(stdout, res->ast); } - /* Save the extracted text */ - if (aux.ntextobjs > 0) - text_extract(&aux, xfile, Xfile); + /* extract text if requested */ + if (xfile != NULL || Xfile != NULL) { + process_page_content(&aux); + if (aux.ntextobjs > 0) // XXX always create output file? + text_extract(&aux, xfile, Xfile); + } return 0; } -- GitLab