Compare revisions

f7dbb2ac · 00f48306 · f309790a · 25596aec · 39cb95f3 · 260c8794
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,7 @@
 *~
 Session.vim
 pdf
+pdf.1
 *.core
 hammer
 lib
-t/*.pdf
--- a/LICENSE
+++ b/LICENSE
-Copyright (c) 2019, 2020 Sven M. Hallberg <pesco@khjk.org>
+Copyright (c) 2019 - 2022 Sven M. Hallberg <pesco@khjk.org>
+Copyright (c) 2020 - 2022 pompolic <pompolic@special-circumstanc.es>
+Copyright (c) 2020 Paul Vines <paul.vines@baesystems.com>
+Copyright (c) 2020, 2021 Kragen Sitaker <xentrac@special-circumstanc.es>
+Copyright (c) 2021, 2022 Sumit Ray <sumit.ray@baesystems.com>
+Copyright (c) 2022 Meg Gordon <meg@special-circumstanc.es>

 Permission to use, copy, modify, and distribute this software for any
 purpose with or without fee is hereby granted, provided that the above

--- a/Makefile
+++ b/Makefile
-CFLAGS += -std=c99 -Wall -Werror
+CFLAGS += -std=c99 -Wall -Werror -DLOG -D_POSIX_C_SOURCE=2

 # find our hammer build - adjust this to your needs
 # i have, for instance, symlinks:
@@ -9,16 +9,33 @@ HAMMER_LIB = ./lib
 CFLAGS += -I$(HAMMER_INCLUDE)
 LDFLAGS += -L$(HAMMER_LIB)

-.PHONY: all test clean
-all: pdf
+SOURCES	= pdf.c lzw.c
+TARGETS	= pdf
+DOCS	= pdf.1.txt
+
+.PHONY: all test clean doc
+all: $(TARGETS)
+doc: $(DOCS)

 test: pdf
-	LD_LIBRARY_PATH=$(HAMMER_LIB) sh -c \
-	'for x in t/*.pdf; do ./pdf "$$x" >/dev/null && echo OK: "$$x"; done'
-	@true
+	LD_LIBRARY_PATH=$(HAMMER_LIB) test/run.sh

-pdf: pdf.c
-	$(CC) -o $@ $(CFLAGS) $(LDFLAGS) $< -lhammer -lz
+pdf: $(SOURCES)
+	$(CC) -o $@ $(CFLAGS) $(LDFLAGS) $(SOURCES) -lhammer -lz

 clean:
-	rm -f pdf
+	rm -f $(TARGETS)
+
+
+MANDOC ?= mandoc
+MANDOCFLAGS += -Ios= -Wall
+
+.SUFFIXES: .mdoc .txt .pdf .html
+.mdoc:
+	$(MANDOC) $(MANDOCFLAGS) -Tman $< > $@
+.mdoc.txt:
+	$(MANDOC) $(MANDOCFLAGS) -Tascii $< | col -b > $@
+.mdoc.pdf:
+	$(MANDOC) $(MANDOCFLAGS) -Tpdf $< > $@
+.mdoc.html:
+	$(MANDOC) $(MANDOCFLAGS) -Thtml $< > $@
--- a/README
+++ b/README
 Beginnings of a PDF parser in Hammer
 ====================================

- - Currently needs a custom Hammer branch. You'll need to build against this:

-   https://gitlab.special-circumstanc.es/pesco/hammer/tree/pdf
+BUILDING

- - Help the default Makefile find Hammer
+   Simply call 'make' in the top level directory.

-       $ ln -s ../hammer/src hammer
-       $ ln -s ../hammer/build/opt/src lib
+       $ make

- - Build/Usage:
+   The environment variables CC, CFLAGS, and LDFLAGS can be used in the usual
+   way to control the compiler to use, compiler flags, and linker flags,
+   respectively.

-       $ make pdf
-       $ ./pdf test.pdf
+   This program uses the Hammer parser combinator library. It needs a recent
+   version, which can be obtained from:
+
+       https://gitlab.special-circumstanc.es/hammer/hammer/
+
+   See the file README.md in that repository for build/install instructions.
+   It is recommended to install Hammer as a system library. See also the
+   TROUBLESHOOTING section below.
+
+
+USAGE
+
+       ./pdf [options] input.pdf [oid]
+
+   The 'pdf' utility attempts to parse and validate the given PDF file. If
+   successful, it prints the resulting AST to stdout using a JSON format.
+   It exits 0 on success, 1 if the input file was found to be invalid, and >1
+   if an error occurs. The optional oid argument selects a specific object to
+   print instead of the whole document.
+
+   Refer to the supplied manual page 'pdf.1' for details.
+
+
+TROUBLESHOOTING
+
+   <hammer/hammer.h> or libhammer.so not found:
+
+     If Hammer is not installed as a system library or in a nonstandard
+     location, cc and ld will fail to locate its headers and library. The
+     quick fix for this is to create symlinks called 'hammer' and 'lib'
+     pointing to Hammer's source and build output directories, respectively:
+
+         $ ln -s ../hammer/src hammer
+         $ ln -s ../hammer/build/opt/src lib
+         $ make
+
+     Likewise, when running 'pdf' directly, ld.so will fail to locate
+     libhammer.so. The quick fix is to point LD_LIBRARY_PATH to the 'lib' dir:
+
+         $ export LD_LIBRARY_PATH=$PWD/lib
+         $ ./pdf <filename>
+
+
+EVALUATING TEST RESULTS
+
+   A suite of example files is provided in the test/ directory. To run the
+   test suite:

-       # place some test files in the t/ directory...
       $ make test
+ 
+   For every file in the test/valid/ and test/invalid/ subdirectories, the pdf
+   parser is invoked.
+
+   For the valid samples, a message of the following form is displayed on a
+   successful parse (exit code 0):
+
+       OK: test/valid/<filename>
+
+   Non-fatal messages may be displayed above it, but presence of the "OK"
+   indicates that the test passed. On any nonzero exit, i.e. if either the
+   file is deemed invalid or the program encountered an unexpected error,
+   error messages are displayed above an indication of the following form
+   that includes the exact exit code:
+
+       FAIL (exit <n>): test/valid/<filename>
+
+   For the invalid samples, messages about parse errors are suppressed and an
+   "OK" is displayed if and only if pdf exits with 1 ("invalid input"). An
+   exit code of 0 or abnormal termination will produce the "FAIL" message with
+   any program output appearing above it.
+
+
+COPYRIGHT
+
+   Various authors. Released under the terms of the ISC license.
+
+   See LICENSE for full copyright and licensing notice.
--- a/TODO
+++ b/TODO
- - move main routine(s) into separate source file.
- - move filter implementation(s) into separate source file.
+ - fix the object stream parser to split input at logical boundaries, as
+   provided by the object index ("N pairs of integers") at the beginning of the
+   stream data.
+
+   this follows discussion with peter wyatt where he initially said that the
+   objects should be delimited by normal PDF token rules, but PDFA then came
+   to the conclusion that, in fact, this was a mistake and the logical
+   begin/end info should delimit things. i.e. if your index says that an object
+   begins at offset 0 and ends at offset 3, followed by one that ends at 6, and
+   the input is "123456", this parses as two numbers, 123 and 456.
+
+   currently the code follows the incorrect former approach, (re-) using the
+   "elemr" parser that is otherwise used with arrays. the above example would
+   parse as one element, the number 123456, in contradiction to the index
+   (which we parse but ignore).
+
+   we have to explicitly walk the index, run our "obj" parser on each
+   respective snippet of input, and wrap the results up in a parse result. we
+   should also validate conditions on the index beforehand. these are
+   thankfully sane (monotonic offsets etc.) and mentioned in the spec.
+
+   we'd like a combinator similar to our hand-rolled p_take(n,env) that works
+   on any input, not just the global input buffer passed from the environment.
+   as of feb 2023, h_bytes(n) which does just that is available in hammer.
+
+ - move main routine(s) and filter implementation(s) into separate source
+   files. e.g.:
+   - main.c: main function and helpers; starting from its include block
+   - pdf.c: parser proper; grammar and basic semantic actions
+   - filter.c: filters
+   - maybe another file just for xref or stream stuff?
+
+ - refactor / clean up the (ascii) filter implementations.
+
+ - rid VIOL() of the internal parser for getting at the severity parameter.
+   this is, i guess, an artefact of h_action() taking a single void pointer of
+   context, so it was not trivial to pass two arguments (message and severity)
+   to the action.
+   - just prefix the message string with a single digit or something.
+     this can be as simple as we need it to be.
+ - rework VIOL to produce a "violation" token in the AST (via h_action)?
+   - a validation (h_attr_bool) would let the parse fail if applicable
+     (severity vs. strictness).
+     - it is not clear how to externally store/retrieve information about fatal
+       violations, since they may occur in branches of the parse that may end
+       up being backed out of due to unrelated reasons.
+   - extract and print non-fatal violations after the parse?
+     - just leaving the violation tokens in the AST would mean that any
+       semantic could encounter them anywhere and would have to handle that
+       correctly. this seems ugly and prone to being forgotten.
+ - the current design mixes validation and semantic parsing ("is it valid? what
+   does it mean?") with diagnostic parsing ("what happened? what did you
+   see?").
+   - a strict division of the two jobs was originally intended, with the
+     pdf_dbg parser (pdf_diag would be a better name) only running after the
+     strictly validating parser proper failed.
+   - pdf_dbg was a quick and rough first stab that follows a more lenient
+     grammar to get at least an approximate location of whatever caused the
+     first parser to fail.
+   - running the parser twice (with all its memory allocation and so on) is not
+     efficient but seemed good enough for a first step. one could fold both
+     parsers into one at the very top and distinguish the results by token
+     type. this could probably reuse many objects from the packrat cache.
+   - working with parser combinators, we can use the full abstraction
+     facilities of our programming language. we should be able to use them to
+     factor out similarities between the two parsers (avoid code duplication).
+
+ - (maybe?) change stream parsing to just stop at "endstream endobj" when
+   /Length is indirect and the filter or postordinate parser doesn't delimit
+   itself. this is not strictly to-spec, but probably an OK restriction to make
+   in practice. a consistency checks can be made against the length after all
+   objects have been parsed.
+
+   note: the current design aims to follow the spec to the letter in that the
+   /Length entry of a stream determines its length, and nothing else. from this
+   it follows that we must find and parse these lengths in "island style".
+   thus, the current code is a hybrid of linear and island parsing. if the
+   reliance on /Length can be broken, the island-based resolver can go and we
+   can have a proper split between two separate parsers - one pure linear and
+   one pure island.
+
+ - parse and print content streams.
+ - parse/validate additional stream types/filters (images...).
+
+ - implement random-access ("island") parser (walking objects from /Root).
+   i'm not sure how much we need to know about the "DOM" for this. maybe
+   nothing? since everything is built out of basic objects and we can just
+   blindly follow references?
+ - check linear and random-access parses for consistency.

- - investigate memory use on big documents (millions of objects).

 - replace disparate parsing routines (applied to different pieces of input)
   with one big HParser that uses h_seek() to move around. this will enable
@@ -11,19 +97,10 @@
 - parse stream objects without reference to their /Length entry by simply
   trying all possible ways and consistency-checking them against the xref
   table in the end, via h_attr_bool().
+   XXX is this actually possible (without unreasonable complications)?

- - include position information, at least for objects, in the (JSON) output.
- - format warnings/errors (stderr) as JSON, too.
-
- - make custom token types for all appropriate parts of the parse result.
-
- - parse content streams.
-
- - implement random-access parser (walking objects from /Root).
- - check linear and random-access parses for consistency.
-
- - handle garbage before %PDF- and after %%EOF
- - handle garbage at other points in the input?
+ - investigate memory use on big documents (millions of objects).

- - add ASCII filter types.
- - add LZW filter.
+ - make custom token types for all appropriate parts of the parse result so
+   that they can be properly distinguished in the output.
+ - include position information, at least for objects, in the (JSON) output.
--- a/lzw.c
+++ b/lzw.c
+#include <hammer/hammer.h>
+#include <hammer/glue.h>
+#include <stdlib.h>	/* malloc, free */
+#include <string.h>	/* memcpy */
+#include "lzw.h"
+
+
+struct context {
+	/*
+	 * Storing byte sequences represented by each LZW code.
+	 *
+	 * Codes 0-255 are predefined representing literals.
+	 * Codes 256 and 267 are the special clear and eod (end of data) codes.
+	 * Codes >257 are dynamically defined by the input.
+	 *
+	 * Each dynamically defined code is an extension of a previously
+	 * defined code. We therefore need only store the code being extended
+	 * and the byte being added.
+	 *
+	 * Thus the entries of this array form linked lists. To terminate the
+	 * lists while making our memory allocation easy, we store the length
+	 * of each code's output sequence.
+	 *
+	 * Finally, we redundantly store the first byte of the sequence so we
+	 * don't have to walk the list during updates (see act_output).
+	 */
+	struct {
+		size_t len;	/* length of the sequence */
+		int prefix;	/* code representing the seq's prefix (len-1) */
+		uint8_t last;	/* final byte of the sequence */
+		uint8_t first;	/* first byte of the sequence */
+	} table[4097];		/* 4096 codes + one dummy (see act_output) */
+
+	/*
+	 * The next code to be assigned, i.e. the current size of the table.
+	 */
+	int next;
+
+	/*
+	 * earlychange = 1 means the bit size is increased "one code early".
+	 * earlychange = 0 is "code length increases shall be postponed as
+	 * long as possible".
+	 */
+	int earlychange;
+};
+
+
+/*
+ * Helpers for working with the table:
+ */
+
+static void
+lzw_clear_table(struct context *ctx)
+{
+	ctx->next = 258;
+}
+
+/*
+ * Update the dictionary with a new entry that extends the given code by one
+ * byte to be filled in later.
+ */
+static void
+lzw_table_extend(struct context *ctx, int code)
+{
+	ctx->table[ctx->next].prefix = code;
+	ctx->table[ctx->next].first = ctx->table[code].first;
+	ctx->table[ctx->next].len = ctx->table[code].len + 1;
+	ctx->table[ctx->next].last = 0xFF;
+	ctx->next++;
+}
+
+/*
+ * Assemble the output sequence represented by the given code word.
+ * The given buffer must have the appropriate size.
+ * Returns the number of bytes written.
+ */
+static size_t
+lzw_code_string(struct context *ctx, int code, uint8_t *buf)
+{
+	size_t i, n;
+
+	/* traverse the list, filling buf from last to first byte */
+	n = ctx->table[code].len;
+	for (i = 0; i < n; i++) {
+		buf[n - 1 - i] = ctx->table[code].last;
+		code = ctx->table[code].prefix;
+	}
+	assert(code == -1);	/* reached the end */
+
+	return n;
+}
+
+
+/*
+ * Global variables:
+ */
+
+HParser *p_lzwdata;
+static struct context *context;
+
+
+/*
+ * Semantic actions and validations:
+ */
+
+static HParsedToken *
+act_clear(const HParseResult *p, void *u)
+{
+	struct context *ctx = u;
+	lzw_clear_table(ctx);
+	return NULL;
+}
+
+static bool
+validate_clear(HParseResult *p, void *u)
+{
+	uint64_t code = H_CAST_UINT(p->ast);
+	return (code == 256);
+}
+
+static bool
+validate_eod(HParseResult *p, void *u)
+{
+	uint64_t code = H_CAST_UINT(p->ast);
+	return (code == 257);
+}
+
+static bool
+validate_output(HParseResult *p, void *u)
+{
+	uint64_t code = H_CAST_UINT(p->ast);
+	struct context *ctx = u;
+
+	return (code != 256 && code != 257 && code < ctx->next);
+}
+
+static HParsedToken *
+act_output(const HParseResult *p, void *u)
+{
+	uint64_t code = H_CAST_UINT(p->ast);
+	struct context *ctx = u;
+
+	assert(ctx->next >= 258);
+	assert(ctx->next <= 4097);
+	assert(code < ctx->next);
+	assert(code != 256);
+	assert(code != 257);
+
+	/* Fill in the missing last byte of the last assigned code, if any. */
+	if (ctx->next > 258)
+		ctx->table[ctx->next - 1].last = ctx->table[code].first;
+
+	/*
+	 * Update the dictionary with a new entry that is missing the last
+	 * byte which we will only learn when we process the next code.
+	 *
+	 * Note that the value 4097 is intentional here. Rather than going
+	 * through the effort of ensuring that the last code is only updated
+	 * once, we simply assign one more code as a dummy.
+	 */
+	if (ctx->next < 4097)
+		lzw_table_extend(ctx, code);
+
+	/* Pass the code through. Output is generated in act_lzwblock below. */
+	return (HParsedToken *)p->ast;	// XXX casting away the const OK?
+}
+
+/*
+ * Assemble the string represented by a block of code words under a given
+ * table. The incoming HParsedToken is a sequence of code words (TT_UINT).
+ */
+static HParsedToken *
+act_lzwblock(const HParseResult *p, void *u)
+{
+	HCountedArray *seq = H_CAST_SEQ(p->ast);
+	struct context *ctx = u;
+	uint8_t *buf, *cur;
+	size_t sz, i;
+	int code;
+
+	/* determine total output size, alloc buffer */
+	sz = 0;
+	for (i = 0; i < seq->used; i++) {
+		code = (int) H_CAST_UINT(seq->elements[i]);
+		sz += ctx->table[code].len;
+	}
+	buf = h_arena_malloc(p->arena, sz);
+
+	/* go through sequence, merge output bytes into buf */
+	cur = buf;
+	for (i = 0; i < seq->used; i++) {
+		code = (int) H_CAST_UINT(seq->elements[i]);
+		cur += lzw_code_string(ctx, code, cur);
+	}
+	assert(cur == buf + sz);
+
+	return H_MAKE_BYTES(buf, sz);
+}
+
+/*
+ * Concatenate blocks to form the final output string.
+ * The incoming HParsedToken is a sequence of HBytes.
+ */
+static HParsedToken *
+act_lzwdata(const HParseResult *p, void *u)
+{
+	HCountedArray *seq = H_CAST_SEQ(p->ast);
+	HBytes bs;
+	uint8_t *buf, *cur;
+	size_t sz, i;
+
+	/* fast path: single element? nothing to do */
+	if (seq->used == 1)
+		return seq->elements[0];
+
+	/* determine total output size, alloc buffer */
+	sz = 0;
+	for (i = 0; i < seq->used; i++)
+		sz += H_CAST_BYTES(seq->elements[i]).len;
+	buf = h_arena_malloc(p->arena, sz);
+
+	/* go through sequence, copying bytes into buf */
+	cur = buf;
+	for (i = 0; i < seq->used; i++) {
+		bs = H_CAST_BYTES(seq->elements[i]);
+		memcpy(cur, bs.token, bs.len);
+		cur += bs.len;
+	}
+	assert(cur == buf + sz);
+
+	return H_MAKE_BYTES(buf, sz);
+}
+
+/*
+ * Continuation for h_bind() in the 'codeword' grammar rule. It inspects the
+ * lzw context passed as 'env' and returns the parser of the correct size.
+ *
+ * NB: We create the returned parsers statically in init_LZW_parser() to avoid
+ * allocation during the parse.
+ */
+static HParser *p_code9, *p_code10, *p_code11, *p_code12;
+static HParser *
+kcodeword(HAllocator *mm__, const HParsedToken *x, void *env)
+{
+	struct context *ctx = env;
+
+	if (ctx->next <= 512 - ctx->earlychange)
+		return p_code9;
+	else if (ctx->next <= 1024 - ctx->earlychange)
+		return p_code10;
+	else if (ctx->next <= 2048 - ctx->earlychange)
+		return p_code11;
+	else
+		return p_code12;
+}
+
+
+/*
+ * Exposed interface:
+ */
+
+void
+init_LZW_parser()
+{
+	int i;
+
+	/* initialize global context variable, incl. static table entries */
+	context = malloc(sizeof *context);
+	assert(context != NULL);
+	for(i = 0; i < 256; i++)
+	{
+		context->table[i].len = 1;
+		context->table[i].prefix = -1;	/* none */
+		context->table[i].first = i;
+		context->table[i].last = i;
+	}
+	init_LZW_context(1);
+
+	/* static parsers for code words of all possible sizes */
+	p_code9  = h_bits(9,  false);
+	p_code10 = h_bits(10, false);
+	p_code11 = h_bits(11, false);
+	p_code12 = h_bits(12, false);
+
+	/* kcodeword() selects the appropriate parser based on context */
+	H_RULE   (codeword,	h_bind(h_epsilon_p(), kcodeword, context));
+
+	H_VRULE  (eod,		codeword);
+	H_AVDRULE(clear,	codeword, context);
+	H_AVDRULE(output,	codeword, context);
+
+	H_ADRULE(lzwblock,	h_right(clear, h_many(output)), context);
+	H_ARULE (lzwdata,	h_left(h_many1(lzwblock), eod));
+	    // XXX validate that the last byte is zero-padded?
+	    // XXX require h_end_p()?
+
+	p_lzwdata = lzwdata;
+}
+
+HParseResult *
+parse_LZW_data(const uint8_t *input, size_t length)
+{
+	HParseResult *res = h_parse(p_lzwdata, input, length);
+	return res;
+}
+
+void
+init_LZW_context(int earlychange)
+{
+	lzw_clear_table(context);
+	context->earlychange = !!earlychange;
+}
--- a/lzw.h
+++ b/lzw.h
+#ifndef PDF_LZW_H
+#define PDF_LZW_H
+
+#include <hammer/hammer.h>
+
+void init_LZW_parser();
+HParseResult * parse_LZW_data(const uint8_t* input, size_t length);
+void init_LZW_context(int earlychange);
+
+#endif // PDF_LZW_H
--- a/pdf.1.mdoc
+++ b/pdf.1.mdoc
+.Dd $Mdocdate$
+.Dt PDF 1
+.Os
+.Sh NAME
+.Nm pdf
+.Nd validation and inspection of PDF files
+.Sh SYNOPSIS
+.Nm pdf
+.Op Fl qsv
+.Op Fl d Ar what
+.Op Fl x Ar txtfile
+.Ar input.pdf
+.Op Ar oid
+.Sh DESCRIPTION
+The
+.Nm
+utility attempts to parse and validate the given PDF file.
+It prints the resulting AST to standard output using a JSON format.
+.Pp
+The optional
+.Ar oid
+argument selects a specific object to be printed instead of the whole document.
+It is expected to be of the form
+.Dq Va n . Ns Va g
+where
+.Va n
+and
+.Va g
+are object and generation numbers, respectively.
+The generation number may be omitted to select the latest object matching
+.Va n .
+.Pp
+The options are as follows:
+.Bl -tag -width Ds
+.It Fl d Cm s
+Dump the body data, after filter decoding, of a given stream object.
+An
+.Ar oid
+argument is required.
+.It Fl q
+Query/quiet mode.
+Do not print to standard output and suppress any messages about parse errors.
+Just indicate success or failure via the exit status.
+.It Fl s
+Strict mode.
+Treat most
+.Dq benign
+format violations as parse errors.
+.It Fl v
+Verbose mode.
+Show additional informational messages.
+.It Fl x Ar txtfile
+Extract the text content of the input document and write it as plain
+text to
+.Ar txtfile .
+.El
+.Sh EXIT STATUS
+The program exits 0 on successful execution with valid (conforming) input.
+An exit code of 1 indicates that the parser identified the input file as
+invalid but otherwise executed normally.
+Exit codes >1 indicate abnormal termination, i.e. program failure with
+indeterminate parse result.
+.Sh STANDARDS
+.Rs
+.%R ISO 32000-1
+.%T Document management \(em Portable document format \(em \
+    Part 1: PDF 1.7
+.%D 2008
+.Re
+.Pp
+.Rs
+.%R ISO 32000-2
+.%T Document management \(em Portable document format \(em \
+    Part 2: PDF 2.0
+.%D 2020
+.Re
--- a/pdf.1.txt
+++ b/pdf.1.txt
+PDF(1)			    General Commands Manual			PDF(1)
+
+NAME
+     pdf - validation and inspection of PDF files
+
+SYNOPSIS
+     pdf [-qsv] [-d what] [-x txtfile] input.pdf [oid]
+
+DESCRIPTION
+     The pdf utility attempts to parse and validate the given PDF file.	 It
+     prints the resulting AST to standard output using a JSON format.
+
+     The optional oid argument selects a specific object to be printed instead
+     of the whole document.  It is expected to be of the form "n.g" where n
+     and g are object and generation numbers, respectively.  The generation
+     number may be omitted to select the latest object matching n.
+
+     The options are as follows:
+
+     -d s    Dump the body data, after filter decoding, of a given stream
+	     object.  An oid argument is required.
+
+     -q	     Query/quiet mode.	Do not print to standard output and suppress
+	     any messages about parse errors.  Just indicate success or
+	     failure via the exit status.
+
+     -s	     Strict mode.  Treat most "benign" format violations as parse
+	     errors.
+
+     -v	     Verbose mode.  Show additional informational messages.
+
+     -x txtfile
+	     Extract the text content of the input document and write it as
+	     plain text to txtfile.
+
+EXIT STATUS
+     The program exits 0 on successful execution with valid (conforming)
+     input.  An exit code of 1 indicates that the parser identified the input
+     file as invalid but otherwise executed normally.  Exit codes >1 indicate
+     abnormal termination, i.e. program failure with indeterminate parse
+     result.
+
+STANDARDS
+     Document management -- Portable document format -- Part 1: PDF 1.7, ISO
+     32000-1, 2008.
+
+     Document management -- Portable document format -- Part 2: PDF 2.0, ISO
+     32000-2, 2020.
+
+				January 6, 2023
--- a/pdf.c
+++ b/pdf.c
--- a/t/.keep
+++ b/t/.keep
--- a/test/invalid/ascii85-longzero.pdf
+++ b/test/invalid/ascii85-longzero.pdf
--- a/test/invalid/ascii85-partial2-inval.pdf
+++ b/test/invalid/ascii85-partial2-inval.pdf
--- a/test/invalid/ascii85-partial2-overflow.pdf
+++ b/test/invalid/ascii85-partial2-overflow.pdf
--- a/test/invalid/dict-dup0.pdf
+++ b/test/invalid/dict-dup0.pdf
--- a/test/invalid/dict-dup1.pdf
+++ b/test/invalid/dict-dup1.pdf
--- a/test/invalid/dict-dup2.pdf
+++ b/test/invalid/dict-dup2.pdf
--- a/test/invalid/loop.pdf.XXX
+++ b/test/invalid/loop.pdf.XXX
--- a/test/invalid/missing_xref_crash_regr.pdf
+++ b/test/invalid/missing_xref_crash_regr.pdf
--- a/test/invalid/prev-oob.pdf
+++ b/test/invalid/prev-oob.pdf
No results found