Compare revisions

11ecff40 · f3f182b7 · 86cc2a88 · c3bc010b · 4738716a · dce73cf5
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,7 @@
 *~
 Session.vim
 pdf
+pdf.1
 *.core
 hammer
 lib
-t/*.pdf
--- a/LICENSE
+++ b/LICENSE
-Copyright (c) 2019, 2020 Sven M. Hallberg <pesco@khjk.org>
-Copyright (c) 2020 pompolic <pompolic@special-circumstanc.es>
+Copyright (c) 2019 - 2022 Sven M. Hallberg <pesco@khjk.org>
+Copyright (c) 2020 - 2022 pompolic <pompolic@special-circumstanc.es>
 Copyright (c) 2020 Paul Vines <paul.vines@baesystems.com>
+Copyright (c) 2020, 2021 Kragen Sitaker <xentrac@special-circumstanc.es>
+Copyright (c) 2021, 2022 Sumit Ray <sumit.ray@baesystems.com>
+Copyright (c) 2022 Meg Gordon <meg@special-circumstanc.es>

 Permission to use, copy, modify, and distribute this software for any
 purpose with or without fee is hereby granted, provided that the above

--- a/Makefile
+++ b/Makefile
-CFLAGS += -std=c99 -Wall -Werror -DLOG
+CFLAGS += -std=c99 -Wall -Werror -DLOG -D_POSIX_C_SOURCE=2

 # find our hammer build - adjust this to your needs
 # i have, for instance, symlinks:
@@ -8,18 +8,34 @@ HAMMER_INCLUDE = .
 HAMMER_LIB = ./lib
 CFLAGS += -I$(HAMMER_INCLUDE)
 LDFLAGS += -L$(HAMMER_LIB)
-SOURCES = pdf.c lzw-lib.c

-.PHONY: all test clean
-all: pdf
+SOURCES	= pdf.c lzw.c
+TARGETS	= pdf
+DOCS	= pdf.1.txt
+
+.PHONY: all test clean doc
+all: $(TARGETS)
+doc: $(DOCS)

 test: pdf
-	LD_LIBRARY_PATH=$(HAMMER_LIB) sh -c \
-	'for x in t/*.pdf; do ./pdf "$$x" >/dev/null && echo OK: "$$x"; done'
-	@true
+	LD_LIBRARY_PATH=$(HAMMER_LIB) test/run.sh

 pdf: $(SOURCES)
 	$(CC) -o $@ $(CFLAGS) $(LDFLAGS) $(SOURCES) -lhammer -lz

 clean:
-	rm -f pdf
+	rm -f $(TARGETS)
+
+
+MANDOC ?= mandoc
+MANDOCFLAGS += -Ios= -Wall
+
+.SUFFIXES: .mdoc .txt .pdf .html
+.mdoc:
+	$(MANDOC) $(MANDOCFLAGS) -Tman $< > $@
+.mdoc.txt:
+	$(MANDOC) $(MANDOCFLAGS) -Tascii $< | col -b > $@
+.mdoc.pdf:
+	$(MANDOC) $(MANDOCFLAGS) -Tpdf $< > $@
+.mdoc.html:
+	$(MANDOC) $(MANDOCFLAGS) -Thtml $< > $@
--- a/README
+++ b/README
 Beginnings of a PDF parser in Hammer
 ====================================

- - Currently needs a custom Hammer branch. You'll need to build against this:

-   https://gitlab.special-circumstanc.es/pesco/hammer/tree/pdf
+BUILDING

-   For detailed build instructions, see README.md in that repository.
+   Simply call 'make' in the top level directory.

- - Help the default Makefile find Hammer
+       $ make

-       $ ln -s ../hammer/src hammer         # needed for building pdf, include files
-       $ ln -s ../hammer/build/opt/src lib  # needed for running pdf, to locate libhammer.so
+   The environment variables CC, CFLAGS, and LDFLAGS can be used in the usual
+   way to control the compiler to use, compiler flags, and linker flags,
+   respectively.

- - Notes for 2020-04-27 release:
+   This program uses the Hammer parser combinator library. It needs a recent
+   version, which can be obtained from:

-    The release branch has been tested to build with the 2020-04-27_RELEASE` branch located at https://gitlab.special-circumstanc.es/pesco/hammer/tree/2020-04-27_RELEASE
+       https://gitlab.special-circumstanc.es/hammer/hammer/

- - Build:
+   See the file README.md in that repository for build/install instructions.
+   It is recommended to install Hammer as a system library. See also the
+   TROUBLESHOOTING section below.

-       $ pushd ../hammer; scons; popd       # build Hammer
-       $ make pdf

- - Usage:
+USAGE

-       $ export LD_LIBRARY_PATH=./lib       # see Troubleshooting section below to see if this is needed
-       $ ldd ./pdf | grep libhammer         # verify that libhammer.so was found
-       $ ./pdf <filename>
+       ./pdf [options] input.pdf [oid]

-       # place some test files in the t/ directory...
-       $ make test
+   The 'pdf' utility attempts to parse and validate the given PDF file. If
+   successful, it prints the resulting AST to stdout using a JSON format.
+   It exits 0 on success, 1 if the input file was found to be invalid, and >1
+   if an error occurs. The optional oid argument selects a specific object to
+   print instead of the whole document.
+
+   Refer to the supplied manual page 'pdf.1' for details.
+
+
+TROUBLESHOOTING
+
+   <hammer/hammer.h> or libhammer.so not found:

- - Troubleshooting:
+     If Hammer is not installed as a system library or in a nonstandard
+     location, cc and ld will fail to locate its headers and library. The
+     quick fix for this is to create symlinks called 'hammer' and 'lib'
+     pointing to Hammer's source and build output directories, respectively:

-       libhammer.so not found:
+         $ ln -s ../hammer/src hammer
+         $ ln -s ../hammer/build/opt/src lib
+         $ make

-           If Hammer is not installed as a system library, ld may fail to locate libhammer.so. The quick fix for this is altering LD_LIBRARY_PATH before running pdf:
+     Likewise, when running 'pdf' directly, ld.so will fail to locate
+     libhammer.so. The quick fix is to point LD_LIBRARY_PATH to the 'lib' dir:

-           $ export LD_LIBRARY_PATH=./lib
-           $ make test
+         $ export LD_LIBRARY_PATH=$PWD/lib
+         $ ./pdf <filename>

-           The second solution is executing "scons install" when building Hammer, which will install it in ld's usual search path:

-           $ pushd ../hammer; scons install; popd
-           # ... Update ldconfig cache if needed
-           $ make pdf
-           $ make test
+EVALUATING TEST RESULTS

- - Evaluating test results:
+   A suite of example files is provided in the test/ directory. To run the
+   test suite:
+
+       $ make test
 
-   For every file in the t/ directory, the pdf parser is executed. On successful parse, a message of the following form is displayed:
+   For every file in the test/valid/ and test/invalid/ subdirectories, the pdf
+   parser is invoked.
+
+   For the valid samples, a message of the following form is displayed on a
+   successful parse (exit code 0):
+
+       OK: test/valid/<filename>
+
+   Non-fatal messages may be displayed above it, but presence of the "OK"
+   indicates that the test passed. On any nonzero exit, i.e. if either the
+   file is deemed invalid or the program encountered an unexpected error,
+   error messages are displayed above an indication of the following form
+   that includes the exact exit code:

-   OK: t/<filename>
+       FAIL (exit <n>): test/valid/<filename>

-   In case of a non-fatal parse error, error messages may be displayed, but presence of the "OK" indicates pdf exited successfully. On a failed test run, only parse error messages are displayed.
+   For the invalid samples, messages about parse errors are suppressed and an
+   "OK" is displayed if and only if pdf exits with 1 ("invalid input"). An
+   exit code of 0 or abnormal termination will produce the "FAIL" message with
+   any program output appearing above it.

- - Copyright:

-  - pesco 2019,2020
-  - pompolic 2020
-  - Paul Vines 2020
-  - David Bryant (modified lzw-ab code)
+COPYRIGHT

-  See LICENSE and lzw-ab-license.txt for full copyright and licensing notice.
+   Various authors. Released under the terms of the ISC license.

+   See LICENSE for full copyright and licensing notice.
--- a/TODO
+++ b/TODO
@@ -19,6 +19,10 @@
   should also validate conditions on the index beforehand. these are
   thankfully sane (monotonic offsets etc.) and mentioned in the spec.

+   we'd like a combinator similar to our hand-rolled p_take(n,env) that works
+   on any input, not just the global input buffer passed from the environment.
+   as of feb 2023, h_bytes(n) which does just that is available in hammer.
+
 - move main routine(s) and filter implementation(s) into separate source
   files. e.g.:
   - main.c: main function and helpers; starting from its include block
@@ -28,14 +32,38 @@

 - refactor / clean up the (ascii) filter implementations.

- - rework VIOL to produce a "violation" token in the AST (via h_action). then,
-   a validation (h_attr_bool) should let the parse fail if applicable (severity
-   vs. strictness). non-fatal violations should be extracted and printed to
-   stderr after the parse.
- - somehow rid VIOL() of the internal parser for getting at the severity
-   parameter. this is, i guess, an artefact of h_action() taking a single void
-   pointer of context, so it was not trivial to pass two arguments (message and
-   severity) to the action.
+ - rid VIOL() of the internal parser for getting at the severity parameter.
+   this is, i guess, an artefact of h_action() taking a single void pointer of
+   context, so it was not trivial to pass two arguments (message and severity)
+   to the action.
+   - just prefix the message string with a single digit or something.
+     this can be as simple as we need it to be.
+ - rework VIOL to produce a "violation" token in the AST (via h_action)?
+   - a validation (h_attr_bool) would let the parse fail if applicable
+     (severity vs. strictness).
+     - it is not clear how to externally store/retrieve information about fatal
+       violations, since they may occur in branches of the parse that may end
+       up being backed out of due to unrelated reasons.
+   - extract and print non-fatal violations after the parse?
+     - just leaving the violation tokens in the AST would mean that any
+       semantic could encounter them anywhere and would have to handle that
+       correctly. this seems ugly and prone to being forgotten.
+ - the current design mixes validation and semantic parsing ("is it valid? what
+   does it mean?") with diagnostic parsing ("what happened? what did you
+   see?").
+   - a strict division of the two jobs was originally intended, with the
+     pdf_dbg parser (pdf_diag would be a better name) only running after the
+     strictly validating parser proper failed.
+   - pdf_dbg was a quick and rough first stab that follows a more lenient
+     grammar to get at least an approximate location of whatever caused the
+     first parser to fail.
+   - running the parser twice (with all its memory allocation and so on) is not
+     efficient but seemed good enough for a first step. one could fold both
+     parsers into one at the very top and distinguish the results by token
+     type. this could probably reuse many objects from the packrat cache.
+   - working with parser combinators, we can use the full abstraction
+     facilities of our programming language. we should be able to use them to
+     factor out similarities between the two parsers (avoid code duplication).

 - (maybe?) change stream parsing to just stop at "endstream endobj" when
   /Length is indirect and the filter or postordinate parser doesn't delimit
@@ -54,31 +82,6 @@
 - parse and print content streams.
 - parse/validate additional stream types/filters (images...).

- - consider reviving the effort to get "obj" to parse with LALR. the messy
-   grammar for arrays with "elemd", "elemr", etc. still stems from project, as
-   does the explicit handling of whitespace -- note that TOK() is only used in
-   KW() and that no instances of KW() remain under "obj".
-
-   alternatively, consider fully reverting the grammar to its clearer PEG form.
-   i would probably keep the explicit whitespace, though.
-
-   what stopped me before was the difficulty to resolve some things without
-   precedence rules; specifically line endings in string literals.
-   is <CR><LF> a "crlf" or a "cr" followed by an "lf"? LALR cannot decide
-   unless you encode that anything following a "cr" doesn't start with <LF>.
-   string literals are currently defined differently. the best way to do it,
-   AFAICS, would be to match (in string literals) all subsequent line endings
-   in one nonterminal and to encode there that a plain "cr" is never followed
-   by "lf".
-
-   FWIW, the motivation for LALR parsing of "obj" was the prospect of parsing
-   an object stream incrementally, as chunks come in from the decompressor
-   (or an arbitrary filter chain).
-
-   NB: the reason why we must distinguish "crlf" from "cr" "lf" at all is of
-   course that in a string literal, the former means "\n" and the latter means
-   "\n\n".
-
 - implement random-access ("island") parser (walking objects from /Root).
   i'm not sure how much we need to know about the "DOM" for this. maybe
   nothing? since everything is built out of basic objects and we can just

--- a/lzw-ab-license.txt
+++ b/lzw-ab-license.txt
-                       Copyright (c) David Bryant
-                          All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-      this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright notice,
-      this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    * Neither the name of Conifer Software nor the names of its contributors
-      may be used to endorse or promote products derived from this software
-      without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/lzw-lib.c
+++ b/lzw-lib.c
-////////////////////////////////////////////////////////////////////////////
-//                            **** LZW-AB ****                            //
-//               Adjusted Binary LZW Compressor/Decompressor              //
-//                     Copyright (c) 2016 David Bryant                    //
-//                           All Rights Reserved                          //
-//      Distributed under the BSD Software License (see license.txt)      //
-////////////////////////////////////////////////////////////////////////////
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "lzw-lib.h"
-
-/* This library implements the LZW general-purpose data compression algorithm.
- * The algorithm was originally described as a hardware implementation by
- * Terry Welsh here:
- *
- *   Welch, T.A. “A Technique for High-Performance Data Compression.”
- *   IEEE Computer 17,6 (June 1984), pp. 8-19.
- *
- * Since then there have been enumerable refinements and variations on the
- * basic technique, and this implementation is no different. The target of
- * the present implementation is embedded systems, and so emphasis was placed
- * on simplicity, fast execution, and minimal RAM usage.
- *
- * The symbols are stored in adjusted binary, which provides considerably
- * better compression performance with virtually no speed penalty compared to
- * the fixed sizes normally used. To ensure good performance on data with
- * varying characteristics (like executable images) the encoder resets as
- * soon as the dictionary is full. Also, worst-case performance is limited
- * to about 8% inflation by catching poor performance and forcing an early
- * reset before longer symbols are sent.
- *
- * The maximum symbol size is configurable on the encode side (from 9 bits
- * to 12 bits) and determines the RAM footprint required by both sides and,
- * to a large extent, the compression performance. This information is
- * communicated to the decoder in the first stream byte so that it can
- * allocate accordingly. The RAM requirements are as follows:
- *
- *    maximum    encoder RAM   decoder RAM
- *  symbol size  requirement   requirement
- * -----------------------------------------
- *     9-bit     1792 bytes    1024 bytes
- *    10-bit     4352 bytes    3072 bytes
- *    11-bit     9472 bytes    7168 bytes
- *    12-bit     19712 bytes   15360 bytes
- * 
- * This implementation uses malloc(), but obviously an embedded version could
- * use static arrays instead if desired (assuming that the maxbits was
- * controlled outside).
- */
-
-#define NULL_CODE       -1      // indicates a NULL prefix
-#define CLEAR_CODE      256     // code to flush dictionary and restart decoder
-#define EOD_CODE        257     // used in PDF's LZWDecode to signal end of data
-#define FIRST_STRING    258     // code of first dictionary string, PDF edition
-
-/* This macro writes the adjusted-binary symbol "code" given the maximum
- * symbol "maxcode". A macro is used here just to avoid the duplication in
- * the lzw_compress() function. The idea is that if "maxcode" is not one
- * less than a power of two (which it rarely will be) then this code can
- * often send fewer bits that would be required with a fixed-sized code.
- *
- * For example, the first code we send will have a "maxcode" of 257, so
- * every "code" would normally consume 9 bits. But with adjusted binary we
- * can actually represent any code from 0 to 253 with just 8 bits -- only
- * the 4 codes from 254 to 257 take 9 bits.
- */
-
-#define WRITE_CODE(code,maxcode) do {                           \
-    int code_bits = (maxcode) < 1024 ?                          \
-        ((maxcode) < 512 ? 8 : 9) :                             \
-        ((maxcode) < 2048 ? 10 : 11);                           \
-    int extras = (1 << (code_bits + 1)) - (maxcode) - 1;        \
-    if ((code) < extras) {                                      \
-        shifter |= ((long)(code) << bits);                      \
-        bits += code_bits;                                      \
-    }                                                           \
-    else {                                                      \
-        shifter |= ((long)(((code) + extras) >> 1) << bits);    \
-        bits += code_bits;                                      \
-        shifter |= ((long)(((code) + extras) & 1) << bits++);   \
-    }                                                           \
-    do { (*dst)(shifter); shifter >>= 8; output_bytes++;        \
-    } while ((bits -= 8) >= 8);                                 \
-} while (0)
-
-/* LZW compression function. Bytes (8-bit) are read and written through callbacks and the
- * "maxbits" parameter specifies the maximum symbol size (9-12), which in turn determines
- * the RAM requirement and, to a large extent, the level of compression achievable. A return
- * value of EOF from the "src" callback terminates the compression process. A non-zero return
- * value indicates one of the two possible errors -- bad "maxbits" param or failed malloc().
- */
-
-int lzw_compress (void (*dst)(int), int (*src)(void), int maxbits)
-{
-    int next = FIRST_STRING, prefix = NULL_CODE, bits = 0, total_codes, c;
-    unsigned long input_bytes = 0, output_bytes = 0;
-    short *first_references, *next_references;
-    unsigned char *terminators;
-    unsigned long shifter = 0;
-
-    if (maxbits < 9 || maxbits > 12)    // check for valid "maxbits" setting
-        return 1;
-
-    // based on the "maxbits" parameter, compute total codes and allocate dictionary storage
-
-    total_codes = 1 << maxbits;
-    first_references = malloc (total_codes * sizeof (first_references [0]));
-    next_references = malloc ((total_codes - 256) * sizeof (next_references [0]));
-    terminators = malloc ((total_codes - 256) * sizeof (terminators [0]));
-
-    if (!first_references || !next_references || !terminators)
-        return 1;                       // failed malloc()
-
-    // clear the dictionary
-
-    memset (first_references, 0, total_codes * sizeof (first_references [0]));
-    memset (next_references, 0, (total_codes - 256) * sizeof (next_references [0]));
-    memset (terminators, 0, (total_codes - 256) * sizeof (terminators [0]));
-
-    (*dst)(maxbits - 9);    // first byte in output stream indicates the maximum symbol bits
-
-    // This is the main loop where we read input bytes and compress them. We always keep track of the
-    // "prefix", which represents a pending byte (if < 256) or string entry (if >= FIRST_STRING) that
-    // has not been sent to the decoder yet. The output symbols are kept in the "shifter" and "bits"
-    // variables and are sent to the output every time 8 bits are available (done in the macro).
-
-    while ((c = (*src)()) != EOF) {
-        int cti;                            // coding table index
-
-        input_bytes++;
-
-        if (prefix == NULL_CODE) {          // this only happens the very first byte when we don't yet have a prefix
-            prefix = c;
-            continue;
-        }
-
-        if ((cti = first_references [prefix])) {    // if any longer strings are built on the current prefix...
-            while (1)
-                if (terminators [cti - 256] == c) { // we found a matching string, so we just update the prefix
-                    prefix = cti;                   // to that string and continue without sending anything
-                    break;
-                }
-                else if (!next_references [cti - 256]) {    // this string did not match the new character and
-                    next_references [cti - 256] = next;     // there aren't any more, so we'll add a new string
-                    cti = 0;                                // and point to it with "next_reference"
-                    break;
-                }
-                else
-                    cti = next_references [cti - 256];      // there are more possible matches to check, so loop back
-        }
-        else                                        // no longer strings are based on the current prefix, so now
-            first_references [prefix] = next;       // the current prefix plus the new byte will be the next string
-
-        // If "cti" is zero, we could not simply extend our "prefix" to a longer string because we did not find a
-        // dictionary match, so we send the symbol representing the current "prefix" and add the new string to the
-        // dictionary. Since the current byte "c" was not included in the prefix, that now becomes our new prefix.
-
-        if (!cti) {
-            WRITE_CODE (prefix, next);              // send symbol for current prefix (0 to next-1)
-            terminators [next - 256] = c;           // newly created string has current byte as the terminator
-            prefix = c;                             // current byte also becomes new prefix for next string
-
-            // This is where we bump the next string index and decide whether to clear the dictionary and start over.
-            // The triggers for that are either the dictionary is full or we've been outputting too many bytes and
-            // decide to cut our losses before the symbols get any larger. Note that for the dictionary full case we
-            // do NOT send the CLEAR_CODE because the decoder knows about this and we don't want to be redundant.
-
-            if (++next == total_codes || output_bytes > 8 + input_bytes + (input_bytes >> 4)) {
-                if (next < total_codes)
-                    WRITE_CODE (CLEAR_CODE, next);
-
-                // clear the dictionary and reset the byte counters -- basically everything starts over
-                // except that we keep the last pending "prefix" (which, of course, was never sent)
-
-                memset (first_references, 0, total_codes * sizeof (first_references [0]));
-                memset (next_references, 0, (total_codes - 256) * sizeof (next_references [0]));
-                memset (terminators, 0, (total_codes - 256) * sizeof (terminators [0]));
-                input_bytes = output_bytes = 0;
-                next = FIRST_STRING;
-            }
-        }
-    }
-
-    // we're done with input, so if we've received anything we still need to send that pesky pending prefix...
-
-    if (prefix != NULL_CODE) {
-        WRITE_CODE (prefix, next);
-
-        if (++next == total_codes)  // watch for clearing to the first string to stay in step with the decoder!
-            next = FIRST_STRING;    // (this was actually a corner-case bug that did not trigger often)
-    }
-
-    WRITE_CODE (next, next);        // the maximum possible code is always reserved for our END_CODE
-
-    if (bits)                       // finally, flush any pending bits from the shifter
-        (*dst)(shifter);
-
-    free (terminators); free (next_references); free (first_references);
-    return 0;
-}
-
-/* LZW decompression function. Bytes (8-bit) are read and written through callbacks.
- * A return value of EOF from the "src" callback terminates the compression process 
- * (although this should not normally occur). A non-zero return value
- * indicates an error, which in this case can be  a
- * failed malloc(), or if an EOF is read from the input stream before the compression
- * terminates naturally with END_CODE.
- */
-
-int lzw_decompress (void (*dst)(int), int (*src)(void))
-{
-    int read_byte, next = FIRST_STRING, prefix = CLEAR_CODE, bits = 0, total_codes;
-    unsigned char *terminators, *reverse_buffer;
-    unsigned long shifter = 0;
-    short *prefixes;
-
-    // PDF specific change: maxbits is not in the input stream
-    // we'll just be pessimistic and allocate the maximal size buffer
-
-    total_codes = 4096;
-    reverse_buffer = malloc ((total_codes - 256) * sizeof (reverse_buffer [0]));
-    prefixes = malloc ((total_codes - 256) * sizeof (prefixes [0]));
-    terminators = malloc ((total_codes - 256) * sizeof (terminators [0]));
-
-    if (!reverse_buffer || !prefixes || !terminators)       // check for mallco() failure
-        return 1;
-
-    // This is the main loop where we read input symbols. The values range from 0 to the code value
-    // of the "next" string in the dictionary. Note that receiving an EOF from the input
-    // stream is actually an error because we should have gotten the END_CODE first.
-
-    while (1) {
-        int code_bits = next < 512 ? 9 : (next < 1024 ? 10 : (next < 2048 ? 11 : 12) ), code;
-
-        #define TOP_BITMASK  (((1 << code_bits) - 1) << (bits - code_bits) )
-        #define BOTTOM_BITMASK ((1 << (bits - code_bits)) - 1)
-
-        do {
-            if ((read_byte = ((*src)())) == EOF) {
-                free (terminators); free (prefixes); free (reverse_buffer);
-                return 1;
-            }
-
-            /* shifter reworked: everything shifted left by a byte,
-             * and the byte we just read becomes the least significant
-             * byte */
-
-            // prepare to shift in next byte
-            shifter <<= 8;
-            /* the bitstrings forming the symbols are stored MSB first,
-            *  so we can just OR in the next */
-            shifter |= (unsigned long) read_byte;
-        } while ((bits += 8) < code_bits);
-
-
-        /* for a 12-bit code, the shifter's bits now look like 
-         * from MSB to LSB: 00...0cccccccccn...n
-         * where c are the bits of our code
-         * and n are the bits we're not yet interested in
-         * the number of times n is repeated is bits - code_bits 
-         * ie. the number of bits read in minus the bits we're interested in */
-
-        // shift our code bits into thier proper place, and save it as the final code
-        code = (int) shifter >> (bits - code_bits);
-        /* we can now clear the shifter's top bits. the result looks like:
-         * 00...0n...n
-         * number of n is bits-code_bits
-         * */
-        shifter &= BOTTOM_BITMASK;
-        // update the count of bytes in the shifter
-        bits -= code_bits;
-
-        if (code == EOD_CODE)                   // In PDF, EOD is signalled by 257, rather than the max code
-            break;
-        else if (code == CLEAR_CODE)        // otherwise check for a CLEAR_CODE to start over early
-            next = FIRST_STRING;
-        else if (prefix == CLEAR_CODE) {    // this only happens at the first symbol which is always sent
-            (*dst)(code);                   // literally and becomes our initial prefix
-            next++;
-        }
-        // Otherwise we have a valid prefix so we step through the string from end to beginning storing the
-        // bytes in the "reverse_buffer", and then we send them out in the proper order. One corner-case
-        // we have to handle here is that the string might be the same one that is actually being defined
-        // now (code == next-1). Also, the first 256 entries of "terminators" and "prefixes" are fixed and
-        // not allocated, so that messes things up a bit.
-        else {
-            int cti = (code == next-1) ? prefix : code;
-            unsigned char *rbp = reverse_buffer, c;
-
-            do *rbp++ = cti < 256 ? cti : terminators [cti - 256];      // step backward through string...
-            while ((cti = (cti < 256) ? NULL_CODE : prefixes [cti - 256]) != NULL_CODE);
-
-            c = *--rbp;     // the first byte in this string is the terminator for the last string, which is
-                            // the one that we'll create a new dictionary entry for this time
-
-            do (*dst)(*rbp);                        // send string in corrected order (except for the terminator
-            while (rbp-- != reverse_buffer);        // which we don't know yet)
-
-            if (code == next-1)
-                (*dst)(c);
-
-            prefixes [next - 1 - 256] = prefix;     // now update the next dictionary entry with the new string
-            terminators [next - 1 - 256] = c;       // (but we're always one behind, so it's not the string just sent)
-
-            if (++next == total_codes)              // check for full dictionary, which forces a reset (and, BTW,
-                next = FIRST_STRING;                // means we'll never use the dictionary entry we just wrote)
-        }
-
-        prefix = code;      // the code we just received becomes the prefix for the next dictionary string entry
-                            // (which we'll create once we find out the terminator)
-    }
-
-    free (terminators); free (prefixes); free (reverse_buffer);
-    return 0;
-}
--- a/lzw-lib.h
+++ b/lzw-lib.h
-////////////////////////////////////////////////////////////////////////////
-//                            **** LZW-AB ****                            //
-//               Adjusted Binary LZW Compressor/Decompressor              //
-//                     Copyright (c) 2016 David Bryant                    //
-//                           All Rights Reserved                          //
-//      Distributed under the BSD Software License (see license.txt)      //
-////////////////////////////////////////////////////////////////////////////
-
-#ifndef LZWLIB_H_
-#define LZWLIB_H_
-
-int lzw_compress (void (*dst)(int), int (*src)(void), int maxbits);
-int lzw_decompress (void (*dst)(int), int (*src)(void));
-
-#endif /* LZWLIB_H_ */
--- a/lzw.c
+++ b/lzw.c
+#include <hammer/hammer.h>
+#include <hammer/glue.h>
+#include <stdlib.h>	/* malloc, free */
+#include <string.h>	/* memcpy */
+#include "lzw.h"
+
+
+struct context {
+	/*
+	 * Storing byte sequences represented by each LZW code.
+	 *
+	 * Codes 0-255 are predefined representing literals.
+	 * Codes 256 and 267 are the special clear and eod (end of data) codes.
+	 * Codes >257 are dynamically defined by the input.
+	 *
+	 * Each dynamically defined code is an extension of a previously
+	 * defined code. We therefore need only store the code being extended
+	 * and the byte being added.
+	 *
+	 * Thus the entries of this array form linked lists. To terminate the
+	 * lists while making our memory allocation easy, we store the length
+	 * of each code's output sequence.
+	 *
+	 * Finally, we redundantly store the first byte of the sequence so we
+	 * don't have to walk the list during updates (see act_output).
+	 */
+	struct {
+		size_t len;	/* length of the sequence */
+		int prefix;	/* code representing the seq's prefix (len-1) */
+		uint8_t last;	/* final byte of the sequence */
+		uint8_t first;	/* first byte of the sequence */
+	} table[4097];		/* 4096 codes + one dummy (see act_output) */
+
+	/*
+	 * The next code to be assigned, i.e. the current size of the table.
+	 */
+	int next;
+
+	/*
+	 * earlychange = 1 means the bit size is increased "one code early".
+	 * earlychange = 0 is "code length increases shall be postponed as
+	 * long as possible".
+	 */
+	int earlychange;
+};
+
+
+/*
+ * Helpers for working with the table:
+ */
+
+static void
+lzw_clear_table(struct context *ctx)
+{
+	ctx->next = 258;
+}
+
+/*
+ * Update the dictionary with a new entry that extends the given code by one
+ * byte to be filled in later.
+ */
+static void
+lzw_table_extend(struct context *ctx, int code)
+{
+	ctx->table[ctx->next].prefix = code;
+	ctx->table[ctx->next].first = ctx->table[code].first;
+	ctx->table[ctx->next].len = ctx->table[code].len + 1;
+	ctx->table[ctx->next].last = 0xFF;
+	ctx->next++;
+}
+
+/*
+ * Assemble the output sequence represented by the given code word.
+ * The given buffer must have the appropriate size.
+ * Returns the number of bytes written.
+ */
+static size_t
+lzw_code_string(struct context *ctx, int code, uint8_t *buf)
+{
+	size_t i, n;
+
+	/* traverse the list, filling buf from last to first byte */
+	n = ctx->table[code].len;
+	for (i = 0; i < n; i++) {
+		buf[n - 1 - i] = ctx->table[code].last;
+		code = ctx->table[code].prefix;
+	}
+	assert(code == -1);	/* reached the end */
+
+	return n;
+}
+
+
+/*
+ * Global variables:
+ */
+
+HParser *p_lzwdata;
+static struct context *context;
+
+
+/*
+ * Semantic actions and validations:
+ */
+
+static HParsedToken *
+act_clear(const HParseResult *p, void *u)
+{
+	struct context *ctx = u;
+	lzw_clear_table(ctx);
+	return NULL;
+}
+
+static bool
+validate_clear(HParseResult *p, void *u)
+{
+	uint64_t code = H_CAST_UINT(p->ast);
+	return (code == 256);
+}
+
+static bool
+validate_eod(HParseResult *p, void *u)
+{
+	uint64_t code = H_CAST_UINT(p->ast);
+	return (code == 257);
+}
+
+static bool
+validate_output(HParseResult *p, void *u)
+{
+	uint64_t code = H_CAST_UINT(p->ast);
+	struct context *ctx = u;
+
+	return (code != 256 && code != 257 && code < ctx->next);
+}
+
+static HParsedToken *
+act_output(const HParseResult *p, void *u)
+{
+	uint64_t code = H_CAST_UINT(p->ast);
+	struct context *ctx = u;
+
+	assert(ctx->next >= 258);
+	assert(ctx->next <= 4097);
+	assert(code < ctx->next);
+	assert(code != 256);
+	assert(code != 257);
+
+	/* Fill in the missing last byte of the last assigned code, if any. */
+	if (ctx->next > 258)
+		ctx->table[ctx->next - 1].last = ctx->table[code].first;
+
+	/*
+	 * Update the dictionary with a new entry that is missing the last
+	 * byte which we will only learn when we process the next code.
+	 *
+	 * Note that the value 4097 is intentional here. Rather than going
+	 * through the effort of ensuring that the last code is only updated
+	 * once, we simply assign one more code as a dummy.
+	 */
+	if (ctx->next < 4097)
+		lzw_table_extend(ctx, code);
+
+	/* Pass the code through. Output is generated in act_lzwblock below. */
+	return (HParsedToken *)p->ast;	// XXX casting away the const OK?
+}
+
+/*
+ * Assemble the string represented by a block of code words under a given
+ * table. The incoming HParsedToken is a sequence of code words (TT_UINT).
+ */
+static HParsedToken *
+act_lzwblock(const HParseResult *p, void *u)
+{
+	HCountedArray *seq = H_CAST_SEQ(p->ast);
+	struct context *ctx = u;
+	uint8_t *buf, *cur;
+	size_t sz, i;
+	int code;
+
+	/* determine total output size, alloc buffer */
+	sz = 0;
+	for (i = 0; i < seq->used; i++) {
+		code = (int) H_CAST_UINT(seq->elements[i]);
+		sz += ctx->table[code].len;
+	}
+	buf = h_arena_malloc(p->arena, sz);
+
+	/* go through sequence, merge output bytes into buf */
+	cur = buf;
+	for (i = 0; i < seq->used; i++) {
+		code = (int) H_CAST_UINT(seq->elements[i]);
+		cur += lzw_code_string(ctx, code, cur);
+	}
+	assert(cur == buf + sz);
+
+	return H_MAKE_BYTES(buf, sz);
+}
+
+/*
+ * Concatenate blocks to form the final output string.
+ * The incoming HParsedToken is a sequence of HBytes.
+ */
+static HParsedToken *
+act_lzwdata(const HParseResult *p, void *u)
+{
+	HCountedArray *seq = H_CAST_SEQ(p->ast);
+	HBytes bs;
+	uint8_t *buf, *cur;
+	size_t sz, i;
+
+	/* fast path: single element? nothing to do */
+	if (seq->used == 1)
+		return seq->elements[0];
+
+	/* determine total output size, alloc buffer */
+	sz = 0;
+	for (i = 0; i < seq->used; i++)
+		sz += H_CAST_BYTES(seq->elements[i]).len;
+	buf = h_arena_malloc(p->arena, sz);
+
+	/* go through sequence, copying bytes into buf */
+	cur = buf;
+	for (i = 0; i < seq->used; i++) {
+		bs = H_CAST_BYTES(seq->elements[i]);
+		memcpy(cur, bs.token, bs.len);
+		cur += bs.len;
+	}
+	assert(cur == buf + sz);
+
+	return H_MAKE_BYTES(buf, sz);
+}
+
+/*
+ * Continuation for h_bind() in the 'codeword' grammar rule. It inspects the
+ * lzw context passed as 'env' and returns the parser of the correct size.
+ *
+ * NB: We create the returned parsers statically in init_LZW_parser() to avoid
+ * allocation during the parse.
+ */
+static HParser *p_code9, *p_code10, *p_code11, *p_code12;
+static HParser *
+kcodeword(HAllocator *mm__, const HParsedToken *x, void *env)
+{
+	struct context *ctx = env;
+
+	if (ctx->next <= 512 - ctx->earlychange)
+		return p_code9;
+	else if (ctx->next <= 1024 - ctx->earlychange)
+		return p_code10;
+	else if (ctx->next <= 2048 - ctx->earlychange)
+		return p_code11;
+	else
+		return p_code12;
+}
+
+
+/*
+ * Exposed interface:
+ */
+
+void
+init_LZW_parser()
+{
+	int i;
+
+	/* initialize global context variable, incl. static table entries */
+	context = malloc(sizeof *context);
+	assert(context != NULL);
+	for(i = 0; i < 256; i++)
+	{
+		context->table[i].len = 1;
+		context->table[i].prefix = -1;	/* none */
+		context->table[i].first = i;
+		context->table[i].last = i;
+	}
+	init_LZW_context(1);
+
+	/* static parsers for code words of all possible sizes */
+	p_code9  = h_bits(9,  false);
+	p_code10 = h_bits(10, false);
+	p_code11 = h_bits(11, false);
+	p_code12 = h_bits(12, false);
+
+	/* kcodeword() selects the appropriate parser based on context */
+	H_RULE   (codeword,	h_bind(h_epsilon_p(), kcodeword, context));
+
+	H_VRULE  (eod,		codeword);
+	H_AVDRULE(clear,	codeword, context);
+	H_AVDRULE(output,	codeword, context);
+
+	H_ADRULE(lzwblock,	h_right(clear, h_many(output)), context);
+	H_ARULE (lzwdata,	h_left(h_many1(lzwblock), eod));
+	    // XXX validate that the last byte is zero-padded?
+	    // XXX require h_end_p()?
+
+	p_lzwdata = lzwdata;
+}
+
+HParseResult *
+parse_LZW_data(const uint8_t *input, size_t length)
+{
+	HParseResult *res = h_parse(p_lzwdata, input, length);
+	return res;
+}
+
+void
+init_LZW_context(int earlychange)
+{
+	lzw_clear_table(context);
+	context->earlychange = !!earlychange;
+}
--- a/lzw.h
+++ b/lzw.h
+#ifndef PDF_LZW_H
+#define PDF_LZW_H
+
+#include <hammer/hammer.h>
+
+void init_LZW_parser();
+HParseResult * parse_LZW_data(const uint8_t* input, size_t length);
+void init_LZW_context(int earlychange);
+
+#endif // PDF_LZW_H
--- a/pdf.1.mdoc
+++ b/pdf.1.mdoc
+.Dd $Mdocdate$
+.Dt PDF 1
+.Os
+.Sh NAME
+.Nm pdf
+.Nd validation and inspection of PDF files
+.Sh SYNOPSIS
+.Nm pdf
+.Op Fl qsv
+.Op Fl d Ar what
+.Op Fl x Ar txtfile
+.Ar input.pdf
+.Op Ar oid
+.Sh DESCRIPTION
+The
+.Nm
+utility attempts to parse and validate the given PDF file.
+It prints the resulting AST to standard output using a JSON format.
+.Pp
+The optional
+.Ar oid
+argument selects a specific object to be printed instead of the whole document.
+It is expected to be of the form
+.Dq Va n . Ns Va g
+where
+.Va n
+and
+.Va g
+are object and generation numbers, respectively.
+The generation number may be omitted to select the latest object matching
+.Va n .
+.Pp
+The options are as follows:
+.Bl -tag -width Ds
+.It Fl d Cm s
+Dump the body data, after filter decoding, of a given stream object.
+An
+.Ar oid
+argument is required.
+.It Fl q
+Query/quiet mode.
+Do not print to standard output and suppress any messages about parse errors.
+Just indicate success or failure via the exit status.
+.It Fl s
+Strict mode.
+Treat most
+.Dq benign
+format violations as parse errors.
+.It Fl v
+Verbose mode.
+Show additional informational messages.
+.It Fl x Ar txtfile
+Extract the text content of the input document and write it as plain
+text to
+.Ar txtfile .
+.El
+.Sh EXIT STATUS
+The program exits 0 on successful execution with valid (conforming) input.
+An exit code of 1 indicates that the parser identified the input file as
+invalid but otherwise executed normally.
+Exit codes >1 indicate abnormal termination, i.e. program failure with
+indeterminate parse result.
+.Sh STANDARDS
+.Rs
+.%R ISO 32000-1
+.%T Document management \(em Portable document format \(em \
+    Part 1: PDF 1.7
+.%D 2008
+.Re
+.Pp
+.Rs
+.%R ISO 32000-2
+.%T Document management \(em Portable document format \(em \
+    Part 2: PDF 2.0
+.%D 2020
+.Re
--- a/pdf.1.txt
+++ b/pdf.1.txt
+PDF(1)			    General Commands Manual			PDF(1)
+
+NAME
+     pdf - validation and inspection of PDF files
+
+SYNOPSIS
+     pdf [-qsv] [-d what] [-x txtfile] input.pdf [oid]
+
+DESCRIPTION
+     The pdf utility attempts to parse and validate the given PDF file.	 It
+     prints the resulting AST to standard output using a JSON format.
+
+     The optional oid argument selects a specific object to be printed instead
+     of the whole document.  It is expected to be of the form "n.g" where n
+     and g are object and generation numbers, respectively.  The generation
+     number may be omitted to select the latest object matching n.
+
+     The options are as follows:
+
+     -d s    Dump the body data, after filter decoding, of a given stream
+	     object.  An oid argument is required.
+
+     -q	     Query/quiet mode.	Do not print to standard output and suppress
+	     any messages about parse errors.  Just indicate success or
+	     failure via the exit status.
+
+     -s	     Strict mode.  Treat most "benign" format violations as parse
+	     errors.
+
+     -v	     Verbose mode.  Show additional informational messages.
+
+     -x txtfile
+	     Extract the text content of the input document and write it as
+	     plain text to txtfile.
+
+EXIT STATUS
+     The program exits 0 on successful execution with valid (conforming)
+     input.  An exit code of 1 indicates that the parser identified the input
+     file as invalid but otherwise executed normally.  Exit codes >1 indicate
+     abnormal termination, i.e. program failure with indeterminate parse
+     result.
+
+STANDARDS
+     Document management -- Portable document format -- Part 1: PDF 1.7, ISO
+     32000-1, 2008.
+
+     Document management -- Portable document format -- Part 2: PDF 2.0, ISO
+     32000-2, 2020.
+
+				January 6, 2023
--- a/pdf.c
+++ b/pdf.c
--- a/t/.keep
+++ b/t/.keep
--- a/t/hello_ascii85.pdf
+++ b/t/hello_ascii85.pdf
--- a/t/hello_ascii85_lf.pdf
+++ b/t/hello_ascii85_lf.pdf
--- a/t/hello_asciihex.pdf
+++ b/t/hello_asciihex.pdf
--- a/t/hello_asciihex_lf.pdf
+++ b/t/hello_asciihex_lf.pdf
--- a/t/hello_lzwdecode_march.pdf
+++ b/t/hello_lzwdecode_march.pdf
--- a/t/hello_runlength.pdf
+++ b/t/hello_runlength.pdf
No results found