diff --git a/Makefile b/Makefile index 25dac3b12e99a090ccbe0ab705c0a0ede4fadb6f..6154e1d8363343e12153e52a27e9826581afdc3a 100644 --- a/Makefile +++ b/Makefile @@ -8,6 +8,7 @@ HAMMER_INCLUDE = . HAMMER_LIB = ./lib CFLAGS += -I$(HAMMER_INCLUDE) LDFLAGS += -L$(HAMMER_LIB) +SOURCES = pdf.c lzw-lib.c .PHONY: all test clean all: pdf @@ -17,8 +18,8 @@ test: pdf 'for x in t/*.pdf; do ./pdf "$$x" >/dev/null && echo OK: "$$x"; done' @true -pdf: pdf.c - $(CC) -o $@ $(CFLAGS) $(LDFLAGS) $< -lhammer -lz +pdf: $(SOURCES) + $(CC) -o $@ $(CFLAGS) $(LDFLAGS) $(SOURCES) -lhammer -lz clean: rm -f pdf diff --git a/lzw-ab-license.txt b/lzw-ab-license.txt new file mode 100644 index 0000000000000000000000000000000000000000..65d4a2e4b96304208852290e3d3bf6ee7ce3dde8 --- /dev/null +++ b/lzw-ab-license.txt @@ -0,0 +1,25 @@ + Copyright (c) David Bryant + All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Conifer Software nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/lzw-lib.c b/lzw-lib.c new file mode 100644 index 0000000000000000000000000000000000000000..38ecd30a3ed38605d2cb752efb66dc8ecb4861c6 --- /dev/null +++ b/lzw-lib.c @@ -0,0 +1,318 @@ +//////////////////////////////////////////////////////////////////////////// +// **** LZW-AB **** // +// Adjusted Binary LZW Compressor/Decompressor // +// Copyright (c) 2016 David Bryant // +// All Rights Reserved // +// Distributed under the BSD Software License (see license.txt) // +//////////////////////////////////////////////////////////////////////////// + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "lzw-lib.h" + +/* This library implements the LZW general-purpose data compression algorithm. + * The algorithm was originally described as a hardware implementation by + * Terry Welsh here: + * + * Welch, T.A. “A Technique for High-Performance Data Compression.†+ * IEEE Computer 17,6 (June 1984), pp. 8-19. + * + * Since then there have been enumerable refinements and variations on the + * basic technique, and this implementation is no different. The target of + * the present implementation is embedded systems, and so emphasis was placed + * on simplicity, fast execution, and minimal RAM usage. + * + * The symbols are stored in adjusted binary, which provides considerably + * better compression performance with virtually no speed penalty compared to + * the fixed sizes normally used. To ensure good performance on data with + * varying characteristics (like executable images) the encoder resets as + * soon as the dictionary is full. Also, worst-case performance is limited + * to about 8% inflation by catching poor performance and forcing an early + * reset before longer symbols are sent. + * + * The maximum symbol size is configurable on the encode side (from 9 bits + * to 12 bits) and determines the RAM footprint required by both sides and, + * to a large extent, the compression performance. This information is + * communicated to the decoder in the first stream byte so that it can + * allocate accordingly. The RAM requirements are as follows: + * + * maximum encoder RAM decoder RAM + * symbol size requirement requirement + * ----------------------------------------- + * 9-bit 1792 bytes 1024 bytes + * 10-bit 4352 bytes 3072 bytes + * 11-bit 9472 bytes 7168 bytes + * 12-bit 19712 bytes 15360 bytes + * + * This implementation uses malloc(), but obviously an embedded version could + * use static arrays instead if desired (assuming that the maxbits was + * controlled outside). + */ + +#define NULL_CODE -1 // indicates a NULL prefix +#define CLEAR_CODE 256 // code to flush dictionary and restart decoder +#define EOD_CODE 257 // used in PDF's LZWDecode to signal end of data +#define FIRST_STRING 258 // code of first dictionary string, PDF edition + +/* This macro writes the adjusted-binary symbol "code" given the maximum + * symbol "maxcode". A macro is used here just to avoid the duplication in + * the lzw_compress() function. The idea is that if "maxcode" is not one + * less than a power of two (which it rarely will be) then this code can + * often send fewer bits that would be required with a fixed-sized code. + * + * For example, the first code we send will have a "maxcode" of 257, so + * every "code" would normally consume 9 bits. But with adjusted binary we + * can actually represent any code from 0 to 253 with just 8 bits -- only + * the 4 codes from 254 to 257 take 9 bits. + */ + +#define WRITE_CODE(code,maxcode) do { \ + int code_bits = (maxcode) < 1024 ? \ + ((maxcode) < 512 ? 8 : 9) : \ + ((maxcode) < 2048 ? 10 : 11); \ + int extras = (1 << (code_bits + 1)) - (maxcode) - 1; \ + if ((code) < extras) { \ + shifter |= ((long)(code) << bits); \ + bits += code_bits; \ + } \ + else { \ + shifter |= ((long)(((code) + extras) >> 1) << bits); \ + bits += code_bits; \ + shifter |= ((long)(((code) + extras) & 1) << bits++); \ + } \ + do { (*dst)(shifter); shifter >>= 8; output_bytes++; \ + } while ((bits -= 8) >= 8); \ +} while (0) + +/* LZW compression function. Bytes (8-bit) are read and written through callbacks and the + * "maxbits" parameter specifies the maximum symbol size (9-12), which in turn determines + * the RAM requirement and, to a large extent, the level of compression achievable. A return + * value of EOF from the "src" callback terminates the compression process. A non-zero return + * value indicates one of the two possible errors -- bad "maxbits" param or failed malloc(). + */ + +int lzw_compress (void (*dst)(int), int (*src)(void), int maxbits) +{ + int next = FIRST_STRING, prefix = NULL_CODE, bits = 0, total_codes, c; + unsigned long input_bytes = 0, output_bytes = 0; + short *first_references, *next_references; + unsigned char *terminators; + unsigned long shifter = 0; + + if (maxbits < 9 || maxbits > 12) // check for valid "maxbits" setting + return 1; + + // based on the "maxbits" parameter, compute total codes and allocate dictionary storage + + total_codes = 1 << maxbits; + first_references = malloc (total_codes * sizeof (first_references [0])); + next_references = malloc ((total_codes - 256) * sizeof (next_references [0])); + terminators = malloc ((total_codes - 256) * sizeof (terminators [0])); + + if (!first_references || !next_references || !terminators) + return 1; // failed malloc() + + // clear the dictionary + + memset (first_references, 0, total_codes * sizeof (first_references [0])); + memset (next_references, 0, (total_codes - 256) * sizeof (next_references [0])); + memset (terminators, 0, (total_codes - 256) * sizeof (terminators [0])); + + (*dst)(maxbits - 9); // first byte in output stream indicates the maximum symbol bits + + // This is the main loop where we read input bytes and compress them. We always keep track of the + // "prefix", which represents a pending byte (if < 256) or string entry (if >= FIRST_STRING) that + // has not been sent to the decoder yet. The output symbols are kept in the "shifter" and "bits" + // variables and are sent to the output every time 8 bits are available (done in the macro). + + while ((c = (*src)()) != EOF) { + int cti; // coding table index + + input_bytes++; + + if (prefix == NULL_CODE) { // this only happens the very first byte when we don't yet have a prefix + prefix = c; + continue; + } + + if ((cti = first_references [prefix])) { // if any longer strings are built on the current prefix... + while (1) + if (terminators [cti - 256] == c) { // we found a matching string, so we just update the prefix + prefix = cti; // to that string and continue without sending anything + break; + } + else if (!next_references [cti - 256]) { // this string did not match the new character and + next_references [cti - 256] = next; // there aren't any more, so we'll add a new string + cti = 0; // and point to it with "next_reference" + break; + } + else + cti = next_references [cti - 256]; // there are more possible matches to check, so loop back + } + else // no longer strings are based on the current prefix, so now + first_references [prefix] = next; // the current prefix plus the new byte will be the next string + + // If "cti" is zero, we could not simply extend our "prefix" to a longer string because we did not find a + // dictionary match, so we send the symbol representing the current "prefix" and add the new string to the + // dictionary. Since the current byte "c" was not included in the prefix, that now becomes our new prefix. + + if (!cti) { + WRITE_CODE (prefix, next); // send symbol for current prefix (0 to next-1) + terminators [next - 256] = c; // newly created string has current byte as the terminator + prefix = c; // current byte also becomes new prefix for next string + + // This is where we bump the next string index and decide whether to clear the dictionary and start over. + // The triggers for that are either the dictionary is full or we've been outputting too many bytes and + // decide to cut our losses before the symbols get any larger. Note that for the dictionary full case we + // do NOT send the CLEAR_CODE because the decoder knows about this and we don't want to be redundant. + + if (++next == total_codes || output_bytes > 8 + input_bytes + (input_bytes >> 4)) { + if (next < total_codes) + WRITE_CODE (CLEAR_CODE, next); + + // clear the dictionary and reset the byte counters -- basically everything starts over + // except that we keep the last pending "prefix" (which, of course, was never sent) + + memset (first_references, 0, total_codes * sizeof (first_references [0])); + memset (next_references, 0, (total_codes - 256) * sizeof (next_references [0])); + memset (terminators, 0, (total_codes - 256) * sizeof (terminators [0])); + input_bytes = output_bytes = 0; + next = FIRST_STRING; + } + } + } + + // we're done with input, so if we've received anything we still need to send that pesky pending prefix... + + if (prefix != NULL_CODE) { + WRITE_CODE (prefix, next); + + if (++next == total_codes) // watch for clearing to the first string to stay in step with the decoder! + next = FIRST_STRING; // (this was actually a corner-case bug that did not trigger often) + } + + WRITE_CODE (next, next); // the maximum possible code is always reserved for our END_CODE + + if (bits) // finally, flush any pending bits from the shifter + (*dst)(shifter); + + free (terminators); free (next_references); free (first_references); + return 0; +} + +/* LZW decompression function. Bytes (8-bit) are read and written through callbacks. + * A return value of EOF from the "src" callback terminates the compression process + * (although this should not normally occur). A non-zero return value + * indicates an error, which in this case can be a + * failed malloc(), or if an EOF is read from the input stream before the compression + * terminates naturally with END_CODE. + */ + +int lzw_decompress (void (*dst)(int), int (*src)(void)) +{ + int read_byte, next = FIRST_STRING, prefix = CLEAR_CODE, bits = 0, total_codes; + unsigned char *terminators, *reverse_buffer; + unsigned long shifter = 0; + short *prefixes; + + // PDF specific change: maxbits is not in the input stream + // we'll just be pessimistic and allocate the maximal size buffer + + total_codes = 4096; + reverse_buffer = malloc ((total_codes - 256) * sizeof (reverse_buffer [0])); + prefixes = malloc ((total_codes - 256) * sizeof (prefixes [0])); + terminators = malloc ((total_codes - 256) * sizeof (terminators [0])); + + if (!reverse_buffer || !prefixes || !terminators) // check for mallco() failure + return 1; + + // This is the main loop where we read input symbols. The values range from 0 to the code value + // of the "next" string in the dictionary. Note that receiving an EOF from the input + // stream is actually an error because we should have gotten the END_CODE first. + + while (1) { + int code_bits = next < 512 ? 9 : (next < 1024 ? 10 : (next < 2048 ? 11 : 12) ), code; + + #define TOP_BITMASK (((1 << code_bits) - 1) << (bits - code_bits) ) + #define BOTTOM_BITMASK ((1 << (bits - code_bits)) - 1) + + do { + if ((read_byte = ((*src)())) == EOF) { + free (terminators); free (prefixes); free (reverse_buffer); + return 1; + } + + /* shifter reworked: everything shifted left by a byte, + * and the byte we just read becomes the least significant + * byte */ + + // prepare to shift in next byte + shifter <<= 8; + /* the bitstrings forming the symbols are stored MSB first, + * so we can just OR in the next */ + shifter |= (unsigned long) read_byte; + } while ((bits += 8) < code_bits); + + + /* for a 12-bit code, the shifter's bits now look like + * from MSB to LSB: 00...0cccccccccn...n + * where c are the bits of our code + * and n are the bits we're not yet interested in + * the number of times n is repeated is bits - code_bits + * ie. the number of bits read in minus the bits we're interested in */ + + // shift our code bits into thier proper place, and save it as the final code + code = (int) shifter >> (bits - code_bits); + /* we can now clear the shifter's top bits. the result looks like: + * 00...0n...n + * number of n is bits-code_bits + * */ + shifter &= BOTTOM_BITMASK; + // update the count of bytes in the shifter + bits -= code_bits; + + if (code == EOD_CODE) // In PDF, EOD is signalled by 257, rather than the max code + break; + else if (code == CLEAR_CODE) // otherwise check for a CLEAR_CODE to start over early + next = FIRST_STRING; + else if (prefix == CLEAR_CODE) { // this only happens at the first symbol which is always sent + (*dst)(code); // literally and becomes our initial prefix + next++; + } + // Otherwise we have a valid prefix so we step through the string from end to beginning storing the + // bytes in the "reverse_buffer", and then we send them out in the proper order. One corner-case + // we have to handle here is that the string might be the same one that is actually being defined + // now (code == next-1). Also, the first 256 entries of "terminators" and "prefixes" are fixed and + // not allocated, so that messes things up a bit. + else { + int cti = (code == next-1) ? prefix : code; + unsigned char *rbp = reverse_buffer, c; + + do *rbp++ = cti < 256 ? cti : terminators [cti - 256]; // step backward through string... + while ((cti = (cti < 256) ? NULL_CODE : prefixes [cti - 256]) != NULL_CODE); + + c = *--rbp; // the first byte in this string is the terminator for the last string, which is + // the one that we'll create a new dictionary entry for this time + + do (*dst)(*rbp); // send string in corrected order (except for the terminator + while (rbp-- != reverse_buffer); // which we don't know yet) + + if (code == next-1) + (*dst)(c); + + prefixes [next - 1 - 256] = prefix; // now update the next dictionary entry with the new string + terminators [next - 1 - 256] = c; // (but we're always one behind, so it's not the string just sent) + + if (++next == total_codes) // check for full dictionary, which forces a reset (and, BTW, + next = FIRST_STRING; // means we'll never use the dictionary entry we just wrote) + } + + prefix = code; // the code we just received becomes the prefix for the next dictionary string entry + // (which we'll create once we find out the terminator) + } + + free (terminators); free (prefixes); free (reverse_buffer); + return 0; +} diff --git a/lzw-lib.h b/lzw-lib.h new file mode 100644 index 0000000000000000000000000000000000000000..81fdeb15e6ade7ef0fd6089a4fdc3d3f2d593578 --- /dev/null +++ b/lzw-lib.h @@ -0,0 +1,15 @@ +//////////////////////////////////////////////////////////////////////////// +// **** LZW-AB **** // +// Adjusted Binary LZW Compressor/Decompressor // +// Copyright (c) 2016 David Bryant // +// All Rights Reserved // +// Distributed under the BSD Software License (see license.txt) // +//////////////////////////////////////////////////////////////////////////// + +#ifndef LZWLIB_H_ +#define LZWLIB_H_ + +int lzw_compress (void (*dst)(int), int (*src)(void), int maxbits); +int lzw_decompress (void (*dst)(int), int (*src)(void)); + +#endif /* LZWLIB_H_ */ diff --git a/pdf.c b/pdf.c index 27135177d3011e86db92c93762835f491da51da2..675693fa93d1ad4e56dfba2e55f6c2b39925c7c7 100644 --- a/pdf.c +++ b/pdf.c @@ -1583,6 +1583,216 @@ FlateDecode(const Dict *parms, HBytes b, HParser *p) return res; } +/* LZW helpers */ + +typedef struct +{ + uint8_t *lzw_buf; + size_t total_buf_size; + size_t write_head; + size_t write_tail; + uint8_t write_checksum; + size_t eof_loc; + + HBytes *input_stream; + size_t read_head; + size_t read_tail; + uint8_t read_checksum; +} lzwspec; + +lzwspec *cur_lzw_spec; + +/* used by write_lzw_buffer to get more space for decoding if needed */ +void +grow_lzw_buffer(size_t amount) +{ + uint8_t *ret_buf = realloc(cur_lzw_spec->lzw_buf, (cur_lzw_spec->total_buf_size+amount) * sizeof(uint8_t)); + if(ret_buf != NULL) + { + cur_lzw_spec->total_buf_size += amount; + cur_lzw_spec->lzw_buf = ret_buf; + } + else + { + fprintf(stderr, "LZWDecode: h_arena_realloc() failed"); + return; + } +} + +lzwspec * +new_lzw_spec(HBytes *bytes) +{ + size_t const BUFSIZE = sizeof(uint8_t) * 1024; + lzwspec *ret = malloc(sizeof(lzwspec)); + ret->input_stream = bytes; + ret->lzw_buf = malloc(BUFSIZE); + ret->total_buf_size = BUFSIZE; + return ret; +} + +void +delete_lzw_spec(lzwspec *spec) +{ + free(spec->lzw_buf); + free(spec); +} + +void +bind_lzw_spec(lzwspec *spec) +{ + cur_lzw_spec = spec; +} + + +#include "lzw-lib.h" + +/* Buffer writer function for the lzw-ab implementation, with a fixed signature. + * Although the type is defined as int, it is expected to write one byte at a time. + * Modifies cur_lzw_spec. Set up the lzw spec to use with bind_lzw_spec() */ + +void +write_lzw_buffer(int value) +{ + size_t const BUFSIZE = sizeof(uint8_t) * 1024; + + if(!cur_lzw_spec->lzw_buf) + { + fprintf(stderr, "LZWDecode: lzw_buf is null!"); + assert(cur_lzw_spec->lzw_buf != NULL); + } + + assert(cur_lzw_spec->write_head <= cur_lzw_spec->total_buf_size); + + if (value == EOF) { + cur_lzw_spec->lzw_buf[cur_lzw_spec->write_head] = (uint8_t) value; + cur_lzw_spec->eof_loc = cur_lzw_spec->write_head; + cur_lzw_spec->write_head++; + return; + } + + /* We can get away with this cast due to writing single bytes. */ + cur_lzw_spec->lzw_buf[cur_lzw_spec->write_head++] = (uint8_t) value; + + /* If you looked at lzw-ab's code, the write head is reset here + * This function uses write_head as the offset of the last written item */ + if (cur_lzw_spec->write_head >= cur_lzw_spec->total_buf_size) + { + grow_lzw_buffer(BUFSIZE); + } + + cur_lzw_spec->write_checksum = cur_lzw_spec->write_checksum * 3 + (uint8_t) value; +} + + +/* Fixed signature function for reading bytes. Modifies cur_lzw_spec. Set cur_lzw_spec + * with bind_lzw_spec() */ +int read_lzw_buffer(void) +{ + uint8_t byte_read; + int ret_value; + + /* Input data is already waiting in the buffer */ + if (cur_lzw_spec->read_head == cur_lzw_spec->read_tail) + cur_lzw_spec->read_tail = cur_lzw_spec->input_stream->len; + + if (cur_lzw_spec->read_head < cur_lzw_spec->read_tail) + { + byte_read = cur_lzw_spec->input_stream->token[cur_lzw_spec->read_head++]; + cur_lzw_spec->read_checksum = cur_lzw_spec->read_checksum * 3 + byte_read; + ret_value = byte_read; + } + else + ret_value = EOF; + + return ret_value; +} + + +HParseResult * +LZWDecode(const Dict *parms, HBytes b, HParser *p) +{ + struct predictor pred = {1, 1, 8, 1}; + int (*depredict)(struct predictor *, uint8_t *, size_t); + HParseResult *res; + int done; + int ret; + const HParsedToken *v; + + /* set up the predictor (if any) */ + #define SETPARM(VAR,STR) do { \ + v = dictentry(parms, (STR)); \ + if (v != NULL) { \ + if (v->token_type != TT_SINT || v->sint < 0) \ + return NULL; \ + VAR = v->sint; \ + } } while(0) + SETPARM(pred.num, "Predictor"); + SETPARM(pred.colors, "Colors"); + SETPARM(pred.bpc, "BitsPerComponent"); + SETPARM(pred.columns, "Columns"); + #undef SETPARM + if (pred.num == 1) + depredict = depred_none; + else { + if (pred.num >= 10 && pred.num <= 15) + depredict = depred_png; + else if (pred.num == 2) { + /* for 8-bpc TIFF pred. 2, we can reuse PNG Sub */ + if (pred.bpc == 8) { + pred.predfun = pp_sub; /* predict left */ + depredict = depred_png; + } else { + // XXX add general TIFF predictor (bpc != 8) + fprintf(stderr, "LZWDecode: /Predictor %d " + "not supported for /BitsPerComponent %d\n", + pred.num, pred.bpc); + return NULL; + } + } else { + fprintf(stderr, "LZWDecode: /Predictor %d" + " not supported\n", pred.num); + return NULL; + } + + /* allocate row buffer */ + if (pred.columns > (INT_MAX - 7) / pred.colors / pred.bpc) { + fprintf(stderr, "LZWDecode: overflow\n"); + return NULL; + } + pred.rowsz = (pred.colors * pred.bpc * pred.columns + 7) / 8; + pred.buf = calloc(1, pred.rowsz); + if (pred.buf == NULL) + err(1, "LZWDecode"); + } + + lzwspec *lzw_spec = new_lzw_spec(&b); + bind_lzw_spec(lzw_spec); + + ret = lzw_decompress(write_lzw_buffer, read_lzw_buffer); + if (ret) { + fprintf(stderr, "lzw_decompress: error (%d)\n", ret); + assert(!"LZWDecode: failed to decompress\n"); + } + done = depredict(&pred, cur_lzw_spec->lzw_buf, cur_lzw_spec->write_head-1); + + if(!done) + { + // happens if depred() thinks it needs more bytes or when depred is depred_none + fprintf(stderr, "LZWDecode: warning, depredict() returns 0, but lzw_decompress finished without error\n"); + } + + res = h_parse(p, pred.out, pred.nout); + free(pred.out); + + bind_lzw_spec(NULL); + delete_lzw_spec(lzw_spec); + + assert(res->ast && res->ast->token_type == TT_BYTES); + res = h_parse(p, res->ast->bytes.token, res->ast->bytes.len); + + return res; +} + HParseResult * RunLengthDecode(const Dict *parms, HBytes b, HParser *p) { @@ -1677,6 +1887,8 @@ decode_stream(const Dict *d, HBytes b, HParser *p) filter = ASCII85Decode; else if (bytes_eq(v->bytes, "RunLengthDecode")) filter = RunLengthDecode; + else if (bytes_eq(v->bytes, "LZWDecode")) + filter = LZWDecode; else return NULL; /* filter not supported */ diff --git a/t/hello_lzwdecode_march.pdf b/t/hello_lzwdecode_march.pdf new file mode 100644 index 0000000000000000000000000000000000000000..7f887ccce2c1c995ea65f1524e28685713f435d3 Binary files /dev/null and b/t/hello_lzwdecode_march.pdf differ