diff --git a/Makefile b/Makefile index 0aaf446eb971efa7ca3f7513a4b005037e7943cd..04612316734667ac8e9f41fee599c36cad1de7a1 100644 --- a/Makefile +++ b/Makefile @@ -12,7 +12,7 @@ HAMMER_INCLUDE = . HAMMER_LIB = ./lib CFLAGS += -I$(HAMMER_INCLUDE) # (-pg :: profile using gprof) (-g :: debug info) LDFLAGS += -L$(HAMMER_LIB) -SOURCES = pdf.c lzw-lib.c +SOURCES = pdf.c lzw.c .PHONY: all test clean all: pdf diff --git a/lzw-ab-license.txt b/lzw-ab-license.txt deleted file mode 100644 index 65d4a2e4b96304208852290e3d3bf6ee7ce3dde8..0000000000000000000000000000000000000000 --- a/lzw-ab-license.txt +++ /dev/null @@ -1,25 +0,0 @@ - Copyright (c) David Bryant - All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of Conifer Software nor the names of its contributors - may be used to endorse or promote products derived from this software - without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/lzw-lib.c b/lzw-lib.c deleted file mode 100644 index 38ecd30a3ed38605d2cb752efb66dc8ecb4861c6..0000000000000000000000000000000000000000 --- a/lzw-lib.c +++ /dev/null @@ -1,318 +0,0 @@ -//////////////////////////////////////////////////////////////////////////// -// **** LZW-AB **** // -// Adjusted Binary LZW Compressor/Decompressor // -// Copyright (c) 2016 David Bryant // -// All Rights Reserved // -// Distributed under the BSD Software License (see license.txt) // -//////////////////////////////////////////////////////////////////////////// - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> - -#include "lzw-lib.h" - -/* This library implements the LZW general-purpose data compression algorithm. - * The algorithm was originally described as a hardware implementation by - * Terry Welsh here: - * - * Welch, T.A. “A Technique for High-Performance Data Compression.†- * IEEE Computer 17,6 (June 1984), pp. 8-19. - * - * Since then there have been enumerable refinements and variations on the - * basic technique, and this implementation is no different. The target of - * the present implementation is embedded systems, and so emphasis was placed - * on simplicity, fast execution, and minimal RAM usage. - * - * The symbols are stored in adjusted binary, which provides considerably - * better compression performance with virtually no speed penalty compared to - * the fixed sizes normally used. To ensure good performance on data with - * varying characteristics (like executable images) the encoder resets as - * soon as the dictionary is full. Also, worst-case performance is limited - * to about 8% inflation by catching poor performance and forcing an early - * reset before longer symbols are sent. - * - * The maximum symbol size is configurable on the encode side (from 9 bits - * to 12 bits) and determines the RAM footprint required by both sides and, - * to a large extent, the compression performance. This information is - * communicated to the decoder in the first stream byte so that it can - * allocate accordingly. The RAM requirements are as follows: - * - * maximum encoder RAM decoder RAM - * symbol size requirement requirement - * ----------------------------------------- - * 9-bit 1792 bytes 1024 bytes - * 10-bit 4352 bytes 3072 bytes - * 11-bit 9472 bytes 7168 bytes - * 12-bit 19712 bytes 15360 bytes - * - * This implementation uses malloc(), but obviously an embedded version could - * use static arrays instead if desired (assuming that the maxbits was - * controlled outside). - */ - -#define NULL_CODE -1 // indicates a NULL prefix -#define CLEAR_CODE 256 // code to flush dictionary and restart decoder -#define EOD_CODE 257 // used in PDF's LZWDecode to signal end of data -#define FIRST_STRING 258 // code of first dictionary string, PDF edition - -/* This macro writes the adjusted-binary symbol "code" given the maximum - * symbol "maxcode". A macro is used here just to avoid the duplication in - * the lzw_compress() function. The idea is that if "maxcode" is not one - * less than a power of two (which it rarely will be) then this code can - * often send fewer bits that would be required with a fixed-sized code. - * - * For example, the first code we send will have a "maxcode" of 257, so - * every "code" would normally consume 9 bits. But with adjusted binary we - * can actually represent any code from 0 to 253 with just 8 bits -- only - * the 4 codes from 254 to 257 take 9 bits. - */ - -#define WRITE_CODE(code,maxcode) do { \ - int code_bits = (maxcode) < 1024 ? \ - ((maxcode) < 512 ? 8 : 9) : \ - ((maxcode) < 2048 ? 10 : 11); \ - int extras = (1 << (code_bits + 1)) - (maxcode) - 1; \ - if ((code) < extras) { \ - shifter |= ((long)(code) << bits); \ - bits += code_bits; \ - } \ - else { \ - shifter |= ((long)(((code) + extras) >> 1) << bits); \ - bits += code_bits; \ - shifter |= ((long)(((code) + extras) & 1) << bits++); \ - } \ - do { (*dst)(shifter); shifter >>= 8; output_bytes++; \ - } while ((bits -= 8) >= 8); \ -} while (0) - -/* LZW compression function. Bytes (8-bit) are read and written through callbacks and the - * "maxbits" parameter specifies the maximum symbol size (9-12), which in turn determines - * the RAM requirement and, to a large extent, the level of compression achievable. A return - * value of EOF from the "src" callback terminates the compression process. A non-zero return - * value indicates one of the two possible errors -- bad "maxbits" param or failed malloc(). - */ - -int lzw_compress (void (*dst)(int), int (*src)(void), int maxbits) -{ - int next = FIRST_STRING, prefix = NULL_CODE, bits = 0, total_codes, c; - unsigned long input_bytes = 0, output_bytes = 0; - short *first_references, *next_references; - unsigned char *terminators; - unsigned long shifter = 0; - - if (maxbits < 9 || maxbits > 12) // check for valid "maxbits" setting - return 1; - - // based on the "maxbits" parameter, compute total codes and allocate dictionary storage - - total_codes = 1 << maxbits; - first_references = malloc (total_codes * sizeof (first_references [0])); - next_references = malloc ((total_codes - 256) * sizeof (next_references [0])); - terminators = malloc ((total_codes - 256) * sizeof (terminators [0])); - - if (!first_references || !next_references || !terminators) - return 1; // failed malloc() - - // clear the dictionary - - memset (first_references, 0, total_codes * sizeof (first_references [0])); - memset (next_references, 0, (total_codes - 256) * sizeof (next_references [0])); - memset (terminators, 0, (total_codes - 256) * sizeof (terminators [0])); - - (*dst)(maxbits - 9); // first byte in output stream indicates the maximum symbol bits - - // This is the main loop where we read input bytes and compress them. We always keep track of the - // "prefix", which represents a pending byte (if < 256) or string entry (if >= FIRST_STRING) that - // has not been sent to the decoder yet. The output symbols are kept in the "shifter" and "bits" - // variables and are sent to the output every time 8 bits are available (done in the macro). - - while ((c = (*src)()) != EOF) { - int cti; // coding table index - - input_bytes++; - - if (prefix == NULL_CODE) { // this only happens the very first byte when we don't yet have a prefix - prefix = c; - continue; - } - - if ((cti = first_references [prefix])) { // if any longer strings are built on the current prefix... - while (1) - if (terminators [cti - 256] == c) { // we found a matching string, so we just update the prefix - prefix = cti; // to that string and continue without sending anything - break; - } - else if (!next_references [cti - 256]) { // this string did not match the new character and - next_references [cti - 256] = next; // there aren't any more, so we'll add a new string - cti = 0; // and point to it with "next_reference" - break; - } - else - cti = next_references [cti - 256]; // there are more possible matches to check, so loop back - } - else // no longer strings are based on the current prefix, so now - first_references [prefix] = next; // the current prefix plus the new byte will be the next string - - // If "cti" is zero, we could not simply extend our "prefix" to a longer string because we did not find a - // dictionary match, so we send the symbol representing the current "prefix" and add the new string to the - // dictionary. Since the current byte "c" was not included in the prefix, that now becomes our new prefix. - - if (!cti) { - WRITE_CODE (prefix, next); // send symbol for current prefix (0 to next-1) - terminators [next - 256] = c; // newly created string has current byte as the terminator - prefix = c; // current byte also becomes new prefix for next string - - // This is where we bump the next string index and decide whether to clear the dictionary and start over. - // The triggers for that are either the dictionary is full or we've been outputting too many bytes and - // decide to cut our losses before the symbols get any larger. Note that for the dictionary full case we - // do NOT send the CLEAR_CODE because the decoder knows about this and we don't want to be redundant. - - if (++next == total_codes || output_bytes > 8 + input_bytes + (input_bytes >> 4)) { - if (next < total_codes) - WRITE_CODE (CLEAR_CODE, next); - - // clear the dictionary and reset the byte counters -- basically everything starts over - // except that we keep the last pending "prefix" (which, of course, was never sent) - - memset (first_references, 0, total_codes * sizeof (first_references [0])); - memset (next_references, 0, (total_codes - 256) * sizeof (next_references [0])); - memset (terminators, 0, (total_codes - 256) * sizeof (terminators [0])); - input_bytes = output_bytes = 0; - next = FIRST_STRING; - } - } - } - - // we're done with input, so if we've received anything we still need to send that pesky pending prefix... - - if (prefix != NULL_CODE) { - WRITE_CODE (prefix, next); - - if (++next == total_codes) // watch for clearing to the first string to stay in step with the decoder! - next = FIRST_STRING; // (this was actually a corner-case bug that did not trigger often) - } - - WRITE_CODE (next, next); // the maximum possible code is always reserved for our END_CODE - - if (bits) // finally, flush any pending bits from the shifter - (*dst)(shifter); - - free (terminators); free (next_references); free (first_references); - return 0; -} - -/* LZW decompression function. Bytes (8-bit) are read and written through callbacks. - * A return value of EOF from the "src" callback terminates the compression process - * (although this should not normally occur). A non-zero return value - * indicates an error, which in this case can be a - * failed malloc(), or if an EOF is read from the input stream before the compression - * terminates naturally with END_CODE. - */ - -int lzw_decompress (void (*dst)(int), int (*src)(void)) -{ - int read_byte, next = FIRST_STRING, prefix = CLEAR_CODE, bits = 0, total_codes; - unsigned char *terminators, *reverse_buffer; - unsigned long shifter = 0; - short *prefixes; - - // PDF specific change: maxbits is not in the input stream - // we'll just be pessimistic and allocate the maximal size buffer - - total_codes = 4096; - reverse_buffer = malloc ((total_codes - 256) * sizeof (reverse_buffer [0])); - prefixes = malloc ((total_codes - 256) * sizeof (prefixes [0])); - terminators = malloc ((total_codes - 256) * sizeof (terminators [0])); - - if (!reverse_buffer || !prefixes || !terminators) // check for mallco() failure - return 1; - - // This is the main loop where we read input symbols. The values range from 0 to the code value - // of the "next" string in the dictionary. Note that receiving an EOF from the input - // stream is actually an error because we should have gotten the END_CODE first. - - while (1) { - int code_bits = next < 512 ? 9 : (next < 1024 ? 10 : (next < 2048 ? 11 : 12) ), code; - - #define TOP_BITMASK (((1 << code_bits) - 1) << (bits - code_bits) ) - #define BOTTOM_BITMASK ((1 << (bits - code_bits)) - 1) - - do { - if ((read_byte = ((*src)())) == EOF) { - free (terminators); free (prefixes); free (reverse_buffer); - return 1; - } - - /* shifter reworked: everything shifted left by a byte, - * and the byte we just read becomes the least significant - * byte */ - - // prepare to shift in next byte - shifter <<= 8; - /* the bitstrings forming the symbols are stored MSB first, - * so we can just OR in the next */ - shifter |= (unsigned long) read_byte; - } while ((bits += 8) < code_bits); - - - /* for a 12-bit code, the shifter's bits now look like - * from MSB to LSB: 00...0cccccccccn...n - * where c are the bits of our code - * and n are the bits we're not yet interested in - * the number of times n is repeated is bits - code_bits - * ie. the number of bits read in minus the bits we're interested in */ - - // shift our code bits into thier proper place, and save it as the final code - code = (int) shifter >> (bits - code_bits); - /* we can now clear the shifter's top bits. the result looks like: - * 00...0n...n - * number of n is bits-code_bits - * */ - shifter &= BOTTOM_BITMASK; - // update the count of bytes in the shifter - bits -= code_bits; - - if (code == EOD_CODE) // In PDF, EOD is signalled by 257, rather than the max code - break; - else if (code == CLEAR_CODE) // otherwise check for a CLEAR_CODE to start over early - next = FIRST_STRING; - else if (prefix == CLEAR_CODE) { // this only happens at the first symbol which is always sent - (*dst)(code); // literally and becomes our initial prefix - next++; - } - // Otherwise we have a valid prefix so we step through the string from end to beginning storing the - // bytes in the "reverse_buffer", and then we send them out in the proper order. One corner-case - // we have to handle here is that the string might be the same one that is actually being defined - // now (code == next-1). Also, the first 256 entries of "terminators" and "prefixes" are fixed and - // not allocated, so that messes things up a bit. - else { - int cti = (code == next-1) ? prefix : code; - unsigned char *rbp = reverse_buffer, c; - - do *rbp++ = cti < 256 ? cti : terminators [cti - 256]; // step backward through string... - while ((cti = (cti < 256) ? NULL_CODE : prefixes [cti - 256]) != NULL_CODE); - - c = *--rbp; // the first byte in this string is the terminator for the last string, which is - // the one that we'll create a new dictionary entry for this time - - do (*dst)(*rbp); // send string in corrected order (except for the terminator - while (rbp-- != reverse_buffer); // which we don't know yet) - - if (code == next-1) - (*dst)(c); - - prefixes [next - 1 - 256] = prefix; // now update the next dictionary entry with the new string - terminators [next - 1 - 256] = c; // (but we're always one behind, so it's not the string just sent) - - if (++next == total_codes) // check for full dictionary, which forces a reset (and, BTW, - next = FIRST_STRING; // means we'll never use the dictionary entry we just wrote) - } - - prefix = code; // the code we just received becomes the prefix for the next dictionary string entry - // (which we'll create once we find out the terminator) - } - - free (terminators); free (prefixes); free (reverse_buffer); - return 0; -} diff --git a/lzw-lib.h b/lzw-lib.h deleted file mode 100644 index 81fdeb15e6ade7ef0fd6089a4fdc3d3f2d593578..0000000000000000000000000000000000000000 --- a/lzw-lib.h +++ /dev/null @@ -1,15 +0,0 @@ -//////////////////////////////////////////////////////////////////////////// -// **** LZW-AB **** // -// Adjusted Binary LZW Compressor/Decompressor // -// Copyright (c) 2016 David Bryant // -// All Rights Reserved // -// Distributed under the BSD Software License (see license.txt) // -//////////////////////////////////////////////////////////////////////////// - -#ifndef LZWLIB_H_ -#define LZWLIB_H_ - -int lzw_compress (void (*dst)(int), int (*src)(void), int maxbits); -int lzw_decompress (void (*dst)(int), int (*src)(void)); - -#endif /* LZWLIB_H_ */ diff --git a/lzw.c b/lzw.c new file mode 100644 index 0000000000000000000000000000000000000000..364e18a97fec77cb665323fead35475c9ecf7450 --- /dev/null +++ b/lzw.c @@ -0,0 +1,455 @@ +/* Table for storing sequences represented by an LZW code */ +#include <hammer/hammer.h> +#include <hammer/glue.h> +// malloc, free +#include <stdlib.h> +// strlen +#include <string.h> + +#include "lzw.h" + +FILE *debug; // DEBUG + +#define BITLIMIT_9 (ctx->earlychange ? 511 : 512) +#define BITLIMIT_10 (ctx->earlychange ? 1023 : 1024) +#define BITLIMIT_11 (ctx->earlychange ? 2047 : 2048) +/* + * Since bit lengths larger than 12 aren't allowed, EarlyChange doesn't matter here. + */ +#define BITLIMIT_12 4096 + + +void LZW_clear_table(LZW_context_T *ctx) +{ + /* + * Optimizations: since we leave the entries 0-257 fixed or empty, we don't need to free() them explicitly. + * And since codes are added to the table sequentially, we don't need to look past ctx->next; + */ + for(int i = 258; i < ctx->next; ++i) + { + HBytes * sequence = ctx->lzw_code_table[i]; + if(sequence != NULL) + { + /* + * Assumption: only the HBytes in the LZW table refer to the particular uint8_t arrays we're freeing. + */ + free((uint8_t *)sequence->token); + free(sequence); + } + ctx->lzw_code_table[i] = NULL; + } +} + +/* + * Creates a HBytes from an array of bytes and its length, and inserts it into the lzw dictionary in ctx. + * Also increments ctx->next. The HBytes will keep the token pointer, to be freed later in lzw_clear_table or init_lzw_context. + */ +void lzw_table_insert(LZW_context_T *ctx, uint8_t *token, size_t token_len) +{ + HBytes * next_entry = malloc(sizeof(HBytes)); + next_entry->token = token; + next_entry->len = token_len; + ctx->lzw_code_table[ctx->next] = next_entry; + ctx->next++; +} + +HParser *p_lzwdata; +LZW_context_T * context; + +/* + * First "code" in input. We output it literally, and set "old" + */ +HParsedToken* +act_LZW_firstcode(const HParseResult *p, void *u) +{ + /*HBytes * next_entry; + size_t next_entry_size; + uint8_t * next_entry_token;*/ + LZW_context_T * ctx = (LZW_context_T *) u; + uint64_t code = H_CAST_UINT(p->ast); + uint8_t *output = H_ALLOC(uint8_t); + *output = (uint8_t) code; + //fprintf(debug, "firstcode code: %lu, next: %u\n", p->ast->uint, ctx->next); // DEBUG + //fprintf(debug, "%lu ", p->ast->uint); // DEBUG + //fflush(debug); // DEBUG + + ctx->old = code; + return H_MAKE_BYTES(output, 1); +} + +HParsedToken* +act_LZW_clear(const HParseResult *p, void *u) +{ + LZW_context_T * ctx = (LZW_context_T *) u; + LZW_clear_table(ctx); + //fprintf(debug, "clear code: %lu, next: %u\n", p->ast->uint, ctx->next); // DEBUG + //fprintf(debug, "%lu ", p->ast->uint); // DEBUG + //fflush(debug); // DEBUG + ctx->next = 258; // Caution: moving this before the call to LZW_clear_table() will cause a memory leak + return H_MAKE_BYTES(NULL, 0); +} + +bool +validate_LZW_9bitcodeword(HParseResult *p, void *u) +{ + LZW_context_T * ctx = (LZW_context_T *) u; + //fprintf(debug, "9 bit code: %lu, next: %u\n", p->ast->uint, ctx->next); // DEBUG + //fflush(debug); // DEBUG + if (ctx->next < BITLIMIT_9) // DEBUG + assert(H_CAST_UINT(p->ast) <= ctx->next); // DEBUG + return (ctx->next < BITLIMIT_9); +} + +bool +validate_LZW_10bitcodeword(HParseResult *p, void *u) +{ + LZW_context_T * ctx = (LZW_context_T *) u; + //fprintf(debug, "10 bit code: %lu, next: %u\n", p->ast->uint, ctx->next); // DEBUG + //fflush(debug); // DEBUG + if (ctx->next >= BITLIMIT_9 && ctx->next < BITLIMIT_10) // DEBUG + assert(H_CAST_UINT(p->ast) <= ctx->next); // DEBUG + return (ctx->next >= BITLIMIT_9 && ctx->next < BITLIMIT_10); +} + +bool +validate_LZW_11bitcodeword(HParseResult *p, void *u) +{ + LZW_context_T * ctx = (LZW_context_T *) u; + //fprintf(debug, "11 bit code: %lu, next: %u\n", p->ast->uint, ctx->next); // DEBUG + //fflush(debug); // DEBUG + if (ctx->next >= BITLIMIT_10 && ctx->next < BITLIMIT_11) // DEBUG + assert(H_CAST_UINT(p->ast) <= ctx->next); // DEBUG + return (ctx->next >= BITLIMIT_10 && ctx->next < BITLIMIT_11); +} + +bool +validate_LZW_12bitcodeword(HParseResult *p, void *u) +{ + LZW_context_T * ctx = (LZW_context_T *) u; + //fprintf(debug, "12 bit code: %lu, next: %u\n", p->ast->uint, ctx->next); // DEBUG + //fflush(debug); // DEBUG + if (ctx->next >= BITLIMIT_11 && ctx->next < BITLIMIT_12) // DEBUG + assert(H_CAST_UINT(p->ast) <= ctx->next); // DEBUG + return (ctx->next >= BITLIMIT_11 && ctx->next < BITLIMIT_12); +} + +bool +validate_LZW_9bitlitspec(HParseResult *p, void *u) +{ + LZW_context_T * ctx = (LZW_context_T *) u; + //fprintf(debug, "9 bit lit: %lu, next: %u\n", p->ast->uint, ctx->next); // DEBUG + //fflush(debug); // DEBUG + uint64_t code = H_CAST_UINT(p->ast); + return (ctx->next < BITLIMIT_9 && code < 258); +} + +bool +validate_LZW_10bitlitspec(HParseResult *p, void *u) +{ + LZW_context_T * ctx = (LZW_context_T *) u; + //fprintf(debug, "10 bit lit: %lu, next: %u\n", p->ast->uint, ctx->next); // DEBUG + //fflush(debug); // DEBUG + uint64_t code = H_CAST_UINT(p->ast); + return (ctx->next >= BITLIMIT_9 && ctx->next < BITLIMIT_10 && code < 258); +} + +bool +validate_LZW_11bitlitspec(HParseResult *p, void *u) +{ + LZW_context_T * ctx = (LZW_context_T *) u; + //fprintf(debug, "11 bit lit: %lu, next: %u\n", p->ast->uint, ctx->next); // DEBUG + //fflush(debug); // DEBUG + uint64_t code = H_CAST_UINT(p->ast); + return (ctx->next >= BITLIMIT_10 && ctx->next < BITLIMIT_11 && code < 258); +} + +bool +validate_LZW_12bitlitspec(HParseResult *p, void *u) +{ + LZW_context_T * ctx = (LZW_context_T *) u; + //fprintf(debug, "12 bit lit: %lu, next: %u\n", p->ast->uint, ctx->next); // DEBUG + //fflush(debug); // DEBUG + uint64_t code = H_CAST_UINT(p->ast); + return (ctx->next >= BITLIMIT_11 && ctx->next < BITLIMIT_12 && code < 258); +} + +bool +validate_LZW_clear(HParseResult *p, void *u) +{ + uint64_t code = H_CAST_UINT(p->ast); + return (code == 256); +} + +bool +validate_LZW_eod(HParseResult *p, void *u) +{ + uint64_t code = H_CAST_UINT(p->ast); + return (code == 257); +} + +bool +validate_LZW_literal(HParseResult *p, void *u) +{ + uint64_t code = H_CAST_UINT(p->ast); + return (code < 256); +} + +HParsedToken* +act_LZW_literal(const HParseResult *p, void *u) +{ + size_t next_entry_size; + uint8_t * next_entry_token; + HBytes * prev_string; + uint64_t code = H_CAST_UINT(p->ast); + LZW_context_T * ctx = (LZW_context_T *) u; + /* + * Literals go from 0-255, so they are guaranteed to fit into 1 byte. See also: validate_LZW_literal + */ + uint8_t *output = H_ALLOC(uint8_t); + *output = (uint8_t) code; + //fprintf(debug, "lit: %lu, next: %u\n", code, ctx->next); // DEBUG + //fprintf(debug, "%lu ", code); // DEBUG + //fflush(debug); // DEBUG + + /* + * Update the dictionary with the new string. Use of system allocator + * here and in act_LZW_codeword is intentional, as LZW_clear_table/init_LZW_context free these + */ + prev_string = ctx->lzw_code_table[ctx->old]; + next_entry_size = prev_string->len + 1; + next_entry_token = calloc(next_entry_size, sizeof(uint8_t)); + memcpy(next_entry_token, prev_string->token, prev_string->len); + next_entry_token[next_entry_size - 1] = (uint8_t) code; + lzw_table_insert(ctx, next_entry_token, next_entry_size); + + ctx->old = code; + return H_MAKE_BYTES(output, 1); +} + +HParsedToken* +act_LZW_codeword(const HParseResult *p, void *u) +{ + HBytes * prev_string; + uint64_t code = H_CAST_UINT(p->ast); + uint8_t prefix; + uint8_t * output_token; + uint8_t * next_entry_token; + size_t prev_string_length; + LZW_context_T * ctx = (LZW_context_T *) u; + + //fprintf(debug, "code: %lu, next: %u\n", code, ctx->next); // DEBUG + //fprintf(debug, "%lu ", code); // DEBUG + //fflush(debug); // DEBUG + + + if(ctx->lzw_code_table[code] != NULL) // code is in the table + { + HBytes * code_str; + size_t code_token_length; + + /* + * Retrieve the output from the dictionary. + * This is what we'll wrap in a HBytes for returning + */ + code_str = ctx->lzw_code_table[code]; + code_token_length = code_str->len; + output_token = calloc(code_token_length, sizeof(uint8_t)); + memcpy(output_token, code_str->token, code_token_length); + + prev_string = ctx->lzw_code_table[ctx->old]; + prev_string_length = prev_string->len; + + /* + * Update the dictionary + */ + prefix = output_token[0]; + next_entry_token = calloc(prev_string_length+1, sizeof(uint8_t)); + memcpy(next_entry_token, prev_string->token, prev_string_length); + next_entry_token[prev_string_length] = prefix; + lzw_table_insert(ctx, next_entry_token, prev_string_length+1); + ctx->old = code; + + return H_MAKE_BYTES(output_token, code_token_length); + } + else // code is not in the table + { + uint8_t new_prefix; + HBytes * missing_table_entry; + uint8_t * missing_table_entry_token; + size_t output_length; + + prev_string = ctx->lzw_code_table[ctx->old]; + prev_string_length = prev_string->len; + prefix = prev_string->token[0]; + + /* + * Put together the string for the current code, then insert it into the table. We also copy the token into a separate uint8_t to be returned by the function + */ + output_length = prev_string_length + 1; + output_token = calloc(output_length, sizeof(uint8_t)); + memcpy(output_token, prev_string->token, prev_string_length); + /* + * Output is one byte longer than prev_string, and the last byte is the first character of the previous string + */ + output_token[prev_string_length] = prefix; + + + missing_table_entry = malloc(sizeof(HBytes)); + missing_table_entry->len = prev_string_length + 1; + missing_table_entry_token = calloc(missing_table_entry->len, sizeof(uint8_t)); + memcpy(missing_table_entry_token, output_token, missing_table_entry->len); + missing_table_entry->token = missing_table_entry_token; + ctx->lzw_code_table[code] = missing_table_entry; + + /* + * Update the dictionary + */ + new_prefix = output_token[0]; + next_entry_token = calloc(prev_string_length+1, sizeof(uint8_t)); + memcpy(next_entry_token, prev_string->token, prev_string_length); + next_entry_token[prev_string_length] = new_prefix; + lzw_table_insert(ctx, next_entry_token, prev_string_length+1); + ctx->old = code; + + return H_MAKE_BYTES(output_token, output_length); + } +} + +HParsedToken* +act_LZW_body(const HParseResult *p, void *u) +{ + size_t index = 0; + size_t total_buffer_size = 0; + size_t num_fragments = h_seq_len(p->ast); + uint8_t * buffer; + + /* sum total bytes in array, alloc buffer */ + for(int i = 0; i < num_fragments; i++) + { + total_buffer_size += H_FIELD_BYTES(i).len; + } + + buffer = h_arena_malloc(p->arena, sizeof(uint8_t) * total_buffer_size); // XXX arena alloc, calloc + + /* go through parse result, merge bytes */ + for(int i = 0; i < num_fragments; i++) + { + size_t len = H_FIELD_BYTES(i).len; + memcpy(&buffer[index], H_FIELD_BYTES(i).token, len); + index += len; + } + + //fprintf(debug, "\n\n"); // DEBUG + //fwrite(buffer, 1, total_buffer_size, debug); // DEBUG + //fflush(debug); // DEBUG + + return H_MAKE_BYTES(buffer, total_buffer_size); +} + + +HParsedToken* +act_LZW_data(const HParseResult *p, void *u) +{ + /* The AST this semantic action receives is a sequence that looks something like this: + elements[0] -> TT_BYTES representing the initial clear code + elements[1] -> TT_BYTES representing the first code (should be a literal) + elements[2] -> TT_BYTES containing the decompressed data (except for the first code) + elements[3] -> TT_UINT representing the EOD code + elements[4] -> TT_UINT representing the remaining bits from EOD to the end of the byte, should be 0 + */ + + //HCountedArray * seq = H_CAST_SEQ(p->ast); + //LZW_context_T *ctx = (LZW_context_T*) u; // DEBUG + size_t total_buffer_size = 0; + uint8_t * buffer; + HBytes first = H_FIELD_BYTES(1); + HBytes rest = H_FIELD_BYTES(2); + + total_buffer_size = first.len + rest.len; + + buffer = calloc(total_buffer_size, sizeof(uint8_t)); + memcpy(buffer, first.token, first.len); + memcpy(buffer+first.len, rest.token, rest.len); + // XXX: Memory use would be greatly decreased if first.token and rest.token could be freed here (allocated in act_LZW_firstcode and act_LZW_body) + + //fprintf(debug, "\n\n"); // DEBUG + /*for(int i = 258; i < ctx->next; ++i) // DEBUG + { + fprintf(debug, "i: %u, str: ", i); + fwrite(ctx->lzw_code_table[i]->token, ctx->lzw_code_table[i]->len, 1, debug); + fprintf(debug, "\n"); + } + fflush(debug); // DEBUG */ + + return H_MAKE_BYTES(buffer, total_buffer_size); +} + + +void init_LZW_parser() +{ + context = malloc(sizeof(LZW_context_T)); + memset(context, 0, sizeof(*context)); + context->next = 258; + /* set up literals in LZW code table */ + for(int i = 0; i < 256; i++) + { + uint8_t *token = malloc(sizeof(uint8_t)); + *token = i; + HBytes *lit = malloc(sizeof(HBytes)); + lit->token = token; + lit->len = 1; + context->lzw_code_table[i] = lit; + } + context->earlychange = 1; + context->old = 257; + + H_VDRULE(LZW_9bitcodeword, h_bits(9, false), context); + H_VDRULE(LZW_10bitcodeword, h_bits(10, false), context); + H_VDRULE(LZW_11bitcodeword, h_bits(11, false), context); + H_VDRULE(LZW_12bitcodeword, h_bits(12, false), context); + + H_VDRULE(LZW_9bitlitspec, h_bits(9, false), context); + H_VDRULE(LZW_10bitlitspec, h_bits(10, false), context); + H_VDRULE(LZW_11bitlitspec, h_bits(11, false), context); + H_VDRULE(LZW_12bitlitspec, h_bits(12, false), context); + + H_RULE(LZW_remainingbits, h_many(h_bits(1, false))); //XXX: could validate that these bits are 0? + + H_ADRULE(LZW_firstcode, LZW_9bitlitspec, context); // First code is always a literal, sets ctx->old + + H_AVDRULE(LZW_clear, h_choice(LZW_9bitlitspec, LZW_10bitlitspec, LZW_11bitlitspec, LZW_12bitlitspec, NULL), context); + H_VDRULE(LZW_eod, h_choice(LZW_9bitlitspec, LZW_10bitlitspec, LZW_11bitlitspec, LZW_12bitlitspec, NULL), context); + H_AVDRULE(LZW_literal, h_choice(LZW_9bitlitspec, LZW_10bitlitspec, LZW_11bitlitspec, LZW_12bitlitspec, NULL), context); + H_ADRULE(LZW_codeword, h_choice(LZW_9bitcodeword, LZW_10bitcodeword, LZW_11bitcodeword, LZW_12bitcodeword, NULL), context); + + H_ADRULE(LZW_body, h_many1(h_butnot(h_choice(LZW_literal, h_ignore(LZW_clear), LZW_codeword, NULL), LZW_eod)), context); + + H_ADRULE(LZW_data, h_sequence(LZW_clear, LZW_firstcode, LZW_body, LZW_eod, LZW_remainingbits, NULL), context); + p_lzwdata = LZW_data; +} + + +HParseResult* parse_LZW_data(const uint8_t* input, size_t length) +{ + //debug = fopen("lzw_debug.txt", "a"); // DEBUG + HParseResult *res = h_parse(p_lzwdata, input, length); + //fclose(debug); // DEBUG + return res; +} + + +void init_LZW_context(int earlychange) +{ + for(int i = 258; i < 4096; ++i) + { + if(context->lzw_code_table[i] != NULL) + { + free((uint8_t *) context->lzw_code_table[i]->token); // These can be freed without issue, because HParsedTokens containing them have separate deep copies + free(context->lzw_code_table[i]); + } + context->lzw_code_table[i] = NULL; + } + context->next = 258; + context->old = 257; + context->earlychange = earlychange; +} diff --git a/lzw.h b/lzw.h new file mode 100644 index 0000000000000000000000000000000000000000..183ee301cbdf38fca9bbb6bfc66d5adff28cda39 --- /dev/null +++ b/lzw.h @@ -0,0 +1,35 @@ +#ifndef PDF_LZW_H +#define PDF_LZW_H + +#include <hammer/hammer.h> + + +typedef struct LZW_context_S +{ + /* + * Table for storing sequences represented by an LZW code + * 0-255, and 256 are special, representing literals, and the reset code. We could explicitly pre-fill them, but it's probably not necessary. + */ + HBytes * lzw_code_table[4096]; + + /* + * Holds the next expected LZW code. We also use this for telling LZW_9bitcodeword, LZW_10bitcodeword, etc. apart. Parses fail if "next" is larger than what can be represented on that many bits. + */ + int next; + + /* + * Previous LZW code, used to construct the next string added to the table. + */ + uint64_t old; + + /* + * EarlyChange = 1 means the bit size is increased "one code early" (Early change = 0 is "code length increases shall be postponed as long as possible" + */ + int earlychange; +} LZW_context_T; + +void init_LZW_parser(); +HParseResult * parse_LZW_data(const uint8_t* input, size_t length); +void init_LZW_context(int earlychange); + +#endif // PDF_LZW_H diff --git a/pdf.c b/pdf.c index 78f284d1e65765df0ff023b57d03751bf1a39321..3268e0775b4e5044902bf3718d91cf5b8912934b 100644 --- a/pdf.c +++ b/pdf.c @@ -3197,133 +3197,7 @@ FlateDecode(const Dict *parms, HBytes b, HParser *p) #endif - - -/* LZW helpers */ - -typedef struct -{ - uint8_t *lzw_buf; - size_t total_buf_size; - size_t write_head; - size_t write_tail; - uint8_t write_checksum; - size_t eof_loc; - - HBytes *input_stream; - size_t read_head; - size_t read_tail; - uint8_t read_checksum; -} lzwspec; - -lzwspec *cur_lzw_spec; - -/* used by write_lzw_buffer to get more space for decoding if needed */ -void -grow_lzw_buffer(size_t amount) -{ - uint8_t *ret_buf = realloc(cur_lzw_spec->lzw_buf, (cur_lzw_spec->total_buf_size+amount) * sizeof(uint8_t)); - if(ret_buf != NULL) - { - cur_lzw_spec->total_buf_size += amount; - cur_lzw_spec->lzw_buf = ret_buf; - } - else - { - fprintf(stderr, "LZWDecode: h_arena_realloc() failed"); - return; - } -} - -lzwspec * -new_lzw_spec(HBytes *bytes) -{ - size_t const BUFSIZE = sizeof(uint8_t) * 1024; - lzwspec *ret = malloc(sizeof(lzwspec)); - memset(ret, 0, sizeof(lzwspec)); - ret->input_stream = bytes; - ret->lzw_buf = malloc(BUFSIZE); - ret->total_buf_size = BUFSIZE; - return ret; -} - -void -delete_lzw_spec(lzwspec *spec) -{ - free(spec->lzw_buf); - free(spec); -} - -void -bind_lzw_spec(lzwspec *spec) -{ - cur_lzw_spec = spec; -} - - -#include "lzw-lib.h" - -/* Buffer writer function for the lzw-ab implementation, with a fixed signature. - * Although the type is defined as int, it is expected to write one byte at a time. - * Modifies cur_lzw_spec. Set up the lzw spec to use with bind_lzw_spec() */ - -void -write_lzw_buffer(int value) -{ - size_t const BUFSIZE = sizeof(uint8_t) * 1024; - - if(!cur_lzw_spec->lzw_buf) - { - fprintf(stderr, "LZWDecode: lzw_buf is null!"); - assert(cur_lzw_spec->lzw_buf != NULL); - } - - assert(cur_lzw_spec->write_head <= cur_lzw_spec->total_buf_size); - - if (value == EOF) { - cur_lzw_spec->lzw_buf[cur_lzw_spec->write_head] = (uint8_t) value; - cur_lzw_spec->eof_loc = cur_lzw_spec->write_head; - cur_lzw_spec->write_head++; - return; - } - - /* We can get away with this cast due to writing single bytes. */ - cur_lzw_spec->lzw_buf[cur_lzw_spec->write_head++] = (uint8_t) value; - - /* If you looked at lzw-ab's code, the write head is reset here - * This function uses write_head as the offset of the last written item */ - if (cur_lzw_spec->write_head >= cur_lzw_spec->total_buf_size) - { - grow_lzw_buffer(BUFSIZE); - } - - cur_lzw_spec->write_checksum = cur_lzw_spec->write_checksum * 3 + (uint8_t) value; -} - - -/* Fixed signature function for reading bytes. Modifies cur_lzw_spec. Set cur_lzw_spec - * with bind_lzw_spec() */ -int read_lzw_buffer(void) -{ - uint8_t byte_read; - int ret_value; - - /* Input data is already waiting in the buffer */ - if (cur_lzw_spec->read_head == cur_lzw_spec->read_tail) - cur_lzw_spec->read_tail = cur_lzw_spec->input_stream->len; - - if (cur_lzw_spec->read_head < cur_lzw_spec->read_tail) - { - byte_read = cur_lzw_spec->input_stream->token[cur_lzw_spec->read_head++]; - cur_lzw_spec->read_checksum = cur_lzw_spec->read_checksum * 3 + byte_read; - ret_value = byte_read; - } - else - ret_value = EOF; - - return ret_value; -} - +#include "lzw.h" HParseResult * LZWDecode(const Dict *parms, HBytes b, HParser *p) @@ -3331,9 +3205,11 @@ LZWDecode(const Dict *parms, HBytes b, HParser *p) struct predictor pred = {1, 1, 8, 1}; int (*depredict)(struct predictor *, uint8_t *, size_t); HParseResult *res; + HParseResult *tmp_res; int done; - int ret; + //int ret; const HParsedToken *v; + int earlychange; /* set up the predictor (if any) */ #define SETPARM(VAR,STR) do { \ @@ -3382,23 +3258,40 @@ LZWDecode(const Dict *parms, HBytes b, HParser *p) err(1, "LZWDecode"); } - lzwspec *lzw_spec = new_lzw_spec(&b); - bind_lzw_spec(lzw_spec); + v = dictentry(parms, "EarlyChange"); + if(v != NULL && v->token_type == TT_SINT && v->sint == 0) + { + earlychange = 0; + } + else + { + earlychange = 1; + } + + init_LZW_context(earlychange); + tmp_res = parse_LZW_data(b.token, b.len); - ret = lzw_decompress(write_lzw_buffer, read_lzw_buffer); - if (ret) { - fprintf(stderr, "lzw_decompress: error (%d)\n", ret); - assert(!"LZWDecode: failed to decompress\n"); + if(!tmp_res) + { + fprintf(stderr, "parse error in LZWDecode filter"); + return NULL; } - done = depredict(&pred, cur_lzw_spec->lzw_buf, cur_lzw_spec->write_head-1); - assert(!done); // XXX ITERATIVE + + assert(tmp_res->ast->token_type == TT_BYTES); + + uint8_t * tmp_buf = malloc(sizeof(uint8_t) * tmp_res->ast->bytes.len); + memcpy(tmp_buf, tmp_res->ast->bytes.token, tmp_res->ast->bytes.len); + done = depredict(&pred, tmp_buf, tmp_res->ast->bytes.len); + assert(!done); + + //done = depredict(&pred, res->ast->bytes.token, res->ast->bytes.len); + //assert(!done); // SR::TODO:: Do a H_MAKE rather than a parse and let the caller do the parse - res = h_parse(p, pred.out, pred.nout); + res = h_parse(p, pred.out, pred.nout); // XXX: should kstream try to decode streams with no Type? + //res = h_parse(p, tmp_res->ast->bytes.token, tmp_res->ast->bytes.len); // XXX depred buffer free(pred.out); - - bind_lzw_spec(NULL); - delete_lzw_spec(lzw_spec); + free(tmp_buf); return res; } @@ -5622,6 +5515,7 @@ main(int argc, char *argv[]) /* build parsers */ aux = (struct Env){infile, input, sz}; init_parser(&aux); + init_LZW_parser(); /* parse all cross-reference sections and trailer dictionaries */