diff --git a/lzw-lib.c b/lzw-lib.c index ed0ae3f3d5c49d97706ac65156f721d026bed633..004cd6cd88783e96226f3f3b53abda6eeb3f079a 100644 --- a/lzw-lib.c +++ b/lzw-lib.c @@ -53,7 +53,8 @@ #define NULL_CODE -1 // indicates a NULL prefix #define CLEAR_CODE 256 // code to flush dictionary and restart decoder -#define FIRST_STRING 257 // code of first dictionary string +#define EOD_CODE 257 // used in PDF's LZWDecode to signal end of data +#define FIRST_STRING 258 // code of first dictionary string, PDF edition /* This macro writes the adjusted-binary symbol "code" given the maximum * symbol "maxcode". A macro is used here just to avoid the duplication in @@ -201,11 +202,10 @@ int lzw_compress (void (*dst)(int), int (*src)(void), int maxbits) return 0; } -/* LZW decompression function. Bytes (8-bit) are read and written through callbacks. The - * "maxbits" parameter is read as the first byte in the stream and controls how much memory - * is allocated for decoding. A return value of EOF from the "src" callback terminates the - * compression process (although this should not normally occur). A non-zero return value - * indicates an error, which in this case can be a bad "maxbits" read from the stream, a +/* LZW decompression function. Bytes (8-bit) are read and written through callbacks. + * A return value of EOF from the "src" callback terminates the compression process + * (although this should not normally occur). A non-zero return value + * indicates an error, which in this case can be a * failed malloc(), or if an EOF is read from the input stream before the compression * terminates naturally with END_CODE. */ @@ -217,12 +217,10 @@ int lzw_decompress (void (*dst)(int), int (*src)(void)) unsigned long shifter = 0; short *prefixes; - if ((read_byte = ((*src)())) == EOF || (read_byte & 0xfc)) //sanitize first byte - return 1; - - // based on the "maxbits" parameter, compute total codes and allocate dictionary storage + // PDF specific change: maxbits is not in the input stream + // we'll just be pessimistic and allocate the maximal size buffer - total_codes = 512 << (read_byte & 0x3); + total_codes = 4096; reverse_buffer = malloc ((total_codes - 256) * sizeof (reverse_buffer [0])); prefixes = malloc ((total_codes - 256) * sizeof (prefixes [0])); terminators = malloc ((total_codes - 256) * sizeof (terminators [0])); @@ -231,49 +229,52 @@ int lzw_decompress (void (*dst)(int), int (*src)(void)) return 1; // This is the main loop where we read input symbols. The values range from 0 to the code value - // of the "next" string in the dictionary (although the actual "next" code cannot be used yet, - // and so we reserve that code for the END_CODE). Note that receiving an EOF from the input + // of the "next" string in the dictionary. Note that receiving an EOF from the input // stream is actually an error because we should have gotten the END_CODE first. while (1) { - int code_bits = next < 1024 ? (next < 512 ? 8 : 9) : (next < 2048 ? 10 : 11), code; + int code_bits = next < 512 ? 9 : (next < 1024 ? 10 : (next < 2048 ? 11 : 12) ), code; int extras = (1 << (code_bits + 1)) - next - 1; + #define TOP_BITMASK (((1 << code_bits) - 1) << (bits - code_bits) ) + #define BOTTOM_BITMASK ((1 << (bits - code_bits)) - 1) + do { if ((read_byte = ((*src)())) == EOF) { free (terminators); free (prefixes); free (reverse_buffer); return 1; } - shifter |= (long) read_byte << bits; + /* shifter reworked: everything shifted left by a byte, + * and the byte we just read becomes the least significant + * byte */ + + // prepare to shift in next byte + shifter <<= 8; + /* the bitstrings forming the symbols are stored MSB first, + * so we can just OR in the next */ + shifter |= (unsigned long) read_byte; } while ((bits += 8) < code_bits); - // first we assume the code will fit in the minimum number of required bits - code = (int) shifter & ((1 << code_bits) - 1); - shifter >>= code_bits; + /* for a 12-bit code, the shifter's bits now look like + * from MSB to LSB: 00...0cccccccccn...n + * where c are the bits of our code + * and n are the bits we're not yet interested in + * the number of times n is repeated is bits - code_bits + * ie. the number of bits read in minus the bits we're interested in */ + + // shift our code bits into thier proper place, and save it as the final code + code = (int) shifter >> (bits - code_bits); + /* we can now clear the shifter's top bits. the result looks like: + * 00...0n...n + * number of n is bits-code_bits + * */ + shifter &= BOTTOM_BITMASK; + // update the count of bytes in the shifter bits -= code_bits; - // but if code >= extras, then we need to read another bit to calculate the real code - // (this is the "adjusted binary" part) - - if (code >= extras) { - if (!bits) { - if ((read_byte = ((*src)())) == EOF) { - free (terminators); free (prefixes); free (reverse_buffer); - return 1; - } - - shifter = (long) read_byte; - bits = 8; - } - - code = (code << 1) - extras + (shifter & 1); - shifter >>= 1; - bits--; - } - - if (code == next) // sending the maximum code is reserved for the end of the file + if (code == EOD_CODE) // In PDF, EOD is signalled by 257, rather than the max code break; else if (code == CLEAR_CODE) // otherwise check for a CLEAR_CODE to start over early next = FIRST_STRING;