diff --git a/pdf.c b/pdf.c index 05d0d3d19cffca89e7dd58bd5c8c764a71daf2f5..cff81e4a32c15667f81b96a0f359ea1da82f8bf0 100644 --- a/pdf.c +++ b/pdf.c @@ -3326,22 +3326,80 @@ int read_lzw_buffer(void) } /* Table for storing sequences represented by an LZW code */ -char * lzw_code_table[4096]; -int next; -uint64_t old; +// XXX lookup is O(1) like this, but maybe memory use will be bad +// XXX unify lzw_context_t and lzwspec + +typedef struct LZW_context_S +{ + /* + * Table for storing sequences represented by an LZW code + * 0-255, and 256 are special, representing literals, and the reset code. We could explicitly pre-fill them, but it's probably not necessary. + */ + const char * lzw_code_table[4096]; + + /* + * Holds the next expected LZW code. We also use this for telling LZW_9bitcodeword, LZW_10bitcodeword, etc. apart. Parses fail if "next" is larger than what can be represented on that many bits. + */ + int next; + + /* + * Previous LZW code, used to construct the next string added to the table. + */ + uint64_t old; +} LZW_context_T; + +void LZW_clear_table(LZW_context_T *ctx) +{ + /* + * Optimizations: since we leave the entries 0-257 empty, we don't need to free() them explicitly. + * And since codes are added to the table sequentially, we don't need to look past ctx->next; + */ + for(int i = 257; i < ctx->next; ++i) + { + const char * sequence = ctx->lzw_code_table[i]; + if(sequence != NULL) + { + free(sequence); + } + } +} HParser *p_lzwdata; +/* + * First "code" in input. We output it literally, and set "old" + */ +HParsedToken* +act_LZW_firstcode(const HParseResult *p, void *u) +{ + LZW_context_T * ctx = (LZW_context_T *) u; + uint64_t code = H_CAST_UINT(p->ast); + ctx->old = code; + return H_MAKE_BYTES(code, 1); +} + +HParsedToken* +act_LZW_clear(const HParseResult *p, void *u) +{ + LZW_context_T * ctx = (LZW_context_T *) u; + LZW_clear_table(ctx); + ctx->next = 258; // Caution: moving this before the call to LZW_clear_table() will cause a memory leak + return H_MAKE_BYTES(NULL, 0); +} + +// XXX: validations +// compare against expected next code, fail the parse if doesn't fit bit length // TODO: maybe a continuation can be used to remember the previous code // But then each codeword would need to get used as input twice HParsedToken* -act_lzw_codeword(const HParseResult *p, void *u) +act_LZW_codeword(const HParseResult *p, void *u) { char * string; char * output; char * entry; uint64_t code = H_CAST_UINT(p->ast); + LZW_context_T * ctx = (LZW_context_T *) u; if(lzw_code_table[code] != NULL) // code is in the table @@ -3353,6 +3411,7 @@ act_lzw_codeword(const HParseResult *p, void *u) strncpy(output, entry, strlen(entry)); output[strlen(entry)] = postfix; output[strlen(entry)+1] = '\0'; + ctx->old = code; return H_MAKE_BYTES(string, strlen(string)); } else // code is not in the table @@ -3364,14 +3423,18 @@ act_lzw_codeword(const HParseResult *p, void *u) output[strlen(entry)] = postfix; output[strlen(entry)+1] = '\0'; lzw_code_table[next] = output; + ctx->old = code; return H_MAKE_BYTES(output, strlen(output)); //XXX: strlen and null-terminated strings may not be appropriate here. using fixed size strings would be preferable (HCountedArray?) } - old = code; } void init_lzw_parser() { - + H_RULE(LZW_9bitcodeword, h_nothing_p()); // XXX grammar + H_RULE(LZW_10bitcodeword, h_nothing_p()); + H_RULE(LZW_11bitcodeword, h_nothing_p()); + H_RULE(LZW_12bitcodeword, h_nothing_p()); + H_ARULE(LZW_codeword, h_nothing_p()); } HParseResult *