diff --git a/lzw.c b/lzw.c index b00a48d24a7c4a02edb10eaf881379237c40287c..cf6a221a0365f4ffbe8a0d56ac46a36d76ded1e0 100644 --- a/lzw.c +++ b/lzw.c @@ -22,11 +22,29 @@ typedef struct LZW_context_S { /* - * Table for storing sequences represented by an LZW code. - * 0-255 are predefined representing literals. - * 256 and 267 are the clear and eod (end of data) codes. + * Storing byte sequences represented by each LZW code. + * + * Codes 0-255 are predefined representing literals. + * Codes 256 and 267 are the special clear and eod (end of data) codes. + * Codes >257 are dynamically defined by the input. + * + * Each dynamically defined code is an extension of a previously + * defined code. We therefore need only store the code being extended + * and the byte being added. + * + * Thus the entries of this array form linked lists. To terminate the + * lists while making our memory allocation easy, we store the length + * of each code's output sequence. + * + * Finally, we redundantly store the first byte of the sequence so we + * don't have to walk the list during updates (see act_output). */ - HBytes lzw_code_table[4096]; + struct { + size_t len; /* length of the sequence */ + int prefix; /* code representing the seq's prefix (len-1) */ + uint8_t last; /* final byte of the sequence */ + uint8_t first; /* first byte of the sequence */ + } table[4096]; /* * Holds the next expected LZW code. We also use this for telling LZW_9bitcodeword, LZW_10bitcodeword, etc. apart. Parses fail if "next" is larger than what can be represented on that many bits. @@ -43,26 +61,41 @@ typedef struct LZW_context_S static void lzw_clear_table(LZW_context_T *ctx) { - /* - * Only the HBytes in the LZW table refer to the strings we're freeing, - * since the HParsedTokens created from them get their own copies. - */ - for(int i = 258; i < ctx->next; ++i) - free((uint8_t *) ctx->lzw_code_table[i].token); ctx->next = 258; } /* - * Creates a HBytes from an array of bytes and its length, and inserts it into the lzw dictionary in ctx. - * Also increments ctx->next. The HBytes will keep the token pointer, to be freed later in lzw_clear_table or init_lzw_context. + * Update the dictionary with a new entry that extends the given code by one + * byte to be filled in later. */ static void -lzw_table_insert(LZW_context_T *ctx, uint8_t *token, size_t token_len) +lzw_table_extend(LZW_context_T *ctx, int code) { - ctx->lzw_code_table[ctx->next] = (HBytes){token, token_len}; + ctx->table[ctx->next].prefix = code; + ctx->table[ctx->next].first = ctx->table[code].first; + ctx->table[ctx->next].len = ctx->table[code].len + 1; + ctx->table[ctx->next].last = 0xFF; ctx->next++; } +/* + * Assemble the output sequence represented by the given code word. + * The given buffer must have the appropriate size. + */ +static void +lzw_code_string(LZW_context_T *ctx, int code, uint8_t *buf) +{ + size_t i, n; + + /* traverse the list, filling buf from last to first byte */ + n = ctx->table[code].len; + for (i = 0; i < n; i++) { + buf[n - 1 - i] = ctx->table[code].last; + code = ctx->table[code].prefix; + } + assert(code == -1); /* reached the end */ +} + HParser *p_lzwdata; static LZW_context_T *context; @@ -147,11 +180,9 @@ validate_output(HParseResult *p, void *u) static HParsedToken* act_output(const HParseResult *p, void *u) { - HBytes code_str; - HBytes last_str; uint64_t code = H_CAST_UINT(p->ast); uint8_t * output_token; - uint8_t * next_entry_token; + size_t output_length; LZW_context_T * ctx = (LZW_context_T *) u; //fprintf(debug, "code: %lu, next: %u\n", code, ctx->next); // DEBUG @@ -165,41 +196,25 @@ act_output(const HParseResult *p, void *u) assert(code != 257); /* - * Retrieve the output from the dictionary. - * This is what we'll wrap in a HBytes for returning. - */ - code_str = ctx->lzw_code_table[code]; - assert(code_str.len > 0); - - /* - * Fill in the missing last character of a previously assigned code, + * Fill in the missing last byte of a previously assigned code, * if there is one. */ - if (ctx->next > 258) { - last_str = ctx->lzw_code_table[ctx->next - 1]; - ((uint8_t *)last_str.token)[last_str.len - 1] = - code_str.token[0]; - // XXX casting away the const. we know what we're doing. - // could avoid HBytes by using our own struct but come on. - // a different design might avoid byte arrays in the - // table altogether by storing just the last character - // and the code of the prefix. - } + if (ctx->next > 258) + ctx->table[ctx->next - 1].last = ctx->table[code].first; /* * Update the dictionary with a new entry that is missing the last - * character which we will only learn when we process the next code. + * byte which we will only learn when we process the next code. */ - next_entry_token = calloc(code_str.len + 1, sizeof(uint8_t)); - memcpy(next_entry_token, code_str.token, code_str.len); - lzw_table_insert(ctx, next_entry_token, code_str.len + 1); + lzw_table_extend(ctx, code); /* - * Return a copy of the output. + * Assemble and return the output string. */ - output_token = h_arena_malloc(p->arena, code_str.len); - memcpy(output_token, code_str.token, code_str.len); - return H_MAKE_BYTES(output_token, code_str.len); + output_length = ctx->table[code].len; + output_token = h_arena_malloc(p->arena, output_length); + lzw_code_string(ctx, code, output_token); + return H_MAKE_BYTES(output_token, output_length); } static HParsedToken* @@ -267,9 +282,10 @@ void init_LZW_parser() /* set up literals in LZW code table */ for(int i = 0; i < 256; i++) { - uint8_t *token = malloc(sizeof(uint8_t)); - *token = i; - context->lzw_code_table[i] = (HBytes){token, 1}; + context->table[i].len = 1; + context->table[i].prefix = -1; /* none */ + context->table[i].first = i; + context->table[i].last = i; } context->earlychange = 1;