From a6ea35eb8a1b7135b99ccb05ed79a4e5a69db83a Mon Sep 17 00:00:00 2001
From: "Sven M. Hallberg" <pesco@khjk.org>
Date: Mon, 19 Dec 2022 23:10:11 +0000
Subject: [PATCH] lzw: switch to a fixed-size table with internally linked
 codes

This saves us from allocating and freeing the HBytes that were stored in
the table. It should also save memory since it essentially shares common
prefixes between codes.

The only remaining call to malloc() is the one allocating the global
context object itself.
---
 lzw.c | 108 +++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 62 insertions(+), 46 deletions(-)

diff --git a/lzw.c b/lzw.c
index b00a48d..cf6a221 100644
--- a/lzw.c
+++ b/lzw.c
@@ -22,11 +22,29 @@
 typedef struct LZW_context_S
 {
 	/*
-	 * Table for storing sequences represented by an LZW code.
-	 * 0-255 are predefined representing literals.
-	 * 256 and 267 are the clear and eod (end of data) codes.
+	 * Storing byte sequences represented by each LZW code.
+	 *
+	 * Codes 0-255 are predefined representing literals.
+	 * Codes 256 and 267 are the special clear and eod (end of data) codes.
+	 * Codes >257 are dynamically defined by the input.
+	 *
+	 * Each dynamically defined code is an extension of a previously
+	 * defined code. We therefore need only store the code being extended
+	 * and the byte being added.
+	 *
+	 * Thus the entries of this array form linked lists. To terminate the
+	 * lists while making our memory allocation easy, we store the length
+	 * of each code's output sequence.
+	 *
+	 * Finally, we redundantly store the first byte of the sequence so we
+	 * don't have to walk the list during updates (see act_output).
 	 */
-	HBytes lzw_code_table[4096];
+	struct {
+		size_t len;	/* length of the sequence */
+		int prefix;	/* code representing the seq's prefix (len-1) */
+		uint8_t last;	/* final byte of the sequence */
+		uint8_t first;	/* first byte of the sequence */
+	} table[4096];
 
 	/*
 	 * Holds the next expected LZW code. We also use this for telling LZW_9bitcodeword, LZW_10bitcodeword, etc. apart. Parses fail if "next" is larger than what can be represented on that many bits.
@@ -43,26 +61,41 @@ typedef struct LZW_context_S
 static void
 lzw_clear_table(LZW_context_T *ctx)
 {
-	/*
-	 * Only the HBytes in the LZW table refer to the strings we're freeing,
-	 * since the HParsedTokens created from them get their own copies.
-	 */
-	for(int i = 258; i < ctx->next; ++i)
-		free((uint8_t *) ctx->lzw_code_table[i].token);
 	ctx->next = 258;
 }
 
 /*
- * Creates a HBytes from an array of bytes and its length, and inserts it into the lzw dictionary in ctx.
- * Also increments ctx->next. The HBytes will keep the token pointer, to be freed later in lzw_clear_table or init_lzw_context.
+ * Update the dictionary with a new entry that extends the given code by one
+ * byte to be filled in later.
  */
 static void
-lzw_table_insert(LZW_context_T *ctx, uint8_t *token, size_t token_len)
+lzw_table_extend(LZW_context_T *ctx, int code)
 {
-	ctx->lzw_code_table[ctx->next] = (HBytes){token, token_len};
+	ctx->table[ctx->next].prefix = code;
+	ctx->table[ctx->next].first = ctx->table[code].first;
+	ctx->table[ctx->next].len = ctx->table[code].len + 1;
+	ctx->table[ctx->next].last = 0xFF;
 	ctx->next++;
 }
 
+/*
+ * Assemble the output sequence represented by the given code word.
+ * The given buffer must have the appropriate size.
+ */
+static void
+lzw_code_string(LZW_context_T *ctx, int code, uint8_t *buf)
+{
+	size_t i, n;
+
+	/* traverse the list, filling buf from last to first byte */
+	n = ctx->table[code].len;
+	for (i = 0; i < n; i++) {
+		buf[n - 1 - i] = ctx->table[code].last;
+		code = ctx->table[code].prefix;
+	}
+	assert(code == -1);	/* reached the end */
+}
+
 HParser *p_lzwdata;
 static LZW_context_T *context;
 
@@ -147,11 +180,9 @@ validate_output(HParseResult *p, void *u)
 static HParsedToken*
 act_output(const HParseResult *p, void *u)
 {
-	HBytes code_str;
-	HBytes last_str;
 	uint64_t code = H_CAST_UINT(p->ast);
 	uint8_t * output_token;
-	uint8_t * next_entry_token;
+	size_t output_length;
 	LZW_context_T * ctx = (LZW_context_T *) u;
 
 	//fprintf(debug, "code: %lu, next: %u\n", code, ctx->next); // DEBUG
@@ -165,41 +196,25 @@ act_output(const HParseResult *p, void *u)
 	assert(code != 257);
 
 	/*
-	 * Retrieve the output from the dictionary.
-	 * This is what we'll wrap in a HBytes for returning.
-	 */
-	code_str = ctx->lzw_code_table[code];
-	assert(code_str.len > 0);
-
-	/*
-	 * Fill in the missing last character of a previously assigned code,
+	 * Fill in the missing last byte of a previously assigned code,
 	 * if there is one.
 	 */
-	if (ctx->next > 258) {
-		last_str = ctx->lzw_code_table[ctx->next - 1];
-		((uint8_t *)last_str.token)[last_str.len - 1] =
-		    code_str.token[0];
-		    // XXX casting away the const. we know what we're doing.
-		    // could avoid HBytes by using our own struct but come on.
-		    // a different design might avoid byte arrays in the
-		    // table altogether by storing just the last character
-		    // and the code of the prefix.
-	}
+	if (ctx->next > 258)
+		ctx->table[ctx->next - 1].last = ctx->table[code].first;
 
 	/*
 	 * Update the dictionary with a new entry that is missing the last
-	 * character which we will only learn when we process the next code.
+	 * byte which we will only learn when we process the next code.
 	 */
-	next_entry_token = calloc(code_str.len + 1, sizeof(uint8_t));
-	memcpy(next_entry_token, code_str.token, code_str.len);
-	lzw_table_insert(ctx, next_entry_token, code_str.len + 1);
+	lzw_table_extend(ctx, code);
 
 	/*
-	 * Return a copy of the output.
+	 * Assemble and return the output string.
 	 */
-	output_token = h_arena_malloc(p->arena, code_str.len);
-	memcpy(output_token, code_str.token, code_str.len);
-	return H_MAKE_BYTES(output_token, code_str.len);
+	output_length = ctx->table[code].len;
+	output_token = h_arena_malloc(p->arena, output_length);
+	lzw_code_string(ctx, code, output_token);
+	return H_MAKE_BYTES(output_token, output_length);
 }
 
 static HParsedToken*
@@ -267,9 +282,10 @@ void init_LZW_parser()
 	/* set up literals in LZW code table */
 	for(int i = 0; i < 256; i++)
 	{
-		uint8_t *token = malloc(sizeof(uint8_t));
-		*token = i;
-		context->lzw_code_table[i] = (HBytes){token, 1};
+		context->table[i].len = 1;
+		context->table[i].prefix = -1;	/* none */
+		context->table[i].first = i;
+		context->table[i].last = i;
 	}
 	context->earlychange = 1;
 
-- 
GitLab