From b1c02c9183ca5767c85fbc69cf2f0afea25735d6 Mon Sep 17 00:00:00 2001
From: "Sven M. Hallberg" <pesco@khjk.org>
Date: Mon, 19 Dec 2022 23:10:11 +0000
Subject: [PATCH] lzw: parse/process input in blocks

This avoids creating an HBytes for each and every code word. Instead, the
code words are collected into blocks behind each clear code and translated
together into a single HBytes per block.
---
 lzw.c | 113 ++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 63 insertions(+), 50 deletions(-)

diff --git a/lzw.c b/lzw.c
index cf6a221..995a985 100644
--- a/lzw.c
+++ b/lzw.c
@@ -81,8 +81,9 @@ lzw_table_extend(LZW_context_T *ctx, int code)
 /*
  * Assemble the output sequence represented by the given code word.
  * The given buffer must have the appropriate size.
+ * Returns the number of bytes written.
  */
-static void
+static size_t
 lzw_code_string(LZW_context_T *ctx, int code, uint8_t *buf)
 {
 	size_t i, n;
@@ -94,6 +95,8 @@ lzw_code_string(LZW_context_T *ctx, int code, uint8_t *buf)
 		code = ctx->table[code].prefix;
 	}
 	assert(code == -1);	/* reached the end */
+
+	return n;
 }
 
 HParser *p_lzwdata;
@@ -181,8 +184,6 @@ static HParsedToken*
 act_output(const HParseResult *p, void *u)
 {
 	uint64_t code = H_CAST_UINT(p->ast);
-	uint8_t * output_token;
-	size_t output_length;
 	LZW_context_T * ctx = (LZW_context_T *) u;
 
 	//fprintf(debug, "code: %lu, next: %u\n", code, ctx->next); // DEBUG
@@ -209,68 +210,80 @@ act_output(const HParseResult *p, void *u)
 	lzw_table_extend(ctx, code);
 
 	/*
-	 * Assemble and return the output string.
+	 * Just return the code again.
+	 * We will assemble the output in act_lzwblock() below.
 	 */
-	output_length = ctx->table[code].len;
-	output_token = h_arena_malloc(p->arena, output_length);
-	lzw_code_string(ctx, code, output_token);
-	return H_MAKE_BYTES(output_token, output_length);
+	return (HParsedToken *)p->ast;	// XXX casting away the const OK?
 }
 
-static HParsedToken*
-act_lzwbody(const HParseResult *p, void *u)
+/*
+ * Assemble the string represented by a block of code words under a given
+ * table. The incoming HParsedToken is a sequence of code words (TT_UINT).
+ */
+static HParsedToken *
+act_lzwblock(const HParseResult *p, void *u)
 {
-	size_t index = 0;
-	size_t total_buffer_size = 0;
-	size_t num_fragments = h_seq_len(p->ast);
-	uint8_t * buffer;
-
-	/* sum total bytes in array, alloc buffer */
-	for(int i = 0; i < num_fragments; i++)
-	{
-		total_buffer_size += H_FIELD_BYTES(i).len;
+	HCountedArray *seq = H_CAST_SEQ(p->ast);
+	LZW_context_T *ctx = u;
+	uint8_t *buf, *cur;
+	size_t sz, i;
+	int code;
+
+	/* determine total output size, alloc buffer */
+	sz = 0;
+	for (i = 0; i < seq->used; i++) {
+		code = (int) H_CAST_UINT(seq->elements[i]);
+		sz += ctx->table[code].len;
 	}
+	buf = h_arena_malloc(p->arena, sz);
 
-	buffer = h_arena_malloc(p->arena, sizeof(uint8_t) * total_buffer_size); // XXX arena alloc, calloc
-
-	/* go through parse result, merge bytes */
-	for(int i = 0; i < num_fragments; i++)
-	{
-		size_t len = H_FIELD_BYTES(i).len;
-		memcpy(&buffer[index], H_FIELD_BYTES(i).token, len);
-		index += len;
+	/* go through sequence, merge output bytes into buf */
+	cur = buf;
+	for (i = 0; i < seq->used; i++) {
+		code = (int) H_CAST_UINT(seq->elements[i]);
+		cur += lzw_code_string(ctx, code, cur);
 	}
+	assert(cur == buf + sz);
 
 	//fprintf(debug, "\n\n"); // DEBUG
-	//fwrite(buffer, 1, total_buffer_size, debug); // DEBUG
+	//fwrite(buf, 1, sz, debug); // DEBUG
 	//fflush(debug); // DEBUG
 
-	return H_MAKE_BYTES(buffer, total_buffer_size);
+	return H_MAKE_BYTES(buf, sz);
 }
 
-
-static HParsedToken*
+/*
+ * Concatenate blocks to form the final output string.
+ * The incoming HParsedToken is a sequence of HBytes.
+ */
+static HParsedToken *
 act_lzwdata(const HParseResult *p, void *u)
 {
-	/* The AST this semantic action receives is a sequence that looks something like this:
-		elements[0] -> TT_BYTES representing the initial clear code
-		elements[1] -> TT_BYTES containing the decompressed data
-		elements[2] -> TT_UINT representing the EOD code
-	*/
-
-	//HCountedArray * seq = H_CAST_SEQ(p->ast);
-	//LZW_context_T *ctx = (LZW_context_T*) u; // DEBUG
-
-	//fprintf(debug, "\n\n"); // DEBUG
-	/*for(int i = 258; i < ctx->next; ++i) // DEBUG
-	{
-		fprintf(debug, "i: %u, str: ", i);
-		fwrite(ctx->lzw_code_table[i].token, ctx->lzw_code_table[i].len, 1, debug);
-		fprintf(debug, "\n");
+	HCountedArray *seq = H_CAST_SEQ(p->ast);
+	HBytes bs;
+	uint8_t *buf, *cur;
+	size_t sz, i;
+
+	/* fast path: single element? nothing to do */
+	if (seq->used == 1)
+		return seq->elements[0];
+
+	/* determine total output size, alloc buffer */
+	sz = 0;
+	for (i = 0; i < seq->used; i++)
+		sz += H_CAST_BYTES(seq->elements[i]).len;
+	buf = h_arena_malloc(p->arena, sz);
+
+	/* go through sequence, copying bytes into buf */
+	cur = buf;
+	for (i = 0; i < seq->used; i++) {
+		bs = H_CAST_BYTES(seq->elements[i]);
+		memcpy(cur, bs.token, bs.len);
+		cur += bs.len;
 	}
-	fflush(debug); // DEBUG */
+	assert(cur == buf + sz);
 
-	return H_FIELD_TOKEN(1);
+	return H_MAKE_BYTES(buf, sz);
 }
 
 
@@ -299,8 +312,8 @@ void init_LZW_parser()
 	H_VDRULE (eod,		codeword, context);
 	H_AVDRULE(output,	codeword, context);
 
-	H_ARULE(lzwbody,	h_many(h_choice(clear, output, NULL)));
-	H_ARULE(lzwdata,	h_sequence(clear, lzwbody, eod, NULL));
+	H_ADRULE(lzwblock,	h_right(clear, h_many(output)), context);
+	H_ARULE (lzwdata,	h_left(h_many1(lzwblock), eod));
 	    // XXX validate that the last byte is zero-padded?
 	    // XXX require h_end_p()?
 
-- 
GitLab