From 041c1345e664e191c967c01273f0bb5721c2a7ae Mon Sep 17 00:00:00 2001
From: Andrea Shepard <andrea@persephoneslair.org>
Date: Mon, 5 Dec 2016 18:21:22 +0000
Subject: [PATCH] Implement h_token() by sequential comparisons for short
 tokens

---
 src/SConscript                 |   2 +-
 src/backends/llvm/llvm.c       |  11 ++-
 src/backends/llvm/llvm.h       |   6 ++
 src/backends/llvm/llvm_bytes.c |  92 ++++++++++++++++++++++
 src/parsers/token.c            | 136 ++++++++++++++++++++++++++++++++-
 src/t_parser.c                 |   1 +
 6 files changed, 244 insertions(+), 4 deletions(-)
 create mode 100644 src/backends/llvm/llvm_bytes.c

diff --git a/src/SConscript b/src/SConscript
index 9b89730d..6ec943c3 100644
--- a/src/SConscript
+++ b/src/SConscript
@@ -70,7 +70,7 @@ backends = ['backends/%s.c' % s for s in
 
 # Add LLVM backend if enabled
 if GetOption("use_llvm"):
-    llvm_backend_files = ['llvm.c', 'llvm_charset.c', 'llvm_suint.c']
+    llvm_backend_files = ['llvm.c', 'llvm_bytes.c', 'llvm_charset.c', 'llvm_suint.c']
     backends = backends + ['backends/llvm/%s' % s for s in llvm_backend_files]
 
 misc_hammer_parts = [
diff --git a/src/backends/llvm/llvm.c b/src/backends/llvm/llvm.c
index 2c6602b0..1e673b67 100644
--- a/src/backends/llvm/llvm.c
+++ b/src/backends/llvm/llvm.c
@@ -26,12 +26,19 @@ HParseResult* make_result(HArena *arena, HParsedToken *tok) {
 }
 
 void h_llvm_declare_common(HLLVMParserCompileContext *ctxt) {
-#if SIZE_MAX == 0xffffffffffffffff
+#if SIZE_MAX == UINT64_MAX
   ctxt->llvm_size_t = LLVMInt64Type();
-#elif SIZE_MAX == 0xffffffff
+#elif SIZE_MAX == UINT32_MAX
   ctxt->llvm_size_t = LLVMInt32Type();
 #else
 #error "SIZE_MAX is not consistent with either 64 or 32-bit platform, couldn't guess LLVM type for size_t"
+#endif
+#if UINTPTR_MAX == UINT64_MAX
+  ctxt->llvm_intptr_t = LLVMInt64Type();
+#elif UINTPTR_MAX == UINT32_MAX
+  ctxt->llvm_intptr_t = LLVMInt32Type();
+#else
+#error "UINTPTR_MAX is not consistent with either 64 or 32-bit platform, couldn't guess LLVM type for intptr"
 #endif
   ctxt->llvm_inputstream = LLVMStructCreateNamed(LLVMGetGlobalContext(), "struct.HInputStream_");
   LLVMTypeRef llvm_inputstream_struct_types[] = {
diff --git a/src/backends/llvm/llvm.h b/src/backends/llvm/llvm.h
index 49d681c4..aab534a5 100644
--- a/src/backends/llvm/llvm.h
+++ b/src/backends/llvm/llvm.h
@@ -20,7 +20,10 @@ struct HLLVMParserCompileContext_ {
   LLVMValueRef func;
   LLVMBuilderRef builder;
   /* Typerefs */
+  /* We determine typerefs for some standard C types we'll need later up front */
   LLVMTypeRef llvm_size_t;
+  LLVMTypeRef llvm_intptr_t;
+  /* LLVM types for Hammer structs and pointers */
   LLVMTypeRef llvm_inputstream;
   LLVMTypeRef llvm_inputstreamptr;
   LLVMTypeRef llvm_arena;
@@ -39,6 +42,9 @@ struct HLLVMParserCompileContext_ {
 bool h_llvm_make_charset_membership_test(HLLVMParserCompileContext *ctxt,
                                          LLVMValueRef r, HCharset cs,
                                          LLVMBasicBlockRef yes, LLVMBasicBlockRef no);
+void h_llvm_make_tt_bytes_fixed(HLLVMParserCompileContext *ctxt,
+                                const uint8_t *bytes, size_t len,
+                                LLVMValueRef *mr_out);
 void h_llvm_make_tt_suint(HLLVMParserCompileContext *ctxt,
                           uint8_t length, uint8_t signedp,
                           LLVMValueRef r, LLVMValueRef *mr_out);
diff --git a/src/backends/llvm/llvm_bytes.c b/src/backends/llvm/llvm_bytes.c
new file mode 100644
index 00000000..9a63f6f9
--- /dev/null
+++ b/src/backends/llvm/llvm_bytes.c
@@ -0,0 +1,92 @@
+#ifdef HAMMER_LLVM_BACKEND
+
+#include <llvm-c/Analysis.h>
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpedantic"
+#include <llvm-c/Core.h>
+#pragma GCC diagnostic pop
+#include <llvm-c/ExecutionEngine.h>
+#include "../../internal.h"
+#include "llvm.h"
+
+/*
+ * Construct LLVM IR to allocate a token of type TT_BYTES with a compile-time
+ * constant value
+ *
+ * Parameters:
+ *  - ctxt [in]: an HLLVMParserCompileContext
+ *  - bytes [in]: an array of bytes
+ *  - len [in]: size of bytes
+ *  - mr_out [out]: the return value from make_result()
+ */
+
+void h_llvm_make_tt_bytes_fixed(HLLVMParserCompileContext *ctxt,
+                                const uint8_t *bytes, size_t len,
+                                LLVMValueRef *mr_out) {
+  /* Set up call to h_arena_malloc() for a new HParsedToken */
+  LLVMValueRef tok_size = LLVMConstInt(LLVMInt32Type(), sizeof(HParsedToken), 0);
+  LLVMValueRef amalloc_args[] = { ctxt->arena, tok_size };
+  /* %h_arena_malloc = call void* @h_arena_malloc(%struct.HArena_.1* %1, i32 48) */
+  LLVMValueRef amalloc = LLVMBuildCall(ctxt->builder,
+      LLVMGetNamedFunction(ctxt->mod, "h_arena_malloc"),
+      amalloc_args, 2, "h_arena_malloc");
+  /* %tok = bitcast void* %h_arena_malloc to %struct.HParsedToken_.2* */
+  LLVMValueRef tok = LLVMBuildBitCast(ctxt->builder, amalloc, ctxt->llvm_parsedtokenptr, "tok");
+
+  /*
+   * tok->token_type = TT_BYTES;
+   */
+  LLVMValueRef toktype = LLVMBuildStructGEP(ctxt->builder, tok, 0, "token_type");
+  LLVMBuildStore(ctxt->builder, LLVMConstInt(LLVMInt32Type(), TT_BYTES, 0), toktype);
+
+  /*
+   * XXX the way LLVM handles unions is batshit insane and forces IR writers
+   * to figure out which element of the union is largest just to declare the
+   * type, and then get all the alignments right - in effect, manually crufting
+   * up something compatible with their C compiler's ABI.  This is not so much
+   * a portability bug as a portability bug queen with a bone-penetrating
+   * ovipositor for laying her eggs in one's brain.
+   *
+   * The sole saving grace here is that the limited number of platforms LLVM
+   * can JIT on make it conceivable I may get this right for the cases that come
+   * up in practice if not for the general case.  If it breaks horribly, the
+   * slightly slower but safe option is to implement a function to set the
+   * relevant union fields from its arguments in C and build a call to it.
+   *
+   * The equivalent C that prompted this rant is quite depressingly simple:
+   *
+   * tok->bytes.token = bytes;
+   * tok->bytes.len = len;
+   */
+
+  LLVMValueRef hbytes_gep_tmp =
+    LLVMBuildStructGEP(ctxt->builder, tok, 1, "tok_union");
+  LLVMValueRef hbytes_gep = LLVMBuildBitCast(ctxt->builder, hbytes_gep_tmp,
+      ctxt->llvm_hbytesptr, "hbytes");
+  LLVMValueRef hbytes_token_gep =
+    LLVMBuildStructGEP(ctxt->builder, hbytes_gep, 0, "hbytes_token");
+  /*
+   * We have to do this silly (uintptr_t) / LLVMConstIntToPtr() dance because
+   * LLVM doesn't seem to offer any way to construct a compile-time pointer
+   * constant other than NULL directly.
+   */
+  LLVMBuildStore(ctxt->builder,
+      LLVMConstIntToPtr(LLVMConstInt(ctxt->llvm_intptr_t, (uintptr_t)bytes, 0),
+        LLVMPointerType(LLVMInt8Type(), 0)),
+      hbytes_token_gep);
+  LLVMValueRef hbytes_len_gep =
+    LLVMBuildStructGEP(ctxt->builder, hbytes_gep, 1, "hbytes_len");
+  LLVMBuildStore(ctxt->builder, LLVMConstInt(ctxt->llvm_size_t, len, 0), hbytes_len_gep);
+
+  /*
+   * Now call make_result()
+   */
+  LLVMValueRef result_args[] = { ctxt->arena, tok };
+  LLVMValueRef mr = LLVMBuildCall(ctxt->builder,
+      LLVMGetNamedFunction(ctxt->mod, "make_result"),
+      result_args, 2, "make_result");
+
+  *mr_out = mr;
+}
+
+#endif /* defined(HAMMER_LLVM_BACKEND) */
diff --git a/src/parsers/token.c b/src/parsers/token.c
index 19029726..6f016064 100644
--- a/src/parsers/token.c
+++ b/src/parsers/token.c
@@ -1,4 +1,11 @@
 #include <assert.h>
+#ifdef HAMMER_LLVM_BACKEND
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpedantic"
+#include <llvm-c/Core.h>
+#pragma GCC diagnostic pop
+#include "../backends/llvm/llvm.h"
+#endif
 #include "parser_internal.h"
 
 typedef struct {
@@ -19,7 +26,6 @@ static HParseResult* parse_token(void *env, HParseState *state) {
   return make_result(state->arena, tok);
 }
 
-
 static HParsedToken *reshape_token(const HParseResult *p, void* user_data) {
   // fetch sequence of uints from p
   assert(p->ast);
@@ -67,12 +73,140 @@ static bool token_ctrvm(HRVMProg *prog, void *env) {
   return true;
 }
 
+#ifdef HAMMER_LLVM_BACKEND
+
+/*
+ * Emit LLVM IR to recognize a token by comparing it to a string stored in
+ * the LLVM module globals.  We use this for longer tokens.
+ */
+
+static bool token_llvm_with_global(HLLVMParserCompileContext *ctxt, HToken *t) {
+  /* TODO */
+  return true;
+}
+
+/*
+ * Emit LLVM IR to recognize a token by sequentially checking each character;
+ * suitable for short tokens.  This also handles the zero-length token case.
+ */
+
+static bool token_llvm_with_sequential_comparisons(HLLVMParserCompileContext *ctxt, HToken *t) {
+  HAllocator *mm__;
+  LLVMValueRef bits, r, c, icmp, mr, rv;
+  LLVMValueRef bits_args[3];
+  LLVMBasicBlockRef entry, success, end, next_char;
+  char name[64];
+  int i;
+
+  /* Get allocator ready */
+  mm__ = ctxt->mm__;
+
+  /* Set up basic blocks: entry, success and exit branches */
+  entry = LLVMAppendBasicBlock(ctxt->func, "tok_seq_entry");
+  success = LLVMAppendBasicBlock(ctxt->func, "tok_seq_success");
+  end = LLVMAppendBasicBlock(ctxt->func, "tok_seq_end");
+
+  /* Branch to entry block */
+  LLVMBuildBr(ctxt->builder, entry);
+  LLVMPositionBuilderAtEnd(ctxt->builder, entry);
+
+  /* Basic block refs for the phi later */
+  LLVMBasicBlockRef *bbs_into_phi = h_new(LLVMBasicBlockRef, 1 + t->len);
+  LLVMValueRef *values_into_phi = h_new(LLVMValueRef, 1 + t->len);
+
+  /* For each char of token... */
+  bits_args[0] = ctxt->stream;
+  bits_args[1] = LLVMConstInt(LLVMInt32Type(), 8, 0);
+  bits_args[2] = LLVMConstInt(LLVMInt8Type(), 0, 0);
+  /* Track the current basic block */
+  LLVMBasicBlockRef curr_char = entry;
+  for (i = 0; i < t->len; ++i) {
+    /* Read a char */
+    bits = LLVMBuildCall(ctxt->builder,
+        LLVMGetNamedFunction(ctxt->mod, "h_read_bits"), bits_args, 3, "read_bits");
+    /* Clamp to i8 */
+    r = LLVMBuildTrunc(ctxt->builder, bits, LLVMInt8Type(), "");
+    /* Comparison */
+    c = LLVMConstInt(LLVMInt8Type(), t->str[i], 0);
+    snprintf(name, 64, "t->str[%d] == r", i);
+    icmp = LLVMBuildICmp(ctxt->builder, LLVMIntEQ, c, r, name);
+    /* Next basic block */
+    snprintf(name, 64, "tok_matched_%d", i);
+    next_char = LLVMAppendBasicBlock(ctxt->func, name);
+    /* Conditional branch */
+    LLVMBuildCondBr(ctxt->builder, icmp, next_char, end);
+    /* Fill in our row in the phi tables */
+    bbs_into_phi[1 + i] = curr_char;
+    values_into_phi[1 + i] = LLVMConstNull(ctxt->llvm_parseresultptr);
+    /* Start from next_char */
+    LLVMPositionBuilderAtEnd(ctxt->builder, next_char);
+    /* Update the current basic block */
+    curr_char = next_char;
+  }
+
+  /* If we got here, accept the token */
+  LLVMBuildBr(ctxt->builder, success);
+
+  /* Success block: make a token */
+  LLVMPositionBuilderAtEnd(ctxt->builder, success);
+  h_llvm_make_tt_bytes_fixed(ctxt, t->str, t->len, &mr);
+  /* Fill in our row in the phi tables */
+  bbs_into_phi[0] = success;
+  values_into_phi[0] = mr;
+  /* Branch to end so we can return the token */
+  LLVMBuildBr(ctxt->builder, end);
+
+  /* End block: return a token if we made one */
+  LLVMPositionBuilderAtEnd(ctxt->builder, end);
+  /* phi the token or a null depending on where we came from */
+  rv = LLVMBuildPhi(ctxt->builder, ctxt->llvm_parseresultptr, "rv");
+  LLVMAddIncoming(rv, values_into_phi, bbs_into_phi, 1 + t->len);
+  /* Free the stuff we allocated to build the phi */
+  h_free(bbs_into_phi);
+  h_free(values_into_phi);
+  /* Return it */
+  LLVMBuildRet(ctxt->builder, rv);
+
+  return true;
+}
+
+#define TOKEN_LENGTH_USE_GLOBAL_CUTOFF 4
+
+static bool token_llvm(HLLVMParserCompileContext *ctxt, void* env) {
+  HToken *t;
+  if (!ctxt) return false;
+
+  /* Get the token */
+  t = (HToken *)env;
+  /*
+   * Check its length; we have two possible code-generation strategies
+   * here: treat it like chars sequentially and emit a series of read/
+   * tests, or put the string in the LLVM module globals and compare
+   * in a loop.  Use the former for very short strings and the latter
+   * for longer ones.
+   *
+   * XXX Like with charsets, we should also think about memoizing these
+   * for recurring strings.
+   */
+  if (t->len > TOKEN_LENGTH_USE_GLOBAL_CUTOFF &&
+      t->len > 0) {
+    return token_llvm_with_global(ctxt, t);
+  } else {
+    return token_llvm_with_sequential_comparisons(ctxt, t);
+  }
+}
+
+#endif /* defined(HAMMER_LLVM_BACKEND) */
+
 const HParserVtable token_vt = {
   .parse = parse_token,
   .isValidRegular = h_true,
   .isValidCF = h_true,
   .desugar = desugar_token,
   .compile_to_rvm = token_ctrvm,
+#ifdef HAMMER_LLVM_BACKEND
+  .llvm = token_llvm,
+#endif
   .higher = false,
 };
 
diff --git a/src/t_parser.c b/src/t_parser.c
index 30e9fc77..69e9dd6a 100644
--- a/src/t_parser.c
+++ b/src/t_parser.c
@@ -1037,6 +1037,7 @@ void register_parser_tests(void) {
   g_test_add_data_func("/core/parser/glr/token_position", GINT_TO_POINTER(PB_GLR), test_token_position);
 
 #ifdef HAMMER_LLVM_BACKEND
+  g_test_add_data_func("/core/parser/llvm/token", GINT_TO_POINTER(PB_LLVM), test_token);
   g_test_add_data_func("/core/parser/llvm/ch", GINT_TO_POINTER(PB_LLVM), test_ch);
   g_test_add_data_func("/core/parser/llvm/ch_range", GINT_TO_POINTER(PB_LLVM), test_ch_range);
   g_test_add_data_func("/core/parser/llvm/int64", GINT_TO_POINTER(PB_LLVM), test_int64);
-- 
GitLab