Skip to content
Snippets Groups Projects
Forked from Hammer / hammer
Up to date with the upstream repository.
lr.c 17.49 KiB
#include <assert.h>
#include <ctype.h>
#include "../parsers/parser_internal.h"
#include "lr.h"



/* Comparison and hashing functions */

// compare symbols - terminals by value, others by pointer
bool h_eq_symbol(const void *p, const void *q)
{
  const HCFChoice *x=p, *y=q;
  return (x==y
          || (x->type==HCF_END && y->type==HCF_END)
          || (x->type==HCF_CHAR && y->type==HCF_CHAR && x->chr==y->chr));
}

// hash symbols - terminals by value, others by pointer
HHashValue h_hash_symbol(const void *p)
{
  const HCFChoice *x=p;
  if(x->type == HCF_END)
    return 0;
  else if(x->type == HCF_CHAR)
    return x->chr * 33;
  else
    return h_hash_ptr(p);
}

// compare LR items by value
static bool eq_lr_item(const void *p, const void *q)
{
  const HLRItem *a=p, *b=q;

  if(!h_eq_symbol(a->lhs, b->lhs)) return false;
  if(a->mark != b->mark) return false;
  if(a->len != b->len) return false;

  for(size_t i=0; i<a->len; i++)
    if(!h_eq_symbol(a->rhs[i], b->rhs[i])) return false;

  return true;
}

// hash LALR items
static inline HHashValue hash_lr_item(const void *p)
{
  const HLRItem *x = p;
  HHashValue hash = 0;

  hash += h_hash_symbol(x->lhs);
  for(HCFChoice **p=x->rhs; *p; p++)
    hash += h_hash_symbol(*p);
  hash += x->mark;

  return hash;
}

// compare item sets (DFA states)
bool h_eq_lr_itemset(const void *p, const void *q)
{
  return h_hashset_equal(p, q);
}

// hash LR item sets (DFA states) - hash the elements and sum
HHashValue h_hash_lr_itemset(const void *p)
{
  HHashValue hash = 0;
  H_FOREACH_KEY((const HHashSet *)p, HLRItem *item)
    hash += hash_lr_item(item);
  H_END_FOREACH

  return hash;
}

bool h_eq_transition(const void *p, const void *q)
{
  const HLRTransition *a=p, *b=q;
  return (a->from == b->from && a->to == b->to && h_eq_symbol(a->symbol, b->symbol));
}

HHashValue h_hash_transition(const void *p)
{
  const HLRTransition *t = p;
  return (h_hash_symbol(t->symbol) + t->from + t->to); // XXX ?
}



/* Constructors */

HLRItem *h_lritem_new(HArena *a, HCFChoice *lhs, HCFChoice **rhs, size_t mark)
{
  HLRItem *ret = h_arena_malloc(a, sizeof(HLRItem));

  size_t len = 0;
  for(HCFChoice **p=rhs; *p; p++) len++;
  assert(mark <= len);

  ret->lhs = lhs;
  ret->rhs = rhs;
  ret->len = len;
  ret->mark = mark;

  return ret;
}

HLRState *h_lrstate_new(HArena *arena)
{
  return h_hashset_new(arena, eq_lr_item, hash_lr_item);
}

HLRTable *h_lrtable_new(HAllocator *mm__, size_t nrows)
{
  HArena *arena = h_new_arena(mm__, 0);    // default blocksize
  assert(arena != NULL);

  HLRTable *ret = h_new(HLRTable, 1);
  ret->nrows = nrows;
  ret->ntmap = h_arena_malloc(arena, nrows * sizeof(HHashTable *));
  ret->tmap = h_arena_malloc(arena, nrows * sizeof(HStringMap *));
  ret->forall = h_arena_malloc(arena, nrows * sizeof(HLRAction *));
  ret->inadeq = h_slist_new(arena);
  ret->arena = arena;
  ret->mm__ = mm__;

  for(size_t i=0; i<nrows; i++) {
    ret->ntmap[i] = h_hashtable_new(arena, h_eq_symbol, h_hash_symbol);
    ret->tmap[i] = h_stringmap_new(arena);
    ret->forall[i] = NULL;
  }

  return ret;
}

void h_lrtable_free(HLRTable *table)
{
  HAllocator *mm__ = table->mm__;
  h_delete_arena(table->arena);
  h_free(table);
}

HLRAction *h_shift_action(HArena *arena, size_t nextstate)
{
  HLRAction *action = h_arena_malloc(arena, sizeof(HLRAction));
  action->type = HLR_SHIFT;
  action->nextstate = nextstate;
  return action;
}

HLRAction *h_reduce_action(HArena *arena, const HLRItem *item)
{
  HLRAction *action = h_arena_malloc(arena, sizeof(HLRAction));
  action->type = HLR_REDUCE;
  action->production.lhs = item->lhs;
  action->production.length = item->len;
#ifndef NDEBUG
  action->production.rhs = item->rhs;
#endif
  return action;
}

// adds 'new' to the branches of 'action'
// returns 'action' if it is already of type HLR_CONFLICT
// allocates a new HLRAction otherwise
HLRAction *h_lr_conflict(HArena *arena, HLRAction *action, HLRAction *new)
{
  if(action->type != HLR_CONFLICT) {
    HLRAction *old = action;
    action = h_arena_malloc(arena, sizeof(HLRAction));
    action->type = HLR_CONFLICT;
    action->branches = h_slist_new(arena);
    h_slist_push(action->branches, old);
    h_slist_push(action->branches, new);
  } else {
    // check if 'new' is already among branches
    HSlistNode *x;
    for(x=action->branches->head; x; x=x->next) {
      if(x->elem == new)
        break;
    }
    // add 'new' if it is not already in list
    if(x == NULL)
      h_slist_push(action->branches, new);
  }

  return action;
}

bool h_lrtable_row_empty(const HLRTable *table, size_t i)
{
  return (h_hashtable_empty(table->ntmap[i])
          && h_stringmap_empty(table->tmap[i]));
}



/* LR driver */

static
HLREngine *h_lrengine_new_(HArena *arena, HArena *tarena, const HLRTable *table)
{
  HLREngine *engine = h_arena_malloc(tarena, sizeof(HLREngine));

  engine->table = table;
  engine->state = 0;
  engine->stack = h_slist_new(tarena);
  engine->merged[0] = NULL;
  engine->merged[1] = NULL;
  engine->arena = arena;
  engine->tarena = tarena;

  return engine;
}

HLREngine *h_lrengine_new(HArena *arena, HArena *tarena, const HLRTable *table,
                          const HInputStream *stream)
{
  HLREngine *engine = h_lrengine_new_(arena, tarena, table);
  engine->input = *stream;
  return engine;
}

static const HLRAction *
terminal_lookup(const HLREngine *engine, const HInputStream *stream)
{
  const HLRTable *table = engine->table;
  size_t state = engine->state;

  assert(state < table->nrows);
  if(table->forall[state]) {
    assert(h_lrtable_row_empty(table, state));  // that would be a conflict
    return table->forall[state];
  } else {
    return h_stringmap_get_lookahead(table->tmap[state], *stream);
  }
}

static const HLRAction *
nonterminal_lookup(const HLREngine *engine, const HCFChoice *symbol)
{
  const HLRTable *table = engine->table;
  size_t state = engine->state;

  assert(state < table->nrows);
  assert(!table->forall[state]);    // contains only reduce entries
                                    // we are only looking for shifts
  return h_hashtable_get(table->ntmap[state], symbol);
}

const HLRAction *h_lrengine_action(const HLREngine *engine)
{
  return terminal_lookup(engine, &engine->input);
}

static HParsedToken *consume_input(HLREngine *engine)
{
  HParsedToken *v;

  uint8_t c = h_read_bits(&engine->input, 8, false);

  if(engine->input.overrun) {     // end of input
    v = NULL;
  } else {
    v = h_arena_malloc(engine->arena, sizeof(HParsedToken));
    v->token_type = TT_UINT;
    v->uint = c;
    v->index = engine->input.pos + engine->input.index - 1;
    v->bit_offset = engine->input.bit_offset;
  }

  return v;
}

// run LR parser for one round; returns false when finished
bool h_lrengine_step(HLREngine *engine, const HLRAction *action)
{
  // short-hand names
  HSlist *stack = engine->stack;
  HArena *arena = engine->arena;
  HArena *tarena = engine->tarena;

  if(action == NULL)
    return false;   // no handle recognizable in input, terminate

  assert(action->type == HLR_SHIFT || action->type == HLR_REDUCE);

  if(action->type == HLR_REDUCE) {
    size_t len = action->production.length;
    HCFChoice *symbol = action->production.lhs;

    // semantic value of the reduction result
    HParsedToken *value = h_arena_malloc(arena, sizeof(HParsedToken));
    value->token_type = TT_SEQUENCE;
    value->seq = h_carray_new_sized(arena, len);
    
    // pull values off the stack, rewinding state accordingly
    HParsedToken *v = NULL;
    for(size_t i=0; i<len; i++) {
      v = h_slist_drop(stack);
      engine->state = (uintptr_t)h_slist_drop(stack);

      // collect values in result sequence
      value->seq->elements[len-1-i] = v;
      value->seq->used++;
    }
    if(v) {
      // result position equals position of left-most symbol
      value->index = v->index;
      value->bit_offset = v->bit_offset;
    } else {
      // result position is current input position  XXX ?
      value->index = engine->input.pos + engine->input.index;
      value->bit_offset = engine->input.bit_offset;
    }

    // perform token reshape if indicated
    if(symbol->reshape) {
      v = symbol->reshape(make_result(arena, value), symbol->user_data);
      if(v) {
        v->index = value->index;
        v->bit_offset = value->bit_offset;
      } else {
        h_arena_free(arena, value);
      }
      value = v;
    }

    // call validation and semantic action, if present
    if(symbol->pred && !symbol->pred(make_result(tarena, value), symbol->user_data))
      return false;     // validation failed -> no parse; terminate
    if(symbol->action)
      value = symbol->action(make_result(arena, value), symbol->user_data);

    // this is LR, building a right-most derivation bottom-up, so no reduce can
    // follow a reduce. we can also assume no conflict follows for GLR if we
    // use LALR tables, because only terminal symbols (lookahead) get reduces.
    const HLRAction *shift = nonterminal_lookup(engine, symbol);
    if(shift == NULL)
      return false;     // parse error
    assert(shift->type == HLR_SHIFT);

    // piggy-back the shift right here, never touching the input
    h_slist_push(stack, (void *)(uintptr_t)engine->state);
    h_slist_push(stack, value);
    engine->state = shift->nextstate;

    // check for success
    if(engine->state == HLR_SUCCESS) {
      assert(symbol == engine->table->start);
      return false;
    }
  } else {
    assert(action->type == HLR_SHIFT);
    HParsedToken *value = consume_input(engine);
    h_slist_push(stack, (void *)(uintptr_t)engine->state);
    h_slist_push(stack, value);
    engine->state = action->nextstate;
  }

  return true;
}

HParseResult *h_lrengine_result(HLREngine *engine)
{
  // parsing was successful iff the engine reaches the end state
  if(engine->state == HLR_SUCCESS) {
    // on top of the stack is the start symbol's semantic value
    assert(!h_slist_empty(engine->stack));
    HParsedToken *tok = engine->stack->head->elem;
    HParseResult *res =  make_result(engine->arena, tok);
    res->bit_length = (engine->input.pos + engine->input.index) * 8;
    return res;
  } else {
    return NULL;
  }
}

HParseResult *h_lr_parse(HAllocator* mm__, const HParser* parser, HInputStream* stream)
{
  HLRTable *table = parser->backend_data;
  if(!table)
    return NULL;

  HArena *arena  = h_new_arena(mm__, 0);    // will hold the results
  HArena *tarena = h_new_arena(mm__, 0);    // tmp, deleted after parse
  HLREngine *engine = h_lrengine_new(arena, tarena, table, stream);

  // out-of-memory handling
  jmp_buf except;
  h_arena_set_except(arena, &except);
  h_arena_set_except(tarena, &except);
  if(setjmp(except)) {
    h_delete_arena(arena);
    h_delete_arena(tarena);
    return NULL;
  }

  // iterate engine to completion
  while(h_lrengine_step(engine, h_lrengine_action(engine)));

  HParseResult *result = h_lrengine_result(engine);
  if(!result)
    h_delete_arena(arena);
  h_delete_arena(tarena);
  return result;
}

void h_lr_parse_start(HSuspendedParser *s)
{
  HLRTable *table = s->parser->backend_data;
  assert(table != NULL);

  HArena *arena  = h_new_arena(s->mm__, 0); // will hold the results
  HArena *tarena = h_new_arena(s->mm__, 0); // tmp, deleted after parse
  HLREngine *engine = h_lrengine_new_(arena, tarena, table);

  s->backend_state = engine;
}

// cf. comment before run_trace in regex.c
#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunknown-pragmas"
#pragma GCC diagnostic ignored "-Wclobbered"
#endif
bool h_lr_parse_chunk(HSuspendedParser* s, HInputStream *stream)
{
  HLREngine *engine = s->backend_state;
  engine->input = *stream;

  bool run = true;

  // out-of-memory handling
  jmp_buf except;
  h_arena_set_except(engine->arena, &except);
  h_arena_set_except(engine->tarena, &except);
  if(setjmp(except)) {
    run = false;                            // done immediately
    assert(engine->state != HLR_SUCCESS);   // h_parse_finish will return NULL
  }

  while(run) {
    // check input against table to determine which action to take
    const HLRAction *action = h_lrengine_action(engine);
    if(action == NEED_INPUT) {
      // XXX assume lookahead 1
      assert(engine->input.length - engine->input.index == 0);
      break;
    }

    // execute action
    run = h_lrengine_step(engine, action);
    if(engine->input.overrun && !engine->input.last_chunk)
      break;
  }

  h_arena_set_except(engine->arena, NULL);
  h_arena_set_except(engine->tarena, NULL);

  *stream = engine->input;
  return !run;  // done if engine no longer running
}
// Reenable -Wclobber
#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC diagnostic pop
#endif

HParseResult *h_lr_parse_finish(HSuspendedParser *s)
{
  HLREngine *engine = s->backend_state;

  HParseResult *result = h_lrengine_result(engine);
  if(!result)
    h_delete_arena(engine->arena);
  h_delete_arena(engine->tarena);
  return result;
}

/* Pretty-printers */

void h_pprint_lritem(FILE *f, const HCFGrammar *g, const HLRItem *item)
{
  h_pprint_symbol(f, g, item->lhs);
  fputs(" ->", f);

  HCFChoice **x = item->rhs;
  HCFChoice **mark = item->rhs + item->mark;
  if(*x == NULL) {
    fputc('.', f);
  } else {
    while(*x) {
      if(x == mark)
        fputc('.', f);
      else
        fputc(' ', f);

      if((*x)->type == HCF_CHAR) {
        // condense character strings
        fputc('"', f);
        h_pprint_char(f, (*x)->chr);
        for(x++; *x; x++) {
          if(x == mark)
            break;
          if((*x)->type != HCF_CHAR)
            break;
          h_pprint_char(f, (*x)->chr);
        }
        fputc('"', f);
      } else {
        h_pprint_symbol(f, g, *x);
        x++;
      }
    }
    if(x == mark)
      fputs(".", f);
  }
}

void h_pprint_lrstate(FILE *f, const HCFGrammar *g,
                      const HLRState *state, unsigned int indent)
{
  bool first = true;
  H_FOREACH_KEY(state, HLRItem *item)
    if(!first)
      for(unsigned int i=0; i<indent; i++) fputc(' ', f);
    first = false;
    h_pprint_lritem(f, g, item);
    fputc('\n', f);
  H_END_FOREACH
}

static void pprint_transition(FILE *f, const HCFGrammar *g, const HLRTransition *t)
{
  fputs("-", f);
  h_pprint_symbol(f, g, t->symbol);
  fprintf(f, "->%zu", t->to);
}

void h_pprint_lrdfa(FILE *f, const HCFGrammar *g,
                    const HLRDFA *dfa, unsigned int indent)
{
  for(size_t i=0; i<dfa->nstates; i++) {
    unsigned int indent2 = indent + fprintf(f, "%4zu: ", i);
    h_pprint_lrstate(f, g, dfa->states[i], indent2);
    for(HSlistNode *x = dfa->transitions->head; x; x = x->next) {
      const HLRTransition *t = x->elem;
      if(t->from == i) {
        for(unsigned int i=0; i<indent2-2; i++) fputc(' ', f);
        pprint_transition(f, g, t);
        fputc('\n', f);
      }
    }
  }
}

void pprint_lraction(FILE *f, const HCFGrammar *g, const HLRAction *action)
{
  switch(action->type) {
  case HLR_SHIFT:
    if(action->nextstate == HLR_SUCCESS)
      fputs("s~", f);
    else
      fprintf(f, "s%zu", action->nextstate);
    break;
  case HLR_REDUCE:
    fputs("r(", f);
    h_pprint_symbol(f, g, action->production.lhs);
    fputs(" -> ", f);
#ifdef NDEBUG
    // if we can't print the production, at least print its length
    fprintf(f, "[%zu]", action->production.length);
#else
    HCFSequence seq = {action->production.rhs};
    h_pprint_sequence(f, g, &seq);
#endif
    fputc(')', f);
    break;
  case HLR_CONFLICT:
    fputc('!', f);
    for(HSlistNode *x=action->branches->head; x; x=x->next) {
      HLRAction *branch = x->elem;
      assert(branch->type != HLR_CONFLICT); // no nesting
      pprint_lraction(f, g, branch);
      if(x->next) fputc('/', f);            // separator
    }
    break;
  default:
    assert_message(0, "not reached");
  }
}

static void valprint_lraction(FILE *file, void *env, void *val)
{
  const HLRAction *action = val;
  const HCFGrammar *grammar = env;
  pprint_lraction(file, grammar, action);
}

static void pprint_lrtable_terminals(FILE *file, const HCFGrammar *g,
                                     const HStringMap *map)
{
  h_pprint_stringmap(file, ' ', valprint_lraction, (void *)g, map);
}

void h_pprint_lrtable(FILE *f, const HCFGrammar *g, const HLRTable *table,
                      unsigned int indent)
{
  for(size_t i=0; i<table->nrows; i++) {
    for(unsigned int j=0; j<indent; j++) fputc(' ', f);
    fprintf(f, "%4zu:", i);
    if(table->forall[i]) {
      fputc(' ', f);
      pprint_lraction(f, g, table->forall[i]);
      if(!h_lrtable_row_empty(table, i))
        fputs(" !!", f);
    }
    H_FOREACH(table->ntmap[i], HCFChoice *symbol, HLRAction *action)
      fputc(' ', f);    // separator
      h_pprint_symbol(f, g, symbol);
      fputc(':', f);
      pprint_lraction(f, g, action);
    H_END_FOREACH
    fputc(' ', f);    // separator
    pprint_lrtable_terminals(f, g, table->tmap[i]);
    fputc('\n', f);
  }
#if 0
  fputs("inadeq=", f);
  for(HSlistNode *x=table->inadeq->head; x; x=x->next) {
    fprintf(f, "%zu ", (uintptr_t)x->elem);
  }
  fputc('\n', f);
#endif
}

HCFGrammar *h_pprint_lr_info(FILE *f, HParser *p)
{
  HAllocator *mm__ = &system_allocator;

  fprintf(f, "\n==== G R A M M A R ====\n");
  HCFGrammar *g = h_cfgrammar_(mm__, h_desugar_augmented(mm__, p));
  if (g == NULL) {
    fprintf(f, "h_cfgrammar failed\n");
    return NULL;
  }
  h_pprint_grammar(f, g, 0);

  fprintf(f, "\n==== D F A ====\n");
  HLRDFA *dfa = h_lr0_dfa(g);
  if (dfa) {
    h_pprint_lrdfa(f, g, dfa, 0);
  } else {
    fprintf(f, "h_lalr_dfa failed\n");
  }

  fprintf(f, "\n==== L R ( 0 )  T A B L E ====\n");
  HLRTable *table0 = h_lr0_table(g, dfa);
  if (table0) {
    h_pprint_lrtable(f, g, table0, 0);
  } else {
    fprintf(f, "h_lr0_table failed\n");
  }
  h_lrtable_free(table0);

  return g;
}