Forked from
Hammer / hammer
1199 commits behind the upstream repository.
-
Sven M. Hallberg authored
cfgrammar.c 13.62 KiB
/* Context-free grammar representation and analysis */
#include "cfgrammar.h"
#include "allocator.h"
#include <assert.h>
#include <ctype.h>
HCFGrammar *h_cfgrammar_new(HAllocator *mm__)
{
HCFGrammar *g = h_new(HCFGrammar, 1);
assert(g != NULL);
g->mm__ = mm__;
g->arena = h_new_arena(mm__, 0); // default blocksize
g->nts = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr);
g->geneps = NULL;
g->first = h_hashtable_new(g->arena, h_eq_ptr, h_hash_ptr);
g->follow = h_hashtable_new(g->arena, h_eq_ptr, h_hash_ptr);
return g;
}
/* Frees the given grammar and associated data.
* Does *not* free parsers' CFG forms as created by h_desugar.
*/
void h_cfgrammar_free(HCFGrammar *g)
{
HAllocator *mm__ = g->mm__;
h_delete_arena(g->arena);
h_free(g);
}
// helpers
static void collect_nts(HCFGrammar *grammar, HCFChoice *symbol);
static void collect_geneps(HCFGrammar *grammar);
// XXX to be consolidated with glue.c when merged upstream
const HParsedToken *h_act_first(const HParseResult *p)
{
assert(p->ast);
assert(p->ast->token_type == TT_SEQUENCE);
assert(p->ast->seq->used > 0);
return p->ast->seq->elements[0];
}
/* Convert 'parser' into CFG representation by desugaring and compiling the set
* of nonterminals.
* A NULL return means we are unable to represent the parser as a CFG.
*/
HCFGrammar *h_cfgrammar(HAllocator* mm__, const HParser *parser)
{
// convert parser to CFG form ("desugar").
HCFChoice *desugared = h_desugar(mm__, parser);
if(desugared == NULL)
return NULL; // -> backend not suitable for this parser
HCFGrammar *g = h_cfgrammar_new(mm__);
// recursively traverse the desugared form and collect all HCFChoices that
// represent a nonterminal (type HCF_CHOICE or HCF_CHARSET).
collect_nts(g, desugared);
if(h_hashset_empty(g->nts)) {
// desugared is a terminal. wrap it in a singleton HCF_CHOICE.
HCFChoice *nt = h_new(HCFChoice, 1);
nt->type = HCF_CHOICE;
nt->seq = h_new(HCFSequence *, 2);
nt->seq[0] = h_new(HCFSequence, 1);
nt->seq[0]->items = h_new(HCFChoice *, 2);
nt->seq[0]->items[0] = desugared;
nt->seq[0]->items[1] = NULL;
nt->seq[1] = NULL;
nt->reshape = h_act_first;
h_hashset_put(g->nts, nt);
g->start = nt;
} else {
g->start = desugared;
}
// determine which nonterminals generate epsilon
collect_geneps(g);
return g;
}
/* Add all nonterminals reachable from symbol to grammar. */
static void collect_nts(HCFGrammar *grammar, HCFChoice *symbol)
{
HCFSequence **s; // for the rhs (sentential form) of a production
HCFChoice **x; // for a symbol in s
if(h_hashset_present(grammar->nts, symbol))
return; // already visited, get out
switch(symbol->type) {
case HCF_CHAR:
case HCF_END:
break; // it's a terminal symbol, nothing to do
case HCF_CHARSET:
break; // NB charsets are considered terminal, too
case HCF_CHOICE:
// exploiting the fact that HHashSet is also a HHashTable to number the
// nonterminals.
// NB top-level (start) symbol gets 0.
h_hashtable_put(grammar->nts, symbol,
(void *)(uintptr_t)grammar->nts->used);
// each element s of symbol->seq (HCFSequence) represents the RHS of
// a production. call self on all symbols (HCFChoice) in s.
for(s = symbol->seq; *s != NULL; s++) {
for(x = (*s)->items; *x != NULL; x++) {
collect_nts(grammar, *x);
}
}
break;
default: // should not be reachable
assert_message(0, "unknown HCFChoice type");
}
}
/* Does the given symbol derive the empty string (under g)? */
bool h_symbol_derives_epsilon(HCFGrammar *g, const HCFChoice *symbol)
{
assert(g->geneps != NULL);
switch(symbol->type) {
case HCF_END: // the end token doesn't count as empty
case HCF_CHAR:
case HCF_CHARSET:
return false;
default: // HCF_CHOICE
return h_hashset_present(g->geneps, symbol);
}
}
/* Does the sentential form s derive the empty string? s NULL-terminated. */
bool h_sequence_derives_epsilon(HCFGrammar *g, HCFChoice **s)
{
// return true iff all symbols in s derive epsilon
for(; *s; s++) {
if(!h_symbol_derives_epsilon(g, *s))
return false;
}
return true;
}
/* Populate the geneps member of g; no-op if called multiple times. */
static void collect_geneps(HCFGrammar *g)
{
if(g->geneps != NULL)
return;
g->geneps = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr);
assert(g->geneps != NULL);
// iterate over the grammar's symbols, the elements of g->nts.
// add any we can identify as deriving epsilon to g->geneps.
// repeat until g->geneps no longer changes.
size_t prevused;
do {
prevused = g->geneps->used;
size_t i;
HHashTableEntry *hte;
for(i=0; i < g->nts->capacity; i++) {
for(hte = &g->nts->contents[i]; hte; hte = hte->next) {
if(hte->key == NULL)
continue;
const HCFChoice *symbol = hte->key;
assert(symbol->type == HCF_CHOICE);
// this NT derives epsilon if any of its productions does.
HCFSequence **p;
for(p = symbol->seq; *p != NULL; p++) {
if(h_sequence_derives_epsilon(g, (*p)->items)) {
h_hashset_put(g->geneps, symbol);
break;
}
}
}
}
} while(g->geneps->used != prevused);
}
/* Compute first set of sentential form s. s NULL-terminated. */
HHashSet *h_first_sequence(HCFGrammar *g, HCFChoice **s);
/* Compute first set of symbol x. Memoized. */
HHashSet *h_first_symbol(HCFGrammar *g, const HCFChoice *x)
{
HHashSet *ret;
HCFSequence **p;
uint8_t c;
// memoize via g->first
assert(g->first != NULL);
ret = h_hashtable_get(g->first, x);
if(ret != NULL)
return ret;
ret = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr);
assert(ret != NULL);
h_hashtable_put(g->first, x, ret);
switch(x->type) {
case HCF_END:
h_hashset_put(ret, (void *)end_token);
break;
case HCF_CHAR:
h_hashset_put(ret, (void *)char_token(x->chr));
break;
case HCF_CHARSET:
c=0;
do {
if(charset_isset(x->charset, c))
h_hashset_put(ret, (void *)char_token(c));
} while(c++ < 255);
break;
case HCF_CHOICE:
// this is a nonterminal
// return the union of the first sets of all productions
for(p=x->seq; *p; ++p)
h_hashset_put_all(ret, h_first_sequence(g, (*p)->items));
break;
default: // should not be reached
assert_message(0, "unknown HCFChoice type");
}
return ret;
}
HHashSet *h_first_sequence(HCFGrammar *g, HCFChoice **s)
{
// the first set of the empty sequence is empty
if(*s == NULL)
return h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr);
// first(X tail) = first(X) if X does not derive epsilon
// = first(X) u first(tail) otherwise
HCFChoice *x = s[0];
HCFChoice **tail = s+1;
HHashSet *first_x = h_first_symbol(g, x);
if(h_symbol_derives_epsilon(g, x)) {
// return the union of first(x) and first(tail)
HHashSet *first_tail = h_first_sequence(g, tail);
HHashSet *ret = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr);
h_hashset_put_all(ret, first_x);
h_hashset_put_all(ret, first_tail);
return ret;
} else {
return first_x;
}
}
/* Compute follow set of symbol x. Memoized. */
HHashSet *h_follow(HCFGrammar *g, const HCFChoice *x)
{
// consider all occurances of X in g
// the follow set of X is the union of:
// {$} if X is the start symbol
// given a production "A -> alpha X tail":
// if tail derives epsilon:
// first(tail) u follow(A)
// else:
// first(tail)
HHashSet *ret;
// memoize via g->follow
assert(g->follow != NULL);
ret = h_hashtable_get(g->follow, x);
if(ret != NULL)
return ret;
ret = h_hashset_new(g->arena, h_eq_ptr, h_hash_ptr);
assert(ret != NULL);
h_hashtable_put(g->follow, x, ret);
// if X is the start symbol, the end token is in its follow set
if(x == g->start)
h_hashset_put(ret, (void *)end_token);
// iterate over g->nts
size_t i;
HHashTableEntry *hte;
for(i=0; i < g->nts->capacity; i++) {
for(hte = &g->nts->contents[i]; hte; hte = hte->next) {
if(hte->key == NULL)
continue;
const HCFChoice *a = hte->key; // production's left-hand symbol
assert(a->type == HCF_CHOICE);
// iterate over the productions for A
HCFSequence **p;
for(p=a->seq; *p; p++) {
HCFChoice **s = (*p)->items; // production's right-hand side
for(; *s; s++) {
if(*s == x) { // occurance found
HCFChoice **tail = s+1;
h_hashset_put_all(ret, h_first_sequence(g, tail));
if(h_sequence_derives_epsilon(g, tail))
h_hashset_put_all(ret, h_follow(g, a));
}
}
}
}
}
return ret;
}
static void pprint_char(FILE *f, char c)
{
switch(c) {
case '"': fputs("\\\"", f); break;
case '\\': fputs("\\\\", f); break;
case '\b': fputs("\\b", f); break;
case '\t': fputs("\\t", f); break;
case '\n': fputs("\\n", f); break;
case '\r': fputs("\\r", f); break;
default:
if(isprint(c)) {
fputc(c, f);
} else {
fprintf(f, "\\x%.2X", c);
}
}
}
static void pprint_charset_char(FILE *f, char c)
{
switch(c) {
case '"': fputc(c, f); break;
case '-': fputs("\\-", f); break;
case ']': fputs("\\-", f); break;
default: pprint_char(f, c);
}
}
static void pprint_charset(FILE *f, const HCharset cs)
{
int i;
fputc('[', f);
for(i=0; i<256; i++) {
if(charset_isset(cs, i))
pprint_charset_char(f, i);
// detect ranges
if(i+2<256 && charset_isset(cs, i+1) && charset_isset(cs, i+2)) {
fputc('-', f);
for(; i<256 && charset_isset(cs, i); i++);
i--; // back to the last in range
pprint_charset_char(f, i);
}
}
fputc(']', f);
}
static const char *nonterminal_name(const HCFGrammar *g, const HCFChoice *nt)
{
static char buf[16] = {0}; // 14 characters in base 26 are enough for 64 bits
// find nt's number in g
size_t n = (uintptr_t)h_hashtable_get(g->nts, nt);
// NB the start symbol (number 0) is always "A".
int i;
for(i=14; i>=0 && (n>0 || i==14); i--) {
buf[i] = 'A' + n%26;
n = n/26; // shift one digit
}
return buf+i+1;
}
static HCFChoice **pprint_string(FILE *f, HCFChoice **x)
{
fputc('"', f);
for(; *x; x++) {
if((*x)->type != HCF_CHAR)
break;
pprint_char(f, (*x)->chr);
}
fputc('"', f);
return x;
}
static void pprint_symbol(FILE *f, const HCFGrammar *g, const HCFChoice *x)
{
switch(x->type) {
case HCF_CHAR:
fputc('"', f);
pprint_char(f, x->chr);
fputc('"', f);
break;
case HCF_END:
fputc('$', f);
break;
case HCF_CHARSET:
pprint_charset(f, x->charset);
default:
fputs(nonterminal_name(g, x), f);
}
}
static void pprint_sequence(FILE *f, const HCFGrammar *g, const HCFSequence *seq)
{
HCFChoice **x = seq->items;
if(*x == NULL) { // the empty sequence
fputs(" \"\"", f);
} else {
while(*x) {
fputc(' ', f); // separator
if((*x)->type == HCF_CHAR) {
// condense character strings
x = pprint_string(f, x);
} else {
pprint_symbol(f, g, *x);
x++;
}
}
}
fputc('\n', f);
}
static
void pprint_ntrules(FILE *f, const HCFGrammar *g, const HCFChoice *nt,
int indent, int len)
{
int i;
int column = indent + len;
const char *name = nonterminal_name(g, nt);
// print rule head (symbol name)
for(i=0; i<indent; i++) fputc(' ', f);
fputs(name, f);
i += strlen(name);
for(; i<column; i++) fputc(' ', f);
fputs(" ->", f);
assert(nt->type == HCF_CHOICE);
HCFSequence **p = nt->seq;
if(*p == NULL) return; // shouldn't happen
pprint_sequence(f, g, *p++); // print first production on the same line
for(; *p; p++) { // print the rest below with "or" bars
for(i=0; i<column; i++) fputc(' ', f); // indent
fputs(" |", f);
pprint_sequence(f, g, *p);
}
}
void h_pprint_grammar(FILE *file, const HCFGrammar *g, int indent)
{
if(g->nts->used < 1)
return;
// determine maximum string length of symbol names
int len;
size_t s;
for(len=1, s=26; s < g->nts->used; len++, s*=26);
// iterate over g->nts
size_t i;
HHashTableEntry *hte;
for(i=0; i < g->nts->capacity; i++) {
for(hte = &g->nts->contents[i]; hte; hte = hte->next) {
if(hte->key == NULL)
continue;
const HCFChoice *a = hte->key; // production's left-hand symbol
assert(a->type == HCF_CHOICE);
pprint_ntrules(file, g, a, indent, len);
}
}
}
void h_pprint_symbolset(FILE *file, const HCFGrammar *g, const HHashSet *set, int indent)
{
int j;
for(j=0; j<indent; j++) fputc(' ', file);
fputc('{', file);
// iterate over set
size_t i;
HHashTableEntry *hte;
const HCFChoice *a = NULL;
for(i=0; i < set->capacity; i++) {
for(hte = &set->contents[i]; hte; hte = hte->next) {
if(hte->key == NULL)
continue;
if(a != NULL) // we're not on the first element
fputc(',', file);
a = hte->key; // production's left-hand symbol
pprint_symbol(file, g, a);
}
}
fputs("}\n", file);
}
void h_pprint_tokenset(FILE *file, const HCFGrammar *g, const HHashSet *set, int indent)
{
int j;
for(j=0; j<indent; j++) fputc(' ', file);
fputc('[', file);
// iterate over set
size_t i;
HHashTableEntry *hte;
for(i=0; i < set->capacity; i++) {
for(hte = &set->contents[i]; hte; hte = hte->next) {
if(hte->key == NULL)
continue;
HCFToken a = (HCFToken)hte->key;
if(a == end_token)
fputc('$', file);
else if(token_char(a) == '$')
fputs("\\$", file);
else
pprint_char(file, token_char(a));
}
}
fputs("]\n", file);
}