Add a term parser.

author: Peter Mikkelsen <peter@pmikkelsen.com> 2021-06-29 15:51:04 +0000
committer: Peter Mikkelsen <peter@pmikkelsen.com> 2021-06-29 15:51:04 +0000
commit: 02145f06ac007d730bc16930185fe18fa3e76c68 (patch)
tree: d71744b876974a4fd3062daf9dc334a984782f80 /parser.c
parent: 0b36426d023e45d6acbc7672c7083a91d10913a8 (diff)
1 files changed, 556 insertions, 0 deletions
diff --git a/parser.c b/parser.c
index 0a5e9d4..459d048 100644
--- a/parser.c
+++ b/parser.c
@@ -1,5 +1,561 @@
 #include <u.h>
 #include <libc.h>
+#include <bio.h>
 
 #include "dat.h"
 #include "fns.h"
+
+#define PrecedenceLevels 1200
+
+/* The parser doesn't produce an ast for now, it only acts like a recognizer,
+   printing errors if it fails */
+
+typedef struct Token Token;
+typedef struct Operator Operator;
+typedef struct OpInfo OpInfo;
+struct Token
+{
+	int	tag;
+	Rune	*text;
+	double	dval;
+	vlong	ival;
+};
+
+struct Operator
+{
+	int type;
+	int level;
+	Rune *spelling;
+	Operator *next;
+};
+
+struct OpInfo
+{
+	int level;
+	int type;
+};
+
+enum {
+	Xf	= 1<<0, /* 1 */
+	Yf	= 1<<1, /* 2 */
+	Xfx	= 1<<2, /* 4 */
+	Xfy	= 1<<3, /* 8 */
+	Yfx	= 1<<4, /* 16 */
+	Fy	= 1<<5, /* 32 */
+	Fx	= 1<<6, /* 64 */
+};
+
+enum {
+	AtomTok			= 1<<0, /* 1 */
+	FunctorTok		= 1<<1, /* 2 */
+	VarTok			= 1<<2, /* 4 */
+	StringTok		= 1<<3, /* 8 */
+	IntTok			= 1<<4, /* 16 */
+	FloatTok		= 1<<5, /* 32 */
+	ParenLeftTok		= 1<<6, /* 64 */
+	ParenRightTok		= 1<<7, /* 128 */
+	SquareBracketLeftTok	= 1<<8, /* 256 */
+	SquareBracketRightTok	= 1<<9, /* 512 */
+	CurlyBracketLeftTok	= 1<<10, /* 1024 */
+	CurlyBracketRightTok	= 1<<11, /* 2048 */
+	CommaTok		= 1<<12, /* 4096 */
+	EofTok			= 1<<13, /* 8192 */
+};
+
+static Biobuf *parsein;
+static Token lookahead;
+static Operator *operators[PrecedenceLevels];
+
+void initoperators(void);
+void addoperator(int, int, Rune *);
+Operator *getoperator(Rune *);
+void nexttoken(void);
+Term *fullterm(int, Rune *, Term *);
+Term *term(void);
+Term *compound(void);
+Term *parseoperators(Term *);
+void match(int);
+void syntaxerror(char *);
+void prologtext(void);
+
+void
+parse(int fd)
+{
+	parsein = Bfdopen(fd, OREAD);
+	if(parsein == nil){
+		print("Could not open file\n");
+		return;
+	}
+	initoperators();
+	nexttoken();
+
+	prologtext();
+}
+
+void
+prologtext(void)
+{
+	if(lookahead.tag == EofTok)
+		return;
+
+	Term *t = fullterm(AtomTok, L".", nil);
+	if(t->tag != CompoundTerm || runestrcmp(t->text, L":-") != 0 || !(t->arity == 1 || t->arity == 2)){
+		print("Expected directive or clause as toplevel\n");
+		print("Got %S\n", prettyprint(t));
+		print("Tag=%d arity=%d text=\"%S\"\n", t->tag, t->arity, t->text);
+		syntaxerror("prologtext");
+	}
+
+	if(t->arity == 1){
+		/* A directive */
+		print("Got directive: %S\n", prettyprint(t));
+	}else{
+		/* A clause */
+		print("Got clause: %S\n", prettyprint(t));
+	}
+
+	if(lookahead.tag == AtomTok && runestrcmp(lookahead.text, L".") == 0)
+		match(AtomTok);
+	else
+		syntaxerror("prologtext");
+
+	prologtext();
+}
+
+Term *
+fullterm(int stoptag, Rune *stoptext, Term *list)
+{
+	if((lookahead.tag & stoptag) && (stoptext == nil || runestrcmp(stoptext, lookahead.text) == 0))
+		return parseoperators(list);
+
+	Term *t = term();
+	list = appendterm(list, t);
+	return fullterm(stoptag, stoptext, list);
+}
+
+Term *
+term(void)
+{
+	Term *result;
+
+	switch(lookahead.tag){
+	case AtomTok:
+		result = mkatom(lookahead.text);
+		match(AtomTok);
+		break;
+	case FunctorTok:
+		result = compound();
+		break;
+	case VarTok:
+		result = mkvariable(lookahead.text);
+		match(VarTok);
+		break;
+	case IntTok:
+		result = mknumber(NumberInt, lookahead.ival, 0);
+		match(IntTok);
+		break;
+	case FloatTok:
+		result = mknumber(NumberFloat, 0, lookahead.dval);
+		match(FloatTok);
+		break;
+	default:
+		print("Cant parse term of token type %d\n", lookahead.tag);
+		syntaxerror("term");
+		result = nil;
+	}
+
+	return result;
+}
+
+Term *
+compound(void)
+{
+	Rune *name = lookahead.text;
+	Term *args = nil;
+	int arity = 0;
+
+	match(FunctorTok);
+	match(ParenLeftTok);
+
+	while(lookahead.tag != ParenRightTok){
+		Term *t = fullterm(CommaTok|ParenRightTok, nil, nil);
+		if(lookahead.tag == CommaTok)
+			match(CommaTok);
+		args = appendterm(args, t);
+		arity++;
+	}
+
+	match(ParenRightTok);
+	return mkcompound(name, arity, args);
+}
+
+Term *
+parseoperators(Term *list)
+{
+	int i;
+	int length = termslength(list);
+	Term *t;
+	Term **terms = malloc(sizeof(Term *) * length);
+	OpInfo *infos = malloc(sizeof(OpInfo) * length);
+
+	for(i = 0, t = list; i < length; i++){
+		Operator *op = getoperator(t->text);
+		if(op){
+			infos[i].type = op->type;
+			infos[i].level = op->level;
+		}else{
+			infos[i].type = 0;
+			infos[i].level = 0;
+		}
+		terms[i] = t;
+		Term *tmp = t;
+		t = t->next;
+		tmp->next = nil;
+	}
+
+	while(length > 1){
+		int index = -1;
+		int lowest = PrecedenceLevels + 1;
+		for(i = 0; i < length; i++){
+			if(infos[i].type == 0)
+				continue;
+			if(infos[i].level == lowest && infos[i].type == Xfy && i == index+1)
+				index = i;
+			else if(infos[i].level < lowest){
+				index = i;
+				lowest = infos[i].level;
+			}
+		}
+
+		if(index == -1){
+			print("Can't parse, list contains no operators");
+			syntaxerror("parseoperators");
+		}
+
+		int infixlevel = infos[index].level & (Xfx|Xfy|Yfx);
+		int prefixlevel = infos[index].level & (Fx|Fy);
+		int postfixlevel = infos[index].level & (Xf|Yf);
+
+		if(infixlevel && index != 0 && index != length-1 && infos[index-1].type == 0 && infos[index-1].type == 0){
+			infos[index-1].type = 0;
+			infos[index-1].level = infos[index].level;
+			Term *args = terms[index-1];
+			args->next = terms[index+1];
+			terms[index-1] = mkcompound(terms[index]->text, 2, args);
+			length -= 2;
+			for(i = index; i < length; i++){
+				infos[i].level = infos[i+2].level;
+				infos[i].type = infos[i+2].type;
+				terms[i] = terms[i+2];
+			}
+		}else if(prefixlevel && index != length-1 && infos[index+1].type == 0){
+			infos[index].type = 0;
+			terms[index] = mkcompound(terms[index]->text, 1, terms[index+1]);
+			length--;
+			for(i = index+1; i < length; i++){
+				infos[i].level = infos[i+1].level;
+				infos[i].type = infos[i+1].type;
+				terms[i] = terms[i+1];
+			}
+		}else if(postfixlevel && index != 0 && infos[index-1].type == 0){
+			infos[index-1].type = 0;
+			infos[index-1].level = infos[index].level;
+			terms[index-1] = mkcompound(terms[index]->text, 1, terms[index-1]);
+			length--;
+			for(i = index; i < length; i++){
+				infos[i].level = infos[i+1].level;
+				infos[i].type = infos[i+1].type;
+				terms[i] = terms[i+1];
+			}
+		}else{
+			print("Parse error when parsing operators\n");
+			syntaxerror("parseoperators");
+		}
+	}
+
+	Term *result = terms[0];
+	free(infos);
+	free(terms);
+	return result;
+}
+
+void
+initoperators(void)
+{
+	Operator *op;
+	int i;
+	for(i = 0; i < PrecedenceLevels; i++){
+		while(operators[i]){
+			op = operators[i];
+			operators[i] = op->next;
+			free(op);
+		}
+	}
+
+	addoperator(1200, Xfx, L":-");
+	addoperator(1200, Fx,  L":-");
+	addoperator(1100, Xfy, L";");
+	addoperator(1000, Xfy, L",");
+	addoperator(700,  Xfx, L"=");
+	addoperator(700,  Xfx, L"is");
+	addoperator(500,  Yfx, L"+");
+	addoperator(400,  Yfx, L"*");
+	addoperator(400,  Yfx, L"/");
+}
+
+void
+addoperator(int level, int type, Rune *spelling)
+{
+	Operator *op = malloc(sizeof(Operator));
+	op->type = type;
+	op->level = level;
+	op->spelling = spelling;
+	op->next = operators[level-1];
+	operators[level-1] = op;
+}
+
+Operator *
+getoperator(Rune *spelling)
+{
+	Operator *op = nil;
+	int level;
+
+	if(spelling == nil)
+		return nil;
+
+	for(level = 0; level < PrecedenceLevels && op == nil; level++){
+		Operator *tmp;
+		for(tmp = operators[level]; tmp != nil; tmp = tmp->next){
+			if(runestrcmp(tmp->spelling, spelling) == 0){
+				if(op == nil){
+					op = malloc(sizeof(Operator));
+					memcpy(op, tmp, sizeof(Operator));
+				}else
+					op->type |= tmp->type;
+			}
+		}
+	}
+	return op;
+}
+
+void
+nexttoken(void)
+{
+	Rune buf[1024];
+	Rune peek;
+	int i = 0;
+	double numD;
+	vlong numI = 0;
+	int negative = 0;
+	static long replaypeek = -1;
+
+	if(replaypeek == -1)
+		peek = Bgetrune(parsein);
+	else{
+		peek = replaypeek;
+		replaypeek = -1;
+	}
+
+	/* Skip whitespace */
+	while(isspacerune(peek))
+		peek = Bgetrune(parsein);
+
+	/* Skip single line comments */
+	if(peek == L'%'){
+		while(peek != L'\n')
+			peek = Bgetrune(parsein);
+		peek = Bgetrune(parsein);
+	}
+
+	/* Variables */
+	if(peek == L'_' || isupperrune(peek)){
+		while(peek == L'_' || isalpharune(peek) || isdigitrune(peek)){
+			/* FIXME Should handle input length, also elsewhere in this function */
+			buf[i++] = peek;
+			peek = Bgetrune(parsein);
+		}
+		buf[i] = '\0';
+		Bungetrune(parsein);
+
+		lookahead.tag = VarTok;
+		lookahead.text = runestrdup(buf);
+		return;
+	}
+
+	/* Strings */
+	if(peek == L'"'){
+		peek = Bgetrune(parsein);
+		while(peek != L'"'){
+			buf[i++] = peek;
+			peek = Bgetrune(parsein);
+		}
+		buf[i] = '\0';
+		lookahead.tag = StringTok;
+		lookahead.text = runestrdup(buf);
+		return;
+	}
+
+	/* Numbers */
+	if(peek == L'-'){
+		peek = Bgetrune(parsein);
+		if(isdigitrune(peek))
+			negative = 1;
+		else{
+			Bungetrune(parsein);
+			peek = L'-';
+		}
+	}
+	if(isdigitrune(peek)){
+		if(peek == L'0'){
+			peek = Bgetrune(parsein);
+			switch(peek){
+			case L'o': /* Octal */
+			case L'x': /* Hexadecimal */
+			case L'b': /* Binary */
+				print("Cant lex numbers in other bases than base 10 right now\n");
+				exits("lexer");
+			}
+		}
+		while(isdigitrune(peek)){
+			numI *= 10;
+			numI += peek - L'0';
+			peek = Bgetrune(parsein);
+		}
+		if(peek == L'.'){
+			int place = 1;
+			numD = numI;
+			peek = Bgetrune(parsein);
+			if(!isdigitrune(peek)){
+				Bungetrune(parsein);
+				replaypeek = L'.';
+				goto Integer;
+			}
+			while(isdigitrune(peek)){
+				numD += (peek - L'0') / (10 * place);
+				peek = Bgetrune(parsein);
+			}
+			/* Should also lex 123.45E10 */
+			lookahead.tag = FloatTok;
+			lookahead.dval = negative ? -numD : numD;
+		}else{
+			Bungetrune(parsein);
+Integer:
+			lookahead.tag = IntTok;
+			lookahead.ival = negative ? -numI : numI;
+		}
+		return;
+	}
+
+	/* Atoms */
+	/* Atoms in single quotes */
+	if(peek == L'\''){
+		peek = Bgetrune(parsein);
+		while(peek != L'\''){
+			buf[i++] = peek;
+			peek = Bgetrune(parsein);
+		}
+		buf[i] = '\0';
+		lookahead.tag = AtomTok;
+		lookahead.text = runestrdup(buf);
+		goto Functor;
+	}
+
+	/* Special atom [] */
+	if(peek == L'['){
+		peek = Bgetrune(parsein);
+		if(peek == L']'){
+			lookahead.tag = AtomTok;
+			lookahead.text = runestrdup(L"[]");
+			goto Functor;
+		}else{
+			Bungetrune(parsein);
+			lookahead.tag = SquareBracketLeftTok;
+			return;
+		}
+	}
+
+	/* Special atom {} */
+	if(peek == L'{'){
+		peek = Bgetrune(parsein);
+		if(peek == L'}'){
+			lookahead.tag = AtomTok;
+			lookahead.text = runestrdup(L"{}");
+			goto Functor;
+		}else{
+			Bungetrune(parsein);
+			lookahead.tag = CurlyBracketLeftTok;
+			return;
+		}
+	}
+
+	/* Graphic atom */
+	Rune *graphics = L"#$&*+-./:<=>?@^~\\";
+	if(runestrchr(graphics, peek)){
+		while(runestrchr(graphics, peek)){
+			buf[i++] = peek;
+			peek = Bgetrune(parsein);
+		}
+		buf[i] = '\0';
+		Bungetrune(parsein);
+		lookahead.tag = AtomTok;
+		lookahead.text = runestrdup(buf);
+		goto Functor;
+	}
+
+	/* simple atom */
+	if(islowerrune(peek)){
+		while(isalpharune(peek) || isdigitrune(peek) || peek == L'_'){
+			buf[i++] = peek;
+			peek = Bgetrune(parsein);
+		}
+		buf[i] = '\0';
+		Bungetrune(parsein);
+		lookahead.tag = AtomTok;
+		lookahead.text = runestrdup(buf);
+		goto Functor;
+	}
+
+	/* Other */
+	if(runestrchr(L",.()]}", peek)){
+		switch(peek){
+		case L',': lookahead.tag = CommaTok; break;
+		case L'(': lookahead.tag = ParenLeftTok; break;
+		case L')': lookahead.tag = ParenRightTok; break;
+		case L']': lookahead.tag = SquareBracketRightTok; break;
+		case L'}': lookahead.tag = CurlyBracketRightTok; break;
+		}
+		return;
+	}
+
+	/* EOF */
+	if(peek == Beof){
+		lookahead.tag = EofTok;
+		return;
+	}
+
+	print("Could not lex rune %C (%ud)\n", peek, peek);
+	exits("lexer");
+
+Functor:
+	peek = Bgetrune(parsein);
+	if(peek == L'(')
+		lookahead.tag = FunctorTok;
+	Bungetrune(parsein);
+	return;
+}
+
+void
+match(int tag)
+{
+	if(lookahead.tag == tag)
+		nexttoken();
+	else
+		syntaxerror("match");
+}
+
+void
+syntaxerror(char *where)
+{
+	print("Syntax error: Unexpected %d token in %s\n", lookahead.tag, where);
+	exits("syntax error");
+}
+\ No newline at end of file
author	Peter Mikkelsen <peter@pmikkelsen.com>	2021-06-29 15:51:04 +0000
committer	Peter Mikkelsen <peter@pmikkelsen.com>	2021-06-29 15:51:04 +0000
commit	02145f06ac007d730bc16930185fe18fa3e76c68 (patch)
tree	d71744b876974a4fd3062daf9dc334a984782f80 /parser.c
parent	0b36426d023e45d6acbc7672c7083a91d10913a8 (diff)