summaryrefslogtreecommitdiff
path: root/parser.c
diff options
context:
space:
mode:
authorPeter Mikkelsen <peter@pmikkelsen.com>2021-06-29 15:51:04 +0000
committerPeter Mikkelsen <peter@pmikkelsen.com>2021-06-29 15:51:04 +0000
commit02145f06ac007d730bc16930185fe18fa3e76c68 (patch)
treed71744b876974a4fd3062daf9dc334a984782f80 /parser.c
parent0b36426d023e45d6acbc7672c7083a91d10913a8 (diff)
Add a term parser.
Diffstat (limited to 'parser.c')
-rw-r--r--parser.c556
1 files changed, 556 insertions, 0 deletions
diff --git a/parser.c b/parser.c
index 0a5e9d4..459d048 100644
--- a/parser.c
+++ b/parser.c
@@ -1,5 +1,561 @@
#include <u.h>
#include <libc.h>
+#include <bio.h>
#include "dat.h"
#include "fns.h"
+
+#define PrecedenceLevels 1200
+
+/* The parser doesn't produce an ast for now, it only acts like a recognizer,
+ printing errors if it fails */
+
+typedef struct Token Token;
+typedef struct Operator Operator;
+typedef struct OpInfo OpInfo;
+struct Token
+{
+ int tag;
+ Rune *text;
+ double dval;
+ vlong ival;
+};
+
+struct Operator
+{
+ int type;
+ int level;
+ Rune *spelling;
+ Operator *next;
+};
+
+struct OpInfo
+{
+ int level;
+ int type;
+};
+
+enum {
+ Xf = 1<<0, /* 1 */
+ Yf = 1<<1, /* 2 */
+ Xfx = 1<<2, /* 4 */
+ Xfy = 1<<3, /* 8 */
+ Yfx = 1<<4, /* 16 */
+ Fy = 1<<5, /* 32 */
+ Fx = 1<<6, /* 64 */
+};
+
+enum {
+ AtomTok = 1<<0, /* 1 */
+ FunctorTok = 1<<1, /* 2 */
+ VarTok = 1<<2, /* 4 */
+ StringTok = 1<<3, /* 8 */
+ IntTok = 1<<4, /* 16 */
+ FloatTok = 1<<5, /* 32 */
+ ParenLeftTok = 1<<6, /* 64 */
+ ParenRightTok = 1<<7, /* 128 */
+ SquareBracketLeftTok = 1<<8, /* 256 */
+ SquareBracketRightTok = 1<<9, /* 512 */
+ CurlyBracketLeftTok = 1<<10, /* 1024 */
+ CurlyBracketRightTok = 1<<11, /* 2048 */
+ CommaTok = 1<<12, /* 4096 */
+ EofTok = 1<<13, /* 8192 */
+};
+
+static Biobuf *parsein;
+static Token lookahead;
+static Operator *operators[PrecedenceLevels];
+
+void initoperators(void);
+void addoperator(int, int, Rune *);
+Operator *getoperator(Rune *);
+void nexttoken(void);
+Term *fullterm(int, Rune *, Term *);
+Term *term(void);
+Term *compound(void);
+Term *parseoperators(Term *);
+void match(int);
+void syntaxerror(char *);
+void prologtext(void);
+
+void
+parse(int fd)
+{
+ parsein = Bfdopen(fd, OREAD);
+ if(parsein == nil){
+ print("Could not open file\n");
+ return;
+ }
+ initoperators();
+ nexttoken();
+
+ prologtext();
+}
+
+void
+prologtext(void)
+{
+ if(lookahead.tag == EofTok)
+ return;
+
+ Term *t = fullterm(AtomTok, L".", nil);
+ if(t->tag != CompoundTerm || runestrcmp(t->text, L":-") != 0 || !(t->arity == 1 || t->arity == 2)){
+ print("Expected directive or clause as toplevel\n");
+ print("Got %S\n", prettyprint(t));
+ print("Tag=%d arity=%d text=\"%S\"\n", t->tag, t->arity, t->text);
+ syntaxerror("prologtext");
+ }
+
+ if(t->arity == 1){
+ /* A directive */
+ print("Got directive: %S\n", prettyprint(t));
+ }else{
+ /* A clause */
+ print("Got clause: %S\n", prettyprint(t));
+ }
+
+ if(lookahead.tag == AtomTok && runestrcmp(lookahead.text, L".") == 0)
+ match(AtomTok);
+ else
+ syntaxerror("prologtext");
+
+ prologtext();
+}
+
+Term *
+fullterm(int stoptag, Rune *stoptext, Term *list)
+{
+ if((lookahead.tag & stoptag) && (stoptext == nil || runestrcmp(stoptext, lookahead.text) == 0))
+ return parseoperators(list);
+
+ Term *t = term();
+ list = appendterm(list, t);
+ return fullterm(stoptag, stoptext, list);
+}
+
+Term *
+term(void)
+{
+ Term *result;
+
+ switch(lookahead.tag){
+ case AtomTok:
+ result = mkatom(lookahead.text);
+ match(AtomTok);
+ break;
+ case FunctorTok:
+ result = compound();
+ break;
+ case VarTok:
+ result = mkvariable(lookahead.text);
+ match(VarTok);
+ break;
+ case IntTok:
+ result = mknumber(NumberInt, lookahead.ival, 0);
+ match(IntTok);
+ break;
+ case FloatTok:
+ result = mknumber(NumberFloat, 0, lookahead.dval);
+ match(FloatTok);
+ break;
+ default:
+ print("Cant parse term of token type %d\n", lookahead.tag);
+ syntaxerror("term");
+ result = nil;
+ }
+
+ return result;
+}
+
+Term *
+compound(void)
+{
+ Rune *name = lookahead.text;
+ Term *args = nil;
+ int arity = 0;
+
+ match(FunctorTok);
+ match(ParenLeftTok);
+
+ while(lookahead.tag != ParenRightTok){
+ Term *t = fullterm(CommaTok|ParenRightTok, nil, nil);
+ if(lookahead.tag == CommaTok)
+ match(CommaTok);
+ args = appendterm(args, t);
+ arity++;
+ }
+
+ match(ParenRightTok);
+ return mkcompound(name, arity, args);
+}
+
+Term *
+parseoperators(Term *list)
+{
+ int i;
+ int length = termslength(list);
+ Term *t;
+ Term **terms = malloc(sizeof(Term *) * length);
+ OpInfo *infos = malloc(sizeof(OpInfo) * length);
+
+ for(i = 0, t = list; i < length; i++){
+ Operator *op = getoperator(t->text);
+ if(op){
+ infos[i].type = op->type;
+ infos[i].level = op->level;
+ }else{
+ infos[i].type = 0;
+ infos[i].level = 0;
+ }
+ terms[i] = t;
+ Term *tmp = t;
+ t = t->next;
+ tmp->next = nil;
+ }
+
+ while(length > 1){
+ int index = -1;
+ int lowest = PrecedenceLevels + 1;
+ for(i = 0; i < length; i++){
+ if(infos[i].type == 0)
+ continue;
+ if(infos[i].level == lowest && infos[i].type == Xfy && i == index+1)
+ index = i;
+ else if(infos[i].level < lowest){
+ index = i;
+ lowest = infos[i].level;
+ }
+ }
+
+ if(index == -1){
+ print("Can't parse, list contains no operators");
+ syntaxerror("parseoperators");
+ }
+
+ int infixlevel = infos[index].level & (Xfx|Xfy|Yfx);
+ int prefixlevel = infos[index].level & (Fx|Fy);
+ int postfixlevel = infos[index].level & (Xf|Yf);
+
+ if(infixlevel && index != 0 && index != length-1 && infos[index-1].type == 0 && infos[index-1].type == 0){
+ infos[index-1].type = 0;
+ infos[index-1].level = infos[index].level;
+ Term *args = terms[index-1];
+ args->next = terms[index+1];
+ terms[index-1] = mkcompound(terms[index]->text, 2, args);
+ length -= 2;
+ for(i = index; i < length; i++){
+ infos[i].level = infos[i+2].level;
+ infos[i].type = infos[i+2].type;
+ terms[i] = terms[i+2];
+ }
+ }else if(prefixlevel && index != length-1 && infos[index+1].type == 0){
+ infos[index].type = 0;
+ terms[index] = mkcompound(terms[index]->text, 1, terms[index+1]);
+ length--;
+ for(i = index+1; i < length; i++){
+ infos[i].level = infos[i+1].level;
+ infos[i].type = infos[i+1].type;
+ terms[i] = terms[i+1];
+ }
+ }else if(postfixlevel && index != 0 && infos[index-1].type == 0){
+ infos[index-1].type = 0;
+ infos[index-1].level = infos[index].level;
+ terms[index-1] = mkcompound(terms[index]->text, 1, terms[index-1]);
+ length--;
+ for(i = index; i < length; i++){
+ infos[i].level = infos[i+1].level;
+ infos[i].type = infos[i+1].type;
+ terms[i] = terms[i+1];
+ }
+ }else{
+ print("Parse error when parsing operators\n");
+ syntaxerror("parseoperators");
+ }
+ }
+
+ Term *result = terms[0];
+ free(infos);
+ free(terms);
+ return result;
+}
+
+void
+initoperators(void)
+{
+ Operator *op;
+ int i;
+ for(i = 0; i < PrecedenceLevels; i++){
+ while(operators[i]){
+ op = operators[i];
+ operators[i] = op->next;
+ free(op);
+ }
+ }
+
+ addoperator(1200, Xfx, L":-");
+ addoperator(1200, Fx, L":-");
+ addoperator(1100, Xfy, L";");
+ addoperator(1000, Xfy, L",");
+ addoperator(700, Xfx, L"=");
+ addoperator(700, Xfx, L"is");
+ addoperator(500, Yfx, L"+");
+ addoperator(400, Yfx, L"*");
+ addoperator(400, Yfx, L"/");
+}
+
+void
+addoperator(int level, int type, Rune *spelling)
+{
+ Operator *op = malloc(sizeof(Operator));
+ op->type = type;
+ op->level = level;
+ op->spelling = spelling;
+ op->next = operators[level-1];
+ operators[level-1] = op;
+}
+
+Operator *
+getoperator(Rune *spelling)
+{
+ Operator *op = nil;
+ int level;
+
+ if(spelling == nil)
+ return nil;
+
+ for(level = 0; level < PrecedenceLevels && op == nil; level++){
+ Operator *tmp;
+ for(tmp = operators[level]; tmp != nil; tmp = tmp->next){
+ if(runestrcmp(tmp->spelling, spelling) == 0){
+ if(op == nil){
+ op = malloc(sizeof(Operator));
+ memcpy(op, tmp, sizeof(Operator));
+ }else
+ op->type |= tmp->type;
+ }
+ }
+ }
+ return op;
+}
+
+void
+nexttoken(void)
+{
+ Rune buf[1024];
+ Rune peek;
+ int i = 0;
+ double numD;
+ vlong numI = 0;
+ int negative = 0;
+ static long replaypeek = -1;
+
+ if(replaypeek == -1)
+ peek = Bgetrune(parsein);
+ else{
+ peek = replaypeek;
+ replaypeek = -1;
+ }
+
+ /* Skip whitespace */
+ while(isspacerune(peek))
+ peek = Bgetrune(parsein);
+
+ /* Skip single line comments */
+ if(peek == L'%'){
+ while(peek != L'\n')
+ peek = Bgetrune(parsein);
+ peek = Bgetrune(parsein);
+ }
+
+ /* Variables */
+ if(peek == L'_' || isupperrune(peek)){
+ while(peek == L'_' || isalpharune(peek) || isdigitrune(peek)){
+ /* FIXME Should handle input length, also elsewhere in this function */
+ buf[i++] = peek;
+ peek = Bgetrune(parsein);
+ }
+ buf[i] = '\0';
+ Bungetrune(parsein);
+
+ lookahead.tag = VarTok;
+ lookahead.text = runestrdup(buf);
+ return;
+ }
+
+ /* Strings */
+ if(peek == L'"'){
+ peek = Bgetrune(parsein);
+ while(peek != L'"'){
+ buf[i++] = peek;
+ peek = Bgetrune(parsein);
+ }
+ buf[i] = '\0';
+ lookahead.tag = StringTok;
+ lookahead.text = runestrdup(buf);
+ return;
+ }
+
+ /* Numbers */
+ if(peek == L'-'){
+ peek = Bgetrune(parsein);
+ if(isdigitrune(peek))
+ negative = 1;
+ else{
+ Bungetrune(parsein);
+ peek = L'-';
+ }
+ }
+ if(isdigitrune(peek)){
+ if(peek == L'0'){
+ peek = Bgetrune(parsein);
+ switch(peek){
+ case L'o': /* Octal */
+ case L'x': /* Hexadecimal */
+ case L'b': /* Binary */
+ print("Cant lex numbers in other bases than base 10 right now\n");
+ exits("lexer");
+ }
+ }
+ while(isdigitrune(peek)){
+ numI *= 10;
+ numI += peek - L'0';
+ peek = Bgetrune(parsein);
+ }
+ if(peek == L'.'){
+ int place = 1;
+ numD = numI;
+ peek = Bgetrune(parsein);
+ if(!isdigitrune(peek)){
+ Bungetrune(parsein);
+ replaypeek = L'.';
+ goto Integer;
+ }
+ while(isdigitrune(peek)){
+ numD += (peek - L'0') / (10 * place);
+ peek = Bgetrune(parsein);
+ }
+ /* Should also lex 123.45E10 */
+ lookahead.tag = FloatTok;
+ lookahead.dval = negative ? -numD : numD;
+ }else{
+ Bungetrune(parsein);
+Integer:
+ lookahead.tag = IntTok;
+ lookahead.ival = negative ? -numI : numI;
+ }
+ return;
+ }
+
+ /* Atoms */
+ /* Atoms in single quotes */
+ if(peek == L'\''){
+ peek = Bgetrune(parsein);
+ while(peek != L'\''){
+ buf[i++] = peek;
+ peek = Bgetrune(parsein);
+ }
+ buf[i] = '\0';
+ lookahead.tag = AtomTok;
+ lookahead.text = runestrdup(buf);
+ goto Functor;
+ }
+
+ /* Special atom [] */
+ if(peek == L'['){
+ peek = Bgetrune(parsein);
+ if(peek == L']'){
+ lookahead.tag = AtomTok;
+ lookahead.text = runestrdup(L"[]");
+ goto Functor;
+ }else{
+ Bungetrune(parsein);
+ lookahead.tag = SquareBracketLeftTok;
+ return;
+ }
+ }
+
+ /* Special atom {} */
+ if(peek == L'{'){
+ peek = Bgetrune(parsein);
+ if(peek == L'}'){
+ lookahead.tag = AtomTok;
+ lookahead.text = runestrdup(L"{}");
+ goto Functor;
+ }else{
+ Bungetrune(parsein);
+ lookahead.tag = CurlyBracketLeftTok;
+ return;
+ }
+ }
+
+ /* Graphic atom */
+ Rune *graphics = L"#$&*+-./:<=>?@^~\\";
+ if(runestrchr(graphics, peek)){
+ while(runestrchr(graphics, peek)){
+ buf[i++] = peek;
+ peek = Bgetrune(parsein);
+ }
+ buf[i] = '\0';
+ Bungetrune(parsein);
+ lookahead.tag = AtomTok;
+ lookahead.text = runestrdup(buf);
+ goto Functor;
+ }
+
+ /* simple atom */
+ if(islowerrune(peek)){
+ while(isalpharune(peek) || isdigitrune(peek) || peek == L'_'){
+ buf[i++] = peek;
+ peek = Bgetrune(parsein);
+ }
+ buf[i] = '\0';
+ Bungetrune(parsein);
+ lookahead.tag = AtomTok;
+ lookahead.text = runestrdup(buf);
+ goto Functor;
+ }
+
+ /* Other */
+ if(runestrchr(L",.()]}", peek)){
+ switch(peek){
+ case L',': lookahead.tag = CommaTok; break;
+ case L'(': lookahead.tag = ParenLeftTok; break;
+ case L')': lookahead.tag = ParenRightTok; break;
+ case L']': lookahead.tag = SquareBracketRightTok; break;
+ case L'}': lookahead.tag = CurlyBracketRightTok; break;
+ }
+ return;
+ }
+
+ /* EOF */
+ if(peek == Beof){
+ lookahead.tag = EofTok;
+ return;
+ }
+
+ print("Could not lex rune %C (%ud)\n", peek, peek);
+ exits("lexer");
+
+Functor:
+ peek = Bgetrune(parsein);
+ if(peek == L'(')
+ lookahead.tag = FunctorTok;
+ Bungetrune(parsein);
+ return;
+}
+
+void
+match(int tag)
+{
+ if(lookahead.tag == tag)
+ nexttoken();
+ else
+ syntaxerror("match");
+}
+
+void
+syntaxerror(char *where)
+{
+ print("Syntax error: Unexpected %d token in %s\n", lookahead.tag, where);
+ exits("syntax error");
+} \ No newline at end of file