diff options
author | Peter Mikkelsen <peter@pmikkelsen.com> | 2021-06-29 15:51:04 +0000 |
---|---|---|
committer | Peter Mikkelsen <peter@pmikkelsen.com> | 2021-06-29 15:51:04 +0000 |
commit | 02145f06ac007d730bc16930185fe18fa3e76c68 (patch) | |
tree | d71744b876974a4fd3062daf9dc334a984782f80 /parser.c | |
parent | 0b36426d023e45d6acbc7672c7083a91d10913a8 (diff) |
Add a term parser.
Diffstat (limited to 'parser.c')
-rw-r--r-- | parser.c | 556 |
1 files changed, 556 insertions, 0 deletions
@@ -1,5 +1,561 @@ #include <u.h> #include <libc.h> +#include <bio.h> #include "dat.h" #include "fns.h" + +#define PrecedenceLevels 1200 + +/* The parser doesn't produce an ast for now, it only acts like a recognizer, + printing errors if it fails */ + +typedef struct Token Token; +typedef struct Operator Operator; +typedef struct OpInfo OpInfo; +struct Token +{ + int tag; + Rune *text; + double dval; + vlong ival; +}; + +struct Operator +{ + int type; + int level; + Rune *spelling; + Operator *next; +}; + +struct OpInfo +{ + int level; + int type; +}; + +enum { + Xf = 1<<0, /* 1 */ + Yf = 1<<1, /* 2 */ + Xfx = 1<<2, /* 4 */ + Xfy = 1<<3, /* 8 */ + Yfx = 1<<4, /* 16 */ + Fy = 1<<5, /* 32 */ + Fx = 1<<6, /* 64 */ +}; + +enum { + AtomTok = 1<<0, /* 1 */ + FunctorTok = 1<<1, /* 2 */ + VarTok = 1<<2, /* 4 */ + StringTok = 1<<3, /* 8 */ + IntTok = 1<<4, /* 16 */ + FloatTok = 1<<5, /* 32 */ + ParenLeftTok = 1<<6, /* 64 */ + ParenRightTok = 1<<7, /* 128 */ + SquareBracketLeftTok = 1<<8, /* 256 */ + SquareBracketRightTok = 1<<9, /* 512 */ + CurlyBracketLeftTok = 1<<10, /* 1024 */ + CurlyBracketRightTok = 1<<11, /* 2048 */ + CommaTok = 1<<12, /* 4096 */ + EofTok = 1<<13, /* 8192 */ +}; + +static Biobuf *parsein; +static Token lookahead; +static Operator *operators[PrecedenceLevels]; + +void initoperators(void); +void addoperator(int, int, Rune *); +Operator *getoperator(Rune *); +void nexttoken(void); +Term *fullterm(int, Rune *, Term *); +Term *term(void); +Term *compound(void); +Term *parseoperators(Term *); +void match(int); +void syntaxerror(char *); +void prologtext(void); + +void +parse(int fd) +{ + parsein = Bfdopen(fd, OREAD); + if(parsein == nil){ + print("Could not open file\n"); + return; + } + initoperators(); + nexttoken(); + + prologtext(); +} + +void +prologtext(void) +{ + if(lookahead.tag == EofTok) + return; + + Term *t = fullterm(AtomTok, L".", nil); + if(t->tag != CompoundTerm || runestrcmp(t->text, L":-") != 0 || !(t->arity == 1 || t->arity == 2)){ + print("Expected directive or clause as toplevel\n"); + print("Got %S\n", prettyprint(t)); + print("Tag=%d arity=%d text=\"%S\"\n", t->tag, t->arity, t->text); + syntaxerror("prologtext"); + } + + if(t->arity == 1){ + /* A directive */ + print("Got directive: %S\n", prettyprint(t)); + }else{ + /* A clause */ + print("Got clause: %S\n", prettyprint(t)); + } + + if(lookahead.tag == AtomTok && runestrcmp(lookahead.text, L".") == 0) + match(AtomTok); + else + syntaxerror("prologtext"); + + prologtext(); +} + +Term * +fullterm(int stoptag, Rune *stoptext, Term *list) +{ + if((lookahead.tag & stoptag) && (stoptext == nil || runestrcmp(stoptext, lookahead.text) == 0)) + return parseoperators(list); + + Term *t = term(); + list = appendterm(list, t); + return fullterm(stoptag, stoptext, list); +} + +Term * +term(void) +{ + Term *result; + + switch(lookahead.tag){ + case AtomTok: + result = mkatom(lookahead.text); + match(AtomTok); + break; + case FunctorTok: + result = compound(); + break; + case VarTok: + result = mkvariable(lookahead.text); + match(VarTok); + break; + case IntTok: + result = mknumber(NumberInt, lookahead.ival, 0); + match(IntTok); + break; + case FloatTok: + result = mknumber(NumberFloat, 0, lookahead.dval); + match(FloatTok); + break; + default: + print("Cant parse term of token type %d\n", lookahead.tag); + syntaxerror("term"); + result = nil; + } + + return result; +} + +Term * +compound(void) +{ + Rune *name = lookahead.text; + Term *args = nil; + int arity = 0; + + match(FunctorTok); + match(ParenLeftTok); + + while(lookahead.tag != ParenRightTok){ + Term *t = fullterm(CommaTok|ParenRightTok, nil, nil); + if(lookahead.tag == CommaTok) + match(CommaTok); + args = appendterm(args, t); + arity++; + } + + match(ParenRightTok); + return mkcompound(name, arity, args); +} + +Term * +parseoperators(Term *list) +{ + int i; + int length = termslength(list); + Term *t; + Term **terms = malloc(sizeof(Term *) * length); + OpInfo *infos = malloc(sizeof(OpInfo) * length); + + for(i = 0, t = list; i < length; i++){ + Operator *op = getoperator(t->text); + if(op){ + infos[i].type = op->type; + infos[i].level = op->level; + }else{ + infos[i].type = 0; + infos[i].level = 0; + } + terms[i] = t; + Term *tmp = t; + t = t->next; + tmp->next = nil; + } + + while(length > 1){ + int index = -1; + int lowest = PrecedenceLevels + 1; + for(i = 0; i < length; i++){ + if(infos[i].type == 0) + continue; + if(infos[i].level == lowest && infos[i].type == Xfy && i == index+1) + index = i; + else if(infos[i].level < lowest){ + index = i; + lowest = infos[i].level; + } + } + + if(index == -1){ + print("Can't parse, list contains no operators"); + syntaxerror("parseoperators"); + } + + int infixlevel = infos[index].level & (Xfx|Xfy|Yfx); + int prefixlevel = infos[index].level & (Fx|Fy); + int postfixlevel = infos[index].level & (Xf|Yf); + + if(infixlevel && index != 0 && index != length-1 && infos[index-1].type == 0 && infos[index-1].type == 0){ + infos[index-1].type = 0; + infos[index-1].level = infos[index].level; + Term *args = terms[index-1]; + args->next = terms[index+1]; + terms[index-1] = mkcompound(terms[index]->text, 2, args); + length -= 2; + for(i = index; i < length; i++){ + infos[i].level = infos[i+2].level; + infos[i].type = infos[i+2].type; + terms[i] = terms[i+2]; + } + }else if(prefixlevel && index != length-1 && infos[index+1].type == 0){ + infos[index].type = 0; + terms[index] = mkcompound(terms[index]->text, 1, terms[index+1]); + length--; + for(i = index+1; i < length; i++){ + infos[i].level = infos[i+1].level; + infos[i].type = infos[i+1].type; + terms[i] = terms[i+1]; + } + }else if(postfixlevel && index != 0 && infos[index-1].type == 0){ + infos[index-1].type = 0; + infos[index-1].level = infos[index].level; + terms[index-1] = mkcompound(terms[index]->text, 1, terms[index-1]); + length--; + for(i = index; i < length; i++){ + infos[i].level = infos[i+1].level; + infos[i].type = infos[i+1].type; + terms[i] = terms[i+1]; + } + }else{ + print("Parse error when parsing operators\n"); + syntaxerror("parseoperators"); + } + } + + Term *result = terms[0]; + free(infos); + free(terms); + return result; +} + +void +initoperators(void) +{ + Operator *op; + int i; + for(i = 0; i < PrecedenceLevels; i++){ + while(operators[i]){ + op = operators[i]; + operators[i] = op->next; + free(op); + } + } + + addoperator(1200, Xfx, L":-"); + addoperator(1200, Fx, L":-"); + addoperator(1100, Xfy, L";"); + addoperator(1000, Xfy, L","); + addoperator(700, Xfx, L"="); + addoperator(700, Xfx, L"is"); + addoperator(500, Yfx, L"+"); + addoperator(400, Yfx, L"*"); + addoperator(400, Yfx, L"/"); +} + +void +addoperator(int level, int type, Rune *spelling) +{ + Operator *op = malloc(sizeof(Operator)); + op->type = type; + op->level = level; + op->spelling = spelling; + op->next = operators[level-1]; + operators[level-1] = op; +} + +Operator * +getoperator(Rune *spelling) +{ + Operator *op = nil; + int level; + + if(spelling == nil) + return nil; + + for(level = 0; level < PrecedenceLevels && op == nil; level++){ + Operator *tmp; + for(tmp = operators[level]; tmp != nil; tmp = tmp->next){ + if(runestrcmp(tmp->spelling, spelling) == 0){ + if(op == nil){ + op = malloc(sizeof(Operator)); + memcpy(op, tmp, sizeof(Operator)); + }else + op->type |= tmp->type; + } + } + } + return op; +} + +void +nexttoken(void) +{ + Rune buf[1024]; + Rune peek; + int i = 0; + double numD; + vlong numI = 0; + int negative = 0; + static long replaypeek = -1; + + if(replaypeek == -1) + peek = Bgetrune(parsein); + else{ + peek = replaypeek; + replaypeek = -1; + } + + /* Skip whitespace */ + while(isspacerune(peek)) + peek = Bgetrune(parsein); + + /* Skip single line comments */ + if(peek == L'%'){ + while(peek != L'\n') + peek = Bgetrune(parsein); + peek = Bgetrune(parsein); + } + + /* Variables */ + if(peek == L'_' || isupperrune(peek)){ + while(peek == L'_' || isalpharune(peek) || isdigitrune(peek)){ + /* FIXME Should handle input length, also elsewhere in this function */ + buf[i++] = peek; + peek = Bgetrune(parsein); + } + buf[i] = '\0'; + Bungetrune(parsein); + + lookahead.tag = VarTok; + lookahead.text = runestrdup(buf); + return; + } + + /* Strings */ + if(peek == L'"'){ + peek = Bgetrune(parsein); + while(peek != L'"'){ + buf[i++] = peek; + peek = Bgetrune(parsein); + } + buf[i] = '\0'; + lookahead.tag = StringTok; + lookahead.text = runestrdup(buf); + return; + } + + /* Numbers */ + if(peek == L'-'){ + peek = Bgetrune(parsein); + if(isdigitrune(peek)) + negative = 1; + else{ + Bungetrune(parsein); + peek = L'-'; + } + } + if(isdigitrune(peek)){ + if(peek == L'0'){ + peek = Bgetrune(parsein); + switch(peek){ + case L'o': /* Octal */ + case L'x': /* Hexadecimal */ + case L'b': /* Binary */ + print("Cant lex numbers in other bases than base 10 right now\n"); + exits("lexer"); + } + } + while(isdigitrune(peek)){ + numI *= 10; + numI += peek - L'0'; + peek = Bgetrune(parsein); + } + if(peek == L'.'){ + int place = 1; + numD = numI; + peek = Bgetrune(parsein); + if(!isdigitrune(peek)){ + Bungetrune(parsein); + replaypeek = L'.'; + goto Integer; + } + while(isdigitrune(peek)){ + numD += (peek - L'0') / (10 * place); + peek = Bgetrune(parsein); + } + /* Should also lex 123.45E10 */ + lookahead.tag = FloatTok; + lookahead.dval = negative ? -numD : numD; + }else{ + Bungetrune(parsein); +Integer: + lookahead.tag = IntTok; + lookahead.ival = negative ? -numI : numI; + } + return; + } + + /* Atoms */ + /* Atoms in single quotes */ + if(peek == L'\''){ + peek = Bgetrune(parsein); + while(peek != L'\''){ + buf[i++] = peek; + peek = Bgetrune(parsein); + } + buf[i] = '\0'; + lookahead.tag = AtomTok; + lookahead.text = runestrdup(buf); + goto Functor; + } + + /* Special atom [] */ + if(peek == L'['){ + peek = Bgetrune(parsein); + if(peek == L']'){ + lookahead.tag = AtomTok; + lookahead.text = runestrdup(L"[]"); + goto Functor; + }else{ + Bungetrune(parsein); + lookahead.tag = SquareBracketLeftTok; + return; + } + } + + /* Special atom {} */ + if(peek == L'{'){ + peek = Bgetrune(parsein); + if(peek == L'}'){ + lookahead.tag = AtomTok; + lookahead.text = runestrdup(L"{}"); + goto Functor; + }else{ + Bungetrune(parsein); + lookahead.tag = CurlyBracketLeftTok; + return; + } + } + + /* Graphic atom */ + Rune *graphics = L"#$&*+-./:<=>?@^~\\"; + if(runestrchr(graphics, peek)){ + while(runestrchr(graphics, peek)){ + buf[i++] = peek; + peek = Bgetrune(parsein); + } + buf[i] = '\0'; + Bungetrune(parsein); + lookahead.tag = AtomTok; + lookahead.text = runestrdup(buf); + goto Functor; + } + + /* simple atom */ + if(islowerrune(peek)){ + while(isalpharune(peek) || isdigitrune(peek) || peek == L'_'){ + buf[i++] = peek; + peek = Bgetrune(parsein); + } + buf[i] = '\0'; + Bungetrune(parsein); + lookahead.tag = AtomTok; + lookahead.text = runestrdup(buf); + goto Functor; + } + + /* Other */ + if(runestrchr(L",.()]}", peek)){ + switch(peek){ + case L',': lookahead.tag = CommaTok; break; + case L'(': lookahead.tag = ParenLeftTok; break; + case L')': lookahead.tag = ParenRightTok; break; + case L']': lookahead.tag = SquareBracketRightTok; break; + case L'}': lookahead.tag = CurlyBracketRightTok; break; + } + return; + } + + /* EOF */ + if(peek == Beof){ + lookahead.tag = EofTok; + return; + } + + print("Could not lex rune %C (%ud)\n", peek, peek); + exits("lexer"); + +Functor: + peek = Bgetrune(parsein); + if(peek == L'(') + lookahead.tag = FunctorTok; + Bungetrune(parsein); + return; +} + +void +match(int tag) +{ + if(lookahead.tag == tag) + nexttoken(); + else + syntaxerror("match"); +} + +void +syntaxerror(char *where) +{ + print("Syntax error: Unexpected %d token in %s\n", lookahead.tag, where); + exits("syntax error"); +}
\ No newline at end of file |