From 02145f06ac007d730bc16930185fe18fa3e76c68 Mon Sep 17 00:00:00 2001 From: Peter Mikkelsen Date: Tue, 29 Jun 2021 15:51:04 +0000 Subject: Add a term parser. --- dat.h | 27 +++ example.pl | 5 + fns.h | 13 ++ main.c | 12 ++ misc.c | 72 ++++++++ mkfile | 2 +- parser.c | 556 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ prettyprint.c | 61 +++++++ 8 files changed, 747 insertions(+), 1 deletion(-) create mode 100644 example.pl create mode 100644 misc.c create mode 100644 prettyprint.c diff --git a/dat.h b/dat.h index d6a72bb..7c7c0a9 100644 --- a/dat.h +++ b/dat.h @@ -1 +1,28 @@ +typedef struct Term Term; +struct Term +{ + int tag; + + Rune *text; + int arity; + Term *next; + Term *children; + int numbertype; + vlong ival; + double dval; +}; + +enum { + CompoundTerm, + AtomTerm, + VariableTerm, + NumberTerm, + StringTerm, +}; + +enum { + NumberInt, + NumberFloat, +}; + int debug; \ No newline at end of file diff --git a/example.pl b/example.pl new file mode 100644 index 0000000..b0c6ebe --- /dev/null +++ b/example.pl @@ -0,0 +1,5 @@ +:- dynamic(math/4). + +math(A,B,C,D) :- D is A + B + C * A. + +true :- 1 = 1. diff --git a/fns.h b/fns.h index e69de29..6878a24 100644 --- a/fns.h +++ b/fns.h @@ -0,0 +1,13 @@ +/* parser.c */ +void parse(int); + +/* prettyprint.c */ +Rune *prettyprint(Term *); + +/* misc.c */ +Term *appendterm(Term *, Term *); +int termslength(Term *); +Term *mkatom(Rune *); +Term *mkvariable(Rune *); +Term *mkcompound(Rune *, int, Term *); +Term *mknumber(int, vlong, double); \ No newline at end of file diff --git a/main.c b/main.c index ad862a2..504b02d 100644 --- a/main.c +++ b/main.c @@ -9,10 +9,15 @@ void usage(void); void main(int argc, char *argv[]) { + char *parsetestfile = nil; + ARGBEGIN{ case 'd': debug = 1; break; + case 'f': + parsetestfile = EARGF(usage()); + break; default: usage(); }ARGEND @@ -20,6 +25,13 @@ main(int argc, char *argv[]) if(argc != 0) usage(); + if(parsetestfile){ + int fd = open(parsetestfile, OREAD); + if(fd < 0) + exits("open"); + parse(fd); + } + exits(nil); } diff --git a/misc.c b/misc.c new file mode 100644 index 0000000..dcd5e17 --- /dev/null +++ b/misc.c @@ -0,0 +1,72 @@ +#include +#include + +#include "dat.h" +#include "fns.h" + +Term * +appendterm(Term *a, Term *b) +{ + if(a == nil) + return b; + + Term *tmp; + for(tmp = a; tmp->next != nil; tmp = tmp->next); + tmp->next = b; + return a; +} + +int +termslength(Term *list) +{ + int len; + for(len = 0; list != nil; len++, list = list->next); + return len; +} + +Term * +mkterm(int tag) +{ + Term *t = malloc(sizeof(Term)); + t->tag = tag; + t->next = nil; + t->children = nil; + t->text = nil; + return t; +} + +Term * +mkatom(Rune *name) +{ + Term *t = mkterm(AtomTerm); + t->text = name; + return t; +} + +Term * +mkvariable(Rune *name) +{ + Term *t = mkterm(VariableTerm); + t->text = name; + return t; +} + +Term * +mkcompound(Rune *name, int arity, Term *args) +{ + Term *t = mkterm(CompoundTerm); + t->text = name; + t->arity = arity; + t->children = args; + return t; +} + +Term * +mknumber(int type, vlong ival, double dval) +{ + Term *t = mkterm(NumberTerm); + t->numbertype = type; + t->ival = ival; + t->dval = dval; + return t; +} diff --git a/mkfile b/mkfile index 18fff33..0e6af56 100644 --- a/mkfile +++ b/mkfile @@ -2,7 +2,7 @@ TARG=pprolog -OFILES=main.$O parser.$O +OFILES=main.$O parser.$O prettyprint.$O misc.$O HFILES=dat.h fns.h diff --git a/parser.c b/parser.c index 0a5e9d4..459d048 100644 --- a/parser.c +++ b/parser.c @@ -1,5 +1,561 @@ #include #include +#include #include "dat.h" #include "fns.h" + +#define PrecedenceLevels 1200 + +/* The parser doesn't produce an ast for now, it only acts like a recognizer, + printing errors if it fails */ + +typedef struct Token Token; +typedef struct Operator Operator; +typedef struct OpInfo OpInfo; +struct Token +{ + int tag; + Rune *text; + double dval; + vlong ival; +}; + +struct Operator +{ + int type; + int level; + Rune *spelling; + Operator *next; +}; + +struct OpInfo +{ + int level; + int type; +}; + +enum { + Xf = 1<<0, /* 1 */ + Yf = 1<<1, /* 2 */ + Xfx = 1<<2, /* 4 */ + Xfy = 1<<3, /* 8 */ + Yfx = 1<<4, /* 16 */ + Fy = 1<<5, /* 32 */ + Fx = 1<<6, /* 64 */ +}; + +enum { + AtomTok = 1<<0, /* 1 */ + FunctorTok = 1<<1, /* 2 */ + VarTok = 1<<2, /* 4 */ + StringTok = 1<<3, /* 8 */ + IntTok = 1<<4, /* 16 */ + FloatTok = 1<<5, /* 32 */ + ParenLeftTok = 1<<6, /* 64 */ + ParenRightTok = 1<<7, /* 128 */ + SquareBracketLeftTok = 1<<8, /* 256 */ + SquareBracketRightTok = 1<<9, /* 512 */ + CurlyBracketLeftTok = 1<<10, /* 1024 */ + CurlyBracketRightTok = 1<<11, /* 2048 */ + CommaTok = 1<<12, /* 4096 */ + EofTok = 1<<13, /* 8192 */ +}; + +static Biobuf *parsein; +static Token lookahead; +static Operator *operators[PrecedenceLevels]; + +void initoperators(void); +void addoperator(int, int, Rune *); +Operator *getoperator(Rune *); +void nexttoken(void); +Term *fullterm(int, Rune *, Term *); +Term *term(void); +Term *compound(void); +Term *parseoperators(Term *); +void match(int); +void syntaxerror(char *); +void prologtext(void); + +void +parse(int fd) +{ + parsein = Bfdopen(fd, OREAD); + if(parsein == nil){ + print("Could not open file\n"); + return; + } + initoperators(); + nexttoken(); + + prologtext(); +} + +void +prologtext(void) +{ + if(lookahead.tag == EofTok) + return; + + Term *t = fullterm(AtomTok, L".", nil); + if(t->tag != CompoundTerm || runestrcmp(t->text, L":-") != 0 || !(t->arity == 1 || t->arity == 2)){ + print("Expected directive or clause as toplevel\n"); + print("Got %S\n", prettyprint(t)); + print("Tag=%d arity=%d text=\"%S\"\n", t->tag, t->arity, t->text); + syntaxerror("prologtext"); + } + + if(t->arity == 1){ + /* A directive */ + print("Got directive: %S\n", prettyprint(t)); + }else{ + /* A clause */ + print("Got clause: %S\n", prettyprint(t)); + } + + if(lookahead.tag == AtomTok && runestrcmp(lookahead.text, L".") == 0) + match(AtomTok); + else + syntaxerror("prologtext"); + + prologtext(); +} + +Term * +fullterm(int stoptag, Rune *stoptext, Term *list) +{ + if((lookahead.tag & stoptag) && (stoptext == nil || runestrcmp(stoptext, lookahead.text) == 0)) + return parseoperators(list); + + Term *t = term(); + list = appendterm(list, t); + return fullterm(stoptag, stoptext, list); +} + +Term * +term(void) +{ + Term *result; + + switch(lookahead.tag){ + case AtomTok: + result = mkatom(lookahead.text); + match(AtomTok); + break; + case FunctorTok: + result = compound(); + break; + case VarTok: + result = mkvariable(lookahead.text); + match(VarTok); + break; + case IntTok: + result = mknumber(NumberInt, lookahead.ival, 0); + match(IntTok); + break; + case FloatTok: + result = mknumber(NumberFloat, 0, lookahead.dval); + match(FloatTok); + break; + default: + print("Cant parse term of token type %d\n", lookahead.tag); + syntaxerror("term"); + result = nil; + } + + return result; +} + +Term * +compound(void) +{ + Rune *name = lookahead.text; + Term *args = nil; + int arity = 0; + + match(FunctorTok); + match(ParenLeftTok); + + while(lookahead.tag != ParenRightTok){ + Term *t = fullterm(CommaTok|ParenRightTok, nil, nil); + if(lookahead.tag == CommaTok) + match(CommaTok); + args = appendterm(args, t); + arity++; + } + + match(ParenRightTok); + return mkcompound(name, arity, args); +} + +Term * +parseoperators(Term *list) +{ + int i; + int length = termslength(list); + Term *t; + Term **terms = malloc(sizeof(Term *) * length); + OpInfo *infos = malloc(sizeof(OpInfo) * length); + + for(i = 0, t = list; i < length; i++){ + Operator *op = getoperator(t->text); + if(op){ + infos[i].type = op->type; + infos[i].level = op->level; + }else{ + infos[i].type = 0; + infos[i].level = 0; + } + terms[i] = t; + Term *tmp = t; + t = t->next; + tmp->next = nil; + } + + while(length > 1){ + int index = -1; + int lowest = PrecedenceLevels + 1; + for(i = 0; i < length; i++){ + if(infos[i].type == 0) + continue; + if(infos[i].level == lowest && infos[i].type == Xfy && i == index+1) + index = i; + else if(infos[i].level < lowest){ + index = i; + lowest = infos[i].level; + } + } + + if(index == -1){ + print("Can't parse, list contains no operators"); + syntaxerror("parseoperators"); + } + + int infixlevel = infos[index].level & (Xfx|Xfy|Yfx); + int prefixlevel = infos[index].level & (Fx|Fy); + int postfixlevel = infos[index].level & (Xf|Yf); + + if(infixlevel && index != 0 && index != length-1 && infos[index-1].type == 0 && infos[index-1].type == 0){ + infos[index-1].type = 0; + infos[index-1].level = infos[index].level; + Term *args = terms[index-1]; + args->next = terms[index+1]; + terms[index-1] = mkcompound(terms[index]->text, 2, args); + length -= 2; + for(i = index; i < length; i++){ + infos[i].level = infos[i+2].level; + infos[i].type = infos[i+2].type; + terms[i] = terms[i+2]; + } + }else if(prefixlevel && index != length-1 && infos[index+1].type == 0){ + infos[index].type = 0; + terms[index] = mkcompound(terms[index]->text, 1, terms[index+1]); + length--; + for(i = index+1; i < length; i++){ + infos[i].level = infos[i+1].level; + infos[i].type = infos[i+1].type; + terms[i] = terms[i+1]; + } + }else if(postfixlevel && index != 0 && infos[index-1].type == 0){ + infos[index-1].type = 0; + infos[index-1].level = infos[index].level; + terms[index-1] = mkcompound(terms[index]->text, 1, terms[index-1]); + length--; + for(i = index; i < length; i++){ + infos[i].level = infos[i+1].level; + infos[i].type = infos[i+1].type; + terms[i] = terms[i+1]; + } + }else{ + print("Parse error when parsing operators\n"); + syntaxerror("parseoperators"); + } + } + + Term *result = terms[0]; + free(infos); + free(terms); + return result; +} + +void +initoperators(void) +{ + Operator *op; + int i; + for(i = 0; i < PrecedenceLevels; i++){ + while(operators[i]){ + op = operators[i]; + operators[i] = op->next; + free(op); + } + } + + addoperator(1200, Xfx, L":-"); + addoperator(1200, Fx, L":-"); + addoperator(1100, Xfy, L";"); + addoperator(1000, Xfy, L","); + addoperator(700, Xfx, L"="); + addoperator(700, Xfx, L"is"); + addoperator(500, Yfx, L"+"); + addoperator(400, Yfx, L"*"); + addoperator(400, Yfx, L"/"); +} + +void +addoperator(int level, int type, Rune *spelling) +{ + Operator *op = malloc(sizeof(Operator)); + op->type = type; + op->level = level; + op->spelling = spelling; + op->next = operators[level-1]; + operators[level-1] = op; +} + +Operator * +getoperator(Rune *spelling) +{ + Operator *op = nil; + int level; + + if(spelling == nil) + return nil; + + for(level = 0; level < PrecedenceLevels && op == nil; level++){ + Operator *tmp; + for(tmp = operators[level]; tmp != nil; tmp = tmp->next){ + if(runestrcmp(tmp->spelling, spelling) == 0){ + if(op == nil){ + op = malloc(sizeof(Operator)); + memcpy(op, tmp, sizeof(Operator)); + }else + op->type |= tmp->type; + } + } + } + return op; +} + +void +nexttoken(void) +{ + Rune buf[1024]; + Rune peek; + int i = 0; + double numD; + vlong numI = 0; + int negative = 0; + static long replaypeek = -1; + + if(replaypeek == -1) + peek = Bgetrune(parsein); + else{ + peek = replaypeek; + replaypeek = -1; + } + + /* Skip whitespace */ + while(isspacerune(peek)) + peek = Bgetrune(parsein); + + /* Skip single line comments */ + if(peek == L'%'){ + while(peek != L'\n') + peek = Bgetrune(parsein); + peek = Bgetrune(parsein); + } + + /* Variables */ + if(peek == L'_' || isupperrune(peek)){ + while(peek == L'_' || isalpharune(peek) || isdigitrune(peek)){ + /* FIXME Should handle input length, also elsewhere in this function */ + buf[i++] = peek; + peek = Bgetrune(parsein); + } + buf[i] = '\0'; + Bungetrune(parsein); + + lookahead.tag = VarTok; + lookahead.text = runestrdup(buf); + return; + } + + /* Strings */ + if(peek == L'"'){ + peek = Bgetrune(parsein); + while(peek != L'"'){ + buf[i++] = peek; + peek = Bgetrune(parsein); + } + buf[i] = '\0'; + lookahead.tag = StringTok; + lookahead.text = runestrdup(buf); + return; + } + + /* Numbers */ + if(peek == L'-'){ + peek = Bgetrune(parsein); + if(isdigitrune(peek)) + negative = 1; + else{ + Bungetrune(parsein); + peek = L'-'; + } + } + if(isdigitrune(peek)){ + if(peek == L'0'){ + peek = Bgetrune(parsein); + switch(peek){ + case L'o': /* Octal */ + case L'x': /* Hexadecimal */ + case L'b': /* Binary */ + print("Cant lex numbers in other bases than base 10 right now\n"); + exits("lexer"); + } + } + while(isdigitrune(peek)){ + numI *= 10; + numI += peek - L'0'; + peek = Bgetrune(parsein); + } + if(peek == L'.'){ + int place = 1; + numD = numI; + peek = Bgetrune(parsein); + if(!isdigitrune(peek)){ + Bungetrune(parsein); + replaypeek = L'.'; + goto Integer; + } + while(isdigitrune(peek)){ + numD += (peek - L'0') / (10 * place); + peek = Bgetrune(parsein); + } + /* Should also lex 123.45E10 */ + lookahead.tag = FloatTok; + lookahead.dval = negative ? -numD : numD; + }else{ + Bungetrune(parsein); +Integer: + lookahead.tag = IntTok; + lookahead.ival = negative ? -numI : numI; + } + return; + } + + /* Atoms */ + /* Atoms in single quotes */ + if(peek == L'\''){ + peek = Bgetrune(parsein); + while(peek != L'\''){ + buf[i++] = peek; + peek = Bgetrune(parsein); + } + buf[i] = '\0'; + lookahead.tag = AtomTok; + lookahead.text = runestrdup(buf); + goto Functor; + } + + /* Special atom [] */ + if(peek == L'['){ + peek = Bgetrune(parsein); + if(peek == L']'){ + lookahead.tag = AtomTok; + lookahead.text = runestrdup(L"[]"); + goto Functor; + }else{ + Bungetrune(parsein); + lookahead.tag = SquareBracketLeftTok; + return; + } + } + + /* Special atom {} */ + if(peek == L'{'){ + peek = Bgetrune(parsein); + if(peek == L'}'){ + lookahead.tag = AtomTok; + lookahead.text = runestrdup(L"{}"); + goto Functor; + }else{ + Bungetrune(parsein); + lookahead.tag = CurlyBracketLeftTok; + return; + } + } + + /* Graphic atom */ + Rune *graphics = L"#$&*+-./:<=>?@^~\\"; + if(runestrchr(graphics, peek)){ + while(runestrchr(graphics, peek)){ + buf[i++] = peek; + peek = Bgetrune(parsein); + } + buf[i] = '\0'; + Bungetrune(parsein); + lookahead.tag = AtomTok; + lookahead.text = runestrdup(buf); + goto Functor; + } + + /* simple atom */ + if(islowerrune(peek)){ + while(isalpharune(peek) || isdigitrune(peek) || peek == L'_'){ + buf[i++] = peek; + peek = Bgetrune(parsein); + } + buf[i] = '\0'; + Bungetrune(parsein); + lookahead.tag = AtomTok; + lookahead.text = runestrdup(buf); + goto Functor; + } + + /* Other */ + if(runestrchr(L",.()]}", peek)){ + switch(peek){ + case L',': lookahead.tag = CommaTok; break; + case L'(': lookahead.tag = ParenLeftTok; break; + case L')': lookahead.tag = ParenRightTok; break; + case L']': lookahead.tag = SquareBracketRightTok; break; + case L'}': lookahead.tag = CurlyBracketRightTok; break; + } + return; + } + + /* EOF */ + if(peek == Beof){ + lookahead.tag = EofTok; + return; + } + + print("Could not lex rune %C (%ud)\n", peek, peek); + exits("lexer"); + +Functor: + peek = Bgetrune(parsein); + if(peek == L'(') + lookahead.tag = FunctorTok; + Bungetrune(parsein); + return; +} + +void +match(int tag) +{ + if(lookahead.tag == tag) + nexttoken(); + else + syntaxerror("match"); +} + +void +syntaxerror(char *where) +{ + print("Syntax error: Unexpected %d token in %s\n", lookahead.tag, where); + exits("syntax error"); +} \ No newline at end of file diff --git a/prettyprint.c b/prettyprint.c new file mode 100644 index 0000000..499fc84 --- /dev/null +++ b/prettyprint.c @@ -0,0 +1,61 @@ +#include +#include + +#include "dat.h" +#include "fns.h" + +Rune *prettyprintlist(Term *, Rune *, int); + +Rune * +prettyprint(Term *t) +{ + Rune *result; + Rune *args; + + switch(t->tag){ + case CompoundTerm: + args = prettyprintlist(t->children, L", ", 0); + result = runesmprint("%S(%S)", t->text, args); + free(args); + break; + case AtomTerm: + case VariableTerm: + result = runesmprint("%S", t->text); + break; + case NumberTerm: + if(t->numbertype == NumberInt) + result = runesmprint("%lld", t->ival); + else + result = runesmprint("%f", t->dval); + break; + default: + result = runesmprint("cant print term with tag %d", t->tag); + break; + } + + return result; +} + +Rune * +prettyprintlist(Term *t, Rune *sep, int end) +{ + if(t == nil){ + if(end) + return runesmprint("%S", sep); + else + return runesmprint(""); + } + + Rune *str = prettyprint(t); + Rune *rest = prettyprintlist(t->next, sep, end); + Rune *result; + + if(t->next != nil) + result = runesmprint("%S%S%S", str, sep, rest); + else + result = runesmprint("%S%S", str, end ? rest : L""); + + free(str); + free(rest); + return result; +} \ No newline at end of file -- cgit v1.2.3