MBED port of https://github.com/ys1382/filagree . The only change is adding #define MBED
Diff: compile.c
- Revision:
- 0:1a89e28dea91
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/compile.c Wed May 30 21:13:01 2012 +0000 @@ -0,0 +1,1379 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <stdarg.h> +#include <ctype.h> +#include <errno.h> + +#include "util.h" +#include "struct.h" +#include "serial.h" +#include "compile.h" +#include "vm.h" +#include "variable.h" + +#define TAG "compile" +#define ERROR_LEX "Lexigraphical error" +#define ERROR_PARSE "Parsological error" +#define ERROR_TOKEN "Unknown token" +#define QUOTE '\'' +#define ESCAPE '\\' +#define ESCAPED_NEWLINE 'n' +#define ESCAPED_TAB 't' +#define ESCAPED_QUOTE '\'' + +uint32_t line; +struct array* lex_list; +struct map *imports = NULL; +struct byte_array *read_file(const struct byte_array *filename); + +// token /////////////////////////////////////////////////////////////////// + + enum Lexeme { + LEX_NONE, + LEX_IMPORT, + LEX_LEFT_COMMENT, + LEX_RIGHT_COMMENT, + LEX_PLUS, + LEX_MINUS, + LEX_TIMES, + LEX_DIVIDE, + LEX_MODULO, + LEX_AND, + LEX_OR, + LEX_NOT, + LEX_BAND, + LEX_BOR, + LEX_INVERSE, + LEX_XOR, + LEX_LSHIFT, + LEX_RSHIFT, + LEX_GREATER, + LEX_LESSER, + LEX_SAME, + LEX_SET, + LEX_DIFFERENT, + LEX_COMMA, + LEX_PERIOD, + LEX_COLON, + LEX_LINE_COMMENT, + LEX_LEFTHESIS, + LEX_RIGHTHESIS, + LEX_LEFTSQUARE, + LEX_RIGHTSQUARE, + LEX_LEFTSTACHE, + LEX_RIGHTSTACHE, + LEX_IDENTIFIER, + LEX_STRING, + LEX_INTEGER, + LEX_TRUE, + LEX_FALSE, + LEX_IF, + LEX_THEN, + LEX_ELSE, + LEX_END, + LEX_WHILE, + LEX_FOR, + LEX_IN, + LEX_WHERE, + LEX_FUNCTION, + LEX_RETURN, + LEX_TRY, + LEX_CATCH, + LEX_THROW, + LEX_DO, + }; + +struct token { + enum Lexeme lexeme; + uint32_t number; + struct byte_array *string; + uint32_t at_line; +}; + +struct number_string lexemes[] = { + {LEX_IMPORT, "import"}, + {LEX_LEFT_COMMENT, "/*"}, + {LEX_RIGHT_COMMENT, "*/"}, + {LEX_LINE_COMMENT, "#"}, + {LEX_PLUS, "+"}, + {LEX_MINUS, "-"}, + {LEX_TIMES, "*"}, + {LEX_DIVIDE, "/"}, + {LEX_MODULO, "%"}, + {LEX_BAND, "&"}, + {LEX_BOR, "|"}, + {LEX_INVERSE, "~"}, + {LEX_XOR, "^"}, + {LEX_RSHIFT, ">>"}, + {LEX_LSHIFT, "<<"}, + {LEX_AND, "and"}, + {LEX_OR, "or"}, + {LEX_NOT, "not"}, + {LEX_GREATER, ">"}, + {LEX_LESSER, "<"}, + {LEX_SAME, "=="}, + {LEX_SET, "="}, + {LEX_DIFFERENT, "!="}, + {LEX_COMMA, ","}, + {LEX_PERIOD, "."}, + {LEX_COLON, ":"}, + {LEX_LEFTHESIS, "("}, + {LEX_RIGHTHESIS, ")"}, + {LEX_LEFTSQUARE, "["}, + {LEX_RIGHTSQUARE, "]"}, + {LEX_LEFTSTACHE, "{"}, + {LEX_RIGHTSTACHE, "}"}, + {LEX_TRUE, "true"}, + {LEX_FALSE, "false"}, + {LEX_IF, "if"}, + {LEX_THEN, "then"}, + {LEX_ELSE, "else"}, + {LEX_END, "end"}, + {LEX_WHILE, "while"}, + {LEX_FUNCTION, "function"}, + {LEX_FOR, "for"}, + {LEX_IN, "in"}, + {LEX_WHERE, "where"}, + {LEX_RETURN, "return"}, + {LEX_TRY, "try"}, + {LEX_CATCH, "catch"}, + {LEX_THROW, "throw"}, + {LEX_DO, "do"} +}; + +const char* lexeme_to_string(enum Lexeme lexeme) +{ + switch (lexeme) { + case LEX_IDENTIFIER: return "id"; + case LEX_STRING: return "string"; + case LEX_INTEGER: return "number"; + default: + return NUM_TO_STRING(lexemes, lexeme); + } +} + +// display //////////////////////////////////////////////////////////////////// + +#ifdef DEBUG + +void display_token(struct token *token, int depth) { + assert_message(depth < 10, "kablooie!"); + char* indent = (char*)malloc(sizeof(char)*(depth+1)); + int i; + for (i=0; i<depth; i++) + indent[i] = '\t'; + indent[i] = 0; + DEBUGPRINT("%s%d ", indent, token->lexeme); + + switch (token->lexeme) { + case LEX_NONE: break; + case LEX_INTEGER: DEBUGPRINT("%u", token->number); break; + case LEX_STRING: + case LEX_IDENTIFIER: { + char* s = byte_array_to_string(token->string); + DEBUGPRINT("%s:%s", lexeme_to_string(token->lexeme), s); + free(s); + } break; + case LEX_FUNCTION: + DEBUGPRINT("fdecl"); + break; + default: + DEBUGPRINT("%s", lexeme_to_string(token->lexeme)); + break; + } + DEBUGPRINT("\n"); + free(indent); +} + +void display_lex_list() { + int i, n = lex_list->length; + for (i=0; i<n; i++) { + DEBUGPRINT("\t%d: ", i); + display_token((struct token*)array_get(lex_list,i), 0); + } +} + +#endif + +// lex ///////////////////////////////////////////////////////////////////// + +struct array* lex(struct byte_array *binput); + +struct token *token_new(enum Lexeme lexeme, int at_line) { + struct token *t = (struct token*)malloc(sizeof(struct token)); + t->lexeme = lexeme; + t->string = NULL; + t->number = 0; + t->at_line = line; + return t; +} + +struct token *insert_token(enum Lexeme lexeme) { + struct token *token = token_new(lexeme, line); + array_add(lex_list, token); + return token; +} + +int insert_token_number(const char *input, int i) { + struct token *token = insert_token(LEX_INTEGER); + token->number = atoi(&input[i]); + //while (isdigit(input[i++])); + for (; isdigit(input[i]); i++); + return i; +} + +bool isiden(char c) +{ + return isalnum(c) || (c=='_'); +} + +bool isKeyword(int n) +{ + for (int c,i=0; (c = lexemes[n].chars[i]); i++) + if (!isalpha(c)) + return false; + return true; +} + +int insert_token_string(enum Lexeme lexeme, const char* input, int i) +{ + struct byte_array *string = byte_array_new(); + while ((lexeme==LEX_IDENTIFIER && isiden(input[i])) || + (lexeme==LEX_STRING && input[i] != QUOTE)) { + uint8_t c = input[i++]; + //DEBUGPRINT("@%d c=%3d %c\n", i, c, c); + if (c == ESCAPE) { + c = input[i++]; + switch (c) { + case ESCAPED_NEWLINE: c = '\n'; break; + case ESCAPED_TAB: c = '\t'; break; + case ESCAPED_QUOTE: c = '\''; break; + default: + return (VOID_INT)exit_message("unknown escape"); + } + } + byte_array_add_byte(string, c); + } + + int end = i + (lexeme==LEX_STRING); + struct token *token = token_new(lexeme, line); + token->string = string; + array_add(lex_list, token); + //display_token(token, 0); + return end; +} + +int insert_lexeme(int index) { + struct number_string sn = lexemes[index]; + insert_token((enum Lexeme)sn.number); + return strlen(sn.chars); +} + +int import(const char* input, int i) +{ + i += strlen(lexeme_to_string(LEX_IMPORT)); + while (isspace(input[i++])); + struct byte_array *path = byte_array_new(); + while (!isspace(input[i]) && input[i]!=QUOTE) + byte_array_add_byte(path, input[i++]); + byte_array_append(path, byte_array_from_string(EXTENSION_SRC)); + + if (!map_has(imports, path)) { + map_insert(imports, path, NULL); + struct byte_array *imported = read_file(path); + lex(imported); + } + + return i+1; +} + +struct array* lex(struct byte_array *binput) +{ + lex_list = array_new(); + imports = map_new(); + int i=0,j; + char c; + line = 1; + + const char* input = byte_array_to_string(binput); + const char* right_comment = lexeme_to_string(LEX_RIGHT_COMMENT); + int right_comment_len = strlen(right_comment); + +lexmore: + while (i < binput->length) { + + for (j=0; j<ARRAY_LEN(lexemes); j++) { + + char* lexstr = lexemes[j].chars; + + if (!strncmp(&input[i], lexstr, strlen(lexstr))) { + + if (isKeyword(j) && i<binput->length-1 && isalpha(input[i+strlen(lexemes[j].chars)])) { + //DEBUGPRINT("%s is a keyword, %c is alpha\n", lexemes[j].chars, input[i+strlen(lexemes[j].chars)]); + continue; // just an identifier + } + enum Lexeme lexeme = (enum Lexeme)lexemes[j].number; + + if (lexeme == LEX_LEFT_COMMENT) { // start comment with /* + while (strncmp(&input[i], right_comment, right_comment_len)) + if (input[i++] == '\n') + line++; + i += right_comment_len; + } else if (lexeme == LEX_LINE_COMMENT) { // start line comment with # + while (input[i++] != '\n'); + line++; + } else if (lexeme == LEX_IMPORT) + i = import(input, i); + else { + //DEBUGPRINT("lexeme=%d\n",lexeme); + i += insert_lexeme(j); + } + goto lexmore; + } + } + + c = input[i]; + + if (isdigit(c)) i = insert_token_number(input,i); + else if (isiden(c)) i = insert_token_string(LEX_IDENTIFIER, input, i); + else if (c == '\n') { line++; i++; } + else if (isspace(c)) i++; + else if (c=='\'') i = insert_token_string(LEX_STRING, input, i+1); + else + return (struct array*)exit_message(ERROR_LEX); + } +#ifdef DEBUG +// display_lex_list(); +#endif + return lex_list; +} + +/* parse /////////////////////////////////////////////////////////////////// + +BNF: + +<statements> --> ( <assignment> | <fcall> | <ifthenelse> | <loop> | <rejoinder> | <iterloop> | <trycatch> | <throw> )* +<trycatch> --> LEX_TRY <statements> LEX_CATCH <variable> <statements> LEX_END +<throw> --> LEX_THROW <expression> +<assignment> --> <destination>, LEX_SET <source>, +<destination> --> <variable> | ( <expression> <member> ) +<source> --> <expression> + +<ifthenelse> --> IF <expression> THEN <statements> + ( ELSE IF <expression> THEN <statements> )* + ( ELSE <statements>)? LEX_END +<loop> --> WHILE <expression> LEX_DO <statements> LEX_END + +<iterator> --> LEX_FOR LEX_IDENTIFIER LEX_IN <expression> ( LEX_WHERE <expression> )? +<iterloop> --> <iterator> LEX_DO <statements> LEX_END +<comprehension> --> LEX_LEFTSQUARE <expression> <iterator> LEX_RIGHTSQUARE + +<rejoinder> --> LEX_RETURN <source>, +<fdecl> --> FUNCTION LEX_LEFTHESIS <variable>, LEX_RIGHTHESIS <statements> LEX_END +<fcall> --> <expression> <call> +<call> --> LEX_LEFTHESIS <source>, LEX_RIGHTHESIS + +<expression> --> <exp2> ( ( LEX_SAME | LEX_DIFFERENT | LEX_GREATER | LEX_LESSER ) <exp2> )? +<exp2> --> <exp3> ( ( LEX_PLUS | LEX_MINUS ) <exp3> )* +<exp3> --> <exp4> ( ( LEX_PLUS | LEX_MINUS | LEX_TIMES | LEX_DIVIDE | LEX_MODULO | + LEX_BAND | LEX_BOR | LEX_XOR | LEX_LSHIFT | LEX_RSHIFT) <exp3> )* +<exp4> --> (LEX_NOT | LEX_MINUS | LEX_INVERSE)? <exp5> +<exp5> --> <exp6> ( <call> | <member> )* +<exp6> --> ( LEX_LEFTTHESIS <expression> LEX_RIGHTTHESIS ) | <atom> +<atom> --> LEX_IDENTIFIER | <float> | <integer> | <boolean> | <table> | <comprehension> | <fdecl> + +<variable> --> LEX_IDENTIFIER +<boolean> --> LEX_TRUE | LEX_FALSE +<floater> --> LEX_INTEGER LEX_PERIOD LEX_INTEGER +<string> --> LEX_STRING +<table> --> LEX_LEFTSQUARE <element>, LEX_RIGHTSQUARE +<element> --> <expression> ( LEX_COLON <expression> )? +<member> --> ( LEX_LEFTSQUARE <expression> LEX_RIGHTSQUARE ) | ( LEX_PERIOD LEX_STRING ) + +*/////////////////////////////////////////////////////////////////////////// + + enum Nonterminal { + SYMBOL_STATEMENTS, + SYMBOL_ASSIGNMENT, + SYMBOL_SOURCE, + SYMBOL_DESTINATION, + SYMBOL_IF_THEN_ELSE, + SYMBOL_LOOP, + SYMBOL_EXPRESSION, + SYMBOL_INTEGER, + SYMBOL_FLOAT, + SYMBOL_STRING, + SYMBOL_VARIABLE, + SYMBOL_TABLE, + SYMBOL_PAIR, + SYMBOL_FDECL, + SYMBOL_FCALL, + SYMBOL_MEMBER, + SYMBOL_RETURN, + SYMBOL_BOOLEAN, + SYMBOL_NIL, + SYMBOL_ITERATOR, + SYMBOL_ITERLOOP, + SYMBOL_COMPREHENSION, + SYMBOL_TRYCATCH, + SYMBOL_THROW, + } nonterminal; + +struct symbol { + enum Nonterminal nonterminal; + const struct token *token; + struct array* list; + struct symbol *index, *value; + float floater; + bool lhs; +}; + +struct number_string nonterminals[] = { + {SYMBOL_STATEMENTS, "statements"}, + {SYMBOL_ASSIGNMENT, "assignment"}, + {SYMBOL_SOURCE, "source"}, + {SYMBOL_DESTINATION, "destination"}, + {SYMBOL_IF_THEN_ELSE, "if-then-else"}, + {SYMBOL_LOOP, "loop"}, + {SYMBOL_EXPRESSION, "expression"}, + {SYMBOL_INTEGER, "integer"}, + {SYMBOL_FLOAT, "float"}, + {SYMBOL_STRING, "string"}, + {SYMBOL_VARIABLE, "variable"}, + {SYMBOL_TABLE, "table"}, + {SYMBOL_PAIR, "pair"}, + {SYMBOL_FDECL, "fdecl"}, + {SYMBOL_FCALL, "fcall"}, + {SYMBOL_MEMBER, "member"}, + {SYMBOL_RETURN, "return"}, + {SYMBOL_BOOLEAN, "boolean"}, + {SYMBOL_NIL, "nil"}, + {SYMBOL_ITERATOR, "iterator"}, + {SYMBOL_ITERLOOP, "iterloop"}, + {SYMBOL_COMPREHENSION, "comprehension"}, + {SYMBOL_TRYCATCH, "try-catch"}, + {SYMBOL_THROW, "throw"}, +}; + +struct array* parse_list; +uint32_t parse_index; +struct symbol *expression(); +struct symbol *statements(); +struct symbol *comprehension(); + +struct symbol *symbol_new(enum Nonterminal nonterminal) +{ + // DEBUGPRINT("symbol_new %s\n", NUM_TO_STRING(nonterminals, ARRAY_LEN(nonterminals), nonterminal)); + struct symbol *s = (struct symbol*)malloc(sizeof(struct symbol)); + s->nonterminal = nonterminal; + s->list = array_new(); + s->index = s->value = NULL; + s->lhs = false; + return s; +} + +struct symbol *symbol_add(struct symbol *s, struct symbol *t) +{ + null_check(s); + if (!t) + return NULL; + //DEBUGPRINT("symbol_add %s\n", nonterminals[t->nonterminal]); + array_add(s->list, t); + return s; +} + +#ifdef DEBUG + +void display_symbol(const struct symbol *symbol, int depth) +{ + // null_check(symbol); + if (!symbol) + return; + assert_message(depth < 100, "kablooie!"); + char* indent = (char*)malloc(sizeof(char)*(depth+1)); + int i; + for (i=0; i<depth; i++) + indent[i] = '\t'; + indent[i] = 0; + DEBUGPRINT("%s%d %s", indent, symbol->nonterminal, NUM_TO_STRING(nonterminals, symbol->nonterminal)); + + switch (symbol->nonterminal) { + case SYMBOL_INTEGER: + DEBUGPRINT(": %u\n", symbol->token->number); + break; + case SYMBOL_FLOAT: + DEBUGPRINT(": %f\n", symbol->floater); + break; + case SYMBOL_VARIABLE: { + char* s = byte_array_to_string(symbol->token->string); + DEBUGPRINT(": '%s (%s)'\n", s, symbol->lhs ? "lhs":"rhs"); + free(s); + } break; + case SYMBOL_STRING: { + char* s = byte_array_to_string(symbol->token->string); + DEBUGPRINT(": '%s'\n", s); + free(s); + } break; + case SYMBOL_EXPRESSION: + DEBUGPRINT(" %s\n", lexeme_to_string(symbol->token->lexeme)); + break; + case SYMBOL_TRYCATCH: + DEBUGPRINT("try\n"); + display_symbol(symbol->index, depth); + DEBUGPRINT("catch %s\n", byte_array_to_string(symbol->token->string)); + display_symbol(symbol->value, depth); + break; + default: + DEBUGPRINT("\n"); + depth++; + display_symbol(symbol->index, depth); + display_symbol(symbol->value, depth); + depth--; + break; + } + + const struct array *list = symbol->list; + if (list && list->length) { + DEBUGPRINT("%s\tlist:\n", indent); + depth++; + for (int k=0; k<list->length; k++) { + const struct symbol *item = array_get(list, k); + display_symbol(item, depth); + } + } + + free(indent); +} + +#endif + +#define LOOKAHEAD (lookahead(0)) +#define FETCH_OR_QUIT(x) if (!fetch(x)) return NULL; +#define OR_ERROR(x) { exit_message("missing %s at line %d", NUM_TO_STRING(lexemes, x), line); return NULL; } +#define FETCH_OR_ERROR(x) if (!fetch(x)) OR_ERROR(x); + + +enum Lexeme lookahead(int n) { + if (parse_index + n >= parse_list->length) + return LEX_NONE; + struct token *token = (struct token*)parse_list->data[parse_index+n]; + assert_message(token!=0, ERROR_NULL); + return token->lexeme; +} + +struct token *fetch(enum Lexeme lexeme) { + if (parse_index >= parse_list->length) + return NULL; + struct token *token = (struct token*)parse_list->data[parse_index]; + if (token->lexeme != lexeme) { + //DEBUGPRINT("fetched %s instead of %s at %d\n", lexeme_to_string(token->lexeme), lexeme_to_string(lexeme), parse_index); + return NULL; + } + //DEBUGPRINT("fetched %s at %d\n", lexeme_to_string(lexeme), parse_index); + // display_token(token, 0); + + parse_index++; + return token; +} + +typedef struct symbol*(Parsnip)(); + +struct symbol *one_of(Parsnip *p, ...) { + uint32_t start = parse_index; + struct symbol *t = NULL; + va_list argp; + va_start(argp, p); + for (; + p && !(t=p()); + p = va_arg(argp, Parsnip*)) + parse_index=start; + va_end(argp); + return t; +} + +struct token *fetch_lookahead(enum Lexeme lexeme, ...) { + struct token *t=NULL; + va_list argp; + va_start(argp, lexeme); + + for (; lexeme; lexeme = (enum Lexeme)va_arg(argp, int)) { + if (LOOKAHEAD == lexeme) { + t = fetch(lexeme); + break; + } + } + va_end(argp); + return t; +} + +struct symbol *symbol_fetch(enum Nonterminal n, enum Lexeme goal, ...) +{ + if (parse_index >= parse_list->length) + return NULL; + struct token *token = (struct token*)parse_list->data[parse_index]; + assert_message(token!=0, ERROR_NULL); + enum Lexeme lexeme = token->lexeme; + + struct symbol *symbol = NULL; + + va_list argp; + va_start(argp, goal); + + for (; goal; goal = (enum Lexeme)va_arg(argp, int)) { + if (lexeme == goal) { + + symbol = symbol_new(n); + symbol->token = token; + // DEBUGPRINT("fetched %s at %d\n", lexeme_to_string(lexeme), parse_index); + // display_token(token, 0); + + parse_index++; + break; + } + } + va_end(argp); + return symbol; +} + +struct symbol *symbol_adds(struct symbol *s, struct symbol *child, ...) { + va_list argp; + va_start(argp, child); + for (; child; child = va_arg(argp, struct symbol*)) + array_add(s->list, child); + va_end(argp); + return s; + +} + +// <x>, --> ( <x> ( LEX_COMMA <x> )* )? +// e.g. a list of zero or more <x>, separated by commas +struct symbol *repeated(enum Nonterminal nonterminal, Parsnip *p) +{ + struct symbol *r, *s = symbol_new(nonterminal); + do { + if (!(r=p())) + break; + symbol_add(s, r); + } while (fetch(LEX_COMMA)); + return s; +} + +//////////////////////////////// BNFs + + +// <destination> --> <variable> | ( <expression> <member> ) +struct symbol *destination() +{ + struct symbol *e = expression(); + if (!e || + (e->nonterminal != SYMBOL_MEMBER && + e->nonterminal != SYMBOL_VARIABLE)) + return NULL; + e->lhs = true; + return e; +}; + +//<variable> --> LEX_IDENTIFIER +struct symbol *variable() +{ + return symbol_fetch(SYMBOL_VARIABLE, LEX_IDENTIFIER, NULL); +} + + +// <fdecl> --> FUNCTION LEX_LEFTHESIS <variable>, LEX_RIGHTHESIS <statements> LEX_END +struct symbol *fdecl() +{ + FETCH_OR_QUIT(LEX_FUNCTION) + FETCH_OR_ERROR(LEX_LEFTHESIS); + struct symbol *s = symbol_new(SYMBOL_FDECL); + s->index = repeated(SYMBOL_DESTINATION, &destination); + FETCH_OR_ERROR(LEX_RIGHTHESIS) + s->value = statements(); + //DEBUGPRINT("fdecl: %d statements\n", s->value->list->length); + //DEBUGPRINT("lookahead=%s\n", NUM_TO_STRING(lexemes, LOOKAHEAD)); + FETCH_OR_ERROR(LEX_END); + return s; +} + +// <element> --> <expression> ( LEX_COLON <expression> )? +struct symbol *element() +{ + struct symbol *e = expression(); + if (fetch(LEX_COLON)) { // i.e. x:y + struct symbol *p = symbol_new(SYMBOL_PAIR); + p->index = e; + p->value = expression(); + return p; + } else { +// p->index = symbol_new(SYMBOL_NIL); + return e; + } +} + +// <table> --> LEX_LEFTSQUARE <element>, LEX_RIGHTSQUARE +struct symbol *table() { + //return list(SYMBOL_TABLE, LEX_LEFTSQUARE, LEX_RIGHTSQUARE, LEX_COLON); + FETCH_OR_QUIT(LEX_LEFTSQUARE); + struct symbol *s = repeated(SYMBOL_TABLE, &element); + FETCH_OR_ERROR(LEX_RIGHTSQUARE); + return s; +} + +struct symbol *integer() +{ + // DEBUGPRINT("integer\n"); + struct token *t = fetch(LEX_INTEGER); + if (!t) + return NULL; + struct symbol *s = symbol_new(SYMBOL_INTEGER); + s->token = t; + return s; +} + +struct symbol *boolean() +{ + struct token *t = fetch_lookahead(LEX_TRUE, LEX_FALSE, NULL); + if (!t) + return NULL; + struct symbol *s = symbol_new(SYMBOL_BOOLEAN); + s->token = t; + return s; +} + +struct symbol *floater() +{ + // DEBUGPRINT("floater\n"); + struct token *t = fetch(LEX_INTEGER); + if (!t) + return NULL; + FETCH_OR_QUIT(LEX_PERIOD); + struct token *u = fetch(LEX_INTEGER); + + float decimal = u->number; + while (decimal > 1) + decimal /= 10; + + struct symbol *s = symbol_new(SYMBOL_FLOAT); + s->floater = t->number + decimal; + return s; +} + +// <string> --> LEX_STRING +struct symbol *string() +{ + struct token *t = fetch(LEX_STRING); + if (!t) + return NULL; + struct symbol *s = symbol_new(SYMBOL_STRING); + s->token = t; + return s; +} + +// <atom> --> LEX_IDENTIFIER | <float> | <integer> | <boolean> | <table> | <comprehension> | <fdecl> +struct symbol *atom() +{ + return one_of(&variable, &string, &floater, &integer, &boolean, &comprehension, &table, &fdecl, NULL); +} + +// <exp5> --> ( LEX_LEFTTHESIS <expression> LEX_RIGHTTHESIS ) | <atom> +struct symbol *exp5() +{ + if (fetch(LEX_LEFTHESIS)) { + struct symbol *s = expression(); + fetch(LEX_RIGHTHESIS); + return s; + } + return atom(); +} + +// <member> --> ( LEX_LEFTSQUARE <expression> LEX_RIGHTSQUARE ) | ( LEX_PERIOD LEX_STRING ) +struct symbol *member() +{ + struct symbol *m = symbol_new(SYMBOL_MEMBER); + + if (fetch(LEX_PERIOD)) { + m->index = variable(); + m->index->nonterminal = SYMBOL_STRING; + } + else { + FETCH_OR_QUIT(LEX_LEFTSQUARE); + m->index = expression(); + FETCH_OR_QUIT(LEX_RIGHTSQUARE); + } + return m; +} + +// <call> --> LEX_LEFTHESIS <source>, LEX_RIGHTHESIS +struct symbol *call() +{ + struct symbol *s = symbol_new(SYMBOL_FCALL); + FETCH_OR_QUIT(LEX_LEFTHESIS); + s->index = repeated(SYMBOL_SOURCE, &expression); // arguments + FETCH_OR_ERROR(LEX_RIGHTHESIS); + return s; +} + +// <fcall> --> <expression> <call> +struct symbol *fcall() +{ + struct symbol *e = expression(); + return (e && e->nonterminal == SYMBOL_FCALL) ? e : NULL; +} + +// <exp4> --> <exp5> ( <call> | member )* +struct symbol *exp4() +{ + struct symbol *g, *f; + f = exp5(); + while (f && (g = one_of(&call, &member, NULL))) { + g->value = f; + f = g; + } + return f; +} + +// <exp3> --> (NOT | LEX_MINUS)? <exp4> +struct symbol *exp3() +{ + struct symbol *e; + if ((e = symbol_fetch(SYMBOL_EXPRESSION, LEX_MINUS, LEX_NOT, NULL))) + return symbol_add(e, exp3()); + return exp4(); +} + +// <exp2> --> (<exp3> ( ( LEX_PLUS | LEX_MINUS | LEX_TIMES | LEX_DIVIDE | MODULO ))* <exp3> +struct symbol *expTwo() +{ + struct symbol *e, *f; + e = exp3(); + while ((f = symbol_fetch(SYMBOL_EXPRESSION, LEX_PLUS, LEX_MINUS, LEX_TIMES, LEX_DIVIDE, LEX_MODULO, NULL))) + e = symbol_adds(f, e, exp3(), NULL); + return e; +} + +// <expression> --> <exp2> ( ( LEX_SAME | LEX_DIFFERENT | LEX_GREATER | LEX_LESSER ) <exp2> )? +struct symbol *expression() +{ + struct symbol *f, *e = expTwo(); + while ((f = symbol_fetch(SYMBOL_EXPRESSION, LEX_SAME, LEX_DIFFERENT, LEX_GREATER, LEX_LESSER, NULL))) + e = symbol_adds(f, e, expTwo(), NULL); + return e; +} + +// <assignment> --> <destination>, LEX_SET <source>, +struct symbol *assignment() +{ + struct symbol *d = repeated(SYMBOL_DESTINATION, &destination); + FETCH_OR_QUIT(LEX_SET); + struct symbol *s = symbol_new(SYMBOL_ASSIGNMENT); + s->value = repeated(SYMBOL_SOURCE, expression); + s->index = d; + return s; +} + +/* <ifthenelse> --> IF <expression> + THEN <statements> + (ELSE IF <expression> THEN <statements>)* + (ELSE <statements>)? + END */ +struct symbol *ifthenelse() +{ + FETCH_OR_QUIT(LEX_IF); + struct symbol *f = symbol_new(SYMBOL_IF_THEN_ELSE); + symbol_add(f, expression()); + fetch(LEX_THEN); + symbol_add(f, statements()); + + while (lookahead(0) == LEX_ELSE && lookahead(1) == LEX_IF) { + fetch(LEX_ELSE); + fetch(LEX_IF); + symbol_add(f, expression()); + fetch(LEX_THEN); + symbol_add(f, statements()); + } + + if (fetch_lookahead(LEX_ELSE)) + symbol_add(f, statements()); + fetch(LEX_END); + return f; +} + +// <loop> --> WHILE <expression> <statements> END +struct symbol *loop() +{ + FETCH_OR_QUIT(LEX_WHILE); + struct symbol *s = symbol_new(SYMBOL_LOOP); + s->index = expression(); + s->value = statements(); + FETCH_OR_ERROR(LEX_END); + return s; +} + +// <iterator> --> LEX_FOR LEX_IDENTIFIER LEX_IN <expression> ( LEX_WHERE <expression> )? +struct symbol *iterator() +{ + FETCH_OR_QUIT(LEX_FOR); + const struct token *t = fetch(LEX_IDENTIFIER); + struct symbol *s = symbol_new(SYMBOL_ITERATOR); + s->token = t; + + FETCH_OR_ERROR(LEX_IN); + s->value = expression(); + if (fetch_lookahead(LEX_WHERE)) + s->index = expression(); + return s; +} + +// <comprehension> --> LEX_LEFTSQUARE <expression> <iterator> LEX_RIGHTSQUARE +struct symbol *comprehension() +{ + // DEBUGPRINT("comprehension\n"); + FETCH_OR_QUIT(LEX_LEFTSQUARE); + struct symbol *s = symbol_new(SYMBOL_COMPREHENSION); + s->value = expression(); + s->index = iterator(); + if (!s->index) + return NULL; + FETCH_OR_ERROR(LEX_RIGHTSQUARE); + return s; +} + +// <iterloop> --> <iterator> <statements> LEX_END +struct symbol *iterloop() +{ + struct symbol *i = iterator(); + if (!i) + return NULL; + struct symbol *s = symbol_new(SYMBOL_ITERLOOP); + s->index = i; + s->value = statements(); + FETCH_OR_ERROR(LEX_END); + return s; +} + +// <rejoinder> --> // LEX_RETURN <expression> +struct symbol *rejoinder() +{ + FETCH_OR_QUIT(LEX_RETURN); + return repeated(SYMBOL_RETURN, &expression); // return values +} + +// <trycatch> --> LEX_TRY <statements> LEX_CATCH <destination> <statements> LEX_END +struct symbol *trycatch() +{ + FETCH_OR_QUIT(LEX_TRY); + struct symbol *s = symbol_new(SYMBOL_TRYCATCH); + s->index = statements(); + FETCH_OR_ERROR(LEX_CATCH); + if (!(s->token = fetch(LEX_IDENTIFIER))) + OR_ERROR(LEX_IDENTIFIER); + s->lhs = true; + s->value = statements(); + FETCH_OR_ERROR(LEX_END); + return s; +} + +// <throw> --> LEX_THROW <expression> +struct symbol *thrower() +{ + FETCH_OR_QUIT(LEX_THROW); + struct symbol *s = symbol_new(SYMBOL_THROW); + s->value = expression(); + return s; +} + +struct symbol *strings_and_variables() +{ + struct array *sav = array_new(); + while ((array_add(sav, variable()) || array_add(sav, string()))); + return (struct symbol*)sav; +} + +// <statements> --> ( <assignment> | <fcall> | <ifthenelse> | <loop> | <rejoinder> | <iterloop> ) * +struct symbol *statements() +{ + struct symbol *s = symbol_new(SYMBOL_STATEMENTS); + struct symbol *t; + while ((t = one_of(&assignment, &fcall, &ifthenelse, &loop, &rejoinder, &iterloop, &trycatch, &thrower, NULL))) + symbol_add(s, t); + return s; +} + +struct symbol *parse(struct array *list, uint32_t index) +{ + DEBUGPRINT("parse:\n"); + assert_message(list!=0, ERROR_NULL); + assert_message(index<list->length, ERROR_INDEX); + + parse_list = list; + parse_index = index; + + struct symbol *p = statements(); +#ifdef DEBUG + display_symbol(p, 1); +#endif + return p; +} + + +// generate //////////////////////////////////////////////////////////////// + +struct byte_array *generate_code(struct byte_array *code, struct symbol *root); + +void generate_step(struct byte_array *code, int count, int action,...) +{ + byte_array_add_byte(code, action); + + va_list argp; + uint8_t parameter; + for(va_start(argp, action); --count;) { + parameter = va_arg(argp, int); + byte_array_add_byte(code, (uint8_t)parameter); + } + va_end(argp); +} + +void generate_items(struct byte_array *code, const struct symbol* root) +{ + const struct array *items = root->list; + uint32_t num_items = items->length; + + for (int i=0; i<num_items; i++) { + struct symbol *item = (struct symbol*)array_get(items, i); + generate_code(code, item); + } +} + +void generate_items_then_op(struct byte_array *code, enum Opcode opcode, const struct symbol* root) +{ + generate_items(code, root); + generate_step(code, 1, opcode); + serial_encode_int(code, 0, root->list->length); +} + +void generate_statements(struct byte_array *code, struct symbol *root) { + if (root) + generate_items(code, root); +} + +void generate_return(struct byte_array *code, struct symbol *root) { + generate_items_then_op(code, VM_RET, root); +} + +void generate_nil(struct byte_array *code, struct symbol *root) { + generate_step(code, 1, VM_NIL); +} + +static void generate_jump(struct byte_array *code, int32_t offset) { + generate_step(code, 1, VM_JMP); + serial_encode_int(code, 0, offset); +} + +void generate_list(struct byte_array *code, struct symbol *root) { + generate_items_then_op(code, VM_LST, root); +} + +void generate_float(struct byte_array *code, struct symbol *root) +{ + generate_step(code, 1, VM_FLT); + serial_encode_float(code, 0, root->floater); +} + +void generate_integer(struct byte_array *code, struct symbol *root) { + generate_step(code, 1, VM_INT); + serial_encode_int(code, 0, root->token->number); +} + +void generate_string(struct byte_array *code, struct symbol *root) { + generate_step(code, 1, VM_STR); + serial_encode_string(code, 0, root->token->string); +} + +void generate_source(struct byte_array *code, struct symbol *root) { + generate_items_then_op(code, VM_SRC, root); +} + +void generate_destination(struct byte_array *code, struct symbol *root) +{ + generate_items(code, root); + generate_step(code, 1, VM_DST); +} + +void generate_assignment(struct byte_array *code, struct symbol *root) +{ + generate_code(code, root->value); + generate_code(code, root->index); +} + +void generate_variable(struct byte_array *code, struct symbol *root) +{ + generate_step(code, 1, root->lhs ? VM_SET : VM_VAR); + serial_encode_string(code, 0, root->token->string); +} + +void generate_boolean(struct byte_array *code, struct symbol *root) { + uint32_t value = 0; + switch (root->token->lexeme) { + case LEX_TRUE: value = 1; break; + case LEX_FALSE: value = 0; break; + default: exit_message("bad boolean value"); return; + } + generate_step(code, 1, VM_BOOL); + serial_encode_int(code, 0, value); +} + +void generate_fdecl(struct byte_array *code, struct symbol *root) +{ + struct byte_array *f = byte_array_new(); + generate_code(f, root->index); + generate_code(f, root->value); + generate_step(code, 1, VM_FNC); + serial_encode_string(code, 0, f); +} + +void generate_pair(struct byte_array *code, struct symbol *root) +{ + generate_code(code, root->index); + generate_code(code, root->value); + generate_step(code, 2, VM_MAP, 1); +} + +void generate_member(struct byte_array *code, struct symbol *root) +{ + generate_code(code, root->index); // array_get(root->branches, INDEX)); + generate_code(code, root->value); // array_get(root->branches, ITERABLE)); + + enum Opcode op = root->lhs ? VM_PUT : VM_GET; + generate_step(code, 1, op); +} + +void generate_fcall(struct byte_array *code, struct symbol *root) +{ + generate_code(code, root->index); // arguments + + if (root->value->nonterminal == SYMBOL_MEMBER) { + generate_code(code, root->value->index); + generate_code(code, root->value->value); + generate_step(code, 1, VM_MET); + } else { + generate_code(code, root->value); // function + generate_step(code, 1, VM_CAL); + } +} + +void generate_math(struct byte_array *code, struct symbol *root) +{ + enum Lexeme lexeme = root->token->lexeme; + generate_statements(code, root); + enum Opcode op = VM_NIL; + switch (lexeme) { + case LEX_PLUS: op = VM_ADD; break; + case LEX_MINUS: op = VM_SUB; break; + case LEX_TIMES: op = VM_MUL; break; + case LEX_DIVIDE: op = VM_DIV; break; + case LEX_MODULO: op = VM_MOD; break; + case LEX_AND: op = VM_AND; break; + case LEX_OR: op = VM_OR; break; + case LEX_NOT: op = VM_NOT; break; + case LEX_SAME: op = VM_EQU; break; + case LEX_DIFFERENT: op = VM_NEQ; break; + case LEX_GREATER: op = VM_GTN; break; + case LEX_LESSER: op = VM_LTN; break; + default: exit_message("bad math lexeme"); break; + } + generate_step(code, 1, op); +} + +void generate_ifthenelse(struct byte_array *code, struct symbol *root) +{ + struct array *gotos = array_new(); // for skipping to end of elseifs, if one is true + + for (int i=0; i<root->list->length; i+=2) { + + // then + struct symbol *thn = (struct symbol*)array_get(root->list, i+1); + assert_message(thn->nonterminal == SYMBOL_STATEMENTS, "branch syntax error"); + struct byte_array *thn_code = byte_array_new(); + generate_code(thn_code, thn); + generate_jump(thn_code, 0); // jump to end + + // if + struct symbol *iff = (struct symbol*)array_get(root->list, i); + generate_code(code, iff); + generate_step(code, 2, VM_IFF, thn_code->length); + byte_array_append(code, thn_code); + array_add(gotos, (void*)(VOID_INT)(code->length-1)); + + // else + if (root->list->length > i+2) { + struct symbol *els = (struct symbol*)array_get(root->list, i+2); + if (els->nonterminal == SYMBOL_STATEMENTS) { + assert_message(root->list->length == i+3, "else should be the last branch"); + generate_code(code, els); + break; + } + } + } + + // go back and fill in where to jump to the end of if-then-elseif-else when done + for (int j=0; j<gotos->length; j++) { + VOID_INT g = (VOID_INT)array_get(gotos, j); + code->data[g] = code->length - g; + } +} + +void generate_loop(struct byte_array *code, struct symbol *root) +{ + struct byte_array *ifa = byte_array_new(); + generate_code(ifa, root->index); + + struct byte_array *b = byte_array_new(); + generate_code(b, root->value); + + struct byte_array *thn = byte_array_new(); + generate_step(thn, 1, VM_IFF); + serial_encode_int(thn, 0, b->length + 2); + + struct byte_array *while_a_do_b = byte_array_concatenate(4, code, ifa, thn, b); + byte_array_append(code, while_a_do_b); + int32_t loop_length = ifa->length + thn->length + b->length; + generate_jump(code, -loop_length-1); +} + +// <iterator> --> LEX_FOR LEX_IDENTIFIER LEX_IN <expression> ( LEX_WHERE <expression> )? +void generate_iterator(struct byte_array *code, struct symbol *root, enum Opcode op) +{ + struct symbol *ator = root->index; + generate_code(code, ator->value); // IN b + generate_step(code, 1, op); // iterator or comprehension + serial_encode_string(code, 0, ator->token->string); // FOR a + + if (ator->index) { // WHERE c + struct byte_array *where = byte_array_new(); + generate_code(where, ator->index); + serial_encode_string(code, 0, where); + } + else + generate_nil(code, NULL); + + struct byte_array *what = byte_array_new(); + generate_code(what, root->value); + serial_encode_string(code, 0, what); // DO d +} + +// <iterloop> --> <iterator> <statements> LEX_END +void generate_iterloop(struct byte_array *code, struct symbol *root) { + generate_iterator(code, root, VM_ITR); +} + +// <comprehension> --> LEX_LEFTSQUARE <expression> <iterator> LEX_RIGHTSQUARE +void generate_comprehension(struct byte_array *code, struct symbol *root) { + generate_iterator(code, root, VM_COM); +} + +// <trycatch> --> LEX_TRY <statements> LEX_CATCH <variable> <statements> LEX_END +void generate_trycatch(struct byte_array *code, struct symbol *root) +{ + struct byte_array *trial = generate_code(NULL, root->index); + + generate_step(code, 1, VM_TRY); + serial_encode_string(code, 0, trial); + + serial_encode_string(code, 0, root->token->string); + struct byte_array *catcher = generate_code(NULL, root->value); + serial_encode_string(code, 0, catcher); +} + +void generate_throw(struct byte_array *code, struct symbol *root) +{ + generate_code(code, root->value); + generate_step(code, 1, VM_TRO); +} + +typedef void(generator)(struct byte_array*, struct symbol*); + +struct byte_array *generate_code(struct byte_array *code, struct symbol *root) +{ + if (!root) + return NULL; + generator *g = NULL; + + //DEBUGPRINT("generate_code %s\n", nonterminals[root->nonterminal]); + switch(root->nonterminal) { + case SYMBOL_NIL: g = generate_nil; break; + case SYMBOL_PAIR: g = generate_pair; break; + case SYMBOL_LOOP: g = generate_loop; break; + case SYMBOL_FDECL: g = generate_fdecl; break; + case SYMBOL_TABLE: g = generate_list; break; + case SYMBOL_FCALL: g = generate_fcall; break; + case SYMBOL_FLOAT: g = generate_float; break; + case SYMBOL_MEMBER: g = generate_member; break; + case SYMBOL_RETURN: g = generate_return; break; + case SYMBOL_SOURCE: g = generate_source; break; + case SYMBOL_STRING: g = generate_string; break; + case SYMBOL_INTEGER: g = generate_integer; break; + case SYMBOL_BOOLEAN: g = generate_boolean; break; + case SYMBOL_ITERLOOP: g = generate_iterloop; break; + case SYMBOL_VARIABLE: g = generate_variable; break; + case SYMBOL_STATEMENTS: g = generate_statements; break; + case SYMBOL_ASSIGNMENT: g = generate_assignment; break; + case SYMBOL_EXPRESSION: g = generate_math; break; + case SYMBOL_DESTINATION: g = generate_destination; break; + case SYMBOL_IF_THEN_ELSE: g = generate_ifthenelse; break; + case SYMBOL_COMPREHENSION: g = generate_comprehension; break; + case SYMBOL_TRYCATCH: g = generate_trycatch; break; + case SYMBOL_THROW: g = generate_throw; break; + default: + return (struct byte_array*)exit_message(ERROR_TOKEN); + } + + if (!code) + code = byte_array_new(); + if (g) + g(code, root); + return code; +} + +struct byte_array *generate_program(struct symbol *root) +{ + DEBUGPRINT("generate:\n"); + struct byte_array *code = byte_array_new(); + generate_code(code, root); + struct byte_array *program = serial_encode_int(0, 0, code->length); + byte_array_append(program, code); +#ifdef DEBUG + display_program(program); +#endif + return program; +} + +// build /////////////////////////////////////////////////////////////////// + +struct byte_array *build_string(const struct byte_array *input) { + assert_message(input, ERROR_NULL); + struct byte_array *input_copy = byte_array_copy(input); + DEBUGPRINT("lex %d:\n", input_copy->length); + struct array* list = lex(input_copy); + struct symbol *tree = parse(list, 0); + + return generate_program(tree); +} + +struct byte_array *build_file(const struct byte_array* filename) +{ + struct byte_array *input = read_file(filename); + return build_string(input); +} + +void compile_file(const char* str) +{ + struct byte_array *filename = byte_array_from_string(str); + struct byte_array *program = build_file(filename); + struct byte_array *dotfg = byte_array_from_string(EXTENSION_SRC); + struct byte_array *dotfgbc = byte_array_from_string(EXTENSION_BC); + int offset = byte_array_find(filename, dotfg, 0); + assert_message(offset > 0, "invalid source file name"); + byte_array_replace(filename, dotfgbc, offset, dotfg->length); + write_file(filename, program); +}