/*
* Assembler Tokeniser:
*/
#include "compiler.h"
/*
* Input file line number, starts from 1 each time.
*/
static long line_number = 1;
/*
* Allow buffering of Tokens for lookahead:
*/
static Token current_token = NULL;
static Token cached_token = NULL;
/*
* Table of keywords
*/
char * TokenNames[] = {
"#ERROR#", "#EOF#", "#\\n#", "#IDENT#",
"int", "real", "string", "#INT#", "#DBL#", "#STR#",
"<<", ">>", "(", ")",
"{", "}", "[", "]",
"==", "!=", "<=", ">=", "<", ">", "=",
",", ";", "*", "/", "%", "+", "-",
"&", "|", "^", "~",
"\\", "\"", ".", ":",
"eof", "print",
"readint", "readline", "readreal",
"function",
"and", "or", "not",
"abs", "chr", "ord", "find", "sort", "size",
"if", "else", "for", "to", "while",
"break", "continue", "pass",
"return", "exit",
};
/*
* Symbol Table (for storing all Tokens)
*
* All keywords, punctuation and identifiers are placed
* into the symbol table. Pointers to Tokens are returned
* by the tokeniser, but the memory is entirely managed
* within the symbol table, so the rest of the program
* need never free memory. Tokens can simply be discarded.
*
* One function manages the symbol table memory: initTokeniser
* must be called every time a new file is being parsed.
*/
enum {
NHASH=1024,
MAXTOKEN = 1024,
};
static int hash_table_used = 0;
static Token hash_table[NHASH];
Token newToken(void)
{
Token t;
t = (Token) zalloc(sizeof(struct Token));
t->next = NULL;
t->line = line_number;
t->kind = ERROR;
t->num = 0;
t->real = 0.0;
t->name = NULL;
return t;
}
static void delToken(Token t)
{
zfree(t->name);
zfree(t);
}
static int hash(char *name)
{
unsigned long h;
for(h=0; *name; name++)
h = h*31 + *name;
return h%NHASH;
}
/*
* Set the hash table to all NULL.
*/
static void initHashTable(void)
{
int i;
for (i=0; i < NHASH; i++)
hash_table[i] = NULL;
}
/*
* Hash lookup
*/
static Token findToken(char *name)
{
Token t;
for (t = hash_table[hash(name)]; t; t=t->next)
return t;
return NULL;
}
/*
* Hash insert (ensures uniqueness)
*/
Token addToken(Token t)
{
int h;
Token prev;
prev = findToken(t->name);
if (prev) {
if (prev != t)
delToken(t);
return prev;
}
else {
h = hash(t->name);
t->next = hash_table[h];
hash_table[h] = t;
return t;
}
}
/*
* Debug printing:
*/
void printToken(Token t)
{
char *name;
if (t->kind == EOF)
name = TokenNames[ENDOFFILE];
else
name = TokenNames[t->kind];
fprintf(stderr
, "line=%ld\t", t
->line
);
fprintf(stderr
, "kind=%2d %s\t", t
->kind
, name
);
fprintf(stderr
, "int=%ld\t", t
->num
);
fprintf(stderr
, "real=%f\t", t
->real
);
fprintf(stderr
, "str=\"%s\"", t
->name
? t
->name
: "");
}
void printAllTokens(void)
{
int i;
Token t;
for (i=0; i<NHASH; i++) {
fprintf(stderr
, "hash value %d:\n", i
);
for (t = hash_table[i]; t; t=t->next)
printToken(t);
}
}
/*
* Remove all identifiers from the hash table, but leave keywords.
*/
static int isKeyword(Token t)
{
return (t->line > 0);
}
static void clearIdentifiers(void)
{
int i;
Token t, next;
for (i=0; i < NHASH; i++) {
t = hash_table[i];
while (t && isKeyword(t)) {
hash_table[i] = t->next;
next = t->next;
delToken(t);
t = next;
}
}
}
/*
* Add all keywords to the hash table.
*/
static void addKeywords(void)
{
int i;
Token t;
char buf[30];
/* add keywords */
for (i=0; i < (sizeof(TokenNames)/sizeof(TokenNames[0])); i++) {
if (TokenNames[i] == NULL)
continue;
t = newToken();
t->line = 0; /* keyword */
t->kind = i;
if (i == ENDOFFILE)
t->kind = EOF;
t->name = zstrdup(TokenNames[i]);
addToken(t);
}
//printAllTokens();
}
/*
* Error handler: prints the error + line number and stops parsing.
*/
void parserError(Program p, char *msg)
{
if (++(p->error_count) > MAX_ERRORS)
return;
if (msg)
fprintf(stderr
, "error: %s near line %ld\n", msg
, line_number
);
/*
if (current_token) {
fprintf(stderr, "this ");
printToken(current_token);
}
if (cached_token) {
fprintf(stderr, "next ");
printToken(cached_token);
}
*/
}
/*
* Low-level file input (reads binary or text, Dos, Unix or Mac):
*/
static int getChar(Program p)
{
int ch;
if (p->error_count > MAX_ERRORS)
return EOF;
if (ch == '\r') {
if (ch != '\n')
ch = '\n';
}
if (ch == '\n')
line_number++;
return ch;
}
static int ungetChar(int ch, Program p)
{
int value = ch;
if (ch != EOF)
if (ch == '\n')
line_number--;
return value;
}
static int peekChar(Program p)
{
int ch = getChar(p);
ungetChar(ch, p);
return ch;
}
/*
* Handle spacing:
* Note: newlines are not counted as spaces by this tokeniser;
* rather, they are tokens in their own right.
*/
static void skipSpaces(Program p)
{
int ch;
while ((ch=getChar(p)) != EOF) {
if ((ch
== '\n') || (! isspace(ch
))) {
ungetChar(ch, p);
break;
}
}
}
/*
* Handle # comments:
* Assume we are about to read the start of a comment.
*/
static void skipComment(Program p)
{
int ch = getChar(p);
if (ch == '#') { // skip until end of line
while ((ch = getChar(p)) != EOF) {
if (ch == '\n') {
ungetChar(ch, p);
break;
}
}
}
else {
ungetChar(ch, p);
}
}
static int ishexdigit(int ch)
{
if (ch == 'a') return 1;
else if (ch == 'b') return 1;
else if (ch == 'c') return 1;
else if (ch == 'd') return 1;
else if (ch == 'e') return 1;
else if (ch == 'f') return 1;
return 0;
}
static Token getNumberToken(Program p, char *buf)
{
Token t;
int ch, i=0;
int base=10;
// read in a plain number
t = newToken();
while ((ch = peekChar(p)) != EOF) {
if ((i == 0) && (ch == '0')) {
t->kind = INTVAL;
buf[i++] = getChar(p);
ch = peekChar(p);
if ((ch == 'x') || (ch == 'X')) {
buf[i++] = getChar(p);
base=16;
}
}
else if ((base == 16) && (ishexdigit(ch))) {
t->kind = INTVAL;
buf[i++] = getChar(p);
}
else if ((base == 16) && (ch == '.')) {
break; /* found 0x1f7. */
}
t->kind = INTVAL;
buf[i++] = getChar(p);
}
else if (ch == '.') {
t->kind = DOUBLEVAL;
buf[i++] = getChar(p);
break; /* found 123.x */
buf[i++] = getChar(p);
if (toupper(ch
= peekChar
(p
)) == 'E') {
buf[i++] = getChar(p);
ch = peekChar(p);
if ((ch == '+') || (ch == '-'))
buf[i++] = getChar(p);
buf[i++] = getChar(p);
}
break;
}
else {
break;
}
}
buf[i] = '\0';
return t;
}
/*
* Read an identifier: an instruction, keyword, label,
* register identifier (r0 ... r63) etc
*/
static Token getIdentToken(Program p, char *buf)
{
Token t;
int ch, i=0;
t = newToken();
t->kind = IDENT;
// read in an identifier
while ((ch = getChar(p)) != EOF) {
buf[i++] = ch;
else {
ungetChar(ch, p);
break;
}
}
buf[i] = '\0';
return t;
}
/*
* Read in punctuation marks, including memory cell identifiers
* such as [r7], [r4], [sp] etc. '[' followed by non-alphabetics is
* returned as separate tokens '[' then whatever then ']' etc.
*/
static Token getPunctToken(Program p, char *buf)
{
Token t;
int ch, i=0;
t = newToken();
// read in a punctuation mark
if ((ch = getChar(p)) != EOF) {
buf[i++] = ch;
switch (ch) {
case '[':
ch = peekChar(p);
break;
while (ch != ']') {
ch = peekChar(p);
break;
buf[i++] = getChar(p);
}
if (ch == ']')
buf[i++] = getChar(p);
break;
case '<':
ch = getChar(p);
if ((ch == '<') || (ch == '='))
buf[i++] = ch;
else
ungetChar(ch, p);
break;
case '>':
ch = getChar(p);
if ((ch == '>') || (ch == '='))
buf[i++] = ch;
else
ungetChar(ch, p);
break;
case '=': case '!':
ch = getChar(p);
if (ch == '=')
buf[i++] = ch;
else
ungetChar(ch, p);
break;
default:
break;
}
}
else {
ungetChar(ch, p);
}
}
buf[i] = '\0';
return t;
}
static Token getStringToken(Program p, char *buf)
{
Token t;
int ch, prev, i=0;
t = newToken();
t->kind = STRINGVAL;
// read in the string until we hit a quote char
ch = getChar(p);
if (ch != '\"') {
t->kind = ERROR;
ungetChar(ch, p); // error
return t;
}
buf[i++] = ch; // keep quote char
prev = '\0';
while ((ch = getChar(p)) != EOF) {
if (ch == '\"') {
if (prev == '\\') {
buf[i-1] = ch;
}
else {
buf[i++] = ch; // keep quote char
break;
}
} else if (ch == '\\') {
if (prev == '\\')
prev = ' '; /* avoid carry-over */
buf[i++] = ch;
continue;
} else if (ch == '\t') {
buf[i++] = '\\';
buf[i++] = 't';
} else if (ch == '\n') {
parserError(p, "newline within string");
} else {
buf[i++] = ch;
}
prev = ch;
}
buf[i] = '\0';
return t;
}
static Token fetchToken(Program p) /* fetch token from file */
{
static char buf[MAXTOKEN];
int ch;
Token t = NULL;
buf[0] = '\0';
// find next token
while ((ch=getChar(p)) != EOF) {
// find token
if (ch == '\n') {
t = findToken(TokenNames[NEWLINE]);
break;
}
ungetChar(ch, p);
skipSpaces(p);
continue; // loop
}
else if (ch == '#') {
ungetChar(ch, p);
skipComment(p);
continue; // loop
}
ungetChar(ch, p);
t = getNumberToken(p, buf);
t->name = zstrdup(buf);
}
ungetChar(ch, p);
t = getIdentToken(p, buf);
t->name = zstrdup(buf);
}
else if (ch == '\"') {
ungetChar(ch, p);
t = getStringToken(p, buf);
t->name = zstrdup(buf);
}
ungetChar(ch, p);
t = getPunctToken(p, buf);
t->name = zstrdup(buf);
}
else {
t = findToken(TokenNames[ERROR]);
break;
}
// always break, except for comments and spaces
break;
}
if ((ch == EOF) && (t == NULL)) {
t = findToken(TokenNames[ENDOFFILE]);
}
t = addToken(t);
printToken(t);
return t;
}
/*
* Initialise (or re-initialise) the tokeniser hash table.
*/
void initTokeniser(void)
{
if (hash_table_used)
clearIdentifiers();
else {
initHashTable();
addKeywords();
hash_table_used = 1;
}
line_number = 1;
current_token = NULL;
cached_token = NULL;
}
Token getToken(Program p) /* fetch next token from the file */
{
current_token = cached_token;
cached_token = NULL;
if (! current_token)
current_token = fetchToken(p);
if (current_token->kind == ERROR)
parserError(p, "some weird error");
return current_token;
}
Token peekToken(Program p) /* peek ahead at next token */
{
if (! cached_token)
cached_token = fetchToken(p);
return cached_token;
}
int eatToken(Program p, int kind, char *msg)
{
Token t = getToken(p);
if (t->kind != kind) {
parserError(p, msg);
return 0;
}
return 1;
}