/*
   File: lexer.c
   Does lexical analysis of Agfl assembler
  
   Copyright 2006 Radboud University of Nijmegen

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU Library General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

   CVS ID: "$Id: lexer.c,v 1.9 2007/07/27 19:40:33 marcs Exp $"
*/

/* general includes */
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <string.h>

/* libabase includes */
#include <abase_error.h>
#include <abase_dstring.h>
#include <abase_memalloc.h>

/* local includes */
#include "lexer.h"

/* public variables */
token curr_token;
int curr_line;
int curr_column;
char *curr_sval;
unsigned int curr_uval;
int curr_nval;

/* Use line buffer for listing */
#define MAX_LINE_LENGTH 1024
static char *line_buffers[2];
static int buffer_idx;

/* Error count */
int lexer_errors;

/* private variables */
static FILE *this_file;
static char *this_fname;
static int this_char;
static int this_line;
static int this_column;

/*
   Error reporting
*/
static void lexical_error (int lin, int col, char *format, ...)
	{ char buf[MAX_FMT_LEN + 1];
	  va_list arg_ptr;
	  va_start (arg_ptr, format);
	  vsprintf (buf, format, arg_ptr);
	  va_end (arg_ptr);
	  lexer_errors++;
	  abs_message ("lexical error in file %s, line %d, col %d: %s",
		       this_fname, lin, col, buf);
	};

/*
   Reading of characters and recognition of character classes
*/
char *get_current_line_buffer ()
	{ return (line_buffers[1 - buffer_idx]);
	};

static void next_char ()
	{ if (this_char == EOF) return;
	  if (this_char != '\n')
	    { this_char = line_buffers[buffer_idx][this_column];
	      this_column++;
	    }
	  else if (fgets (line_buffers[1 - buffer_idx], MAX_LINE_LENGTH - 1, this_file) != NULL)
	    { buffer_idx = 1 - buffer_idx;
	      this_line++;
	      this_column = 1;
	      this_char = line_buffers[buffer_idx][0];
	    }
	  else this_char = EOF; 
	};

static int ahead_letter ()
	{ if (('a' <= this_char) && (this_char <= 'z')) return (1);
	  if (('A' <= this_char) && (this_char <= 'Z')) return (1);
	  if (this_char == '_') return (1);
	  return (0);
	};

static int ahead_digit ()
	{ if (('0' <= this_char) && (this_char <= '9')) return (1);
	  return (0);
	};

static int ahead_letgit ()
	{ if (ahead_letter ()) return (1);
	  if (ahead_digit ()) return (1);
	  return (0);
	};

static int is_decimal_digit (unsigned int *value)
	{ if (('0' <= this_char) && (this_char <= '9'))
	    { *value = (unsigned int) (this_char - '0');
	      next_char ();
	      return (1);
	    };
	  return (0);
	};

static int is_hexadecimal_digit (unsigned int *value)
	{ if (('0' <= this_char) && (this_char <= '9'))
	    { *value = (unsigned int) (this_char - '0');
	      next_char ();
	      return (1);
	    };
	  if (('A' <= this_char) && (this_char <= 'F'))
	    { *value = ((unsigned int) (this_char - 'A')) + 10;
	      next_char ();
	      return (1);
	    };
	  if (('a' <= this_char) && (this_char <= 'f'))
	    { *value = ((unsigned int) (this_char - 'a')) + 10;
	      next_char ();
	      return (1);
	    };
	  return (0);
	};

/*
   Since the end of line is significant in assembler, we do not eat it away
*/
static void may_skip_layout ()
	{ while ((this_char == ' ') || (this_char == '\t')) next_char ();
	};

static void skip_line_comment ()
	{ do { next_char (); } while (this_char != '\n');
	};

static void scan_identifier ()
	{ dstring ds = abs_init_dstring (64);
	  abs_append_dstring_c (ds, (char) (this_char & 0xff));
	  next_char ();
	  while (ahead_letgit ())
	    { abs_append_dstring_c (ds, (char) (this_char & 0xff));
	      next_char ();
	    };
	  curr_sval = abs_finish_dstring (ds);
	  curr_token = SYMBOL;
	};

#define ESCAPE 0x1B
static int is_special (char *ptr)
	{ if (this_char != '\\') return (0);
	  next_char ();
	  if ((this_char == '\n') || (this_char == EOF))
	    { /* Next loop repetition will recover error */
	      *ptr = ' ';
	      return (1);
	    };
	  *ptr = this_char;
	  next_char ();
	  return (1);
	};

static void scan_string ()
	{ dstring ds = abs_init_dstring (64);
	  next_char ();			/* eat '"' */
	  while (1)
	    { char ch;
	      if ((this_char == '\n') || (this_char == EOF))
		{ lexical_error (this_line, this_column, "string contains newline or eof");
		  curr_token = STRING;
		  curr_sval = abs_new_string ("<ERROR>", "scan_string");
		  return;
		}
	      else if (is_special (&ch))
		{ abs_append_dstring_c (ds, '\\');
		  abs_append_dstring_c (ds, ch);
		}
	      else if (this_char == '"') break;
	      else
		{ abs_append_dstring_c (ds, (char) (this_char & 0xff));
	          next_char ();
		};
	    };
	  next_char ();			/* eat closing '"' */
	  curr_token = STRING;
	  curr_sval = abs_finish_dstring (ds);
	};

/* Also parse 0xhex */
#define UMAXINTDIV10 429496729
#define UMAXINTMOD10 6
static void should_be_decimal_number ()
	{ unsigned int value = 0;
	  unsigned int dval;
	  if (!is_decimal_digit (&value))
	    { lexical_error (this_line, this_column, "Illegal decimal number");
	      curr_token = NUMBER;
	      curr_uval = 0;
	      return;
	    };
	  while (is_decimal_digit (&dval))
	    { if ((value > UMAXINTDIV10) ||
		  ((value == UMAXINTDIV10) && (UMAXINTMOD10 <= dval)))
		{ lexical_error (this_line, this_column, "Too large decimal number");
		  while (is_decimal_digit (&dval));
		  curr_token = NUMBER;
		  curr_uval = 0;
		  return;
		}
	      else value = value * 10 + dval;
	    };
	  curr_token = NUMBER;
	  curr_uval = value;
	};

#define MAXINTDIV16 0xfffffff
static void should_be_hexadecimal_number ()
	{ unsigned int value = 0;
	  unsigned int dval;
	  if (!is_hexadecimal_digit (&value))
	    { lexical_error (this_line, this_column, "Illegal hexadecimal number");
	      curr_token = NUMBER;
	      curr_nval = 0;
	      return;
	    };
	  while (is_hexadecimal_digit (&dval))
	    { if (value > MAXINTDIV16)
		{ lexical_error (this_line, this_column, "Too large hexadecimal number");
		  while (is_hexadecimal_digit (&dval));
		  curr_token = NUMBER;
		  curr_nval = 0;
		  return;
		}
	      else value = value * 16 + dval;
	    };
	  curr_token = NUMBER;
	  curr_uval = value;
	};

static void scan_unsigned_number ()
	{ if (this_char == '0')
	    { next_char ();
	      if (this_char == 'x')
	        { next_char ();
		  should_be_hexadecimal_number ();
		  return;
		}
	      else if ((this_char < '0') || (this_char > '9'))
		{ curr_uval = 0;
		  curr_token = NUMBER;
		  return;
		};
	    };
	  should_be_decimal_number ();
	};

static void scan_signed_number ()
	{ next_char ();
	  should_be_decimal_number ();
	  curr_nval = - ((int) curr_uval);
	  curr_uval = 0;
	  curr_token = SIGNED_NUMBER;
	};

static void scan_rest_slash ()
	{ next_char ();
	  if (this_char == '*')
	    { do
		 { /* Skip chars until next '*' or EOF */
		   do { next_char (); }
		   while ((this_char != '*') && (this_char != EOF));

		   /* Skip * */
		   if (this_char == '*') next_char ();	
		 }
	      while ((this_char != '/') && (this_char != EOF));
	      if (this_char == EOF)
		{ lexical_error (curr_line, curr_column, "Runaway C comment");
		  curr_token = EOFSYMBOL;
		}
	      else
		{ next_char ();
		  curr_token = UNDEFINED;
		};
	    }
	  else 
	    { lexical_error (curr_line, curr_column, "Illegal character: '/'");
	      curr_token = UNDEFINED;
	    };
	}; 

static void yield_symbol (int sy)
	{ curr_token = sy;
	  next_char ();
	};

void read_token ()
	{ may_skip_layout ();
	  curr_line = this_line;
	  curr_column = this_column;
	  if (curr_token == EOFSYMBOL) return;
	  else if (ahead_letter ()) scan_identifier ();
	  else if (ahead_digit ()) scan_unsigned_number ();
	  else
	    switch (this_char)
	      { case '\n': yield_symbol (EOLN); break;
		case '"': scan_string (); break;
		case '#': skip_line_comment ();
			  yield_symbol (EOLN);
			  break;
		case '/': scan_rest_slash ();
			  if (curr_token == UNDEFINED) read_token ();
			  break;
		case '(': yield_symbol (LEFTPARENTHESIS); break;
		case ')': yield_symbol (RIGHTPARENTHESIS); break;
		case ',': yield_symbol (COMMA); break;
		case '-': scan_signed_number (); break;
		case '.': yield_symbol (PERIOD); break;
		case ':': yield_symbol (COLON); break;
		case EOF: yield_symbol (EOFSYMBOL); break;
		default:
		  lexical_error (this_line, this_column, "Illegal character: '%c'", this_char);
		  next_char ();
		  read_token ();
	      };
	};

void dump_token ()
	{ abs_printf ("Line %d, Column %d: ", curr_line, curr_column);
	  switch (curr_token)
	    { case UNDEFINED: abs_message ("Undefined"); break;
	      case SYMBOL: abs_message ("Symbol %s", curr_sval); break;
	      case NUMBER: abs_message ("Number %u", curr_uval); break;
	      case SIGNED_NUMBER: abs_message ("Signed %d", curr_nval); break;
	      case STRING: abs_message ("String \"%s\"", curr_sval); break;
	      case COLON: abs_message (":"); break;
	      case COMMA: abs_message (","); break;
	      case PERIOD: abs_message ("."); break;
	      case LEFTPARENTHESIS: abs_message ("("); break;
	      case RIGHTPARENTHESIS: abs_message (")"); break;
	      case EOLN: abs_message ("end of line"); break;
	      case EOFSYMBOL: abs_message ("end of file"); break;
	      default: abs_message ("illegal symbol %d", curr_token);
	    };
	};

void init_lexer (FILE *fd, char *fname)
	{ line_buffers[0] = abs_malloc (MAX_LINE_LENGTH, "init_lexer");
	  line_buffers[1] = abs_malloc (MAX_LINE_LENGTH, "init_lexer");
	  buffer_idx = 1;
	  curr_token = UNDEFINED;
	  this_fname = fname;
	  this_file = fd;
	  this_line = 0;
	  this_column = 0;
	  this_char = '\n';
	  next_char ();
	  read_token ();
	};

void reinit_lexer ()
	{ rewind (this_file);
	  curr_token = UNDEFINED;
	  this_line = 0;
	  this_column = 0;
	  this_char = '\n';
	  next_char ();
	  read_token ();
	}; 
