simon
/
SCCD


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
							using System;
using System.Collections.Generic;

namespace csharp_sccd_compiler
{
    public class Lexer
    {
        bool skip_white_space;
        bool accept_unknown_tokens;
        static Dictionary<char, Token.Type> single_rules = new Dictionary<char, Token.Type>{
            { '/', Token.Type.SLASH },
            { '(', Token.Type.LBRACKET },
            { ')', Token.Type.RBRACKET },
            { ',', Token.Type.COMMA },
            { '.', Token.Type.DOT },
        };

        string buf;
        int pos;

        public Lexer( bool skip_white_space = true, bool accept_unknown_tokens = false)
        {
            this.accept_unknown_tokens = accept_unknown_tokens;
            this.skip_white_space = skip_white_space;
        }

        /// <summary>
        /// Initialize the lexer with a buffer as input.
        /// </summary>
        public void setInput(string buffer)
        {
            this.buf = buffer;
            this.pos = 0;
        }

        /// <summary>
        /// Return the next token (a Token object) found in the input buffer. None is returned if the end of the buffer was reached.
        /// In case of a lexing error (the current chunk of the buffer matches no rule), a LexerException is raised.
        /// </summary>
        public Token nextToken()
        {
            if (this.skip_white_space)
                this.skipWhiteSpace();
            if (this.pos >= this.buf.Length)
                return null;

            char c = this.buf[this.pos]; //first char of next token

            Token.Type result_type;
            if (Lexer.single_rules.TryGetValue(c, out result_type)) //check if it is an operator
            {
                Token token = new Token(result_type, c.ToString(), this.pos);
                this.pos += 1;
                return token;
            }
            else //not an operator
            {
                if (this.isAlpha(c))
                    return this.processWord();
                else if (this.isDigit(c))
                    return this.processNumber();
                else if (c == '\'' || c == '"')
                    return this.processQuote();
                else if (this.isWhiteSpace(c))
                    return this.processWhiteSpace();
            }
            //if we're here, no rule matched
            if (this.accept_unknown_tokens)
            {
                Token token = new Token(Token.Type.UNKNOWN, c.ToString(), this.pos);
                this.pos += 1;
                return token;
            }
            throw new LexerException(string.Format("Invalid character at position {0}.", this.pos));
        }

        public IEnumerable<Token> iterateTokens()
        {
            while (true)
            {
                Token tok = this.nextToken();
                if (tok == null)
                    break;
                yield return tok;
            }
        }

        private void skipWhiteSpace()
        {
            while (this.pos < this.buf.Length)
            {
                if (this.isWhiteSpace(this.buf[this.pos]))
                    this.pos += 1;
                else
                    break; 
            }
        }

        private bool isWhiteSpace(char c)
        {
            return c == ' ' || c == '\t' || c == '\r' || c == '\n';
        }

        private bool isAlpha(char c)
        {
            return char.IsLetter(c) || c == '_';
        }

        private bool isDigit(char c)
        {
            return char.IsDigit(c);
        }

        private bool isAlphaNum(char c)
        {
            return this.isAlpha(c) || this.isDigit(c);
        }

        private Token processWhiteSpace()
        {
            int nextpos = this.pos + 1;
            while ( nextpos < this.buf.Length && this.isWhiteSpace(this.buf[nextpos]))
                nextpos += 1;
            Token token = new Token(Token.Type.WHITESPACE, this.buf.Substring(this.pos, nextpos-this.pos), this.pos);
            this.pos = nextpos;
            return token;
        }

        private Token processNumber()
        {
            int nextpos = this.pos + 1;
            while ( nextpos < this.buf.Length && this.isDigit(this.buf[nextpos]))
                nextpos += 1;
            Token token = new Token(Token.Type.NUMBER, this.buf.Substring(this.pos, nextpos-this.pos), this.pos);
            this.pos = nextpos;
            return token;
        }

        private Token processWord()
        {
            int nextpos = this.pos + 1;
            while ( nextpos < this.buf.Length && this.isAlphaNum(this.buf[nextpos]))
                nextpos += 1;
            Token token = new Token(Token.Type.WORD, this.buf.Substring(this.pos, nextpos-this.pos), this.pos);
            this.pos = nextpos;
            return token;
        }

        private Token processQuote()
        {
            //this.pos points at the opening quote. Find the ending quote.
            int end_index = this.buf.IndexOf(this.buf[this.pos], this.pos + 1);
            if (end_index == -1)
                throw new LexerException(string.Format("Missing matching quote for the quote at position {0}.", this.pos));

            Token token = new Token(Token.Type.QUOTED, this.buf.Substring(this.pos, end_index-this.pos+1), this.pos);
            this.pos = end_index + 1;
            return token;
        }
    }
}