Lexer.cs 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
  1. using System;
  2. using System.Collections.Generic;
  3. namespace csharp_sccd_compiler
  4. {
  5. public class Lexer
  6. {
  7. bool skip_white_space;
  8. bool accept_unknown_tokens;
  9. static Dictionary<char, Token.Type> single_rules = new Dictionary<char, Token.Type>{
  10. { '/', Token.Type.SLASH },
  11. { '(', Token.Type.LBRACKET },
  12. { ')', Token.Type.RBRACKET },
  13. { ',', Token.Type.COMMA },
  14. { '.', Token.Type.DOT },
  15. };
  16. string buf;
  17. int pos;
  18. public Lexer( bool skip_white_space = true, bool accept_unknown_tokens = false)
  19. {
  20. this.accept_unknown_tokens = accept_unknown_tokens;
  21. this.skip_white_space = skip_white_space;
  22. }
  23. /// <summary>
  24. /// Initialize the lexer with a buffer as input.
  25. /// </summary>
  26. public void setInput(string buffer)
  27. {
  28. this.buf = buffer;
  29. this.pos = 0;
  30. }
  31. /// <summary>
  32. /// Return the next token (a Token object) found in the input buffer. None is returned if the end of the buffer was reached.
  33. /// In case of a lexing error (the current chunk of the buffer matches no rule), a LexerException is raised.
  34. /// </summary>
  35. public Token nextToken()
  36. {
  37. if (this.skip_white_space)
  38. this.skipWhiteSpace();
  39. if (this.pos >= this.buf.Length)
  40. return null;
  41. char c = this.buf[this.pos]; //first char of next token
  42. Token.Type result_type;
  43. if (Lexer.single_rules.TryGetValue(c, out result_type)) //check if it is an operator
  44. {
  45. Token token = new Token(result_type, c.ToString(), this.pos);
  46. this.pos += 1;
  47. return token;
  48. }
  49. else //not an operator
  50. {
  51. if (this.isAlpha(c))
  52. return this.processWord();
  53. else if (this.isDigit(c))
  54. return this.processNumber();
  55. else if (c == '\'' || c == '"')
  56. return this.processQuote();
  57. else if (this.isWhiteSpace(c))
  58. return this.processWhiteSpace();
  59. }
  60. //if we're here, no rule matched
  61. if (this.accept_unknown_tokens)
  62. {
  63. Token token = new Token(Token.Type.UNKNOWN, c.ToString(), this.pos);
  64. this.pos += 1;
  65. return token;
  66. }
  67. throw new LexerException(string.Format("Invalid character at position {0}.", this.pos));
  68. }
  69. public IEnumerable<Token> iterateTokens()
  70. {
  71. while (true)
  72. {
  73. Token tok = this.nextToken();
  74. if (tok == null)
  75. break;
  76. yield return tok;
  77. }
  78. }
  79. private void skipWhiteSpace()
  80. {
  81. while (this.pos < this.buf.Length)
  82. {
  83. if (this.isWhiteSpace(this.buf[this.pos]))
  84. this.pos += 1;
  85. else
  86. break;
  87. }
  88. }
  89. private bool isWhiteSpace(char c)
  90. {
  91. return c == ' ' || c == '\t' || c == '\r' || c == '\n';
  92. }
  93. private bool isAlpha(char c)
  94. {
  95. return char.IsLetter(c) || c == '_';
  96. }
  97. private bool isDigit(char c)
  98. {
  99. return char.IsDigit(c);
  100. }
  101. private bool isAlphaNum(char c)
  102. {
  103. return this.isAlpha(c) || this.isDigit(c);
  104. }
  105. private Token processWhiteSpace()
  106. {
  107. int nextpos = this.pos + 1;
  108. while ( nextpos < this.buf.Length && this.isWhiteSpace(this.buf[nextpos]))
  109. nextpos += 1;
  110. Token token = new Token(Token.Type.WHITESPACE, this.buf.Substring(this.pos, nextpos-this.pos), this.pos);
  111. this.pos = nextpos;
  112. return token;
  113. }
  114. private Token processNumber()
  115. {
  116. int nextpos = this.pos + 1;
  117. while ( nextpos < this.buf.Length && this.isDigit(this.buf[nextpos]))
  118. nextpos += 1;
  119. Token token = new Token(Token.Type.NUMBER, this.buf.Substring(this.pos, nextpos-this.pos), this.pos);
  120. this.pos = nextpos;
  121. return token;
  122. }
  123. private Token processWord()
  124. {
  125. int nextpos = this.pos + 1;
  126. while ( nextpos < this.buf.Length && this.isAlphaNum(this.buf[nextpos]))
  127. nextpos += 1;
  128. Token token = new Token(Token.Type.WORD, this.buf.Substring(this.pos, nextpos-this.pos), this.pos);
  129. this.pos = nextpos;
  130. return token;
  131. }
  132. private Token processQuote()
  133. {
  134. //this.pos points at the opening quote. Find the ending quote.
  135. int end_index = this.buf.IndexOf(this.buf[this.pos], this.pos + 1);
  136. if (end_index == -1)
  137. throw new LexerException(string.Format("Missing matching quote for the quote at position {0}.", this.pos));
  138. Token token = new Token(Token.Type.QUOTED, this.buf.Substring(this.pos, end_index-this.pos+1), this.pos);
  139. this.pos = end_index + 1;
  140. return token;
  141. }
  142. }
  143. }