lexer.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
  1. from utils import Enum
  2. TokenType = Enum("SLASH",
  3. "LBRACKET",
  4. "RBRACKET",
  5. "COMMA",
  6. "DOT",
  7. "NUMBER",
  8. "WORD",
  9. "QUOTED",
  10. "WHITESPACE",
  11. "UNKNOWN"
  12. )
  13. class Token(object):
  14. """ A simple Token structure. Token type, value and position.
  15. """
  16. def __init__(self, token_type, val, pos):
  17. self.type = token_type
  18. self.val = val
  19. self.pos = pos
  20. def __str__(self):
  21. return '%s(%s) at %s' % (TokenType.name_of(self.type), self.val, self.pos)
  22. class LexerError(Exception):
  23. def __init__(self, pos):
  24. self.pos = pos
  25. class Lexer(object):
  26. single_rules = {
  27. '/': TokenType.SLASH,
  28. '(': TokenType.LBRACKET,
  29. ')': TokenType.RBRACKET,
  30. ',': TokenType.COMMA,
  31. '.': TokenType.DOT
  32. }
  33. def __init__(self, skip_white_space = True, accept_unknown_tokens = False):
  34. self.skip_white_space = skip_white_space
  35. self.accept_unknown_tokens = accept_unknown_tokens
  36. def input(self, buf):
  37. """ Initialize the lexer with a buffer as input.
  38. """
  39. self.buf = buf
  40. self.pos = 0
  41. self.buflen = len(buf)
  42. def nextToken(self):
  43. """ Return the next token (a Token object) found in the
  44. input buffer. None is returned if the end of the
  45. buffer was reached.
  46. In case of a lexing error (the current chunk of the
  47. buffer matches no rule), a LexerError is raised.
  48. """
  49. if self.skip_white_space :
  50. self.skipWhiteSpace()
  51. if self.pos >= self.buflen:
  52. return None
  53. #c part of next token
  54. c = self.buf[self.pos]
  55. #check if it is an operator
  56. result_type = self.single_rules.get(c,None)
  57. if result_type is not None :
  58. token = Token(result_type, c, self.pos)
  59. self.pos += 1
  60. return token
  61. else : #not an operator
  62. if (self.isAlpha(c)) :
  63. return self.processIdentifier()
  64. elif (self.isDigit(c)) :
  65. return self.processNumber()
  66. elif ( c == "'" or c == '"') :
  67. return self.processQuote()
  68. elif (self.isWhiteSpace(c)) :
  69. return self.processWhiteSpace()
  70. # if we're here, no rule matched
  71. if self.accept_unknown_tokens :
  72. token = Token(TokenType.UNKNOWN, c, self.pos)
  73. self.pos += 1
  74. return token
  75. raise LexerError("Invalid character at position " + str(this.pos) + ".")
  76. def tokens(self):
  77. """ Returns an iterator to the tokens found in the buffer.
  78. """
  79. while True:
  80. tok = self.nextToken()
  81. if tok is None: break
  82. yield tok
  83. def skipWhiteSpace(self):
  84. while (self.pos < self.buflen) :
  85. if self.isWhiteSpace(self.buf[self.pos]) :
  86. self.pos += 1
  87. else :
  88. break
  89. def isAlpha(self, c):
  90. return c.isalpha() or c == '_';
  91. def isAlphaNum(self, c):
  92. return c.isalnum() or c == '_';
  93. def isDigit(self, c):
  94. return c.isdigit()
  95. def isWhiteSpace(self, c):
  96. return c == ' ' or c == '\t' or c == '\r' or c == '\n'
  97. def processNumber(self):
  98. nextpos = self.pos + 1
  99. while (nextpos < self.buflen) and (self.isDigit(self.buf[nextpos])) :
  100. nextpos += 1;
  101. token = Token(TokenType.NUMBER, self.buf[self.pos:nextpos], self.pos)
  102. self.pos = nextpos
  103. return token
  104. def processIdentifier(self):
  105. nextpos = self.pos + 1
  106. while (nextpos < self.buflen) and (self.isAlphaNum(self.buf[nextpos])) :
  107. nextpos += 1;
  108. token = Token(TokenType.WORD, self.buf[self.pos:nextpos], self.pos)
  109. self.pos = nextpos
  110. return token
  111. def processQuote(self):
  112. # this.pos points at the opening quote. Find the ending quote.
  113. end_index = self.buf.find(self.buf[self.pos], self.pos + 1)
  114. if (end_index == -1) :
  115. raise LexerError("Missing matching quote for the quote at position " + str(this.pos) + ".")
  116. token = Token(TokenType.QUOTED, self.buf[self.pos:end_index+1], self.pos)
  117. self.pos = end_index + 1;
  118. return token;
  119. def processWhiteSpace(self):
  120. nextpos = self.pos + 1
  121. while (nextpos < self.buflen) and (self.isWhiteSpace(self.buf[nextpos])) :
  122. nextpos += 1;
  123. token = Token(TokenType.WHITESPACE, self.buf[self.pos:nextpos], self.pos)
  124. self.pos = nextpos
  125. return token