lexer.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. from utils import Enum
  2. TokenType = Enum("SLASH",
  3. "LBRACKET",
  4. "RBRACKET",
  5. "COMMA",
  6. "DOT",
  7. "NUMBER",
  8. "WORD",
  9. "QUOTED",
  10. "WHITESPACE",
  11. "BINARYOPERATOR",
  12. "UNARYOPERATOR",
  13. "UNKNOWN"
  14. )
  15. class Token(object):
  16. """ A simple Token structure. Token type, value and position.
  17. """
  18. def __init__(self, token_type, val, pos):
  19. self.type = token_type
  20. self.val = val
  21. self.pos = pos
  22. def __str__(self):
  23. return '%s(%s) at %s' % (TokenType.name_of(self.type), self.val, self.pos)
  24. class LexerError(Exception):
  25. def __init__(self, pos):
  26. self.pos = pos
  27. class Lexer(object):
  28. single_rules = {
  29. '/': TokenType.SLASH,
  30. '(': TokenType.LBRACKET,
  31. ')': TokenType.RBRACKET,
  32. ',': TokenType.COMMA,
  33. '.': TokenType.DOT,
  34. '+': TokenType.BINARYOPERATOR,
  35. '-': TokenType.BINARYOPERATOR,
  36. '<': TokenType.BINARYOPERATOR,
  37. '>': TokenType.BINARYOPERATOR,
  38. '==': TokenType.BINARYOPERATOR,
  39. '<=': TokenType.BINARYOPERATOR,
  40. '>=': TokenType.BINARYOPERATOR,
  41. '=': TokenType.BINARYOPERATOR,
  42. '+=': TokenType.BINARYOPERATOR,
  43. '-=': TokenType.BINARYOPERATOR,
  44. '&&': TokenType.BINARYOPERATOR,
  45. '||': TokenType.BINARYOPERATOR,
  46. '!': TokenType.UNARYOPERATOR}
  47. def __init__(self, skip_white_space = True, accept_unknown_tokens = False):
  48. self.skip_white_space = skip_white_space
  49. self.accept_unknown_tokens = accept_unknown_tokens
  50. def input(self, buf):
  51. """ Initialize the lexer with a buffer as input.
  52. """
  53. self.buf = buf
  54. self.pos = 0
  55. self.buflen = len(buf)
  56. def nextToken(self):
  57. """ Return the next token (a Token object) found in the
  58. input buffer. None is returned if the end of the
  59. buffer was reached.
  60. In case of a lexing error (the current chunk of the
  61. buffer matches no rule), a LexerError is raised.
  62. """
  63. if self.skip_white_space :
  64. self.skipWhiteSpace()
  65. if self.pos >= self.buflen:
  66. return None
  67. #c part of next token
  68. c = self.buf[self.pos]
  69. #check if it is an operator
  70. result_type = self.single_rules.get(c,None)
  71. if result_type is not None :
  72. if self.pos < self.buflen-1:
  73. c2 = c+self.buf[self.pos+1]
  74. result_type2 = self.single_rules.get(c2, None)
  75. if result_type2 is not None:
  76. c = c2
  77. result_type = result_type2
  78. self.pos += 1
  79. token = Token(result_type, c, self.pos)
  80. self.pos += 1
  81. return token
  82. else : #not an operator
  83. if (self.isAlpha(c)) :
  84. return self.processIdentifier()
  85. elif (self.isDigit(c)) :
  86. return self.processNumber()
  87. elif ( c == "'" or c == '"') :
  88. return self.processQuote()
  89. elif (self.isWhiteSpace(c)) :
  90. return self.processWhiteSpace()
  91. # if we're here, no rule matched
  92. if self.accept_unknown_tokens :
  93. token = Token(TokenType.UNKNOWN, c, self.pos)
  94. self.pos += 1
  95. return token
  96. raise LexerError("Invalid character at position " + str(self.pos) + ".")
  97. def tokens(self):
  98. """ Returns an iterator to the tokens found in the buffer.
  99. """
  100. while True:
  101. tok = self.nextToken()
  102. if tok is None: break
  103. yield tok
  104. def skipWhiteSpace(self):
  105. while (self.pos < self.buflen) :
  106. if self.isWhiteSpace(self.buf[self.pos]) :
  107. self.pos += 1
  108. else :
  109. break
  110. def isAlpha(self, c):
  111. return c.isalpha() or c == '_';
  112. def isAlphaNum(self, c):
  113. return c.isalnum() or c == '_';
  114. def isDigit(self, c):
  115. return c.isdigit()
  116. def isWhiteSpace(self, c):
  117. return c == ' ' or c == '\t' or c == '\r' or c == '\n'
  118. def processNumber(self):
  119. nextpos = self.pos + 1
  120. while (nextpos < self.buflen) and (self.isDigit(self.buf[nextpos])) :
  121. nextpos += 1;
  122. token = Token(TokenType.NUMBER, self.buf[self.pos:nextpos], self.pos)
  123. self.pos = nextpos
  124. return token
  125. def processIdentifier(self):
  126. nextpos = self.pos + 1
  127. while (nextpos < self.buflen) and (self.isAlphaNum(self.buf[nextpos])) :
  128. nextpos += 1;
  129. token = Token(TokenType.WORD, self.buf[self.pos:nextpos], self.pos)
  130. self.pos = nextpos
  131. return token
  132. def processQuote(self):
  133. # self.pos points at the opening quote. Find the ending quote.
  134. end_index = self.buf.find(self.buf[self.pos], self.pos + 1)
  135. if (end_index == -1) :
  136. raise LexerError("Missing matching quote for the quote at position " + str(self.pos) + ".")
  137. token = Token(TokenType.QUOTED, self.buf[self.pos:end_index+1], self.pos)
  138. self.pos = end_index + 1;
  139. return token;
  140. def processWhiteSpace(self):
  141. nextpos = self.pos + 1
  142. while (nextpos < self.buflen) and (self.isWhiteSpace(self.buf[nextpos])) :
  143. nextpos += 1;
  144. token = Token(TokenType.WHITESPACE, self.buf[self.pos:nextpos], self.pos)
  145. self.pos = nextpos
  146. return token