grammar_compiler_visitor.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340
  1. """
  2. Author Daniel Riegelhaupt
  3. Date October 2014
  4. A visitor that takes a tree returned by the parser parsing a grammar
  5. and returns a structure that the parser can use to parse files written in that grammar
  6. """
  7. from hutnparser import * #we import the parser for constant, and the Tree Class
  8. from time import time
  9. def dicToStr(dic):
  10. text = ""
  11. keys = dic.keys()
  12. last = None
  13. if len(keys) > 0:
  14. last = keys[-1]
  15. for key in dic.keys():
  16. text += "'" + key + "': "
  17. val = dic[key]
  18. if isinstance(val, dict):
  19. text += " { " + dicToStr(val) +" }, \n"
  20. else:
  21. text += str(val)
  22. if key != last:
  23. text += ", "
  24. return text
  25. #some constants rule names
  26. START_RULE = "start"
  27. GRAMMAR_RULE = "grammar"
  28. PROD_RULE = "production_rule"
  29. RULE_DEF = "rule_definition"
  30. RULE_NAME = "rule_name"
  31. RULE_RHS = "rule_right_hand_side"
  32. TOKEN_DEF = "token_definition"
  33. TOKEN_COLLECTION = "token_collection"
  34. TOKEN_SUB_COLLECTION = "token_sub_collection"
  35. TOKEN_NAME= "token_name"
  36. TOKEN_VALUE = "token_value"
  37. KEYWORD_TYPE = "keywords"
  38. MESSAGE = "message"
  39. MODIFIER = "modifier"
  40. IMPL_MOD = "IMPLICIT_MOD" #token name not a rule name
  41. REMOVE = "remove"
  42. #for rhs rules
  43. CARD_RULE = "cardinality"
  44. MINUS= "MINUS" #token name not a rule name
  45. INT = "INT" #token name not a rule name
  46. OPER = "OPER"
  47. OR = "OR"
  48. LPAR = "LPAR"
  49. RPAR = "RPAR"
  50. #parser constants
  51. TOKEN_TYPE = Parser.Constants.Token
  52. PROD_TYPE= Parser.Constants.Production
  53. class Visitor(object):
  54. """
  55. The way the data structure is: a token is a tree with a tail containing a single string
  56. so it isn't enough to just check for isInstance
  57. """
  58. def isTree(self, item):
  59. ret = False
  60. if isinstance(item, Tree):
  61. if len(item.tail) > 1:
  62. ret = True
  63. elif len(item.tail) == 1 and isinstance(item.tail[0], Tree):
  64. ret = True
  65. return ret
  66. def isToken(self,item):
  67. ret = False
  68. if isinstance(item, Tree):
  69. if len(item.tail) == 1 and isinstance(item.tail[0], basestring):
  70. ret = True
  71. return ret
  72. def getTokenValue(self, item):
  73. #in a rule like Place : place_name ....,
  74. # place_name : LOWER_LETTER
  75. #and item is place_name than the value is at the bottom of place_name -> LOWER_LETTER -> value
  76. #USE ONLY WHEN SURE YOU ARE EXPECTING A TOKEN VALUE
  77. while item and isinstance(item, Tree):
  78. item = item.tail[0]
  79. return str(item)
  80. class GrammarCompilerVisitor(Visitor):
  81. def __init__(self):
  82. self.tokens = {}
  83. self.rules = {}
  84. self.keywords = {}
  85. self.implicit = []
  86. def visit(self, tree):
  87. if self.isTree(tree):
  88. if tree.head == START_RULE:
  89. for item in tree.tail:
  90. if self.isTree(item) and item.head == GRAMMAR_RULE:
  91. self.visitGrammar(item)
  92. #elif self.isTree(item, Tree) and item.head == MAPPER_RULE:
  93. # self.visitMapper(item)
  94. # TODO or maybe in a complete separate visitor depending on how complex this one gets
  95. self.addImplicit()
  96. #print "rules: "
  97. #print dicToStr(self.rules)
  98. return {'rules': self.rules, 'tokens': self.tokens}
  99. def visitGrammar(self,tree):
  100. if self.isTree(tree):
  101. for child in tree.tail:
  102. if self.isTree(child):
  103. rule = child.head
  104. if rule == PROD_RULE: #a grammar consists of prod rules and a prod rule can be a rule or token
  105. self.visitGrammar(child)
  106. elif rule == RULE_DEF: #top level rule definition
  107. self.visitRule(child)
  108. elif rule == TOKEN_COLLECTION: #part of th grammar where tokens are defined as a def or collection
  109. self.visitTokens(child)
  110. else:
  111. print 'Encountered unexpected rule type in grammar rule. type: ', rule
  112. def visitRule(self,tree):
  113. if self.isTree(tree):
  114. name = ''
  115. body = ''
  116. msg = ''
  117. rm = False
  118. for child in tree.tail:
  119. if self.isTree(child):
  120. rule = child.head
  121. if rule == RULE_NAME:
  122. name = self.getTokenValue(child)
  123. # print name
  124. elif rule == RULE_RHS:
  125. body = self.createRHS(child)
  126. elif rule == MESSAGE: #part of th grammar where tokens are defined as a def or collection
  127. #TODO this will not work for the moment due to having an extra parent
  128. #the child 0 is @Msg or Message
  129. #child 1 is a token of type REGEXP and his tail contains the value
  130. msg = self.getTokenValue(child.tail[1])
  131. elif rule == REMOVE:
  132. rm = True
  133. else:
  134. print 'Encountered unexpected rule type in rule definition. type: ', rule
  135. if name and body:
  136. mesg = msg
  137. if not mesg: #if there was no message
  138. msg = name #than the message is the name of the rule
  139. self.addRule(name, body, mesg, rm)
  140. def createRHS(self, tree):
  141. body, boolean = self.innerRHS(tree)
  142. return body
  143. def innerRHS(self, tree):
  144. """
  145. a | b| c in the grammar wil return will return
  146. a tree that if simply traverse directly wil returned the structure [ | , a [ |, b, c]]
  147. we want it to return [|a,b,c]
  148. hence the unpacking
  149. on the other hand if the user explicitly puts a | ( b| c)
  150. we do not allow unpacking so as to return the structure the user expects
  151. (even though they are equivalent and the above is simpler )
  152. hence a bool that says we are not allowed to unpack
  153. """
  154. #reqUnpack = False #expecting to unpack
  155. allowUnpack = True #allow unpacking in upperlevel of recursion
  156. operator = '.' #we always assume it is a sequence we can change it later if we are wrong
  157. rhs = []
  158. for item in tree.tail:
  159. head = item.head
  160. if self.isTree(item):
  161. if head == TOKEN_NAME: # a reference to a token
  162. tok = '$' + self.getTokenValue(item)
  163. rhs.append(tok)
  164. elif head == TOKEN_VALUE: #an anonymous token, written directly in the rule
  165. tok = self.getTokenValue(item)
  166. rhs.append(tok[1:-1])
  167. elif head == RULE_NAME: #a reference to a rule
  168. rule = '@' + self.getTokenValue(item)
  169. rhs.append(rule)
  170. elif head == CARD_RULE: #there is a cardinality operator
  171. #TODO for the moment this has the same bug as message: instead of being at the same level as other child nodes
  172. #this has an extra parent with the same name as its true parent namel rule_right_hand_side
  173. #(rule_definition for message)
  174. allowUnpack = False
  175. operator = '#'
  176. extra = '('
  177. for child in item.tail:
  178. if child.head == MINUS:
  179. extra += '-'
  180. if child.head == INT:
  181. extra += self.getTokenValue(child) + ')'
  182. operator += extra
  183. elif head == RULE_RHS: # another rhs rule
  184. r, u = self.innerRHS(item)
  185. if u == True:
  186. #if operator == '|':
  187. # print "INNER RULE: | begin replaced by: " , r[0]
  188. #operator = r[0]
  189. if r[0] == '|' and operator == '.':
  190. operator = '|'
  191. for item in r[1:]:
  192. rhs.append(item)
  193. else:
  194. rhs.append(r)
  195. else:
  196. print 'Encountered unexpected rule type in tree RHS with head: ', item.head
  197. elif self.isToken(item):
  198. #print "TOKEN INNER in rule:", tree.head, "with name", item.head, " value", self.getTokenValue(item)
  199. head = item.head
  200. if head == OPER:
  201. operator = self.getTokenValue(item)
  202. allowUnpack = False
  203. elif head == OR:
  204. operator = '|'
  205. #reqUnpack = True
  206. elif head == LPAR:
  207. allowUnpack = False #whatever is here belongs togheter and can't be unpacked at a higher level
  208. elif head == RPAR:
  209. pass #the case is here because it is legal but doesn't require any other action than LPAR
  210. else:
  211. print 'Encountered unexpected Token in RHS of kind: ', head
  212. if operator == '.' or operator == '|':
  213. rhs = [operator] + rhs
  214. elif not operator.startswith('#'): #so * + or ?
  215. if len(rhs) == 1:
  216. rhs = [operator] + rhs
  217. else:
  218. rhs = [operator] + ['.' , rhs ]
  219. else:
  220. #TODO uncomment this when bug about parent level is fixed for card (and message)
  221. zero = str(rhs[0])
  222. if zero.startswith('@') or zero.startswith('$'):
  223. zero = zero[1:]
  224. rhs[0] = operator + zero
  225. return rhs , allowUnpack
  226. def addImplicit(self):
  227. if self.implicit and self.rules.has_key('start'):
  228. t = str(time())
  229. t = t.replace('.','')[:-5]
  230. name = 'implicit_autogenerated_' + t
  231. #name + random number to avoid any conflicts with possible names
  232. impl= ['|'] + self.implicit
  233. body = [ '*', impl ]
  234. msg = "Automatically generated 'Implict' rule"
  235. #we add it to the rules
  236. self.addRule(name, body, msg)
  237. self.rules['start']['interleave'] = ['?', '@' + name]
  238. def visitTokens(self,tree):
  239. if self.isTree(tree):
  240. for child in tree.tail:
  241. if self.isTree(child):
  242. rule = child.head
  243. if rule == TOKEN_DEF:
  244. self.fillTokenInfo(child)
  245. elif rule == TOKEN_SUB_COLLECTION:
  246. #a collection is 0 type 1: 2 name 3 { 4 and further token_defs last }
  247. colType = self.getTokenValue(child.tail[0])
  248. colName = self.getTokenValue(child.tail[2])
  249. for item in child.tail[4:]:
  250. if self.isTree(item) and item.head == TOKEN_DEF:
  251. self.fillTokenInfo(item,colType, colName)
  252. else: #token_collection_content is the parent of both token def and token sub collection
  253. self.visitTokens(child)
  254. def fillTokenInfo(self,tree, colType = None, colName = ''):
  255. name = ''
  256. val = ''
  257. msg = ''
  258. rm = False
  259. for item in tree.tail:
  260. if self.isTree(item):
  261. head = item.head
  262. if head == TOKEN_NAME:
  263. #roken name contains a child of type token uppercase and this child contains the actual value
  264. name = self.getTokenValue(item)
  265. elif head == TOKEN_VALUE:
  266. val = self.getTokenValue(item)
  267. elif head == MESSAGE:
  268. #the child 0 is @Msg or Message
  269. #child 1 is a token of type REGEXP and his tail contains the value
  270. msg = self.getTokenValue(item.tail[1])
  271. elif head == MODIFIER:
  272. #the tail is the token, the head gives us the name we don't need the actual value
  273. #especially since the actual value might change
  274. if item.tail[0].head == IMPL_MOD:
  275. self.implicit.append( '$' + name)
  276. #else:
  277. #pass
  278. #TODO if there are multiple modifers do something
  279. elif head == REMOVE:
  280. rm = True
  281. if name and val:
  282. mesg = msg
  283. if not mesg: #if there was no message
  284. msg = val #than the message is the value of the token
  285. self.addToken(name, val, mesg, rm)
  286. def addToken(self, name, value, msg, hidden = False):
  287. msg = str(msg[1:-1])
  288. value = str(value[1:-1])
  289. val = {'type': TOKEN_TYPE, 'reg': value, 'errortext': msg }
  290. if (hidden == True):
  291. val['hidden'] = True
  292. self.tokens[name] = val
  293. def addRule(self, name, rhs, msg, hidden = False):
  294. msg = str(msg[1:-1])
  295. val = {'type': PROD_TYPE, 'body': rhs, 'errortext': msg }
  296. if (hidden == True):
  297. val['hidden'] = True
  298. self.rules[name] = val