grammar_compiler_visitor.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345
  1. """
  2. Author Daniel Riegelhaupt
  3. Date October 2014
  4. A visitor that takes a tree returned by the parser parsing a grammar
  5. and returns a structure that the parser can use to parse files written in that grammar
  6. """
  7. from interface.HUTN.hutn_compiler.hutnparser import * #we import the parser for constant, and the Tree Class
  8. from time import time
  9. def dicToStr(dic):
  10. text = ""
  11. keys = dic.keys()
  12. last = None
  13. if len(keys) > 0:
  14. last = keys[-1]
  15. for key in dic.keys():
  16. text += "'" + key + "': "
  17. val = dic[key]
  18. if isinstance(val, dict):
  19. text += " { " + dicToStr(val) +" }, \n"
  20. else:
  21. text += str(val)
  22. if key != last:
  23. text += ", "
  24. return text
  25. #some constants rule names
  26. START_RULE = "start"
  27. GRAMMAR_RULE = "grammar"
  28. PROD_RULE = "production_rule"
  29. RULE_DEF = "rule_definition"
  30. RULE_NAME = "rule_name"
  31. RULE_RHS = "rule_right_hand_side"
  32. TOKEN_DEF = "token_definition"
  33. TOKEN_COLLECTION = "token_collection"
  34. TOKEN_SUB_COLLECTION = "token_sub_collection"
  35. TOKEN_NAME= "token_name"
  36. TOKEN_VALUE = "token_value"
  37. KEYWORD_TYPE = "keywords"
  38. MESSAGE = "message"
  39. MODIFIER = "modifier"
  40. IMPL_MOD = "IMPLICIT_MOD" #token name not a rule name
  41. REMOVE = "remove"
  42. #for rhs rules
  43. CARD_RULE = "cardinality"
  44. MINUS= "MINUS" #token name not a rule name
  45. INT = "INT" #token name not a rule name
  46. OPER = "OPER"
  47. OR = "OR"
  48. LPAR = "LPAR"
  49. RPAR = "RPAR"
  50. #parser constants
  51. TOKEN_TYPE = Parser.Constants.Token
  52. PROD_TYPE= Parser.Constants.Production
  53. class Visitor(object):
  54. """
  55. The way the data structure is: a token is a tree with a tail containing a single string
  56. so it isn't enough to just check for isInstance
  57. """
  58. def isTree(self, item):
  59. ret = False
  60. if isinstance(item, Tree):
  61. if len(item.tail) > 1:
  62. ret = True
  63. elif len(item.tail) == 1 and isinstance(item.tail[0], Tree):
  64. ret = True
  65. return ret
  66. def isToken(self,item):
  67. ret = False
  68. if isinstance(item, Tree):
  69. try:
  70. if len(item.tail) == 1 and isinstance(item.tail[0], basestring):
  71. ret = True
  72. except NameError:
  73. if len(item.tail) == 1 and isinstance(item.tail[0], str):
  74. ret = True
  75. return ret
  76. def getTokenValue(self, item):
  77. #in a rule like Place : place_name ....,
  78. # place_name : LOWER_LETTER
  79. #and item is place_name than the value is at the bottom of place_name -> LOWER_LETTER -> value
  80. #USE ONLY WHEN SURE YOU ARE EXPECTING A TOKEN VALUE
  81. while item and isinstance(item, Tree):
  82. item = item.tail[0]
  83. return str(item)
  84. class GrammarCompilerVisitor(Visitor):
  85. def __init__(self):
  86. self.tokens = {}
  87. self.rules = {}
  88. self.keywords = {}
  89. self.implicit = []
  90. def visit(self, tree):
  91. if self.isTree(tree):
  92. if tree.head == START_RULE:
  93. for item in tree.tail:
  94. if self.isTree(item) and item.head == GRAMMAR_RULE:
  95. self.visitGrammar(item)
  96. #elif self.isTree(item, Tree) and item.head == MAPPER_RULE:
  97. # self.visitMapper(item)
  98. # TODO or maybe in a complete separate visitor depending on how complex this one gets
  99. self.addImplicit()
  100. #print "rules: "
  101. #print dicToStr(self.rules)
  102. return {'rules': self.rules, 'tokens': self.tokens}
  103. def visitGrammar(self,tree):
  104. if self.isTree(tree):
  105. for child in tree.tail:
  106. if self.isTree(child):
  107. rule = child.head
  108. if rule == PROD_RULE: #a grammar consists of prod rules and a prod rule can be a rule or token
  109. self.visitGrammar(child)
  110. elif rule == RULE_DEF: #top level rule definition
  111. self.visitRule(child)
  112. elif rule == TOKEN_COLLECTION: #part of th grammar where tokens are defined as a def or collection
  113. self.visitTokens(child)
  114. else:
  115. print('Encountered unexpected rule type in grammar rule. type: ' + str(rule))
  116. def visitRule(self,tree):
  117. if self.isTree(tree):
  118. name = ''
  119. body = ''
  120. msg = ''
  121. rm = False
  122. for child in tree.tail:
  123. if self.isTree(child):
  124. rule = child.head
  125. if rule == RULE_NAME:
  126. name = self.getTokenValue(child)
  127. # print name
  128. elif rule == RULE_RHS:
  129. body = self.createRHS(child)
  130. elif rule == MESSAGE: #part of th grammar where tokens are defined as a def or collection
  131. #TODO this will not work for the moment due to having an extra parent
  132. #the child 0 is @Msg or Message
  133. #child 1 is a token of type REGEXP and his tail contains the value
  134. msg = self.getTokenValue(child.tail[1])
  135. elif rule == REMOVE:
  136. rm = True
  137. else:
  138. print('Encountered unexpected rule type in rule definition. type: ' + str(rule))
  139. if name and body:
  140. mesg = msg
  141. if not mesg: #if there was no message
  142. msg = name #than the message is the name of the rule
  143. self.addRule(name, body, mesg, rm)
  144. def createRHS(self, tree):
  145. body, boolean = self.innerRHS(tree)
  146. return body
  147. def innerRHS(self, tree):
  148. """
  149. a | b| c in the grammar wil return will return
  150. a tree that if simply traverse directly wil returned the structure [ | , a [ |, b, c]]
  151. we want it to return [|a,b,c]
  152. hence the unpacking
  153. on the other hand if the user explicitly puts a | ( b| c)
  154. we do not allow unpacking so as to return the structure the user expects
  155. (even though they are equivalent and the above is simpler )
  156. hence a bool that says we are not allowed to unpack
  157. """
  158. #reqUnpack = False #expecting to unpack
  159. allowUnpack = True #allow unpacking in upperlevel of recursion
  160. operator = '.' #we always assume it is a sequence we can change it later if we are wrong
  161. rhs = []
  162. for item in tree.tail:
  163. head = item.head
  164. if self.isTree(item):
  165. if head == TOKEN_NAME: # a reference to a token
  166. tok = '$' + self.getTokenValue(item)
  167. rhs.append(tok)
  168. elif head == TOKEN_VALUE: #an anonymous token, written directly in the rule
  169. tok = self.getTokenValue(item)
  170. rhs.append(tok[1:-1])
  171. elif head == RULE_NAME: #a reference to a rule
  172. rule = '@' + self.getTokenValue(item)
  173. rhs.append(rule)
  174. elif head == CARD_RULE: #there is a cardinality operator
  175. #TODO for the moment this has the same bug as message: instead of being at the same level as other child nodes
  176. #this has an extra parent with the same name as its true parent namel rule_right_hand_side
  177. #(rule_definition for message)
  178. allowUnpack = False
  179. operator = '#'
  180. extra = '('
  181. for child in item.tail:
  182. if child.head == MINUS:
  183. extra += '-'
  184. if child.head == INT:
  185. extra += self.getTokenValue(child) + ')'
  186. operator += extra
  187. elif head == RULE_RHS: # another rhs rule
  188. r, u = self.innerRHS(item)
  189. if u == True:
  190. #if operator == '|':
  191. # print "INNER RULE: | begin replaced by: " , r[0]
  192. #operator = r[0]
  193. if r[0] == '|' and operator == '.':
  194. operator = '|'
  195. for item in r[1:]:
  196. rhs.append(item)
  197. else:
  198. rhs.append(r)
  199. else:
  200. print('Encountered unexpected rule type in tree RHS with head: ' + str(item.head))
  201. elif self.isToken(item):
  202. #print "TOKEN INNER in rule:", tree.head, "with name", item.head, " value", self.getTokenValue(item)
  203. head = item.head
  204. if head == OPER:
  205. operator = self.getTokenValue(item)
  206. allowUnpack = False
  207. elif head == OR:
  208. operator = '|'
  209. #reqUnpack = True
  210. elif head == LPAR:
  211. allowUnpack = False #whatever is here belongs togheter and can't be unpacked at a higher level
  212. elif head == RPAR:
  213. pass #the case is here because it is legal but doesn't require any other action than LPAR
  214. else:
  215. print('Encountered unexpected Token in RHS of kind: ' + str(head))
  216. if operator == '.' or operator == '|':
  217. rhs = [operator] + rhs
  218. elif not operator.startswith('#'): #so * + or ?
  219. if len(rhs) == 1:
  220. rhs = [operator] + rhs
  221. else:
  222. rhs = [operator] + ['.' , rhs ]
  223. else:
  224. #TODO uncomment this when bug about parent level is fixed for card (and message)
  225. zero = str(rhs[0])
  226. if zero.startswith('@') or zero.startswith('$'):
  227. zero = zero[1:]
  228. rhs[0] = operator + zero
  229. return rhs , allowUnpack
  230. def addImplicit(self):
  231. if self.implicit and 'start' in self.rules:
  232. t = str(time())
  233. t = t.replace('.','')[:-5]
  234. name = 'implicit_autogenerated_' + t
  235. #name + random number to avoid any conflicts with possible names
  236. impl= ['|'] + self.implicit
  237. body = [ '*', impl ]
  238. msg = "Automatically generated 'Implicit' rule"
  239. #we add it to the rules
  240. self.addRule(name, body, msg)
  241. self.rules['start']['interleave'] = ['?', '@' + name]
  242. def visitTokens(self,tree):
  243. if self.isTree(tree):
  244. for child in tree.tail:
  245. if self.isTree(child):
  246. rule = child.head
  247. if rule == TOKEN_DEF:
  248. self.fillTokenInfo(child)
  249. elif rule == TOKEN_SUB_COLLECTION:
  250. #a collection is 0 type 1: 2 name 3 { 4 and further token_defs last }
  251. colType = self.getTokenValue(child.tail[0])
  252. colName = self.getTokenValue(child.tail[2])
  253. for item in child.tail[4:]:
  254. if self.isTree(item) and item.head == TOKEN_DEF:
  255. self.fillTokenInfo(item,colType, colName)
  256. else: #token_collection_content is the parent of both token def and token sub collection
  257. self.visitTokens(child)
  258. def fillTokenInfo(self,tree, colType = None, colName = ''):
  259. name = ''
  260. val = ''
  261. msg = ''
  262. rm = False
  263. for item in tree.tail:
  264. if self.isTree(item):
  265. head = item.head
  266. if head == TOKEN_NAME:
  267. #roken name contains a child of type token uppercase and this child contains the actual value
  268. name = self.getTokenValue(item)
  269. elif head == TOKEN_VALUE:
  270. val = self.getTokenValue(item)
  271. elif head == MESSAGE:
  272. #the child 0 is @Msg or Message
  273. #child 1 is a token of type REGEXP and his tail contains the value
  274. msg = self.getTokenValue(item.tail[1])
  275. elif head == MODIFIER:
  276. #the tail is the token, the head gives us the name we don't need the actual value
  277. #especially since the actual value might change
  278. if item.tail[0].head == IMPL_MOD:
  279. self.implicit.append( '$' + name)
  280. #else:
  281. #pass
  282. #TODO if there are multiple modifers do something
  283. elif head == REMOVE:
  284. rm = True
  285. if name and val:
  286. mesg = msg
  287. if not mesg: #if there was no message
  288. msg = val #than the message is the value of the token
  289. self.addToken(name, val, mesg, rm)
  290. def addToken(self, name, value, msg, hidden = False):
  291. msg = str(msg[1:-1])
  292. value = str(value[1:-1])
  293. val = {'type': TOKEN_TYPE, 'reg': value, 'errortext': msg }
  294. if (hidden == True):
  295. val['hidden'] = True
  296. self.tokens[name] = val
  297. def addRule(self, name, rhs, msg, hidden = False):
  298. msg = str(msg[1:-1])
  299. val = {'type': PROD_TYPE, 'body': rhs, 'errortext': msg }
  300. if (hidden == True):
  301. val['hidden'] = True
  302. self.rules[name] = val