123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341 |
- """
- Author Daniel Riegelhaupt
- Date October 2014
- A visitor that takes a tree returned by the parser parsing a grammar
- and returns a structure that the parser can use to parse files written in that grammar
- """
- from hutnparser import * #we import the parser for constant, and the Tree Class
- from time import time
- def dicToStr(dic):
- text = ""
- keys = dic.keys()
- last = None
- if len(keys) > 0:
- last = keys[-1]
- for key in dic.keys():
- text += "'" + key + "': "
- val = dic[key]
- if isinstance(val, dict):
- text += " { " + dicToStr(val) +" }, \n"
- else:
- text += str(val)
- if key != last:
- text += ", "
- return text
- #some constants rule names
- START_RULE = "start"
- GRAMMAR_RULE = "grammar"
- PROD_RULE = "production_rule"
- RULE_DEF = "rule_definition"
- RULE_NAME = "rule_name"
- RULE_RHS = "rule_right_hand_side"
- TOKEN_DEF = "token_definition"
- TOKEN_COLLECTION = "token_collection"
- TOKEN_SUB_COLLECTION = "token_sub_collection"
- TOKEN_NAME= "token_name"
- TOKEN_VALUE = "token_value"
- KEYWORD_TYPE = "keywords"
- MESSAGE = "message"
- MODIFIER = "modifier"
- IMPL_MOD = "IMPLICIT_MOD" #token name not a rule name
- REMOVE = "remove"
- #for rhs rules
- CARD_RULE = "cardinality"
- MINUS= "MINUS" #token name not a rule name
- INT = "INT" #token name not a rule name
- OPER = "OPER"
- OR = "OR"
- LPAR = "LPAR"
- RPAR = "RPAR"
- #parser constants
- TOKEN_TYPE = Parser.Constants.Token
- PROD_TYPE= Parser.Constants.Production
- class Visitor(object):
- """
- The way the data structure is: a token is a tree with a tail containing a single string
- so it isn't enough to just check for isInstance
- """
- def isTree(self, item):
- ret = False
- if isinstance(item, Tree):
- if len(item.tail) > 1:
- ret = True
- elif len(item.tail) == 1 and isinstance(item.tail[0], Tree):
- ret = True
- return ret
- def isToken(self,item):
- ret = False
- if isinstance(item, Tree):
- if len(item.tail) == 1 and isinstance(item.tail[0], basestring):
- ret = True
- return ret
- def getTokenValue(self, item):
- #in a rule like Place : place_name ....,
- # place_name : LOWER_LETTER
- #and item is place_name than the value is at the bottom of place_name -> LOWER_LETTER -> value
- #USE ONLY WHEN SURE YOU ARE EXPECTING A TOKEN VALUE
- while item and isinstance(item, Tree):
- item = item.tail[0]
- return str(item)
- class GrammarCompilerVisitor(Visitor):
- def __init__(self):
- self.tokens = {}
- self.rules = {}
- self.keywords = {}
- self.implicit = []
- def visit(self, tree):
- if self.isTree(tree):
- if tree.head == START_RULE:
- for item in tree.tail:
- if self.isTree(item) and item.head == GRAMMAR_RULE:
- self.visitGrammar(item)
- #elif self.isTree(item, Tree) and item.head == MAPPER_RULE:
- # self.visitMapper(item)
- # TODO or maybe in a complete separate visitor depending on how complex this one gets
- self.addImplicit()
- #print "rules: "
- #print dicToStr(self.rules)
- return {'rules': self.rules, 'tokens': self.tokens}
- def visitGrammar(self,tree):
- if self.isTree(tree):
- for child in tree.tail:
- if self.isTree(child):
- rule = child.head
- if rule == PROD_RULE: #a grammar consists of prod rules and a prod rule can be a rule or token
- self.visitGrammar(child)
- elif rule == RULE_DEF: #top level rule definition
- self.visitRule(child)
- elif rule == TOKEN_COLLECTION: #part of th grammar where tokens are defined as a def or collection
- self.visitTokens(child)
- else:
- print 'Encountered unexpected rule type in grammar rule. type: ', rule
- def visitRule(self,tree):
- if self.isTree(tree):
- name = ''
- body = ''
- msg = ''
- rm = False
- for child in tree.tail:
- if self.isTree(child):
- rule = child.head
- if rule == RULE_NAME:
- name = self.getTokenValue(child)
- # print name
- elif rule == RULE_RHS:
- body = self.createRHS(child)
- elif rule == MESSAGE: #part of th grammar where tokens are defined as a def or collection
- #TODO this will not work for the moment due to having an extra parent
- #the child 0 is @Msg or Message
- #child 1 is a token of type REGEXP and his tail contains the value
- msg = self.getTokenValue(child.tail[1])
- elif rule == REMOVE:
- rm = True
- else:
- print 'Encountered unexpected rule type in rule definition. type: ', rule
- if name and body:
- mesg = msg
- if not mesg: #if there was no message
- msg = name #than the message is the name of the rule
- self.addRule(name, body, mesg, rm)
- def createRHS(self, tree):
- body, boolean = self.innerRHS(tree)
- return body
- def innerRHS(self, tree):
- """
- a | b| c in the grammar wil return will return
- a tree that if simply traverse directly wil returned the structure [ | , a [ |, b, c]]
- we want it to return [|a,b,c]
- hence the unpacking
- on the other hand if the user explicitly puts a | ( b| c)
- we do not allow unpacking so as to return the structure the user expects
- (even though they are equivalent and the above is simpler )
- hence a bool that says we are not allowed to unpack
- """
- #reqUnpack = False #expecting to unpack
- allowUnpack = True #allow unpacking in upperlevel of recursion
- operator = '.' #we always assume it is a sequence we can change it later if we are wrong
- rhs = []
- for item in tree.tail:
- head = item.head
- if self.isTree(item):
- if head == TOKEN_NAME: # a reference to a token
- tok = '$' + self.getTokenValue(item)
- rhs.append(tok)
- elif head == TOKEN_VALUE: #an anonymous token, written directly in the rule
- tok = self.getTokenValue(item)
- rhs.append(tok[1:-1])
- elif head == RULE_NAME: #a reference to a rule
- rule = '@' + self.getTokenValue(item)
- rhs.append(rule)
- elif head == CARD_RULE: #there is a cardinality operator
- #TODO for the moment this has the same bug as message: instead of being at the same level as other child nodes
- #this has an extra parent with the same name as its true parent namel rule_right_hand_side
- #(rule_definition for message)
- allowUnpack = False
- operator = '#'
- extra = '('
- for child in item.tail:
- if child.head == MINUS:
- extra += '-'
- if child.head == INT:
- extra += self.getTokenValue(child) + ')'
- operator += extra
- elif head == RULE_RHS: # another rhs rule
- r, u = self.innerRHS(item)
- if u == True:
- #if operator == '|':
- # print "INNER RULE: | begin replaced by: " , r[0]
- #operator = r[0]
- if r[0] == '|' and operator == '.':
- operator = '|'
- for item in r[1:]:
- rhs.append(item)
- else:
- rhs.append(r)
- else:
- print 'Encountered unexpected rule type in tree RHS with head: ', item.head
- elif self.isToken(item):
- #print "TOKEN INNER in rule:", tree.head, "with name", item.head, " value", self.getTokenValue(item)
- head = item.head
- if head == OPER:
- operator = self.getTokenValue(item)
- allowUnpack = False
- elif head == OR:
- operator = '|'
- #reqUnpack = True
- elif head == LPAR:
- allowUnpack = False #whatever is here belongs togheter and can't be unpacked at a higher level
- elif head == RPAR:
- pass #the case is here because it is legal but doesn't require any other action than LPAR
- else:
- print 'Encountered unexpected Token in RHS of kind: ', head
- if operator == '.' or operator == '|':
- rhs = [operator] + rhs
- elif not operator.startswith('#'): #so * + or ?
- if len(rhs) == 1:
- rhs = [operator] + rhs
- else:
- rhs = [operator] + ['.' , rhs ]
- else:
- #TODO uncomment this when bug about parent level is fixed for card (and message)
- zero = str(rhs[0])
- if zero.startswith('@') or zero.startswith('$'):
- zero = zero[1:]
- rhs[0] = operator + zero
- return rhs , allowUnpack
- def addImplicit(self):
- if self.implicit and self.rules.has_key('start'):
- t = str(time())
- t = t.replace('.','')[:-5]
- name = 'implicit_autogenerated_' + t
- #name + random number to avoid any conflicts with possible names
- impl= ['|'] + self.implicit
- body = [ '*', impl ]
- msg = "Automatically generated 'Implicit' rule"
- #we add it to the rules
- self.addRule(name, body, msg)
- self.rules['start']['interleave'] = ['?', '@' + name]
- def visitTokens(self,tree):
- if self.isTree(tree):
- for child in tree.tail:
- if self.isTree(child):
- rule = child.head
- if rule == TOKEN_DEF:
- self.fillTokenInfo(child)
- elif rule == TOKEN_SUB_COLLECTION:
- #a collection is 0 type 1: 2 name 3 { 4 and further token_defs last }
- colType = self.getTokenValue(child.tail[0])
- colName = self.getTokenValue(child.tail[2])
- for item in child.tail[4:]:
- if self.isTree(item) and item.head == TOKEN_DEF:
- self.fillTokenInfo(item,colType, colName)
- else: #token_collection_content is the parent of both token def and token sub collection
- self.visitTokens(child)
- def fillTokenInfo(self,tree, colType = None, colName = ''):
- name = ''
- val = ''
- msg = ''
- rm = False
- for item in tree.tail:
- if self.isTree(item):
- head = item.head
- if head == TOKEN_NAME:
- #roken name contains a child of type token uppercase and this child contains the actual value
- name = self.getTokenValue(item)
- elif head == TOKEN_VALUE:
- val = self.getTokenValue(item)
- elif head == MESSAGE:
- #the child 0 is @Msg or Message
- #child 1 is a token of type REGEXP and his tail contains the value
- msg = self.getTokenValue(item.tail[1])
- elif head == MODIFIER:
- #the tail is the token, the head gives us the name we don't need the actual value
- #especially since the actual value might change
- if item.tail[0].head == IMPL_MOD:
- self.implicit.append( '$' + name)
- #else:
- #pass
- #TODO if there are multiple modifers do something
- elif head == REMOVE:
- rm = True
- if name and val:
- mesg = msg
- if not mesg: #if there was no message
- msg = val #than the message is the value of the token
- self.addToken(name, val, mesg, rm)
- def addToken(self, name, value, msg, hidden = False):
- msg = str(msg[1:-1])
- value = str(value[1:-1])
- val = {'type': TOKEN_TYPE, 'reg': value, 'errortext': msg }
- if (hidden == True):
- val['hidden'] = True
- self.tokens[name] = val
- def addRule(self, name, rhs, msg, hidden = False):
- msg = str(msg[1:-1])
- val = {'type': PROD_TYPE, 'body': rhs, 'errortext': msg }
- if (hidden == True):
- val['hidden'] = True
- self.rules[name] = val
|