""" Author Daniel Riegelhaupt Date October 2014 A visitor that takes a tree returned by the parser parsing a grammar and returns a structure that the parser can use to parse files written in that grammar """ from hutnparser import * #we import the parser for constant, and the Tree Class from time import time def dicToStr(dic): text = "" keys = dic.keys() last = None if len(keys) > 0: last = keys[-1] for key in dic.keys(): text += "'" + key + "': " val = dic[key] if isinstance(val, dict): text += " { " + dicToStr(val) +" }, \n" else: text += str(val) if key != last: text += ", " return text #some constants rule names START_RULE = "start" GRAMMAR_RULE = "grammar" PROD_RULE = "production_rule" RULE_DEF = "rule_definition" RULE_NAME = "rule_name" RULE_RHS = "rule_right_hand_side" TOKEN_DEF = "token_definition" TOKEN_COLLECTION = "token_collection" TOKEN_SUB_COLLECTION = "token_sub_collection" TOKEN_NAME= "token_name" TOKEN_VALUE = "token_value" KEYWORD_TYPE = "keywords" MESSAGE = "message" MODIFIER = "modifier" IMPL_MOD = "IMPLICIT_MOD" #token name not a rule name REMOVE = "remove" #for rhs rules CARD_RULE = "cardinality" MINUS= "MINUS" #token name not a rule name INT = "INT" #token name not a rule name OPER = "OPER" OR = "OR" LPAR = "LPAR" RPAR = "RPAR" #parser constants TOKEN_TYPE = Parser.Constants.Token PROD_TYPE= Parser.Constants.Production class Visitor(object): """ The way the data structure is: a token is a tree with a tail containing a single string so it isn't enough to just check for isInstance """ def isTree(self, item): ret = False if isinstance(item, Tree): if len(item.tail) > 1: ret = True elif len(item.tail) == 1 and isinstance(item.tail[0], Tree): ret = True return ret def isToken(self,item): ret = False if isinstance(item, Tree): if len(item.tail) == 1 and isinstance(item.tail[0], basestring): ret = True return ret def getTokenValue(self, item): #in a rule like Place : place_name ...., # place_name : LOWER_LETTER #and item is place_name than the value is at the bottom of place_name -> LOWER_LETTER -> value #USE ONLY WHEN SURE YOU ARE EXPECTING A TOKEN VALUE while item and isinstance(item, Tree): item = item.tail[0] return str(item) class GrammarCompilerVisitor(Visitor): def __init__(self): self.tokens = {} self.rules = {} self.keywords = {} self.implicit = [] def visit(self, tree): if self.isTree(tree): if tree.head == START_RULE: for item in tree.tail: if self.isTree(item) and item.head == GRAMMAR_RULE: self.visitGrammar(item) #elif self.isTree(item, Tree) and item.head == MAPPER_RULE: # self.visitMapper(item) # TODO or maybe in a complete separate visitor depending on how complex this one gets self.addImplicit() #print "rules: " #print dicToStr(self.rules) return {'rules': self.rules, 'tokens': self.tokens} def visitGrammar(self,tree): if self.isTree(tree): for child in tree.tail: if self.isTree(child): rule = child.head if rule == PROD_RULE: #a grammar consists of prod rules and a prod rule can be a rule or token self.visitGrammar(child) elif rule == RULE_DEF: #top level rule definition self.visitRule(child) elif rule == TOKEN_COLLECTION: #part of th grammar where tokens are defined as a def or collection self.visitTokens(child) else: print 'Encountered unexpected rule type in grammar rule. type: ', rule def visitRule(self,tree): if self.isTree(tree): name = '' body = '' msg = '' rm = False for child in tree.tail: if self.isTree(child): rule = child.head if rule == RULE_NAME: name = self.getTokenValue(child) # print name elif rule == RULE_RHS: body = self.createRHS(child) elif rule == MESSAGE: #part of th grammar where tokens are defined as a def or collection #TODO this will not work for the moment due to having an extra parent #the child 0 is @Msg or Message #child 1 is a token of type REGEXP and his tail contains the value msg = self.getTokenValue(child.tail[1]) elif rule == REMOVE: rm = True else: print 'Encountered unexpected rule type in rule definition. type: ', rule if name and body: mesg = msg if not mesg: #if there was no message msg = name #than the message is the name of the rule self.addRule(name, body, mesg, rm) def createRHS(self, tree): body, boolean = self.innerRHS(tree) return body def innerRHS(self, tree): """ a | b| c in the grammar wil return will return a tree that if simply traverse directly wil returned the structure [ | , a [ |, b, c]] we want it to return [|a,b,c] hence the unpacking on the other hand if the user explicitly puts a | ( b| c) we do not allow unpacking so as to return the structure the user expects (even though they are equivalent and the above is simpler ) hence a bool that says we are not allowed to unpack """ #reqUnpack = False #expecting to unpack allowUnpack = True #allow unpacking in upperlevel of recursion operator = '.' #we always assume it is a sequence we can change it later if we are wrong rhs = [] for item in tree.tail: head = item.head if self.isTree(item): if head == TOKEN_NAME: # a reference to a token tok = '$' + self.getTokenValue(item) rhs.append(tok) elif head == TOKEN_VALUE: #an anonymous token, written directly in the rule tok = self.getTokenValue(item) rhs.append(tok[1:-1]) elif head == RULE_NAME: #a reference to a rule rule = '@' + self.getTokenValue(item) rhs.append(rule) elif head == CARD_RULE: #there is a cardinality operator #TODO for the moment this has the same bug as message: instead of being at the same level as other child nodes #this has an extra parent with the same name as its true parent namel rule_right_hand_side #(rule_definition for message) allowUnpack = False operator = '#' extra = '(' for child in item.tail: if child.head == MINUS: extra += '-' if child.head == INT: extra += self.getTokenValue(child) + ')' operator += extra elif head == RULE_RHS: # another rhs rule r, u = self.innerRHS(item) if u == True: #if operator == '|': # print "INNER RULE: | begin replaced by: " , r[0] #operator = r[0] if r[0] == '|' and operator == '.': operator = '|' for item in r[1:]: rhs.append(item) else: rhs.append(r) else: print 'Encountered unexpected rule type in tree RHS with head: ', item.head elif self.isToken(item): #print "TOKEN INNER in rule:", tree.head, "with name", item.head, " value", self.getTokenValue(item) head = item.head if head == OPER: operator = self.getTokenValue(item) allowUnpack = False elif head == OR: operator = '|' #reqUnpack = True elif head == LPAR: allowUnpack = False #whatever is here belongs togheter and can't be unpacked at a higher level elif head == RPAR: pass #the case is here because it is legal but doesn't require any other action than LPAR else: print 'Encountered unexpected Token in RHS of kind: ', head if operator == '.' or operator == '|': rhs = [operator] + rhs elif not operator.startswith('#'): #so * + or ? if len(rhs) == 1: rhs = [operator] + rhs else: rhs = [operator] + ['.' , rhs ] else: #TODO uncomment this when bug about parent level is fixed for card (and message) zero = str(rhs[0]) if zero.startswith('@') or zero.startswith('$'): zero = zero[1:] rhs[0] = operator + zero return rhs , allowUnpack def addImplicit(self): if self.implicit and self.rules.has_key('start'): t = str(time()) t = t.replace('.','')[:-5] name = 'implicit_autogenerated_' + t #name + random number to avoid any conflicts with possible names impl= ['|'] + self.implicit body = [ '*', impl ] msg = "Automatically generated 'Implicit' rule" #we add it to the rules self.addRule(name, body, msg) self.rules['start']['interleave'] = ['?', '@' + name] def visitTokens(self,tree): if self.isTree(tree): for child in tree.tail: if self.isTree(child): rule = child.head if rule == TOKEN_DEF: self.fillTokenInfo(child) elif rule == TOKEN_SUB_COLLECTION: #a collection is 0 type 1: 2 name 3 { 4 and further token_defs last } colType = self.getTokenValue(child.tail[0]) colName = self.getTokenValue(child.tail[2]) for item in child.tail[4:]: if self.isTree(item) and item.head == TOKEN_DEF: self.fillTokenInfo(item,colType, colName) else: #token_collection_content is the parent of both token def and token sub collection self.visitTokens(child) def fillTokenInfo(self,tree, colType = None, colName = ''): name = '' val = '' msg = '' rm = False for item in tree.tail: if self.isTree(item): head = item.head if head == TOKEN_NAME: #roken name contains a child of type token uppercase and this child contains the actual value name = self.getTokenValue(item) elif head == TOKEN_VALUE: val = self.getTokenValue(item) elif head == MESSAGE: #the child 0 is @Msg or Message #child 1 is a token of type REGEXP and his tail contains the value msg = self.getTokenValue(item.tail[1]) elif head == MODIFIER: #the tail is the token, the head gives us the name we don't need the actual value #especially since the actual value might change if item.tail[0].head == IMPL_MOD: self.implicit.append( '$' + name) #else: #pass #TODO if there are multiple modifers do something elif head == REMOVE: rm = True if name and val: mesg = msg if not mesg: #if there was no message msg = val #than the message is the value of the token self.addToken(name, val, mesg, rm) def addToken(self, name, value, msg, hidden = False): msg = str(msg[1:-1]) value = str(value[1:-1]) val = {'type': TOKEN_TYPE, 'reg': value, 'errortext': msg } if (hidden == True): val['hidden'] = True self.tokens[name] = val def addRule(self, name, rhs, msg, hidden = False): msg = str(msg[1:-1]) val = {'type': PROD_TYPE, 'body': rhs, 'errortext': msg } if (hidden == True): val['hidden'] = True self.rules[name] = val