jonathanvdc
/
modelverse
forked from yentl/modelverse


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340
							"""
Author Daniel Riegelhaupt
Date October 2014

A visitor that takes a tree returned by the parser parsing a grammar
and returns a structure that the parser can use to parse files written in that grammar
"""
from hutnparser import * #we  import the parser for constant, and the Tree Class
from time import time

def dicToStr(dic):
    text = ""
    keys = dic.keys()
    last = None
    if len(keys) > 0:
        last = keys[-1]
    for key in dic.keys():
        text +=  "'" + key + "': "
        val = dic[key]
        if isinstance(val, dict):
            text += " { " + dicToStr(val) +" }, \n"
        else:
            text += str(val)
            if key != last:
                text += ", "
    return text


#some constants rule names
START_RULE = "start"
GRAMMAR_RULE = "grammar"
PROD_RULE = "production_rule"

RULE_DEF = "rule_definition"
RULE_NAME = "rule_name"
RULE_RHS = "rule_right_hand_side"

TOKEN_DEF = "token_definition"
TOKEN_COLLECTION = "token_collection"
TOKEN_SUB_COLLECTION = "token_sub_collection"
TOKEN_NAME= "token_name"
TOKEN_VALUE = "token_value"

KEYWORD_TYPE = "keywords"
MESSAGE = "message"

MODIFIER = "modifier"
IMPL_MOD = "IMPLICIT_MOD" #token name  not a rule name
REMOVE = "remove"

#for rhs rules
CARD_RULE = "cardinality"
MINUS= "MINUS" #token name  not a rule name
INT = "INT" #token name  not a rule name

OPER = "OPER"
OR = "OR"
LPAR = "LPAR"
RPAR = "RPAR"

#parser constants
TOKEN_TYPE = Parser.Constants.Token
PROD_TYPE=  Parser.Constants.Production

class Visitor(object):
    """
    The way the data structure is: a token is a tree with a tail containing a single string
    so it isn't enough to just check for isInstance
    """
    def isTree(self, item):
        ret = False
        if isinstance(item, Tree):
            if len(item.tail) > 1:
                ret = True
            elif len(item.tail) == 1 and isinstance(item.tail[0], Tree):
                ret = True
        return ret

    def isToken(self,item):
        ret = False
        if isinstance(item, Tree):
            if len(item.tail) == 1 and isinstance(item.tail[0], basestring):
                ret = True
        return ret

    def getTokenValue(self, item):
        #in a rule like Place : place_name ....,
        # place_name : LOWER_LETTER
        #and item is place_name than the value is at the bottom of place_name -> LOWER_LETTER -> value
        #USE ONLY WHEN SURE YOU ARE EXPECTING A TOKEN VALUE
        while item and isinstance(item, Tree):
            item = item.tail[0]
        return str(item)


class GrammarCompilerVisitor(Visitor):
    def __init__(self):
        self.tokens = {}
        self.rules = {}
        self.keywords = {}
        self.implicit = []

    def visit(self, tree):
        if self.isTree(tree):
            if tree.head == START_RULE:
                for item in tree.tail:
                    if self.isTree(item) and item.head == GRAMMAR_RULE:
                        self.visitGrammar(item)
                    #elif self.isTree(item, Tree) and item.head == MAPPER_RULE:
                    #    self.visitMapper(item)
                    #    TODO or maybe in a complete separate visitor depending on how complex this one gets

        self.addImplicit()
        #print "rules: "
        #print dicToStr(self.rules)

        return {'rules': self.rules, 'tokens': self.tokens}

    def visitGrammar(self,tree):
        if self.isTree(tree):
            for child in tree.tail:
                if self.isTree(child):
                    rule = child.head
                    if rule == PROD_RULE: #a grammar consists of prod rules and a prod rule can be a rule or token
                        self.visitGrammar(child)
                    elif rule == RULE_DEF: #top level rule definition
                        self.visitRule(child)
                    elif rule == TOKEN_COLLECTION: #part of th grammar where tokens are defined as a def or collection
                        self.visitTokens(child)
                    else:
                        print 'Encountered unexpected rule type in grammar rule. type: ', rule

    def visitRule(self,tree):
        if self.isTree(tree):
            name = ''
            body = ''
            msg = ''
            rm = False
            for child in tree.tail:
                if self.isTree(child):
                    rule = child.head
                    if rule == RULE_NAME:
                       name = self.getTokenValue(child)
                       # print name
                    elif rule == RULE_RHS:
                        body = self.createRHS(child)
                    elif rule == MESSAGE: #part of th grammar where tokens are defined as a def or collection
                        #TODO this will not work for the moment due to having an extra parent
                        #the child  0 is @Msg or Message
                        #child 1 is a token of type REGEXP and his tail contains the value
                        msg = self.getTokenValue(child.tail[1])
                    elif rule == REMOVE:
                        rm = True
                    else:
                        print 'Encountered unexpected rule type in rule definition. type: ', rule

            if name and body:
                mesg = msg
                if not mesg: #if there was no message
                    msg = name #than the message is the name of the rule
                self.addRule(name, body, mesg, rm)

    def createRHS(self, tree):
        body, boolean = self.innerRHS(tree)
        return body

    def innerRHS(self, tree):
        """
        a | b| c in the grammar wil return  will return
        a tree that if simply traverse directly wil returned the structure [ | , a [ |, b, c]]
        we want it to return [|a,b,c]
        hence the unpacking

        on the other hand if the user explicitly puts a | ( b| c)
        we do not allow unpacking   so as to return the structure the user expects
        (even though they are equivalent and the above is simpler )
        hence a bool that says we are not allowed to unpack
        """
        #reqUnpack = False #expecting to unpack
        allowUnpack = True #allow unpacking in upperlevel of recursion
        operator = '.' #we always assume it is a sequence we can change it later if we are wrong
        rhs = []
        for item in tree.tail:
            head = item.head
            if self.isTree(item):
                if head == TOKEN_NAME: # a reference to a token
                    tok =  '$' + self.getTokenValue(item)
                    rhs.append(tok)

                elif head == TOKEN_VALUE: #an anonymous token, written directly in the rule
                    tok = self.getTokenValue(item)
                    rhs.append(tok[1:-1])

                elif head == RULE_NAME: #a reference to a rule
                    rule =  '@' + self.getTokenValue(item)
                    rhs.append(rule)

                elif head == CARD_RULE: #there is a cardinality operator
                    #TODO for the moment this has the same bug as message: instead of being at the same level as other child nodes
                    #this has an extra parent with the same name as its true parent namel rule_right_hand_side
                    #(rule_definition for message)

                    allowUnpack = False
                    operator = '#'
                    extra = '('
                    for child in item.tail:
                        if child.head == MINUS:
                            extra += '-'
                        if child.head == INT:
                            extra += self.getTokenValue(child) + ')'
                            operator += extra

                elif head == RULE_RHS: # another rhs rule
                    r, u = self.innerRHS(item)
                    if u == True:
                        #if operator == '|':
                        #    print "INNER RULE: | begin replaced by: " , r[0]
                        #operator = r[0]
                        if r[0] == '|' and operator == '.':
                            operator = '|'
                        for item in r[1:]:
                            rhs.append(item)

                    else:
                        rhs.append(r)
                else:
                    print 'Encountered unexpected rule type in tree RHS with head: ', item.head
            elif self.isToken(item):
                #print "TOKEN INNER in rule:",  tree.head, "with name", item.head, " value", self.getTokenValue(item)
                head = item.head
                if head == OPER:
                    operator = self.getTokenValue(item)
                    allowUnpack = False
                elif head == OR:
                    operator = '|'
                    #reqUnpack = True
                elif head == LPAR:
                    allowUnpack = False #whatever is here belongs togheter and can't be unpacked at a higher level
                elif head == RPAR:
                    pass #the case is here because it is legal but doesn't require any other action than LPAR
                else:
                    print 'Encountered unexpected Token in RHS of kind: ', head

        if operator == '.' or operator == '|':
            rhs = [operator] + rhs
        elif not operator.startswith('#'): #so * + or ?
            if len(rhs) == 1:
                rhs = [operator] + rhs
            else:
                rhs = [operator] + ['.' , rhs ]
        else:
            #TODO uncomment this when bug about parent level is fixed for card (and message)
            zero = str(rhs[0])
            if zero.startswith('@') or zero.startswith('$'):
                zero = zero[1:]
            rhs[0] = operator + zero

        return rhs , allowUnpack

    def addImplicit(self):
        if self.implicit and self.rules.has_key('start'):

            t = str(time())
            t = t.replace('.','')[:-5]
            name = 'implicit_autogenerated_' + t
            #name + random number to avoid any conflicts with possible names

            impl=  ['|'] + self.implicit
            body = [ '*', impl ]
            msg = "Automatically generated 'Implict' rule"

            #we add it to the rules
            self.addRule(name, body, msg)
            self.rules['start']['interleave'] = ['?', '@' + name]


    def visitTokens(self,tree):
        if self.isTree(tree):
            for child in tree.tail:
                if self.isTree(child):
                    rule = child.head
                    if rule == TOKEN_DEF:
                        self.fillTokenInfo(child)
                    elif rule == TOKEN_SUB_COLLECTION:
                        #a collection is 0 type 1: 2 name 3 {  4 and further token_defs last }
                        colType  = self.getTokenValue(child.tail[0])
                        colName = self.getTokenValue(child.tail[2])
                        for item in child.tail[4:]:
                            if self.isTree(item) and item.head == TOKEN_DEF:
                                self.fillTokenInfo(item,colType, colName)
                    else: #token_collection_content is the  parent of both token def and token sub collection
                        self.visitTokens(child)

    def fillTokenInfo(self,tree, colType = None, colName = ''):
        name = ''
        val = ''
        msg = ''
        rm = False
        for item in tree.tail:
            if self.isTree(item):
                head = item.head
                if head == TOKEN_NAME:
                    #roken name contains a child of type token uppercase and this child contains the actual value
                    name =  self.getTokenValue(item)
                elif head == TOKEN_VALUE:
                    val = self.getTokenValue(item)
                elif head == MESSAGE:
                    #the child  0 is @Msg or Message
                    #child 1 is a token of type REGEXP and his tail contains the value
                    msg = self.getTokenValue(item.tail[1])
                elif head == MODIFIER:
                    #the tail is the token, the head gives us the name we don't need the actual value
                    #especially since the actual value might change
                    if item.tail[0].head == IMPL_MOD:
                        self.implicit.append( '$' + name)
                    #else:
                        #pass
                        #TODO if there are multiple modifers do something
                elif head == REMOVE:
                    rm = True
            if name and val:
                mesg = msg
                if not mesg: #if there was no message
                    msg = val #than the message is the value of the token
                self.addToken(name, val, mesg, rm)

    def addToken(self, name, value, msg, hidden = False):
        msg = str(msg[1:-1])
        value = str(value[1:-1])
        val = {'type': TOKEN_TYPE, 'reg': value, 'errortext': msg }
        if (hidden == True):
            val['hidden'] = True
        self.tokens[name] = val

    def addRule(self, name, rhs, msg, hidden = False):
        msg = str(msg[1:-1])
        val = {'type': PROD_TYPE, 'body': rhs, 'errortext': msg }
        if (hidden == True):
            val['hidden'] = True
        self.rules[name] = val