jonathanvdc
/
modelverse
forked from yentl/modelverse


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
							"""
Author: Daniel Riegelhaupt
Date: October 2014

The meta grammar needed by Bruno Barroca's parser to read other grammars.
Based on his data structure
"""
#TODO check escape characters in errortext
class Grammar(object):

    def __init__(self):

        self.tokens = {
            #'<token name>' : { 'type': 'token', 'reg': '<token value>' 'errortext': '<human readable error text>'}
            "LOWER_CASE":  { 'type': 'token', 'reg': r'[a-z_][a-z_0-9]*', 'errortext': 'Lower case characters'},
            "UPPER_CASE": { 'type': 'token', 'reg': r'[A-Z_][A-Z_0-9]*', 'errortext': 'Upper case characters'},
            "REGEXP":  { 'type': 'token', 'reg': r'\'(.|\n)*?[^\\]\'', 'errortext': 'Regular expression'},
            "INT":  { 'type': 'token', 'reg': r'[0-9]*', 'errortext': 'Integers'},

            "IMPLICIT_MOD": { 'type': 'token', 'reg': r'(@Implicit|@Impl)', 'errortext': '@Implicit or @Impl'},
            "MESSAGE_MOD": { 'type': 'token', 'reg': r'(@Message|@Msg)', 'errortext': '@Message or  @Msg'},
            "REMOVE_MOD": { 'type': 'token', 'reg': r'(@Remove|@Rm)', 'errortext': '@Remove or  @Rm'},
            
            "TOKENS": { 'type': 'token', 'reg': r'tokens'  , 'errortext': 'tokens'},
            "KEYWORDS": { 'type': 'token', 'reg': r'keywords', 'errortext': 'keywords'},
            "GRAMMAR": { 'type': 'token', 'reg': r'grammar', 'errortext': 'grammar'},
            "MAPPER": { 'type': 'token', 'reg': r'mapper', 'errortext': 'mapper'},

            "OPER": { 'type': 'token', 'reg': r'[?*+]', 'errortext': '? or * or +'},
            "CARD": { 'type': 'token', 'reg': r'#', 'errortext': '#'},
            "MINUS": { 'type': 'token', 'reg': r'-', 'errortext': '-'},
            "OR": { 'type': 'token', 'reg': r'\|', 'errortext': '|' },
            "LPAR": { 'type': 'token', 'reg': r'\(', 'errortext': '(' },
            "RPAR": { 'type': 'token', 'reg': r'\)', 'errortext': ')' },
            "SEMICOL": { 'type': 'token', 'reg': r';', 'errortext': ';' },
            "COMMA": { 'type': 'token', 'reg': r',', 'errortext': ',' },
            "LCBR": { 'type': 'token', 'reg': r'\{', 'errortext':  '{'},
            "RCBR": { 'type': 'token', 'reg': r'\}', 'errortext': '}' },
            "LSBR": { 'type': 'token', 'reg': r'\[', 'errortext':  '['},
            "RSBR": { 'type': 'token', 'reg': r'\]', 'errortext': ']' },
            "DOT": { 'type': 'token', 'reg': r'\.', 'errortext':  '.'},
            "COLON": { 'type': 'token', 'reg': r':', 'errortext': ':'},

            "NEWLINE": { 'type': 'token', 'reg': r'(\r?\n[\t ]*)+'  , 'errortext': 'New Line'},
            "WS": { 'type': 'token', 'reg': r'[\t \f]+'  , 'errortext': 'White space'},
            "LINE_CONT": { 'type': 'token', r'reg': '\\[\t \f]*\r?\n', 'errortext': 'Line continuation'},
            "COMMENT": { 'type': 'token', r'reg': '//[^\n]*', 'errortext': 'Comment'}
        }

        self.rules = {
            'start':  {'type': 'prod', 'body':['.', '@grammar', ['?', '@mapper' ]] , 'errortext': 'Top level start',
                       'interleave': ['?','@implicit'] },
            
            'grammar':  {'type': 'prod', 'body': ['.', '$GRAMMAR', '$LCBR', ['*', '@production_rule'], '$RCBR' ], 
                         'errortext': 'Grammar definition'},
            
            'production_rule':  {'type': 'prod', 'body': ['|', '@rule_definition',  '@token_collection' ], 
                                 'errortext': 'Top level production rule definition'},

            'rule_definition':  {'type': 'prod', 'body':
                                ['.', '@rule_name', '$COLON', '@rule_right_hand_side', ['*',['|','@message','@remove']],'$SEMICOL' ],
                                'errortext': 'Production rule definition'},

            'rule_name':  {'type': 'prod', 'body': ['.' , '$LOWER_CASE'], 'errortext': 'Rule name' },

            'rule_right_hand_side': { 'type': 'prod', 'body':  [ '|', '@token_name', '@token_value', '@rule_name',
                                                         ['.', '$LPAR',  '@rule_right_hand_side', '$RPAR',
                                                          ['?', '@cardinality']],
                                                         ['.', '@rule_right_hand_side', '$OR', '@rule_right_hand_side'],
                                                         ['.', '@rule_right_hand_side', '@rule_right_hand_side'],
                                                         ['.', '@rule_right_hand_side', '$OPER' ]],
                                    'errortext':  'Production rule right hand side'},

            'cardinality': { 'type': 'prod', 'body':  ['.', '$CARD', ['?', ['.','$LSBR', ['?', '$MINUS'], '$INT' ,'$RSBR']]],
                         'errortext': 'Cardinality'},

            'message': { 'type': 'prod', 'body':  ['.', '$MESSAGE_MOD', '@message_value'],
                         'errortext': 'Error message'},

            'message_value': { 'type': 'prod', 'body':  ['.','$REGEXP'], 'errortext': 'Error message value' },

            'token_collection': { 'type': 'prod', 'body': ['.', '$TOKENS', '$LCBR',
                                                           ['*',  [ '|', '@token_sub_collection', '@token_definition']],
                                                           '$RCBR' ],
                                'errortext': 'Top level token definition' },

            'token_sub_collection': { 'type': 'prod', 'body': [ '.', '@token_collection_category', '$COLON',
                                                      '@collection_name', '$LCBR',
                                                      ['*' ,'@token_definition'] ,'$RCBR'],
                                      'errortext': 'Token collection definition' },

            'token_collection_category': { 'type': 'prod', 'body': ['.', '$KEYWORDS'],
                                           'errortext':  'Token collection categories: keywords' },

            'collection_name': { 'type': 'prod', 'body': ['.', '$LOWER_CASE'] , 'errortext':  'Token collection name'},

            'token_definition': { 'type': 'prod', 'body': [ '.', '@token_name', '$COLON', '@token_value',
                                                            ['*',['|','@modifier','@message','@remove']], '$SEMICOL'],
                                  'errortext':  'Token definition' },

            'token_name': { 'type': 'prod', 'body': ['.', '$UPPER_CASE'] , 'errortext':  'Token name'},

            'token_value': { 'type': 'prod', 'body': ['.', '$REGEXP'] , 'errortext':  'Token value'},

            'modifier': { 'type': 'prod', 'body': ['.', '$IMPLICIT_MOD'] , 'errortext':  'Possible modifiers'},

            'remove': { 'type': 'prod', 'body': ['.', '$REMOVE_MOD'] , 'errortext':  'Possible modifiers'},

            'mapper': { 'type': 'prod', 'body': ['.', '$MAPPER','$LCBR', ['*', '@mapper_content'], '$RCBR'],
                        'errortext':  'Top level mapper definition'},

            'mapper_content': { 'type': 'prod', 'body': ['.', 'TODO'], 'errortext':  'Content of the mapper' },

            'implicit': {'type': 'prod', 'body':['*', ['|', '$NEWLINE', '$WS', '$LINE_CONT', '$COMMENT']],
                         'errortext': 'Implicit'}
        }