hutnparser.py 41 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020
  1. """
  2. Author: Bruno Barroca
  3. Date: October 2014
  4. Description: A top down parser
  5. Modifications by Daniel Riegelhaupt:
  6. *removed test input
  7. *changed pos to startpos because in my humble opinion it makes more sense to have a tupple (startpos, endpos) than (pos, endpos)
  8. *aded parameters to init: tab_size, line_position, hide_implicit
  9. - see init comments for more info on all otions
  10. - line_postion will change startpos and end Pos to instance of class Postion. (changed december 2014)
  11. *Added anonymous terminals: tokens do not have to be defined as tokens but can be typed directly in rules
  12. *changed interleave function to be deep, and start on START
  13. *changed position returned in tree to be relative to line numbers instead of the absolute one
  14. - Did the same for partialresults returned on syntax error change this is error results too
  15. - TODO check efficiency on the previous point checking the whole text for every position might be slow
  16. *Changed usage , instead of Parser(input, grammar).pars() it is now Parser(grammar).parse(input)
  17. - Added a self.reset() method for fields that need to be initializes again when parsing a new input
  18. *Changed findFailure and generateErrorReports:
  19. * i need the the rule/token name as well not only the error text
  20. * hidden elements (like for example comments and newline ) are not included in error reports if hide_implicit is set to true
  21. * same for the interleave rule
  22. """
  23. import re
  24. from copy import deepcopy
  25. from hutn_compiler.position import Position
  26. line_cache = {}
  27. class Tree(object):
  28. def __init__(self, head, tail, startpos, endpos, inputfile = None):
  29. self.head = head
  30. self.tail = tail
  31. self.startpos = startpos
  32. self.endpos = endpos
  33. self._tail = None
  34. self.inputfile = inputfile
  35. # IMPORTANT: self.replaced: replace_child defines self.replaced
  36. def is_rule(self):
  37. return self.head.islower()
  38. def is_token(self):
  39. return not self.is_rule()
  40. def get_tail(self):
  41. if self.is_rule():
  42. if not self._tail:
  43. self._tail = [t for t in self.get_raw_tail()
  44. if not t.head.startswith("implicit_autogenerated_")]
  45. return self._tail
  46. else:
  47. return self.get_raw_tail()
  48. def get_raw_tail(self):
  49. return self.tail
  50. def get_text(self, with_implicit=False):
  51. parts = []
  52. if with_implicit:
  53. tail = Tree.get_raw_tail
  54. else:
  55. tail = Tree.get_tail
  56. def post_order(tree):
  57. for child in tail(tree):
  58. if hasattr(child, "replaced"):
  59. child = child.replaced
  60. if isinstance(child, Tree):
  61. post_order(child)
  62. else:
  63. parts.append(child)
  64. post_order(self)
  65. return ''.join(parts)
  66. def get_child(self, name):
  67. for child in self.get_tail():
  68. if child.head == name:
  69. return child
  70. return None
  71. def get_children(self, name):
  72. children = []
  73. for child in self.get_tail():
  74. if child.head == name:
  75. children.append(child)
  76. return children
  77. def replace_child(self, old_child, new_child):
  78. new_child.replaced = old_child
  79. i = self.get_raw_tail().index(old_child)
  80. self.get_raw_tail()[i] = new_child
  81. i = self.get_tail().index(old_child)
  82. self.get_tail()[i] = new_child
  83. def get_tail_without(self, names):
  84. if self.is_rule():
  85. return [t for t in self.get_tail() if not t.head in names]
  86. else:
  87. return self.get_raw_tail()
  88. def __str__(self):
  89. return "(%s, %s) [%s]" % (
  90. self.head, str((self.startpos, self.endpos)),
  91. ", ".join([str(i) for i in self.get_raw_tail()]))
  92. def get_reference_line(self):
  93. return "%s:%s:%s-%s" % (self.inputfile, self.startpos["line"], self.startpos["column"], self.endpos["column"])
  94. def fix_tracability(self, inputfile):
  95. if self.inputfile is None:
  96. self.inputfile = inputfile
  97. for f in self.tail:
  98. if isinstance(f, Tree):
  99. f.fix_tracability(self.inputfile)
  100. def pretty_print(self, i=0):
  101. return "\t" * i + str(self.head) + "\n" + "\n".join([("\t" * (i+1) + st) if isinstance(st, str) else st.pretty_print(i+1) for st in self.get_tail()])
  102. class Parser(object):
  103. class Constants(object):
  104. Token = 'token'
  105. Production = 'prod'
  106. Success = 'success'
  107. Failure = 'failure'
  108. class LR(object):
  109. def __init__(self, seed, rulename, head, nextlr):
  110. self.seed = seed
  111. self.rule = rulename
  112. self.head = head
  113. self.next = nextlr
  114. def copy(self):
  115. return Parser.LR(self.seed, self.rule, self.head, self.next)
  116. class Head(object):
  117. def __init__(self, rulename, involved, evaluation):
  118. self.rule = rulename
  119. self.involved = involved
  120. self.evaluation = evaluation
  121. def __init__(self, grammar, **options):
  122. """
  123. creates a Parser for the given grammar
  124. :param grammar: An instance of the Grammar class
  125. :param options: the following options are supported:
  126. tab_size: default 1. sets the character size of a tab character
  127. hide_implicit: default False. when true implicit tokens are hidden from the returned parse tree and error message.
  128. Note that this this option will not override rules or tokens where the hidden variable has already been set manually in the Grammar class
  129. line_position: default False. when true we use line, column Position object instead of absolute position integer in the parse tree for startpos and endpos
  130. """
  131. #changed by Daniel: members that need to be initialized each time parse is a called have been put in def reset()
  132. #that method is called when the parse() method is called
  133. self.rules = deepcopy(grammar.rules)
  134. self.tokens = deepcopy(grammar.tokens)
  135. self.implicitList = [] #added by Daniel, set in hideImplict so that we can review the implicit list in case of error messages
  136. self.implictRuleName = ""
  137. #options Added by Daniel
  138. self.tabsize = int(options.pop('tab_size', 1)) #the character size of a tab
  139. self.hideImplicit = bool(options.pop('hide_implicit', False))
  140. #whether to hide implicit tokens and rules from the returned parse tree
  141. #Important note: this option will not override rules or tokens where the hidden variable has already been set manually
  142. self.linePosition = bool(options.pop('line_position', False))
  143. #if true the position of the returned parse tree will consist of a line and a column instead of the position in the string array
  144. #preprocess must happen after options, (after hideImplicit has been set)
  145. self.preprocess()
  146. def reset(self):
  147. self.input = ""
  148. self.memotable = {}
  149. self.failure = {}
  150. self.lrstack = None
  151. self.heads = {}
  152. self.countcard = {}
  153. def preprocess(self):
  154. #for elem in self.rules.keys(): #Changed by Daniel: we only check start because it's global
  155. elem = 'start'
  156. if elem in self.rules.keys():
  157. if ('interleave' in self.rules[elem]):
  158. ilist = self.rules[elem]['interleave']
  159. self.setHideImplicit(ilist, self.hideImplicit)
  160. self.interleave(self.rules[elem], ilist)
  161. def setHideImplicit(self, ilist, bool= False):
  162. if ilist:
  163. #ilist = ['?', '@rulename']
  164. rulename= ilist[1][1:]
  165. self.implictRuleName = rulename #used to hide later error reports later
  166. self.rules[rulename]['hidden'] = bool
  167. if rulename in self.rules:
  168. body = self.rules[rulename]['body']
  169. #body = [*, [| ,,,,]]
  170. elems= body[1][1:]
  171. self.implicitList = elems
  172. for elem in elems:
  173. l = None
  174. error = ''
  175. if elem[0] == '@':
  176. l = self.rules
  177. error = ' rule not found in grammar rules.'
  178. elif elem[0]== '$':
  179. l = self.tokens
  180. error = ' token not found in grammar rules.'
  181. #else: in this case it is an anonymous token,
  182. if l:
  183. name = elem[1:]
  184. if name in l:
  185. if not 'hidden' in l[name]:
  186. #this method will not override anything the user has explicitly specified in the structure
  187. #if there is already a hidden value there it will be kept even if it is not the same one
  188. #an examples use case is whitespaces vs comments:
  189. #both can appear anywhere in the text and so are implicit in the grammar.
  190. #however we dont want spaces in the tree but we do want the comments
  191. l[name]['hidden'] = bool
  192. else:
  193. raise Exception(name + error)
  194. #else: Anon token can't be ignored for the moment unless we create an ignore list for it or something like that.
  195. else:
  196. raise Exception(rulename + ' rule not found in grammar rules.')
  197. def interleave(self, elem, ilist):
  198. #quick and simple interleaving method, will probably contain double interleaving
  199. #but this is as simple as i could make it without taking into account each and every case
  200. def quickInterLeave(lst, inter):
  201. newL = []
  202. newL.append(lst[0])
  203. isSeq = self.isSequence(lst[0])
  204. for item in lst[1:]:
  205. if (isinstance(item, list)):#a sublist
  206. newL.append(quickInterLeave(item,inter))
  207. else:
  208. if(item[0] == '@'): #rule
  209. rulename = item [1:]
  210. if rulename in self.rules:
  211. rule = self.rules[rulename]
  212. if not 'visited' in rule or rule['visited'] == False:
  213. self.interleave(rule, inter)
  214. else:
  215. raise Exception(rulename + ' rule not found in grammar rules.')
  216. """
  217. Else:
  218. pass
  219. in this case it is a token or anon token we dont need to do anything special,
  220. just add it to the list interleaved
  221. """
  222. if isSeq: # no need to complicate the data structure if the list is a sequence
  223. if not newL[-1] == inter:
  224. newL.append(inter)
  225. newL.append(item)
  226. newL.append(inter)
  227. else:
  228. newL.append(['.', inter,item ,inter])
  229. """
  230. This way in case the list is not a sequence this doesnt change the meaning of the list:
  231. example: t1, t2 are tokens, i is an optional whitespace being intereleaved
  232. [., t1, t2] -> [., i ,t1, i, t2]
  233. the meaning stays the same:
  234. t1 and t2 both have ot be found for the rule to apply regardless of the ws
  235. [|, t1, t2] -> [|, i ,t1, i, t2]
  236. the meaning changed: if i is encountered the or is satisfied:
  237. so instead we do -> [|, [., i ,t1, i,], [., i ,t2, i,]]
  238. note that while inter has been added to the data stricture 4 times it will only match
  239. for one option so it is not really duplicate.
  240. another way of writing this can be [., inter [|, t1, t2], inter ] but this is easier said than
  241. done especially for big (complex) data structures
  242. """
  243. return newL
  244. #the first thing we do is say that the item has been visited this will avoid infinite loop due to recursion
  245. elem['visited'] = True
  246. if (not 'body' in elem):
  247. return
  248. ls = elem['body']
  249. newbody = quickInterLeave(ls,ilist)
  250. elem['body'] = newbody
  251. def parse(self, text):
  252. self.reset() #Changed by Daniel receive text as param. instead of once at init so first we reset the fields
  253. self.input = text
  254. results = self.applyrule('@start', 0)
  255. if len(results) > 1:
  256. # Handle ambiguity
  257. from hutn_compiler.prettyprint_visitor import PrettyPrintVisitor
  258. for p in results:
  259. print("===================================")
  260. print("VISIT RESULT")
  261. print("===================================")
  262. visitor = PrettyPrintVisitor([])
  263. visitor.visit(p["tree"])
  264. print(visitor.dump())
  265. result = self.generateErrorReport()
  266. elif (results == [] or results[0]['endpos'] < len(self.input)):
  267. result = self.generateErrorReport()
  268. for elem in result['partialresults']: #Added by Daniel there was no post processing on partial results. I need it
  269. if elem['tree']: #with partial results the tree can be None
  270. elem['tree'] = IgnorePostProcessor(self.rules, self.tokens).visit(elem['tree'])
  271. if self.linePosition:
  272. # elem['tree'].startpos = 0
  273. # elem['tree'].endpos = 0
  274. elem['tree'] = Parser.PositionPostProcessor(self.convertToLineColumn).visit(elem['tree']) #Added by Daniel
  275. elif len(results) == 1:
  276. result = results[0]
  277. result.update({'status': Parser.Constants.Success})
  278. if result['tree'].head != 'start':
  279. result['tree'] = Tree('start', [result['tree']], result['tree'].startpos, result['tree'].endpos)
  280. result['tree'] = IgnorePostProcessor(self.rules, self.tokens).visit(result['tree'])
  281. if self.linePosition: #Added by Daniel
  282. result['tree'] = Parser.PositionPostProcessor(self.convertToLineColumn).visit(result['tree'])
  283. return result
  284. def generate_line_cache(self):
  285. global line_cache
  286. if self in line_cache:
  287. return
  288. line_cache[self] = []
  289. lc = line_cache[self]
  290. line = 1
  291. column = 0
  292. for i in self.input:
  293. if i == "\n":
  294. line += 1
  295. column = 0
  296. elif i == "\t":
  297. column += self.tabsize
  298. else:
  299. column += 1
  300. lc.append((line, column))
  301. def convertToLineColumn(self, pos):
  302. global line_cache
  303. self.generate_line_cache()
  304. if pos < len(line_cache[self]):
  305. return {'line': line_cache[self][pos][0], 'column': line_cache[self][pos][1]}
  306. else:
  307. return {'line': line_cache[self][-1][0], 'column': line_cache[self][-1][1] + 1}
  308. def findlargerresultat(self, pos):
  309. endpos = pos
  310. result = None
  311. for key in self.memotable.keys():
  312. elem = self.memotable[key]
  313. if (elem == []):
  314. continue
  315. if (elem[0]['startpos'] == pos and endpos < elem[0]['endpos']):
  316. endpos = elem[0]['endpos']
  317. result = elem[0]
  318. return result
  319. def generateErrorReport(self):
  320. # consult the memotable and collect contiguities until endpos
  321. endpos = len(self.input) - 1
  322. pos = 0
  323. elems = []
  324. while pos <= endpos:
  325. elem = self.findlargerresultat(pos)
  326. if (not elem or (elem and elem['endpos'] == pos)):
  327. break
  328. pos = elem['endpos']
  329. elems.append(elem)
  330. if (pos <= endpos):
  331. elems.append({'tree': None, 'startpos': pos, 'endpos': endpos})
  332. elem = self.getFirstBiggestSpan(elems)
  333. if elem is None:
  334. return {'status': Parser.Constants.Failure, 'line': 0, 'column': 0, 'text': "Empty input file", 'partialresults': [], 'grammarelements': None}
  335. reasons = self.findFailure(elem['startpos'], elem['endpos'])
  336. if (reasons == []):
  337. pos -= 1
  338. else:
  339. pos = reasons[0]['startpos']
  340. read = self.input[pos:pos + 1]
  341. linecolumn = self.convertToLineColumn(pos)
  342. message = 'Syntax error at line ' + str(linecolumn['line']) + ' and column ' + str(linecolumn['column']) + '. '
  343. keys = []
  344. if (not reasons == []):
  345. first = True
  346. for reason in reasons:
  347. if (first):
  348. message += 'Expected ' + reason['text']
  349. first = False
  350. else:
  351. message += ' or ' + reason['text']
  352. keys.append(reason['key'])
  353. message += '. Instead read: ' + repr(read) + '.'
  354. else:
  355. message += 'Read: \'' + read + '\'.'
  356. return {'status': Parser.Constants.Failure, 'line': linecolumn['line'], 'column': linecolumn['column'],
  357. 'text': message, 'partialresults': elems, 'grammarelements': keys}
  358. def getFirstBiggestSpan(self, elems):
  359. biggestspan = 0
  360. result = None
  361. for elem in elems:
  362. span = elem['endpos'] - elem['startpos']
  363. if (biggestspan < span):
  364. result = elem
  365. span = biggestspan
  366. return result
  367. def findFailure(self, pos, endpos):
  368. posreasons = []
  369. endposreasons = []
  370. #changed by Daniel:
  371. #* i need the key as well for autocomplete so in stead of appending elem i return a new dictionary with elem and the key inside
  372. #* checks both condition for posreasons and endposreasons in one for loop instead of 2
  373. #* do not cosider keys that are hidden
  374. for key in self.failure.keys():
  375. #keys are given starting either with $ for tokens or @ for rules
  376. #howver with the the given metagrammar Tokens are all caps and rules are all in small letters so there cant be an overlapp
  377. #and we can safely test both
  378. if self.hideImplicit and\
  379. (('$' + key in self.implicitList) or ('@' + key in self.implicitList) or (key == self.implictRuleName)):
  380. continue
  381. else:
  382. elem = self.failure[key]
  383. if (elem['startpos'] == pos and not elem['text'] == ''):
  384. posreasons.append({'key': key, 'startpos': elem['startpos'] , 'text': elem['text'] })
  385. if (elem['startpos'] == endpos and not elem['text'] == ''):
  386. endposreasons.append({'key': key, 'startpos': elem['startpos'] , 'text': elem['text'] })
  387. if (len(endposreasons) < len(posreasons)):
  388. return posreasons
  389. else:
  390. return endposreasons
  391. def setupLR(self, rule, elem):
  392. if (elem.head == None):
  393. elem.head = Parser.Head(rule, [], [])
  394. s = self.lrstack
  395. while s and not s.rule == elem.head.rule:
  396. s.head = elem.head
  397. if (not s.rule in elem.head.involved):
  398. elem.head.involved.append(s.rule)
  399. s = s.next
  400. def recall(self, rule, j):
  401. newresults = []
  402. if ((rule, j) in self.memotable):
  403. newresults = self.memotable[(rule, j)]
  404. h = None
  405. if (j in self.heads):
  406. h = self.heads[j]
  407. if (not h):
  408. return newresults
  409. if (newresults == [] and not rule in (h.involved + [h.rule])):
  410. return [] # [{'tree': [], 'startpos': j, 'endpos': j}]
  411. if (rule in h.evaluation):
  412. h.evaluation.remove(rule)
  413. newresults = self.eval(rule, j)
  414. self.memotable.update({(rule, j): newresults})
  415. return newresults
  416. def applyrule(self, rule, j):
  417. overallresults = []
  418. newresults = self.recall(rule, j)
  419. if (not newresults == []):
  420. memoresults = []
  421. for elem in newresults:
  422. if (isinstance(elem['tree'], Parser.LR)):
  423. self.setupLR(rule, elem['tree'])
  424. memoresults += elem['tree'].seed
  425. else:
  426. overallresults.append(elem)
  427. if (not memoresults == []):
  428. self.memotable.update({(rule, j): memoresults})
  429. return memoresults
  430. return overallresults
  431. else:
  432. #lr = Parser.LR([], rule, None, deepcopy(self.lrstack))
  433. lr = Parser.LR([], rule, None, None if not self.lrstack else self.lrstack.copy())
  434. self.lrstack = lr
  435. self.memotable.update({(rule, j): [{'tree': lr, 'startpos': j, 'endpos': j}]})
  436. newresults = self.eval(rule, j)
  437. self.lrstack = self.lrstack.next
  438. memoresults = []
  439. if ((rule, j) in self.memotable):
  440. memoresults = self.memotable[(rule, j)]
  441. for melem in memoresults:
  442. if (isinstance(melem['tree'], Parser.LR) and melem['tree'].head):
  443. melem['tree'].seed = newresults
  444. r = self.lr_answer(rule, j, melem)
  445. if (not r == []):
  446. overallresults += r
  447. if (overallresults != []): # prefer grown results
  448. return overallresults
  449. self.memotable.update({(rule, j): newresults})
  450. return newresults
  451. def lr_answer(self, rule, pos, melem):
  452. h = melem['tree'].head
  453. if (not h.rule == rule):
  454. return melem['tree'].seed
  455. else:
  456. melems = melem['tree'].seed
  457. result = []
  458. for melem_i in melems:
  459. if (not melem_i['tree'] == None):
  460. result.append(melem_i)
  461. if (result == []):
  462. return []
  463. else:
  464. newresult = []
  465. for melem_i in result:
  466. newresult.append(self.growLR(rule, pos, melem_i, h))
  467. return newresult
  468. def growLR(self, rule, pos, melem, head=None):
  469. self.heads.update({pos: head})
  470. while (True):
  471. overallresults = []
  472. head.evaluation = deepcopy(head.involved)
  473. newresults = self.eval(rule, pos)
  474. for elem in newresults:
  475. if (elem['endpos'] > melem['endpos']):
  476. melem = elem
  477. overallresults.append(elem)
  478. if (overallresults == []):
  479. self.heads.update({pos: None})
  480. return melem
  481. self.memotable.update({(rule, pos): overallresults})
  482. def eval(self, rulename, j):
  483. # Returns [{'tree':Tree(head=rulename, tail=[...], startpos=j, endpos=x), 'startpos':j, 'endpos':x}]
  484. # Raises Exception if there is no such token/rule
  485. if (rulename[0] == '@'):
  486. rulename = rulename[1:]
  487. if (not rulename in self.rules):
  488. raise Exception(rulename + ' rule not found in grammar rules.')
  489. rule = self.rules[rulename]
  490. elif (rulename[0] == '$'):
  491. rulename = rulename[1:]
  492. if (not rulename in self.tokens):
  493. raise Exception(rulename + ' token not found in grammar tokens.')
  494. rule = self.tokens[rulename]
  495. else:
  496. # raise Exception('Plain terminals not allowed inside grammar rules: ' + str(rulename))
  497. # we create an anonymous token rule
  498. # we can write whatever we want as fake type as long as it is not equal to the type of the prodcution rule
  499. # or to that of the token
  500. rule = {'type': 'anonymous_token'}
  501. if (self.isType(rule, Parser.Constants.Production)):
  502. newresults = []
  503. results = self.eval_body(rulename, rule['body'], j)
  504. for r in results:
  505. if (r['tree']):
  506. head = r['tree'].head
  507. if(head == '*' or head == '+' or head == '?' or head == '|' or head == '.'):
  508. newr = {'tree': Tree(rulename, [r['tree']], r['startpos'], r['endpos']), 'startpos': r['startpos'],
  509. 'endpos': r['endpos']}
  510. r = newr
  511. newresults.append(r)
  512. elif (self.isType(rule, Parser.Constants.Token)):
  513. newresults = self.term(rulename, j)
  514. else: ##Changed by Daniel: if not a production rule or defined token we try an anonymous token:
  515. newresults = self.anonTerm(rulename, j)
  516. return newresults
  517. def eval_body(self, rulename, ls, j):
  518. # Delegates the task to sub-functions: alt, seq, opt, many, more, card
  519. # Returns
  520. # Raises Exception if the first element in the body is not in {'|', '.', '?', '*', '+', '#'}
  521. if (self.isAlternative(ls[0])):
  522. return self.alt(rulename, ls[1:], j)
  523. elif (self.isSequence(ls[0])):
  524. return self.seq(rulename, ls[1:], j)
  525. elif (self.isOptional(ls[0])):
  526. return self.opt(rulename, ls[1:], j)
  527. elif (self.isMany(ls[0])):
  528. return self.many(rulename, ls[1:], j)
  529. elif (self.isMore(ls[0])):
  530. return self.more(rulename, ls[1:], j)
  531. elif (self.isCard(ls[0])):
  532. return self.card(rulename, ls[0][1:], ls[1:], j)
  533. raise Exception('Unrecognized grammar expression: ' + str(ls[0]))
  534. def isSequence(self, operator):
  535. return operator == '.'
  536. def isAlternative(self, operator):
  537. return operator == '|'
  538. def isMany(self, operator):
  539. return operator == '*'
  540. def isCard(self, operator):
  541. return operator.startswith('#')
  542. def isMore(self, operator):
  543. return operator == '+'
  544. def isOptional(self, operator):
  545. return operator == '?'
  546. def isType(self, rule, oftype):
  547. if (rule['type'] == oftype):
  548. return True
  549. def term(self, rulename, j):
  550. if (j >= len(self.input)):
  551. errortext = ''
  552. if (rulename in self.tokens and 'errortext' in self.tokens[rulename]):
  553. errortext = self.tokens[rulename]['errortext']
  554. self.failure.update({rulename: {'startpos': j, 'text': errortext}})
  555. return []
  556. rule = self.tokens[rulename]
  557. mobj = re.match(rule['reg'], self.input[j:])
  558. #Changed by daniel instead of re.match(reg) did re.match(re.compile(reg).patern)
  559. #this is to avoid problems with \ before i did this i had the match the character \ by doing [\\\\]
  560. # because to write only two slashes it would have to be r'[\\]' which cant be done directly in hte grammar so it had to be in string form
  561. #this way reading [\\] will be interpreted correctly instead of giving an error like it used to
  562. if (not mobj):
  563. # this is a failure! nice to register!
  564. self.failure.update({rulename: {'startpos': j, 'text': self.tokens[rulename]['errortext']}})
  565. return []
  566. return [{'tree': Tree(rulename, [mobj.group()], j, j + mobj.end()), 'startpos': j, 'endpos': j + mobj.end()}]
  567. def anonTerm(self, term, j):
  568. """
  569. #Changed by Daniel: added this whole method.
  570. Anonymous term to allow for direct terminals in rules
  571. (write 'Foo' directly instead of having to deine a FOO token)
  572. """
  573. qt = '\''
  574. name = qt + term + qt
  575. if (j >= len(self.input)):
  576. self.failure.update({ name : {'startpos': j, 'text': name}})
  577. return []
  578. mobj = re.match(term, self.input[j:])
  579. if (not mobj):
  580. # this is a failure! nice to register!
  581. self.failure.update({ name : {'startpos': j, 'text': name }})
  582. return []
  583. return [{'tree': Tree(name , [mobj.group()], j, j + mobj.end()), 'startpos': j, 'endpos': j + mobj.end()}]
  584. def many(self, rulename, ls, j):
  585. rule_i = ls[0]
  586. if (isinstance(rule_i, list)):
  587. results = self.eval_body('*', rule_i, j)
  588. else:
  589. results = self.applyrule(rule_i, j)
  590. if (results == []):
  591. return [{'tree': None, 'startpos': j, 'endpos': j}]
  592. seq = ['.'] + ls + [['*'] + ls]
  593. results = self.eval_body('*', seq, j)
  594. overall_results = []
  595. for r in results:
  596. if (r['tree']):
  597. if (len(r['tree'].tail) > 1):
  598. left = r['tree'].tail[0]
  599. right = r['tree'].tail[1].tail
  600. r['tree'].tail = [left] + right
  601. overall_results.append(r)
  602. return overall_results
  603. def more(self, rulename, ls, j):
  604. rule_i = ls[0]
  605. if (isinstance(rule_i, list)):
  606. results = self.eval_body('+', rule_i, j)
  607. else:
  608. results = self.applyrule(rule_i, j)
  609. if (results == []):
  610. return []
  611. seq = ['.'] + ls + [['*'] + ls]
  612. results = self.eval_body('+', seq, j)
  613. overall_results = []
  614. for r in results:
  615. if (r['tree']):
  616. if (len(r['tree'].tail) > 1):
  617. left = r['tree'].tail[0]
  618. right = r['tree'].tail[1].tail
  619. r['tree'].tail = [left] + right
  620. overall_results.append(r)
  621. return overall_results
  622. def opt(self, rulename, ls, j):
  623. if (j >= len(self.input)):
  624. errortext = ''
  625. if (rulename in self.rules and 'errortext' in self.rules[rulename]):
  626. errortext = self.rules[rulename]['errortext']
  627. else:
  628. for item in ls:
  629. if ((not isinstance(item[1:], list)) and item[1:] in self.rules):
  630. errortext = self.rules[item[1:]]['errortext']
  631. self.failure.update({rulename: {'startpos': j, 'text': errortext}})
  632. return [{'tree': None, 'startpos': j, 'endpos': j}]
  633. results = []
  634. rule_i = ls[0]
  635. if (isinstance(rule_i, list)):
  636. results = self.eval_body('?', rule_i, j)
  637. else:
  638. results = self.applyrule(rule_i, j)
  639. if (not results == []):
  640. return results
  641. # empty case
  642. return [{'tree': None, 'startpos': j, 'endpos': j}]
  643. def card(self, rulename, cardrule, ls, j):
  644. count = 0
  645. delta = 1
  646. # a# a#(-1) #indent, #(-1)indent
  647. group = re.match('\((?P<delta>[-+]?\d+)\)(?P<rule>\S+)',cardrule)
  648. if(group):
  649. cardrule = group.group('rule')
  650. delta = int(group.group('delta'))
  651. if (not cardrule in self.countcard):
  652. count = delta
  653. self.countcard.update({cardrule: {j: count}})
  654. else:
  655. if not j in self.countcard[cardrule]: # # if we already know the count for j, then ignore..
  656. d = self.countcard[cardrule]
  657. lastcount = 0
  658. ## Performance bottleneck
  659. #print("J: %s, d: %s" % (j, d))
  660. for i in range(0, j):
  661. if i in d:
  662. lastcount = d[i]
  663. #print("Result: " + str(lastcount))
  664. ## End of performance bottleneck
  665. new_lastcount = 0
  666. for i in sorted(d.keys()):
  667. if i < j:
  668. new_lastcount = d[i]
  669. else:
  670. break
  671. if lastcount != new_lastcount:
  672. raise Exception()
  673. ## End new code
  674. count = lastcount + delta
  675. d.update({j: count})
  676. else:
  677. count = self.countcard[cardrule][j]
  678. results = []
  679. rule_i = '@' + cardrule
  680. if(count == 0):
  681. results = [{'tree': None, 'startpos': j, 'endpos': j}]
  682. else:
  683. for i in range(0, count):
  684. if (results == []):
  685. if (isinstance(rule_i, list)):
  686. newresults = self.eval_body(rulename, rule_i, j)
  687. else:
  688. newresults = self.applyrule(rule_i, j)
  689. if (newresults == []):
  690. del self.countcard[cardrule][j]
  691. return []
  692. newresults = self.merge(rulename, newresults, {'startpos': j, 'endpos': j})
  693. else:
  694. for elem_p in results:
  695. if (isinstance(rule_i, list)):
  696. newresults = self.eval_body(rulename, rule_i, elem_p['endpos'])
  697. else:
  698. newresults = self.applyrule(rule_i, elem_p['endpos'])
  699. if (newresults == []):
  700. del self.countcard[cardrule][j]
  701. return []
  702. newresults = self.merge(rulename, newresults, elem_p)
  703. results = newresults
  704. for rule_i in ls:
  705. for elem_p in results:
  706. if (isinstance(rule_i, list)):
  707. newresults = self.eval_body(rulename, rule_i, elem_p['endpos'])
  708. else:
  709. newresults = self.applyrule(rule_i, elem_p['endpos'])
  710. if (newresults == []):
  711. del self.countcard[cardrule][j]
  712. return []
  713. newresults = self.merge(rulename, newresults, elem_p)
  714. results = newresults
  715. del self.countcard[cardrule][j]
  716. return results
  717. def seq(self, rulename, ls, j):
  718. #
  719. results = []
  720. for rule_i in ls:
  721. if (results == []):
  722. if (isinstance(rule_i, list)):
  723. newresults = self.eval_body('.', rule_i, j)
  724. else:
  725. newresults = self.applyrule(rule_i, j)
  726. if (newresults == []):
  727. return []
  728. newresults = self.merge('.', newresults, {'startpos': j, 'endpos': j})
  729. else:
  730. r = []
  731. for elem_p in results:
  732. if (isinstance(rule_i, list)):
  733. newresults = self.eval_body('.', rule_i, elem_p['endpos'])
  734. else:
  735. newresults = self.applyrule(rule_i, elem_p['endpos'])
  736. if (newresults == []):
  737. return []
  738. newresults = self.merge('.', newresults, elem_p)
  739. results = newresults
  740. return results
  741. def merge(self, rulename, newres, elem_p):
  742. # Brief: tail of each new tree needs to be prepended with tail of the previous tree
  743. # rulename: becomes the head of each tree in the returned list
  744. # newres: may have more than one tree in case of alt operator: 'x' ('a' | 'b') 'y'
  745. # tail of each new tree needs to be prepended with tail of previous tree
  746. # Returns same list as eval: [{'tree':Tree(head=rulename, tail=[...], startpos=j, endpos=x), 'startpos':j, 'endpos':x}]
  747. results = []
  748. for elem_n in newres:
  749. tail = []
  750. if ('tree' in elem_p and elem_p['tree']):
  751. tail += elem_p['tree'].tail
  752. if ('tree' in elem_n and elem_n['tree']):
  753. tail.append(elem_n['tree'])
  754. value = {'tree': Tree(rulename, tail, elem_p['startpos'], elem_n['endpos']), 'startpos': elem_p['startpos'],
  755. 'endpos': elem_n['endpos']}
  756. results += [value]
  757. return results
  758. def alt(self, rulename, ls, j):
  759. # Evaluates all alternatives using eval_body or applyrule
  760. # Returns same list as eval: [{'tree':Tree(head=rulename, tail=[...], startpos=j, endpos=x), 'startpos':j, 'endpos':x}]
  761. overall_results = []
  762. results = [] # TODO: remove this variable as it's never used
  763. for rule_i in ls:
  764. if (isinstance(rule_i, list)):
  765. newresults = self.eval_body('|', rule_i, j)
  766. else:
  767. newresults = self.applyrule(rule_i, j)
  768. overall_results += newresults
  769. return overall_results
  770. class PositionPostProcessor(object):
  771. """
  772. This post processor changes absolute position (place in the parsed string )to a line, column position
  773. added by Daniel
  774. """
  775. """
  776. efficiency note:
  777. how effective is this. this might be slowing things down quit a bit having to calculate that for everything
  778. 1) an alternative would be use the method only for the leaves, and that traverse the tree bottom up to create
  779. the interval using the left most and right most children of each subtree. but since tat involves extra tree
  780. traversal that might not help that much.
  781. 2) another thing that might improve efficiency is to create change the position calculating method:
  782. create one that doesnt scan the whole text for new line each time we calculate a position,
  783. but creates a table of them the first time.
  784. we can calculate the line by returning the index in the table of the the new line the closest to the given
  785. position and the column is the difference between the position of that newline and the column (maybe + or - 1,
  786. check that)
  787. in case this method doesn't slow things down too much ignore this
  788. """
  789. def __init__(self, method):
  790. self.calcPosMethod = method
  791. def inner_visit(self,tree):
  792. startDic = self.calcPosMethod(tree.startpos)
  793. endDic = self.calcPosMethod(tree.endpos)
  794. tree.startpos = Position(startDic["line"], startDic["column"])
  795. tree.endpos = Position(endDic["line"], endDic["column"])
  796. for item in tree.tail:
  797. if (isinstance(item, Tree)):
  798. self.inner_visit(item)
  799. def visit(self, tree):
  800. if tree:
  801. self.inner_visit(tree)
  802. return tree
  803. class DefaultPrinter(object):
  804. def __init__(self, output='console'):
  805. self.outputStream = ''
  806. self.output = output
  807. def inner_visit(self, tree):
  808. for item in tree.tail:
  809. if (isinstance(item, Tree)):
  810. self.inner_visit(item)
  811. else:
  812. self.outputStream += item
  813. def visit(self, tree):
  814. self.inner_visit(tree)
  815. if (self.output == 'console'):
  816. print(self.outputStream)
  817. class PrettyPrinter(object):
  818. def __init__(self, output='console'):
  819. self.outputStream = ''
  820. self.output = output
  821. self.tabcount = -1
  822. def tab(self):
  823. tabspace = ''
  824. for i in range(0, self.tabcount):
  825. tabspace += ' '
  826. return tabspace
  827. def inner_visit(self, tree):
  828. self.tabcount += 1
  829. self.outputStream += self.tab()
  830. self.outputStream += 'node ' + tree.head + ':\n'
  831. for item in tree.tail:
  832. if (isinstance(item, Tree)):
  833. self.inner_visit(item)
  834. else:
  835. self.tabcount += 1
  836. self.outputStream += self.tab() + item + ' @' + str(tree.startpos) + ' to ' + str(
  837. tree.endpos) + ' \n'
  838. self.tabcount -= 1
  839. self.tabcount -= 1
  840. def visit(self, tree):
  841. self.inner_visit(tree)
  842. if (self.output == 'console'):
  843. print(self.outputStream)
  844. class IgnorePostProcessor(object):
  845. def __init__(self, rules, tokens):
  846. self.rules = rules
  847. self.tokens = tokens
  848. def inner_visit(self, tree):
  849. results = []
  850. if (isinstance(tree, Tree)):
  851. if (self.isHidden(tree.head)):
  852. for item in tree.tail:
  853. ivlist = []
  854. ivresult = self.inner_visit(item)
  855. for elem in ivresult:
  856. if (isinstance(elem, Tree)):
  857. ivlist += [elem]
  858. results += ivlist
  859. else:
  860. tlist = []
  861. for item in tree.tail:
  862. tlist += self.inner_visit(item)
  863. tree.tail = tlist
  864. results += [tree]
  865. return results
  866. return [tree]
  867. def visit(self, tree):
  868. # start cannot be hidden
  869. tlist = []
  870. for item in tree.tail:
  871. tlist += self.inner_visit(item)
  872. tree.tail = tlist
  873. return tree
  874. def isHidden(self, head):
  875. if (head == '*' or head == '+' or head == '?' or head == '|' or head == '.'):
  876. return True
  877. if (head in self.rules):
  878. return 'hidden' in self.rules[head] and self.rules[head]['hidden']
  879. elif (head in self.tokens): #Changed by Daniel: added elif condition and return false otherwise, need for anon tokens
  880. return 'hidden' in self.tokens[head] and self.tokens[head]['hidden']
  881. else:
  882. return False