gavo.grammars.freeregrammar

1 """ 2 A grammar based on repeated application of REs 3 """ 4 5 #c Copyright 2008-2019, the GAVO project 6 #c 7 #c This program is free software, covered by the GNU GPL. See the 8 #c COPYING file in the source distribution. 9 10 11 import re 12 13 from gavo import base 14 from gavo.grammars import common 15 from gavo.grammars import regrammar 16 17 18 _onlyWhitespaceLeft = re.compile(r"\s*$") 19

20 -class RowIterator(common.FileRowIterator):

21 chunkSize = 8192 22

23 - def _iterRecords(self):

24 curPos, buffer = 0, "" 25 recPat = self.grammar.rowProduction 26 27 if self.grammar.ignoreJunk: 28 getNext = recPat.search 29 else: 30 getNext = recPat.match 31 32 while True: 33 mat = getNext(buffer, curPos) 34 if not mat: # no match, fetch new stuff. 35 newStuff = self.inputFile.read(self.chunkSize) 36 if not newStuff: # file exhausted 37 break 38 buffer = buffer[curPos:]+newStuff 39 curPos = 0 40 continue 41 res = mat.group() 42 yield res 43 curPos = mat.end() 44 self.curLine += res.count("\n") 45 buffer = buffer[curPos:] 46 if not self.grammar.ignoreJunk and not _onlyWhitespaceLeft.match(buffer): 47 raise common.ParseError("Junk at end of file", self.getLocator(), 48 buffer)

49

50 - def _iterRows(self):

51 for rawRec in self._iterRecords(): 52 try: 53 res = self.grammar.parseRE.match(rawRec).groupdict() 54 if self.grammar.stripTokens: 55 res = dict((k, v.strip()) for k, v in res.iteritems()) 56 yield res 57 except AttributeError: 58 raise base.ui.logOldExc( 59 common.ParseError("Malformed input, parseRE did not match.", 60 self.getLocator(), rawRec))

61

62 - def getLocator(self):

63 return "%s, line %d"%(self.sourceToken, self.curLine)

64 65

66 -class FreeREGrammar(common.Grammar):

67 """A grammar allowing "free" regular expressions to parse a document. 68 69 Basically, you give a rowProduction to match individual records in the 70 document. All matches of rowProduction will then be matched with 71 parseRE, which in turn must have named groups. The dictionary from 72 named groups to their matches makes up the input row. 73 74 For writing the parseRE, we recommend writing an element, using a 75 CDATA construct, and taking advantage of python's "verbose" regular 76 expressions. Here's an example:: 77 78 <parseRE><![CDATA[(?xsm)^name::(?P<name>.*) 79 ^query::(?P<query>.*) 80 ^description::(?P<description>.*)\.\. 81 ]]></parseRE> 82 """ 83 name_ = "freeREGrammar" 84 85 _rowProduction = regrammar.REAttribute("rowProduction", 86 default=re.compile(r"(?m)^.+$\n"), description="RE matching a complete" 87 " record.") 88 _parseRE = regrammar.REAttribute("parseRE", default=base.Undefined, 89 description="RE containing named groups matching a record") 90 _stripTokens = base.BooleanAttribute("stripTokens", default=False, 91 description="Strip whitespace from result tokens?") 92 _ignoreJunk = base.BooleanAttribute("ignoreJunk", default=False, 93 description="Ignore everything outside of the row production") 94 rowIterator = RowIterator

95

Source Code for Module gavo.grammars.freeregrammar