Package gavo :: Package grammars :: Module freeregrammar
[frames] | no frames]

Source Code for Module gavo.grammars.freeregrammar

 1  """ 
 2  A grammar based on repeated application of REs 
 3  """ 
 4   
 5  #c Copyright 2008-2019, the GAVO project 
 6  #c 
 7  #c This program is free software, covered by the GNU GPL.  See the 
 8  #c COPYING file in the source distribution. 
 9   
10   
11  import re 
12   
13  from gavo import base 
14  from gavo.grammars import common 
15  from gavo.grammars import regrammar 
16   
17   
18  _onlyWhitespaceLeft = re.compile(r"\s*$") 
19   
20 -class RowIterator(common.FileRowIterator):
21 chunkSize = 8192 22
23 - def _iterRecords(self):
24 curPos, buffer = 0, "" 25 recPat = self.grammar.rowProduction 26 27 if self.grammar.ignoreJunk: 28 getNext = recPat.search 29 else: 30 getNext = recPat.match 31 32 while True: 33 mat = getNext(buffer, curPos) 34 if not mat: # no match, fetch new stuff. 35 newStuff = self.inputFile.read(self.chunkSize) 36 if not newStuff: # file exhausted 37 break 38 buffer = buffer[curPos:]+newStuff 39 curPos = 0 40 continue 41 res = mat.group() 42 yield res 43 curPos = mat.end() 44 self.curLine += res.count("\n") 45 buffer = buffer[curPos:] 46 if not self.grammar.ignoreJunk and not _onlyWhitespaceLeft.match(buffer): 47 raise common.ParseError("Junk at end of file", self.getLocator(), 48 buffer)
49
50 - def _iterRows(self):
51 for rawRec in self._iterRecords(): 52 try: 53 res = self.grammar.parseRE.match(rawRec).groupdict() 54 if self.grammar.stripTokens: 55 res = dict((k, v.strip()) for k, v in res.iteritems()) 56 yield res 57 except AttributeError: 58 raise base.ui.logOldExc( 59 common.ParseError("Malformed input, parseRE did not match.", 60 self.getLocator(), rawRec))
61
62 - def getLocator(self):
63 return "%s, line %d"%(self.sourceToken, self.curLine)
64 65
66 -class FreeREGrammar(common.Grammar):
67 """A grammar allowing "free" regular expressions to parse a document. 68 69 Basically, you give a rowProduction to match individual records in the 70 document. All matches of rowProduction will then be matched with 71 parseRE, which in turn must have named groups. The dictionary from 72 named groups to their matches makes up the input row. 73 74 For writing the parseRE, we recommend writing an element, using a 75 CDATA construct, and taking advantage of python's "verbose" regular 76 expressions. Here's an example:: 77 78 <parseRE><![CDATA[(?xsm)^name::(?P<name>.*) 79 ^query::(?P<query>.*) 80 ^description::(?P<description>.*)\.\. 81 ]]></parseRE> 82 """ 83 name_ = "freeREGrammar" 84 85 _rowProduction = regrammar.REAttribute("rowProduction", 86 default=re.compile(r"(?m)^.+$\n"), description="RE matching a complete" 87 " record.") 88 _parseRE = regrammar.REAttribute("parseRE", default=base.Undefined, 89 description="RE containing named groups matching a record") 90 _stripTokens = base.BooleanAttribute("stripTokens", default=False, 91 description="Strip whitespace from result tokens?") 92 _ignoreJunk = base.BooleanAttribute("ignoreJunk", default=False, 93 description="Ignore everything outside of the row production") 94 rowIterator = RowIterator
95