Source code for gavo.grammars.freeregrammar

"""
A grammar based on repeated application of REs
"""

#c Copyright 2008-2023, the GAVO project <gavo@ari.uni-heidelberg.de>
#c
#c This program is free software, covered by the GNU GPL.  See the
#c COPYING file in the source distribution.


import re

from gavo import base
from gavo import utils
from gavo.grammars import common
from gavo.grammars import regrammar


_onlyWhitespaceLeft = re.compile(r"\s*$")

[docs]class RowIterator(common.FileRowIterator): chunkSize = 8192 def _iterRecords(self): curPos, buffer = 0, "" recPat = self.grammar.rowProduction if self.grammar.ignoreJunk: getNext = recPat.search else: getNext = recPat.match while True: mat = getNext(buffer, curPos) if not mat: # no match, fetch new stuff. newStuff = self.inputFile.read(self.chunkSize) if not newStuff: # file exhausted break buffer = buffer[curPos:]+newStuff curPos = 0 continue res = mat.group() yield res curPos = mat.end() self.curLine += res.count("\n") buffer = buffer[curPos:] if not self.grammar.ignoreJunk and not _onlyWhitespaceLeft.match(buffer): raise utils.SourceParseError("Junk at end of file", location=self.getLocator(), offending=buffer) def _iterRows(self): for rawRec in self._iterRecords(): try: res = self.grammar.parseRE.match(rawRec).groupdict() if self.grammar.stripTokens: res = dict((k, v.strip()) for k, v in res.items()) yield res except AttributeError: raise base.ui.logOldExc( utils.SourceParseError("Malformed input, parseRE did not match.", location=self.getLocator(), offending=rawRec))
[docs] def getLocator(self): return "%s, line %d"%(self.sourceToken, self.curLine)
[docs]class FreeREGrammar(common.Grammar): """A grammar allowing "free" regular expressions to parse a document. Basically, you give a rowProduction to match individual records in the document. All matches of rowProduction will then be matched with parseRE, which in turn must have named groups. The dictionary from named groups to their matches makes up the input row. For writing the parseRE, we recommend writing an element, using a CDATA construct, and taking advantage of python's "verbose" regular expressions. Here's an example:: <parseRE><![CDATA[(?xsm)^name::(?P<name>.*) ^query::(?P<query>.*) ^description::(?P<description>.*)\.\. ]]></parseRE> """ name_ = "freeREGrammar" _rowProduction = regrammar.REAttribute("rowProduction", default=re.compile(r"(?m)^.+$\n"), description="RE matching a complete" " record.") _parseRE = regrammar.REAttribute("parseRE", default=base.Undefined, description="RE containing named groups matching a record") _stripTokens = base.BooleanAttribute("stripTokens", default=False, description="Strip whitespace from result tokens?") _ignoreJunk = base.BooleanAttribute("ignoreJunk", default=False, description="Ignore everything outside of the row production") rowIterator = RowIterator