1 """
2 A grammar based on repeated application of REs
3 """
4
5
6
7
8
9
10
11 import re
12
13 from gavo import base
14 from gavo.grammars import common
15 from gavo.grammars import regrammar
16
17
18 _onlyWhitespaceLeft = re.compile(r"\s*$")
19
21 chunkSize = 8192
22
24 curPos, buffer = 0, ""
25 recPat = self.grammar.rowProduction
26
27 if self.grammar.ignoreJunk:
28 getNext = recPat.search
29 else:
30 getNext = recPat.match
31
32 while True:
33 mat = getNext(buffer, curPos)
34 if not mat:
35 newStuff = self.inputFile.read(self.chunkSize)
36 if not newStuff:
37 break
38 buffer = buffer[curPos:]+newStuff
39 curPos = 0
40 continue
41 res = mat.group()
42 yield res
43 curPos = mat.end()
44 self.curLine += res.count("\n")
45 buffer = buffer[curPos:]
46 if not self.grammar.ignoreJunk and not _onlyWhitespaceLeft.match(buffer):
47 raise common.ParseError("Junk at end of file", self.getLocator(),
48 buffer)
49
51 for rawRec in self._iterRecords():
52 try:
53 res = self.grammar.parseRE.match(rawRec).groupdict()
54 if self.grammar.stripTokens:
55 res = dict((k, v.strip()) for k, v in res.iteritems())
56 yield res
57 except AttributeError:
58 raise base.ui.logOldExc(
59 common.ParseError("Malformed input, parseRE did not match.",
60 self.getLocator(), rawRec))
61
63 return "%s, line %d"%(self.sourceToken, self.curLine)
64
65
67 """A grammar allowing "free" regular expressions to parse a document.
68
69 Basically, you give a rowProduction to match individual records in the
70 document. All matches of rowProduction will then be matched with
71 parseRE, which in turn must have named groups. The dictionary from
72 named groups to their matches makes up the input row.
73
74 For writing the parseRE, we recommend writing an element, using a
75 CDATA construct, and taking advantage of python's "verbose" regular
76 expressions. Here's an example::
77
78 <parseRE><![CDATA[(?xsm)^name::(?P<name>.*)
79 ^query::(?P<query>.*)
80 ^description::(?P<description>.*)\.\.
81 ]]></parseRE>
82 """
83 name_ = "freeREGrammar"
84
85 _rowProduction = regrammar.REAttribute("rowProduction",
86 default=re.compile(r"(?m)^.+$\n"), description="RE matching a complete"
87 " record.")
88 _parseRE = regrammar.REAttribute("parseRE", default=base.Undefined,
89 description="RE containing named groups matching a record")
90 _stripTokens = base.BooleanAttribute("stripTokens", default=False,
91 description="Strip whitespace from result tokens?")
92 _ignoreJunk = base.BooleanAttribute("ignoreJunk", default=False,
93 description="Ignore everything outside of the row production")
94 rowIterator = RowIterator
95