1 """
2 A grammar splitting the input file into lines and lines into records
3 using REs.
4 """
5
6
7
8
9
10
11
12 import re
13
14 from gavo import base
15 from gavo.grammars.common import (
16 Grammar, FileRowIterator, FileRowAttributes, REAttribute)
17
18
20 """is an iterator based on regular expressions.
21 """
22 chunkSize = 8192
23
25 for i in range(self.grammar.topIgnoredLines):
26 self.inputFile.readline()
27 self.curLine += 1
28
29 curPos = 0
30 splitPat = self.grammar.recordSep
31 buffer = ""
32 while True:
33 mat = splitPat.search(buffer, curPos)
34 if not mat:
35 newStuff = self.inputFile.read(self.chunkSize)
36 if not newStuff:
37 break
38 buffer = buffer[curPos:]+newStuff
39 curPos = 0
40 if self.grammar.commentPat:
41 buffer = self.grammar.commentPat.sub("", buffer)
42 continue
43 self.curLine += mat.group().count("\n")
44 res = buffer[curPos:mat.start()]
45
46 if self.grammar.stopPat and self.grammar.stopPat.match(res):
47 return
48
49 yield res.strip()
50 curPos = mat.end()
51 self.curLine += res.count("\n")
52
53 res = buffer[curPos:].strip()
54 if res and not (
55 self.grammar.stopPat and self.grammar.stopPat.match(res)):
56 yield res
57
59 for rawRec in self._iterInRecords():
60 try:
61 res = self._makeRec(rawRec)
62 except base.SkipThis:
63 continue
64 yield res
65 self.inputFile.close()
66 self.grammar = None
67
69 if self.grammar.recordCleaner:
70 cleanMat = self.grammar.recordCleaner.match(inputLine)
71 if not cleanMat:
72 raise base.SourceParseError("'%s' does not match cleaner"%inputLine,
73 source=str(self.sourceToken))
74 inputLine = " ".join(cleanMat.groups())
75
76 if not inputLine.strip():
77 raise base.SkipThis("Empty line")
78
79 fields = self.grammar.fieldSep.split(inputLine)
80 if not self.grammar.lax and len(fields)!=len(self.grammar.names):
81 raise base.SourceParseError("%d fields found, expected %d"%(
82 len(fields), len(self.grammar.names)),
83 source=self.sourceToken,
84 location=self.getLocator(),
85 hint="reGrammars need the same number of input fields in each line,"
86 " and that number has to match the number of tokens in the names"
87 " attribute. If that's not true for your input but it still"
88 " makes sense, add lax='True' to your grammar.")
89 return dict(zip(self.grammar.names, fields))
90
92 return "line %d"%self.curLine
93
94
96 """A grammar that builds rowdicts from records and fields specified
97 via REs separating them.
98
99 There is also a simple facility for "cleaning up" records. This can be
100 used to remove standard shell-like comments; use
101 ``recordCleaner="(?:#.*)?(.*)"``.
102 """
103 name_ = "reGrammar"
104
105 rowIterator = REIterator
106
107 _til = base.IntAttribute("topIgnoredLines", default=0, description=
108 "Skip this many lines at the top of each source file.",
109 copyable=True)
110 _stopPat = REAttribute("stopPat", default=None,
111 description="Stop parsing when a record *matches* this RE (this"
112 " is for skipping non-data footers",
113 copyable=True)
114 _recordSep = REAttribute("recordSep", default=re.compile("\n"),
115 description="RE for separating two records in the source.",
116 copyable=True)
117 _fieldSep = REAttribute("fieldSep", default=re.compile(r"\s+"),
118 description="RE for separating two fields in a record.",
119 copyable=True)
120 _commentPat = REAttribute("commentPat", default=None,
121 description="RE inter-record material to be ignored (note: make this"
122 " match the entire comment, or you'll get random mess from partly-matched"
123 " comments. Use '(?m)^#.*$' for beginning-of-line hash-comments.",
124 copyable=True)
125 _recordCleaner = REAttribute("recordCleaner", default=None,
126 description="A regular expression matched against each record."
127 " The matched groups in this RE are joined by blanks and used"
128 " as the new pattern. This can be used for simple cleaning jobs;"
129 " However, records not matching recordCleaner are rejected.",
130 copyable=True)
131 _names = base.StringListAttribute("names", description=
132 "Names for the parsed fields, in matching sequence. You can"
133 r" use macros here, e.g., \\colNames{someTable}.", expand=True,
134 copyable=True)
135 _lax = base.BooleanAttribute("lax", description="allow more or less"
136 " fields in source records than there are names", default=False,
137 copyable=True)
138