gavo.grammars.regrammar

1 """ 2 A grammar splitting the input file into lines and lines into records 3 using REs. 4 """ 5 6 #c Copyright 2008-2019, the GAVO project 7 #c 8 #c This program is free software, covered by the GNU GPL. See the 9 #c COPYING file in the source distribution. 10 11 12 import re 13 14 from gavo import base 15 from gavo.grammars.common import ( 16 Grammar, FileRowIterator, FileRowAttributes, REAttribute) 17 18

19 -class REIterator(FileRowIterator):

20 """is an iterator based on regular expressions. 21 """ 22 chunkSize = 8192 23

24 - def _iterInRecords(self):

25 for i in range(self.grammar.topIgnoredLines): 26 self.inputFile.readline() 27 self.curLine += 1 28 29 curPos = 0 30 splitPat = self.grammar.recordSep 31 buffer = "" 32 while True: 33 mat = splitPat.search(buffer, curPos) 34 if not mat: # no match, fetch new stuff. 35 newStuff = self.inputFile.read(self.chunkSize) 36 if not newStuff: # file exhausted 37 break 38 buffer = buffer[curPos:]+newStuff 39 curPos = 0 40 if self.grammar.commentPat: 41 buffer = self.grammar.commentPat.sub("", buffer) 42 continue 43 self.curLine += mat.group().count("\n") 44 res = buffer[curPos:mat.start()] 45 46 if self.grammar.stopPat and self.grammar.stopPat.match(res): 47 return 48 49 yield res.strip() 50 curPos = mat.end() 51 self.curLine += res.count("\n") 52 # yield stuff left if there's something left 53 res = buffer[curPos:].strip() 54 if res and not ( 55 self.grammar.stopPat and self.grammar.stopPat.match(res)): 56 yield res

57

58 - def _iterRows(self):

59 for rawRec in self._iterInRecords(): 60 try: 61 res = self._makeRec(rawRec) 62 except base.SkipThis: 63 continue 64 yield res 65 self.inputFile.close() 66 self.grammar = None

67

68 - def _makeRec(self, inputLine):

69 if self.grammar.recordCleaner: 70 cleanMat = self.grammar.recordCleaner.match(inputLine) 71 if not cleanMat: 72 raise base.SourceParseError("'%s' does not match cleaner"%inputLine, 73 source=str(self.sourceToken)) 74 inputLine = " ".join(cleanMat.groups()) 75 76 if not inputLine.strip(): 77 raise base.SkipThis("Empty line") 78 79 fields = self.grammar.fieldSep.split(inputLine) 80 if not self.grammar.lax and len(fields)!=len(self.grammar.names): 81 raise base.SourceParseError("%d fields found, expected %d"%( 82 len(fields), len(self.grammar.names)), 83 source=self.sourceToken, 84 location=self.getLocator(), 85 hint="reGrammars need the same number of input fields in each line," 86 " and that number has to match the number of tokens in the names" 87 " attribute. If that's not true for your input but it still" 88 " makes sense, add lax='True' to your grammar.") 89 return dict(zip(self.grammar.names, fields))

90

91 - def getLocator(self):

92 return "line %d"%self.curLine

93 94

95 -class REGrammar(Grammar, FileRowAttributes):

96 """A grammar that builds rowdicts from records and fields specified 97 via REs separating them. 98 99 There is also a simple facility for "cleaning up" records. This can be 100 used to remove standard shell-like comments; use 101 ``recordCleaner="(?:#.*)?(.*)"``. 102 """ 103 name_ = "reGrammar" 104 105 rowIterator = REIterator 106 107 _til = base.IntAttribute("topIgnoredLines", default=0, description= 108 "Skip this many lines at the top of each source file.", 109 copyable=True) 110 _stopPat = REAttribute("stopPat", default=None, 111 description="Stop parsing when a record *matches* this RE (this" 112 " is for skipping non-data footers", 113 copyable=True) 114 _recordSep = REAttribute("recordSep", default=re.compile("\n"), 115 description="RE for separating two records in the source.", 116 copyable=True) 117 _fieldSep = REAttribute("fieldSep", default=re.compile(r"\s+"), 118 description="RE for separating two fields in a record.", 119 copyable=True) 120 _commentPat = REAttribute("commentPat", default=None, 121 description="RE inter-record material to be ignored (note: make this" 122 " match the entire comment, or you'll get random mess from partly-matched" 123 " comments. Use '(?m)^#.*$' for beginning-of-line hash-comments.", 124 copyable=True) 125 _recordCleaner = REAttribute("recordCleaner", default=None, 126 description="A regular expression matched against each record." 127 " The matched groups in this RE are joined by blanks and used" 128 " as the new pattern. This can be used for simple cleaning jobs;" 129 " However, records not matching recordCleaner are rejected.", 130 copyable=True) 131 _names = base.StringListAttribute("names", description= 132 "Names for the parsed fields, in matching sequence. You can" 133 r" use macros here, e.g., \\colNames{someTable}.", expand=True, 134 copyable=True) 135 _lax = base.BooleanAttribute("lax", description="allow more or less" 136 " fields in source records than there are names", default=False, 137 copyable=True)

138

Source Code for Module gavo.grammars.regrammar