Package gavo :: Package utils :: Module plainxml
[frames] | no frames]

Source Code for Module gavo.utils.plainxml

  1  """ 
  2  Some XML hacks. 
  3   
  4  StartEndHandler simplifies the creation of SAX parsers, intended for  
  5  client code or non-DC XML parsing. 
  6   
  7  iterparse is an elementtree-inspired thin expat layer; both VOTable 
  8  and base.structure parsing builds on it. 
  9  """ 
 10   
 11  #c Copyright 2008-2019, the GAVO project 
 12  #c 
 13  #c This program is free software, covered by the GNU GPL.  See the 
 14  #c COPYING file in the source distribution. 
 15   
 16   
 17  import collections 
 18  import weakref 
 19  import xml.sax 
 20  from xml.parsers import expat 
 21  from xml.sax.handler import ContentHandler 
 22   
 23  from gavo.utils import excs 
 24  from gavo.utils import misctricks 
 25  from gavo.utils import texttricks 
26 27 -class ErrorPosition(object):
28 """A wrapper for an error position. 29 30 Construct it with file name, line number, and column. Use None 31 for missing or unknown values. 32 """ 33 fName = None
34 - def __init__(self, fName, line, column):
35 self.line = line or '?' 36 self.col = column 37 if self.col is None: 38 self.col = '?' 39 self.fName = fName
40
41 - def __str__(self):
42 if self.fName: 43 return "%s, (%s, %s)"%(self.fName, self.line, self.col) 44 else: 45 return "(%s, %s)"%(self.line, self.col)
46
47 48 -class iterparse(object):
49 """iterates over start, data, and end events in source. 50 51 To keep things simple downstream, we swallow all namespace prefixes, 52 if present. 53 54 iterparse is constructed with a source (anything that can read(source)) 55 and optionally a custom error class. This error class needs to 56 have the message as the first argument. Since expat error messages 57 usually contain line number and column in them, no extra pos attribute 58 is supported. 59 60 Since the parser typically is far ahead of the events seen, we 61 do our own bookkeeping by storing the parser position with each 62 event. The *end* of the construct that caused an event can 63 be retrieved using pos. 64 """ 65 chunkSize = 2**20 66 "The number of bytes handed to expat from iterparse at one go." 67
68 - def __init__(self, source, parseErrorClass=excs.StructureError):
69 self.source = source 70 self.parseErrorClass = parseErrorClass 71 72 if hasattr(source, "name"): 73 self.inputName = source.name 74 elif hasattr(source, "getvalue"): 75 self.inputName = "[%s]"%( 76 texttricks.makeEllipsis(repr(source.getvalue())[1:-1], 30)) 77 else: 78 self.inputName = repr(source)[:34] 79 80 self.parser = expat.ParserCreate() 81 self.parser.buffer_text = True 82 self.lastLine, self.lastColumn = 1, 0 83 # We want ordered attributes for forcing attribute names to be 84 # byte strings. 85 self.parser.returns_unicode = True 86 self.evBuf = collections.deque() 87 self.parser.StartElementHandler = self._startElement 88 self.parser.EndElementHandler = self._endElement 89 self.parser.CharacterDataHandler = self._characters
90
91 - def __iter__(self):
92 return self
93
94 - def _startElement(self, name, attrs):
95 self.evBuf.append( 96 (("start", name.split(":")[-1], attrs), 97 (self.parser.CurrentLineNumber, self.parser.CurrentColumnNumber)))
98
99 - def _endElement(self, name):
100 self.evBuf.append((("end", name.split(":")[-1], None), 101 (self.parser.CurrentLineNumber, self.parser.CurrentColumnNumber)))
102
103 - def _characters(self, data):
104 self.evBuf.append((("data", None, data), None))
105
106 - def pushBack(self, type, name, payload):
107 self.evBuf.appendleft(((type, name, payload), None))
108
109 - def next(self):
110 while not self.evBuf: 111 try: 112 nextChunk = self.source.read(self.chunkSize) 113 if nextChunk: 114 self.parser.Parse(nextChunk) 115 else: 116 self.close() 117 break 118 except expat.ExpatError as ex: 119 srcDesc = getattr(self.source, "name", "(internal source)") 120 newEx = self.parseErrorClass(srcDesc+" "+str(ex)) 121 newEx.posInMsg = True # see base.xmlstruct 122 newEx.inFile = srcDesc 123 raise misctricks.logOldExc(newEx) 124 125 if not self.evBuf: 126 raise StopIteration("End of Input") 127 event, pos = self.evBuf.popleft() 128 if pos is not None: 129 self.lastLine, self.lastColumn = pos 130 return event
131
132 - def close(self):
133 self.parser.Parse("", True) 134 self.parser.StartElementHandler =\ 135 self.parser.EndElementHandler = \ 136 self.parser.CharacterDataHandler = None
137 138 @property
139 - def pos(self):
140 return ErrorPosition(self.inputName, self.lastLine, self.lastColumn)
141
142 - def getParseError(self, msg):
143 res = self.parseErrorClass("At %s: %s"%(self.pos, msg)) 144 res.posInMsg = True # see base.xmlstruct 145 return res
146
147 148 -class StartEndHandler(ContentHandler):
149 """This class provides startElement, endElement and characters 150 methods that translate events into method calls. 151 152 When an opening tag is seen, we look of a _start_<element name> 153 method and, if present, call it with the name and the attributes. 154 When a closing tag is seen, we try to call _end_<element name> with 155 name, attributes and contents. If the _end_xxx method returns a 156 string (or similar), this value will be added to the content of the 157 enclosing element. 158 159 Rather than overriding __init__, you probably want to override 160 the _initialize() method to create the data structures you want 161 to fill from XML. 162 163 StartEndHandlers clean element names from namespace prefixes, and 164 they ignore them in every other way. If you need namespaces, use 165 a different interface. 166 """
167 - def __init__(self):
168 ContentHandler.__init__(self) 169 self.realHandler = weakref.proxy(self) 170 self.elementStack = [] 171 self.contentsStack = [[]] 172 self._initialize()
173
174 - def _initialize(self):
175 pass
176
177 - def processingInstruction(self, target, data):
178 self.contentsStack[-1].append(data)
179
180 - def cleanupName(self, name):
181 return name.split(":")[-1].replace("-", "_")
182
183 - def startElementNS(self, namePair, qName, attrs):
184 newAttrs = {} 185 for ns, name in attrs.keys(): 186 if ns is None: 187 newAttrs[name] = attrs[(ns, name)] 188 else: 189 newAttrs["{%s}%s"%(ns, name)] = attrs[(ns, name)] 190 self.startElement(namePair[1], newAttrs)
191
192 - def startElement(self, name, attrs):
193 self.contentsStack.append([]) 194 name = self.cleanupName(name) 195 self.elementStack.append((name, attrs)) 196 if hasattr(self.realHandler, "_start_%s"%name): 197 getattr(self.realHandler, "_start_%s"%name)(name, attrs) 198 elif hasattr(self, "_defaultStart"): 199 self._defaultStart(name, attrs)
200
201 - def endElementNS(self, namePair, qName):
202 self.endElement(namePair[1])
203
204 - def endElement(self, name, suppress=False):
205 contents = "".join(self.contentsStack.pop()) 206 name = self.cleanupName(name) 207 _, attrs = self.elementStack.pop() 208 res = None 209 if hasattr(self.realHandler, "_end_%s"%name): 210 res = getattr(self.realHandler, 211 "_end_%s"%name)(name, attrs, contents) 212 elif hasattr(self, "_defaultEnd"): 213 res = self._defaultEnd(name, attrs, contents) 214 if isinstance(res, basestring) and not suppress: 215 self.contentsStack[-1].append(res)
216
217 - def characters(self, chars):
218 self.contentsStack[-1].append(chars)
219
220 - def getResult(self):
221 return self.contentsStack[0][0]
222
223 - def getParentTag(self, depth=1):
224 """Returns the name of the parent element. 225 226 This only works as written here in end handlers. In start handlers, 227 you have to path depth=2 (since their tag already is on the stack. 228 """ 229 if self.elementStack: 230 return self.elementStack[-depth][0]
231
232 - def parse(self, stream):
233 xml.sax.parse(stream, self) 234 return self
235
236 - def parseString(self, string):
237 xml.sax.parseString(string, self) 238 return self
239
240 - def getAttrsAsDict(self, attrs):
241 """returns attrs as received from SAX as a dictionary. 242 243 The main selling point is that any namespace prefixes are removed from 244 the attribute names. Any prefixes on attrs remain, though. 245 """ 246 return dict((k.split(":")[-1], v) for k, v in attrs.items())
247
248 - def setDocumentLocator(self, locator):
249 self.locator = locator
250
251 252 -def traverseETree(eTree):
253 """iterates the elements of an elementTree in postorder. 254 """ 255 for child in eTree: 256 for gc in traverseETree(child): 257 yield gc 258 yield eTree
259