1 """
2 Some XML hacks.
3
4 StartEndHandler simplifies the creation of SAX parsers, intended for
5 client code or non-DC XML parsing.
6
7 iterparse is an elementtree-inspired thin expat layer; both VOTable
8 and base.structure parsing builds on it.
9 """
10
11
12
13
14
15
16
17 import collections
18 import weakref
19 import xml.sax
20 from xml.parsers import expat
21 from xml.sax.handler import ContentHandler
22
23 from gavo.utils import excs
24 from gavo.utils import misctricks
25 from gavo.utils import texttricks
28 """A wrapper for an error position.
29
30 Construct it with file name, line number, and column. Use None
31 for missing or unknown values.
32 """
33 fName = None
34 - def __init__(self, fName, line, column):
35 self.line = line or '?'
36 self.col = column
37 if self.col is None:
38 self.col = '?'
39 self.fName = fName
40
42 if self.fName:
43 return "%s, (%s, %s)"%(self.fName, self.line, self.col)
44 else:
45 return "(%s, %s)"%(self.line, self.col)
46
49 """iterates over start, data, and end events in source.
50
51 To keep things simple downstream, we swallow all namespace prefixes,
52 if present.
53
54 iterparse is constructed with a source (anything that can read(source))
55 and optionally a custom error class. This error class needs to
56 have the message as the first argument. Since expat error messages
57 usually contain line number and column in them, no extra pos attribute
58 is supported.
59
60 Since the parser typically is far ahead of the events seen, we
61 do our own bookkeeping by storing the parser position with each
62 event. The *end* of the construct that caused an event can
63 be retrieved using pos.
64 """
65 chunkSize = 2**20
66 "The number of bytes handed to expat from iterparse at one go."
67
69 self.source = source
70 self.parseErrorClass = parseErrorClass
71
72 if hasattr(source, "name"):
73 self.inputName = source.name
74 elif hasattr(source, "getvalue"):
75 self.inputName = "[%s]"%(
76 texttricks.makeEllipsis(repr(source.getvalue())[1:-1], 30))
77 else:
78 self.inputName = repr(source)[:34]
79
80 self.parser = expat.ParserCreate()
81 self.parser.buffer_text = True
82 self.lastLine, self.lastColumn = 1, 0
83
84
85 self.parser.returns_unicode = True
86 self.evBuf = collections.deque()
87 self.parser.StartElementHandler = self._startElement
88 self.parser.EndElementHandler = self._endElement
89 self.parser.CharacterDataHandler = self._characters
90
93
95 self.evBuf.append(
96 (("start", name.split(":")[-1], attrs),
97 (self.parser.CurrentLineNumber, self.parser.CurrentColumnNumber)))
98
100 self.evBuf.append((("end", name.split(":")[-1], None),
101 (self.parser.CurrentLineNumber, self.parser.CurrentColumnNumber)))
102
104 self.evBuf.append((("data", None, data), None))
105
106 - def pushBack(self, type, name, payload):
107 self.evBuf.appendleft(((type, name, payload), None))
108
110 while not self.evBuf:
111 try:
112 nextChunk = self.source.read(self.chunkSize)
113 if nextChunk:
114 self.parser.Parse(nextChunk)
115 else:
116 self.close()
117 break
118 except expat.ExpatError as ex:
119 srcDesc = getattr(self.source, "name", "(internal source)")
120 newEx = self.parseErrorClass(srcDesc+" "+str(ex))
121 newEx.posInMsg = True
122 newEx.inFile = srcDesc
123 raise misctricks.logOldExc(newEx)
124
125 if not self.evBuf:
126 raise StopIteration("End of Input")
127 event, pos = self.evBuf.popleft()
128 if pos is not None:
129 self.lastLine, self.lastColumn = pos
130 return event
131
133 self.parser.Parse("", True)
134 self.parser.StartElementHandler =\
135 self.parser.EndElementHandler = \
136 self.parser.CharacterDataHandler = None
137
138 @property
140 return ErrorPosition(self.inputName, self.lastLine, self.lastColumn)
141
143 res = self.parseErrorClass("At %s: %s"%(self.pos, msg))
144 res.posInMsg = True
145 return res
146
149 """This class provides startElement, endElement and characters
150 methods that translate events into method calls.
151
152 When an opening tag is seen, we look of a _start_<element name>
153 method and, if present, call it with the name and the attributes.
154 When a closing tag is seen, we try to call _end_<element name> with
155 name, attributes and contents. If the _end_xxx method returns a
156 string (or similar), this value will be added to the content of the
157 enclosing element.
158
159 Rather than overriding __init__, you probably want to override
160 the _initialize() method to create the data structures you want
161 to fill from XML.
162
163 StartEndHandlers clean element names from namespace prefixes, and
164 they ignore them in every other way. If you need namespaces, use
165 a different interface.
166 """
168 ContentHandler.__init__(self)
169 self.realHandler = weakref.proxy(self)
170 self.elementStack = []
171 self.contentsStack = [[]]
172 self._initialize()
173
176
179
182
184 newAttrs = {}
185 for ns, name in attrs.keys():
186 if ns is None:
187 newAttrs[name] = attrs[(ns, name)]
188 else:
189 newAttrs["{%s}%s"%(ns, name)] = attrs[(ns, name)]
190 self.startElement(namePair[1], newAttrs)
191
193 self.contentsStack.append([])
194 name = self.cleanupName(name)
195 self.elementStack.append((name, attrs))
196 if hasattr(self.realHandler, "_start_%s"%name):
197 getattr(self.realHandler, "_start_%s"%name)(name, attrs)
198 elif hasattr(self, "_defaultStart"):
199 self._defaultStart(name, attrs)
200
203
205 contents = "".join(self.contentsStack.pop())
206 name = self.cleanupName(name)
207 _, attrs = self.elementStack.pop()
208 res = None
209 if hasattr(self.realHandler, "_end_%s"%name):
210 res = getattr(self.realHandler,
211 "_end_%s"%name)(name, attrs, contents)
212 elif hasattr(self, "_defaultEnd"):
213 res = self._defaultEnd(name, attrs, contents)
214 if isinstance(res, basestring) and not suppress:
215 self.contentsStack[-1].append(res)
216
218 self.contentsStack[-1].append(chars)
219
221 return self.contentsStack[0][0]
222
224 """Returns the name of the parent element.
225
226 This only works as written here in end handlers. In start handlers,
227 you have to path depth=2 (since their tag already is on the stack.
228 """
229 if self.elementStack:
230 return self.elementStack[-depth][0]
231
232 - def parse(self, stream):
233 xml.sax.parse(stream, self)
234 return self
235
239
241 """returns attrs as received from SAX as a dictionary.
242
243 The main selling point is that any namespace prefixes are removed from
244 the attribute names. Any prefixes on attrs remain, though.
245 """
246 return dict((k.split(":")[-1], v) for k, v in attrs.items())
247
249 self.locator = locator
250
253 """iterates the elements of an elementTree in postorder.
254 """
255 for child in eTree:
256 for gc in traverseETree(child):
257 yield gc
258 yield eTree
259