1 """
2 Stream parsing of VOTables.
3
4 This module builds on a shallow wrapping of expat in utils.iterparse.
5 There is an "almost-tight" parsing loop in the parse method. It
6 builds an xmlstan tree (mainly through the _processNodeDefault method).
7 """
8
9
10
11
12
13
14
15
16
17
18
19
20 from cStringIO import StringIO
21
22 from gavo import utils
23 from gavo.utils import ElementTree
24 from gavo.votable import common
25 from gavo.votable import model
26 from gavo.votable import tableparser
27
28
29 DEFAULT_WATCHSET = []
30
31
32 VOTABLE_NAMESPACES = [
33 "http://www.ivoa.net/xml/VOTable/v1.0",
34 "http://www.ivoa.net/xml/VOTable/v1.1",
35 "http://www.ivoa.net/xml/VOTable/v1.2",
36 "http://www.ivoa.net/xml/VOTable/v1.3",
37 ]
38
39
41 """this is a sentinel element used when an element is not known
42 but robust parsing is requested.
43
44 These should not end up in a DOM, but if they do, they're silent.
45
46 They're designed to largely behave like stanxml Elements; it can't
47 autoconstruct, though.
48 """
51
54
57
60
63
66
67
69 """the default node processor: Append child to parent, return child.
70 """
71 assert not (text and text.strip()), (
72 "Content '%s' in must-empty VOTable element %s"%(text, repr(child)))
73 parent[child]
74 return child
75
76
77 -def _processNodeWithContent(text, child, parent):
78 """the node processor for nodes with text content.
79 """
80 if text and text.strip():
81 child[text]
82 parent[child]
83 return child
84
85
86 _end_DESCRIPTION = _processNodeWithContent
87 _end_INFO = _processNodeWithContent
88 _end_MODEL = _processNodeWithContent
89 _end_URL = _processNodeWithContent
90 _end_LITERAL = _processNodeWithContent
91 _end_NAME = _processNodeWithContent
92
93
94 _end_STREAM = _processNodeWithContent
95 _end_TD = _processNodeWithContent
96 _end_IDREF = _processNodeWithContent
97 _end_LITERAL = _processNodeWithContent
98
99
103
104
106 """returns a dictionary of tag names to end processors.
107
108 Each processor as defined using _end_XXXX has an entry each for
109 each namespace we're likely to encounter, and one non-namespaced.
110 """
111 res, globs = {}, globals()
112 for n, v in globs.iteritems():
113 if n.startswith("_end_"):
114 elName = n[5:]
115 res[elName] = v
116 for ns in VOTABLE_NAMESPACES:
117 res["%s:%s"%(ns, elName)] = v
118 return res
119
120 computeEndProcessors = utils.CachedGetter(_computeEndProcessorsImpl)
121
122
124 """returns a dictionary of tag names to xmlstan elements building them.
125
126 All elements are present for each VOTABLE_NAMESPACE, plus once non-namespaced.
127 """
128 res = {}
129 for n in dir(model.VOTable):
130 if not n.startswith("_"):
131 val = getattr(model.VOTable, n)
132 res[n] = val
133 for ns in VOTABLE_NAMESPACES:
134 res[ElementTree.QName(ns, n)] = val
135 return res
136
137 computeElements = utils.CachedGetter(_computeElementsImpl)
138
139
141 """returns a sanitised version of attDict for element.
142
143 We force attribute keys to be byte strings (since they're being used
144 as keyword arguments), and we drop everything that's namespace related
145 -- it's not necessary for VOTables and people mess it up anyway.
146
147 Also, we complain about or filter out attributes that element
148 cannot deal with.
149 """
150 cleaned = {}
151 for key, value in attrDict.iteritems():
152 if ":" in key or key=="xmlns":
153 continue
154 key = str(key.replace("-", "_"))
155 if not hasattr(element, "_a_"+key):
156 if raiseOnInvalid:
157 raise KeyError(key)
158 else:
159 continue
160 cleaned[key] = value
161 return cleaned
162
163
165 """returns an iterator yielding items of interest.
166
167 inFile is a something that supports read(bytes)
168
169 watchset is a sequence of items of VOTable you want yielded. By
170 default, that's just VOTable.TABLE. You may want to see INFO
171 or PARAM of certain protocols.
172 """
173
174
175 watchset = set(watchset)
176 idmap = {}
177 processors = computeEndProcessors()
178 elements = computeElements()
179 elementStack = [None]
180 iterator = utils.iterparse(inFile, common.VOTableParseError)
181 content = []
182
183 for type, tag, payload in iterator:
184 if type=="data":
185 content.append(payload)
186
187 elif type=="start":
188
189 if tag not in elements:
190 if raiseOnInvalid:
191 raise iterator.getParseError("Unknown tag: %s"%tag)
192 else:
193 element = IGNORE()
194 else:
195 element = elements[tag]()
196
197 if payload:
198 try:
199 payload = _cleanAttributes(payload, element, raiseOnInvalid)
200 except KeyError as msg:
201 raise iterator.getParseError("Attribute %s invalid on %s"%(
202 str(msg), element.name_))
203 elementStack.append(element(**payload))
204
205
206 content = []
207
208
209 elId = payload.get("ID")
210 if elId is not None:
211 idmap[elId] = elementStack[-1]
212
213
214 if tag=="DATA":
215 yield tableparser.Rows(elementStack[-2], iterator)
216
217 elif type=="end":
218
219 if content:
220 text = "".join(content)
221 content = []
222 else:
223 text = None
224
225
226 nodeProc = processors.get(tag, _processNodeDefault)
227 preChild = elementStack.pop()
228 if not isinstance(preChild, IGNORE):
229
230 child = nodeProc(text, preChild, elementStack[-1])
231
232
233 if child is not None and child.__class__ in watchset:
234 child.idmap = idmap
235 yield child
236
237 else:
238 assert False
239
240
249
250
252 """returns an iterator yielding pairs of (table definition, row iterator).
253
254 string contains a VOTable literal.
255 """
256 return parse(StringIO(string), watchset, raiseOnInvalid)
257