1 """
2 Stream parsing of VOTables.
3
4 This module builds on a shallow wrapping of expat in utils.iterparse.
5 There is an "almost-tight" parsing loop in the parse method. It
6 builds an xmlstan tree (mainly through the _processNodeDefault method).
7 """
8
9
10
11
12
13
14
15
16
17
18
19
20 from cStringIO import StringIO
21
22 from gavo import utils
23 from gavo.utils import ElementTree
24 from gavo.votable import common
25 from gavo.votable import model
26 from gavo.votable import tableparser
27
28
29 DEFAULT_WATCHSET = []
30
31
32 VOTABLE_NAMESPACES = [
33 "http://www.ivoa.net/xml/VOTable/v1.0",
34 "http://www.ivoa.net/xml/VOTable/v1.1",
35 "http://www.ivoa.net/xml/VOTable/v1.2",
36 "http://www.ivoa.net/xml/VOTable/v1.3",
37 ]
38
39
41 """this is a sentinel element used when an element is not known
42 but robust parsing is requested.
43
44 These should not end up in a DOM, but if they do, they're silent.
45
46 They're designed to largely behave like stanxml Elements; it can't
47 autoconstruct, though.
48 """
51
54
57
60
63
66
67
69 """the default node processor: Append child to parent, return child.
70 """
71 assert not (text and text.strip())
72 parent[child]
73 return child
74
75
76 -def _processNodeWithContent(text, child, parent):
77 """the node processor for nodes with text content.
78 """
79 if text and text.strip():
80 child[text]
81 parent[child]
82 return child
83
84
85 _end_DESCRIPTION = _processNodeWithContent
86 _end_INFO = _processNodeWithContent
87
88
89 _end_STREAM = _processNodeWithContent
90 _end_TD = _processNodeWithContent
91
92
96
97
99 """returns a dictionary of tag names to end processors.
100
101 Each processor as defined using _end_XXXX has an entry each for
102 each namespace we're likely to encounter, and one non-namespaced.
103 """
104 res, globs = {}, globals()
105 for n, v in globs.iteritems():
106 if n.startswith("_end_"):
107 elName = n[5:]
108 res[elName] = v
109 for ns in VOTABLE_NAMESPACES:
110 res["%s:%s"%(ns, elName)] = v
111 return res
112
113 computeEndProcessors = utils.CachedGetter(_computeEndProcessorsImpl)
114
115
117 """returns a dictionary of tag names to xmlstan elements building them.
118
119 All elements are present for each VOTABLE_NAMESPACE, plus once non-namespaced.
120 """
121 res = {}
122 for n in dir(model.VOTable):
123 if not n.startswith("_"):
124 val = getattr(model.VOTable, n)
125 res[n] = val
126 for ns in VOTABLE_NAMESPACES:
127 res[ElementTree.QName(ns, n)] = val
128 return res
129
130 computeElements = utils.CachedGetter(_computeElementsImpl)
131
132
134 """returns a sanitised version of attDict for element.
135
136 We force attribute keys to be byte strings (since they're being used
137 as keyword arguments), and we drop everything that's namespace related
138 -- it's not necessary for VOTables and people mess it up anyway.
139
140 Also, we complain about or filter out attributes that element
141 cannot deal with.
142 """
143 cleaned = {}
144 for key, value in attrDict.iteritems():
145 if ":" in key or key=="xmlns":
146 continue
147 key = str(key.replace("-", "_"))
148 if not hasattr(element, "_a_"+key):
149 if raiseOnInvalid:
150 raise KeyError(key)
151 else:
152 continue
153 cleaned[key] = value
154 return cleaned
155
156
158 """returns an iterator yielding items of interest.
159
160 inFile is a something that supports read(bytes)
161
162 watchset is a sequence of items of VOTable you want yielded. By
163 default, that's just VOTable.TABLE. You may want to see INFO
164 or PARAM of certain protocols.
165 """
166
167
168 watchset = set(watchset)
169 idmap = {}
170 processors = computeEndProcessors()
171 elements = computeElements()
172 elementStack = [None]
173 iterator = utils.iterparse(inFile, common.VOTableParseError)
174 content = []
175
176 for type, tag, payload in iterator:
177 if type=="data":
178 content.append(payload)
179
180 elif type=="start":
181
182 if tag not in elements:
183 if raiseOnInvalid:
184 raise iterator.getParseError("Unknown tag: %s"%tag)
185 else:
186 element = IGNORE()
187 else:
188 element = elements[tag]()
189
190 if payload:
191 try:
192 payload = _cleanAttributes(payload, element, raiseOnInvalid)
193 except KeyError, msg:
194 raise iterator.getParseError("Attribute %s invalid on %s"%(
195 str(msg), element.name_))
196 elementStack.append(element(**payload))
197
198
199 content = []
200
201
202 elId = payload.get("ID")
203 if elId is not None:
204 idmap[elId] = elementStack[-1]
205
206
207 if tag=="DATA":
208 yield tableparser.Rows(elementStack[-2], iterator)
209
210 elif type=="end":
211
212 if content:
213 text = "".join(content)
214 content = []
215 else:
216 text = None
217
218
219 nodeProc = processors.get(tag, _processNodeDefault)
220 preChild = elementStack.pop()
221 if not isinstance(preChild, IGNORE):
222
223 child = nodeProc(text, preChild, elementStack[-1])
224
225
226 if child is not None and child.__class__ in watchset:
227 child.idmap = idmap
228 yield child
229
230 else:
231 assert False
232
233
242
243
245 """returns an iterator yielding pairs of (table definition, row iterator).
246
247 string contains a VOTable literal.
248 """
249 return parse(StringIO(string), watchset, raiseOnInvalid)
250