Package gavo :: Package votable :: Module parser
[frames] | no frames]

Source Code for Module gavo.votable.parser

  1  """ 
  2  Stream parsing of VOTables. 
  3   
  4  This module builds on a shallow wrapping of expat in utils.iterparse. 
  5  There is an "almost-tight" parsing loop in the parse method.  It 
  6  builds an xmlstan tree (mainly through the _processNodeDefault method). 
  7  """ 
  8   
  9  #c Copyright 2008-2016, the GAVO project 
 10  #c 
 11  #c This program is free software, covered by the GNU GPL.  See the 
 12  #c COPYING file in the source distribution. 
 13   
 14   
 15  # To fiddle with the nodes as they are generated, define an 
 16  # _end_ELEMENTNAME method.  If you do this, you will have to do 
 17  # any adding of children to parents yourself (it happens in  
 18  # _processNodeDefault, which is called when no custom handler is 
 19  # present. 
 20  from cStringIO import StringIO 
 21   
 22  from gavo import utils 
 23  from gavo.utils import ElementTree 
 24  from gavo.votable import common 
 25  from gavo.votable import model 
 26  from gavo.votable import tableparser 
 27   
 28   
 29  DEFAULT_WATCHSET = [] 
 30   
 31  # We treat all VOTable versions as equal. 
 32  VOTABLE_NAMESPACES = [ 
 33          "http://www.ivoa.net/xml/VOTable/v1.0", 
 34          "http://www.ivoa.net/xml/VOTable/v1.1", 
 35          "http://www.ivoa.net/xml/VOTable/v1.2", 
 36          "http://www.ivoa.net/xml/VOTable/v1.3", 
 37  ] 
 38   
 39   
40 -class IGNORE(object):
41 """this is a sentinel element used when an element is not known 42 but robust parsing is requested. 43 44 These should not end up in a DOM, but if they do, they're silent. 45 46 They're designed to largely behave like stanxml Elements; it can't 47 autoconstruct, though. 48 """
49 - def __init__(self):
50 pass
51
52 - def __call__(self, **kwargs):
53 return self
54
55 - def __getitem__(self, item):
56 pass
57
58 - def isEmpty(self):
59 return True
60
61 - def shouldBeSkipped(self):
62 return True
63
64 - def apply(self, func):
65 return
66 67
68 -def _processNodeDefault(text, child, parent):
69 """the default node processor: Append child to parent, return child. 70 """ 71 assert not (text and text.strip()) 72 parent[child] 73 return child
74 75
76 -def _processNodeWithContent(text, child, parent):
77 """the node processor for nodes with text content. 78 """ 79 if text and text.strip(): 80 child[text] # Attention: mixed content not supported 81 parent[child] 82 return child
83 84 85 _end_DESCRIPTION = _processNodeWithContent 86 _end_INFO = _processNodeWithContent 87 # STREAMs and TABLEDATA should ordinarily be processed by the table 88 # iterator, so this really is only interesting for special applications: 89 _end_STREAM = _processNodeWithContent 90 _end_TD = _processNodeWithContent 91 92
93 -def _end_VOTABLE(text, child, parent):
94 # VOTABLEs have no useful parents. 95 return child
96 97
98 -def _computeEndProcessorsImpl():
99 """returns a dictionary of tag names to end processors. 100 101 Each processor as defined using _end_XXXX has an entry each for 102 each namespace we're likely to encounter, and one non-namespaced. 103 """ 104 res, globs = {}, globals() 105 for n, v in globs.iteritems(): 106 if n.startswith("_end_"): 107 elName = n[5:] 108 res[elName] = v 109 for ns in VOTABLE_NAMESPACES: 110 res["%s:%s"%(ns, elName)] = v 111 return res
112 113 computeEndProcessors = utils.CachedGetter(_computeEndProcessorsImpl) 114 115
116 -def _computeElementsImpl():
117 """returns a dictionary of tag names to xmlstan elements building them. 118 119 All elements are present for each VOTABLE_NAMESPACE, plus once non-namespaced. 120 """ 121 res = {} 122 for n in dir(model.VOTable): 123 if not n.startswith("_"): 124 val = getattr(model.VOTable, n) 125 res[n] = val 126 for ns in VOTABLE_NAMESPACES: 127 res[ElementTree.QName(ns, n)] = val 128 return res
129 130 computeElements = utils.CachedGetter(_computeElementsImpl) 131 132
133 -def _cleanAttributes(attrDict, element, raiseOnInvalid):
134 """returns a sanitised version of attDict for element. 135 136 We force attribute keys to be byte strings (since they're being used 137 as keyword arguments), and we drop everything that's namespace related 138 -- it's not necessary for VOTables and people mess it up anyway. 139 140 Also, we complain about or filter out attributes that element 141 cannot deal with. 142 """ 143 cleaned = {} 144 for key, value in attrDict.iteritems(): 145 if ":" in key or key=="xmlns": 146 continue 147 key = str(key.replace("-", "_")) 148 if not hasattr(element, "_a_"+key): 149 if raiseOnInvalid: 150 raise KeyError(key) 151 else: 152 continue 153 cleaned[key] = value 154 return cleaned
155 156
157 -def parse(inFile, watchset=DEFAULT_WATCHSET, raiseOnInvalid=True):
158 """returns an iterator yielding items of interest. 159 160 inFile is a something that supports read(bytes) 161 162 watchset is a sequence of items of VOTable you want yielded. By 163 default, that's just VOTable.TABLE. You may want to see INFO 164 or PARAM of certain protocols. 165 """ 166 # This parser has gotten a bit too fat. Maybe move the whole thing 167 # to a class? All this isn't terribly critical to performance... 168 watchset = set(watchset) 169 idmap = {} 170 processors = computeEndProcessors() 171 elements = computeElements() 172 elementStack = [None] # None is VOTABLE's parent 173 iterator = utils.iterparse(inFile, common.VOTableParseError) 174 content = [] 175 176 for type, tag, payload in iterator: 177 if type=="data": 178 content.append(payload) 179 180 elif type=="start": 181 # Element open: push new node on the stack... 182 if tag not in elements: 183 if raiseOnInvalid: 184 raise iterator.getParseError("Unknown tag: %s"%tag) 185 else: 186 element = IGNORE() 187 else: 188 element = elements[tag]() 189 190 if payload: 191 try: 192 payload = _cleanAttributes(payload, element, raiseOnInvalid) 193 except KeyError, msg: 194 raise iterator.getParseError("Attribute %s invalid on %s"%( 195 str(msg), element.name_)) 196 elementStack.append(element(**payload)) 197 198 # ...prepare for new content,... 199 content = [] 200 201 # ...add the node to the id map if it has an ID... 202 elId = payload.get("ID") 203 if elId is not None: 204 idmap[elId] = elementStack[-1] 205 206 # ...and pass control to special iterator if DATA is coming in. 207 if tag=="DATA": 208 yield tableparser.Rows(elementStack[-2], iterator) 209 210 elif type=="end": 211 # Element close: process text content... 212 if content: 213 text = "".join(content) 214 content = [] 215 else: 216 text = None 217 218 # ...see if we have any special procssing to do for the node type... 219 nodeProc = processors.get(tag, _processNodeDefault) 220 preChild = elementStack.pop() 221 if not isinstance(preChild, IGNORE): 222 # ...call handler with the current node and its future parent... 223 child = nodeProc(text, preChild, elementStack[-1]) 224 225 # ...and let user do something with the element if she ordered it. 226 if child is not None and child.__class__ in watchset: 227 child.idmap = idmap 228 yield child 229 230 else: 231 assert False
232 233
234 -def readRaw(inFile):
235 """returns a V.VOTABLE instance with filled-in data for the input from 236 inFile. 237 """ 238 for el in parse(inFile, [model.VOTable.TABLE, model.VOTable.VOTABLE]): 239 if isinstance(el, tableparser.Rows): 240 el.tableDefinition.rows = list(el) 241 return el
242 243
244 -def parseString(string, watchset=DEFAULT_WATCHSET, raiseOnInvalid=True):
245 """returns an iterator yielding pairs of (table definition, row iterator). 246 247 string contains a VOTable literal. 248 """ 249 return parse(StringIO(string), watchset, raiseOnInvalid)
250