Package gavo :: Package protocols :: Module oaiclient
[frames] | no frames]

Source Code for Module gavo.protocols.oaiclient

  1  """ 
  2  A simple client of OAI-http. 
  3   
  4  This includes both some high-level functions and rudimentary parsers 
  5  that can serve as bases for more specialized parsers. 
  6  """ 
  7   
  8  #c Copyright 2008-2019, the GAVO project 
  9  #c 
 10  #c This program is free software, covered by the GNU GPL.  See the 
 11  #c COPYING file in the source distribution. 
 12   
 13   
 14  import cPickle as pickle 
 15  import hashlib 
 16  import os 
 17  import re 
 18  import urllib 
 19  from cStringIO import StringIO 
 20  from xml import sax 
 21  from xml.sax import saxutils 
 22   
 23  from gavo import base 
 24  from gavo import svcs 
 25  from gavo import utils 
 26   
 27   
28 -class FailedQuery(Exception):
29 - def __init__(self, msg, code="?", value="?"):
30 Exception.__init__(self, msg) 31 self.code, self.value = code, value
32 33
34 -class NoRecordsMatch(Exception):
35 pass
36 37
38 -class PrefixIsTaken(Exception):
39 pass
40 41 # Canonical prefixes, i.e., essentially fixed prefixes for certain 42 # namespaces. This is all an ugly nightmare, but this is what you 43 # get for having namespace prefixes in attributes. 44
45 -class CanonicalPrefixes(object):
46 """a self-persisting dictionary of the prefixes we use in our 47 OAI interface. 48 49 CanonicalPrefixes objects are constructed with the name of a 50 pickle file containing a list of (prefix, uri) pairs. 51 52 This reproduces some code from stanxml.NSRegistry, but we want that 53 stuff as instance method here, not as class method. 54 """
55 - def __init__(self, pickleName):
56 self.pickleName = pickleName 57 self._registry = {} 58 self._reverseRegistry = {} 59 self._loadData()
60
61 - def registerPrefix(self, prefix, ns, save=True):
62 if prefix in self._registry: 63 if ns!=self._registry[prefix]: 64 raise PrefixIsTaken(prefix) 65 return 66 self._registry[prefix] = ns 67 if ns in self._reverseRegistry and self._reverseRegistry[ns]!=prefix: 68 raise ValueError("Namespace %s already has prefix %s, will" 69 " not clobber with %s"%(ns, self._reverseRegistry[ns], prefix)) 70 self._reverseRegistry[ns] = prefix 71 if save: 72 self._saveData()
73
74 - def registerPrefixOrMakeUp(self, prefix, ns):
75 """registers prefix for ns or, if prefix is already taken, makes 76 up a new prefix for the namespace URI ns. 77 """ 78 try: 79 self.registerPrefix(prefix, ns) 80 except PrefixIsTaken: 81 origPrefix, uniquer = prefix, 0 82 while True: 83 try: 84 prefix = origPrefix+str(uniquer) 85 self.registerPrefix(prefix, ns) 86 except PrefixIsTaken: 87 uniquer += 1 88 else: 89 break
90
91 - def getPrefixForNS(self, ns):
92 try: 93 return self._reverseRegistry[ns] 94 except KeyError: 95 raise svcs.NotFoundError(ns, "XML namespace", 96 "registry of XML namespaces.")
97
98 - def haveNS(self, ns):
99 return ns in self._reverseRegistry
100
101 - def getNSForPrefix(self, prefix):
102 try: 103 return self._registry[prefix] 104 except KeyError: 105 raise base.NotFoundError(prefix, "XML namespace prefix", 106 "registry of prefixes.")
107
108 - def iterNS(self):
109 return self._registry.iteritems()
110
111 - def _fillFromPairs(self, pairs):
112 """fills the instance from a list of prefix, uri pairs. 113 114 Pairs is what is stored in the pickle. 115 """ 116 for prefix, uri in pairs: 117 self.registerPrefix(prefix, uri, save=False)
118
119 - def _bootstrap(self):
120 """sets up our canonical prefixes from DaCHS' (stanxml) namespace 121 registry. 122 """ 123 from gavo import api #noflake: hope most prefixes are registred after that 124 from gavo.utils import stanxml 125 self._fillFromPairs(stanxml.NSRegistry._registry.iteritems()) 126 self._saveData()
127
128 - def _loadData(self):
129 try: 130 with open(self.pickleName) as f: 131 self._fillFromPairs(pickle.load(f)) 132 except IOError: # most likely, the file does not exist yet 133 base.ui.notifyWarning("Starting new canonical prefixes") 134 self._bootstrap()
135
136 - def _saveData(self):
137 toPersist = list(sorted(self._registry.iteritems())) 138 try: 139 with open(self.pickleName+".tmp", "w") as f: 140 pickle.dump(toPersist, f) 141 os.rename(self.pickleName+".tmp", self.pickleName) 142 except IOError as msg: 143 base.ui.notifyWarning("Could not persist canonical prefixes: %s"% 144 msg)
145 146
147 -def getCanonicalPrefixes():
148 return CanonicalPrefixes(os.path.join(base.getConfig("cacheDir"), 149 "rrOaiPrefixes.pickle"))
150 151
152 -class OAIErrorMixin(object):
153 - def _end_error(self, name, attrs, content):
154 if attrs["code"]=="noRecordsMatch": 155 raise NoRecordsMatch() 156 raise FailedQuery("Registry bailed with code %s, value %s"%( 157 attrs["code"], content), attrs["code"], content)
158 159
160 -class IdParser(utils.StartEndHandler, OAIErrorMixin):
161 """A parser for simple OAI-PMH headers. 162 163 Records end up as a list of dictionaries in the recs attribute. 164 """ 165 resumptionToken = None 166
167 - def __init__(self, initRecs=None):
168 utils.StartEndHandler.__init__(self) 169 if initRecs is None: 170 self.recs = [] 171 else: 172 self.recs = initRecs
173
174 - def getResult(self):
175 return self.recs
176
177 - def _end_identifier(self, name, attrs, content):
178 self.recs[-1]["id"] = content.strip()
179
180 - def _end_datestamp(self, name, attrs, content):
181 try: 182 self.recs[-1]["date"] = utils.parseISODT(content) 183 except ValueError: # don't fail just because of a broken date 184 self.recs[-1]["date"] = None
185
186 - def _start_header(self, name, attrs):
187 self.recs.append({})
188
189 - def _end_resumptionToken(self, name, attrs, content):
190 if content.strip(): 191 self.resumptionToken = content
192 193
194 -class RecordParser(IdParser, OAIErrorMixin):
195 """A simple parser for ivo_vor records. 196 197 This only pulls out a number of the most salient items; more will 198 probably follow as needed. 199 """
200 - def _end_title(self, name, attrs, content):
201 if self.getParentTag()=="Resource": 202 self.recs[-1][name] = content
203
204 - def _end_email(self, name, attrs, content):
205 if self.getParentTag()=="contact": 206 self.recs[-1]["contact.email"] = content
207
208 - def _end_name(self, name, attrs, content):
209 if self.getParentTag()=="creator": 210 self.recs[-1].setdefault(name, []).append(content)
211
212 - def _end_subject(self, name, attrs, content):
213 self.recs[-1].setdefault(name, []).append(content)
214
215 - def _handleContentChild(self, name, attrs, content):
216 if self.getParentTag()=="content": 217 self.recs[-1][name] = content
218 219 _end_description = _end_source = _end_referenceURL = \ 220 _handleContentChild 221
222 - def _end_datestamp(self, name, attrs, content):
223 # nuke IdParser implementation, we take our date from ri:Resource 224 pass
225
226 - def _startResource(self, name, attrs):
227 self.recs.append({})
228
229 - def _end_Resource(self, name, attrs, content):
230 self.recs[-1]["date"] = utils.parseISODT(attrs["updated"])
231
232 - def _end_accessURL(self, name, attrs, content):
233 self.recs[-1].setdefault(name, []).append(content)
234 235
236 -class OAIRecordsParser(sax.ContentHandler, OAIErrorMixin):
237 """a SAX ContentHandler generating tuples of some record-level metadata 238 and pre-formatted XML of simple implementation of the OAI interface. 239 240 canonicalPrefixes is a CanonicalPrefixesInstance built from 241 res/canonicalPrefixes.pickle 242 243 Note that we *require* that records actually carry ivo_vor metadata. 244 """ 245 # attribute names the values of which should be disambiguated to 246 # reduce the likelihood of clashes when ids are reused between documents. 247 # (see _normalizeAttrs) 248 _referringAttributeNames = set(["id", "ref", 249 "coord_system_id"]) 250 251 resumptionToken = None 252
253 - def __init__(self, canonicalPrefixes=None):
254 self.canonicalPrefixes = canonicalPrefixes or getCanonicalPrefixes() 255 sax.ContentHandler.__init__(self) 256 self.buffer = None 257 self.writer = None 258 self.rowdicts = [] 259 self.prefixMap = {} 260 self.prefixesToTranslate = {}
261
262 - def startPrefixMapping(self, prefix, uri):
263 self.prefixMap.setdefault(prefix, []).append(uri) 264 265 # Here, we make sure we find a globally unique prefix for every 266 # namespace URI. canonicalPrefixes makes sure this unique prefix 267 # is persistent and later available to the OAI interface 268 if not self.canonicalPrefixes.haveNS(uri): 269 self.canonicalPrefixes.registerPrefixOrMakeUp(prefix, uri) 270 271 canonPrefix = self.canonicalPrefixes.getPrefixForNS(uri) 272 if prefix!=canonPrefix or prefix in self.prefixesToTranslate: 273 self.prefixesToTranslate.setdefault(prefix, []).append(canonPrefix)
274
275 - def endPrefixMapping(self, prefix):
276 self.prefixMap[prefix].pop() 277 if prefix in self.prefixesToTranslate: 278 self.prefixesToTranslate[prefix].pop() 279 if not self.prefixesToTranslate[prefix]: 280 del self.prefixesToTranslate[prefix]
281
282 - def startElementNS(self, namePair, ignored, attrs):
283 ns, name = namePair 284 if ns is not None: 285 name = self.canonicalPrefixes.getPrefixForNS(ns)+":"+name 286 if attrs: 287 attrs = self._normalizeAttrs(attrs) 288 289 if name in self.startHandlers: 290 self.startHandlers[name](self, name, attrs) 291 292 if self.writer: 293 self.writer.startElement(name, attrs) 294 295 self._lastChars = []
296
297 - def endElementNS(self, namePair, name):
298 ns, name = namePair 299 if ns is not None: 300 name = self.canonicalPrefixes.getPrefixForNS(ns)+":"+name 301 if self.writer: 302 self.writer.endElement(name) 303 if name in self.endHandlers: 304 self.endHandlers[name](self, name)
305
306 - def characters(self, stuff):
307 if self.writer: 308 self.writer.characters(stuff) 309 # Hack, see _getLastContent 310 self._lastChars.append(stuff)
311
312 - def normalizeNamespace(self, name):
313 """fixes the namespace prefix of name if necessary. 314 315 name must be a qualified name, i.e., contain exactly one colon. 316 317 "normalize" here means make sure the prefix matches our canonical prefix 318 and change it to the canonical one if necessary. 319 """ 320 prefix, base = name.split(":") 321 if prefix not in self.prefixesToTranslate: 322 return name 323 return self.prefixesToTranslate[prefix][-1]+":"+base
324
325 - def _normalizeAttrs(self, attrs):
326 """fixes attribute name and attribute value namespaces if necessary. 327 328 It also always checks for xsi:type and fixes namespaced attribute 329 values as necessary. 330 331 See also normalizeNamespace. 332 """ 333 newAttrs = {} 334 for ns, name in attrs.keys(): 335 value = attrs[(ns, name)] 336 if ns is None: 337 newName = name 338 else: 339 newName = self.canonicalPrefixes.getPrefixForNS(ns)+":"+name 340 341 if newName=="xsi:type": 342 if ":" in value: 343 value = self.normalizeNamespace(value) 344 345 # to uniqueify id/ref-pairs, prepend an md5-digest of the ivoid 346 # to selected ids. This isn't guaranteed to always work, but 347 # if someone is devious enough to cause collisions here, they 348 # deserve no better. 349 if newName in self._referringAttributeNames: 350 value = value+hashlib.md5(self.ivoid).hexdigest() 351 352 newAttrs[newName] = value 353 354 return newAttrs
355
356 - def _getLastContent(self):
357 """returns the entire character content since the last XML event. 358 """ 359 return "".join(self._lastChars)
360
361 - def notifyError(self, err):
362 self._errorOccurred = True
363
364 - def shipout(self, role, record):
365 # see _end_identifier for an explanation of the following condition 366 if self.ivoid is None: 367 return 368 # see our docstring on why we need the following 369 if not self.metadataSeen: 370 return 371 if self._errorOccurred: 372 return 373 374 # _start_header sets _isDeleted 375 if self._isDeleted: 376 return 377 self.rowdicts.append((role, record))
378
379 - def _start_oai_header(self, name, attrs):
380 self._isDeleted = attrs.get("status", "").lower()=="deleted"
381
382 - def _start_oai_record(self, name, attrs):
383 self._errorOccurred = False 384 self.curXML = StringIO() 385 self.writer = saxutils.XMLGenerator(self.curXML, "utf-8") 386 self.writer.startDocument() 387 self.ivoid, self.updated = None, None 388 self.metadataSeen = False 389 self.oaiSets = set()
390
391 - def _start_ri_Resource(self, anme, attrs):
392 self.metadataSeen = True
393
394 - def _end_oai_record(self, name):
395 if self.writer is not None: 396 self.writer.endDocument() 397 # yeah, we decode the serialized result right away; it's easier 398 # to store character streams in the DB the way I'm doing things. 399 oaixml = self.curXML.getvalue().decode("utf-8") 400 # unfortunately, XMLGenerator insists on adding an XML declaration, 401 # which I can't have here. I remove it manually 402 if oaixml.startswith("<?xml"): 403 oaixml = oaixml[oaixml.index("?>")+2:] 404 self.shipout("oairecs", { 405 "ivoid": self.ivoid, 406 "updated": self.updated, 407 "oaixml": oaixml}) 408 self.writer = None 409 self.curXML = None
410
411 - def _end_oai_setSpec(self, name):
412 self.oaiSets.add(self._getLastContent())
413
414 - def _end_oai_identifier(self, name):
415 self.ivoid = self._getLastContent().lower().strip()
416
417 - def _end_oai_resumptionToken(self, name):
418 self.resumptionToken = self._getLastContent()
419
420 - def _start_oai_error(self, name, attrs):
421 self._errorAttrs = attrs
422
423 - def _end_oai_error(self, name):
424 self._end_error(name, self._errorAttrs, self._getLastContent())
425
426 - def getResult(self):
427 return self.rowdicts
428 429 startHandlers = { 430 "oai:record": _start_oai_record, 431 "oai:header": _start_oai_header, 432 "ri:Resource": _start_ri_Resource, 433 "oai:error": _start_oai_error, 434 } 435 endHandlers = { 436 "oai:record": _end_oai_record, 437 "oai:setSpec": _end_oai_setSpec, 438 "oai:resumptionToken": _end_oai_resumptionToken, 439 "oai:identifier": _end_oai_identifier, 440 "oai:error": _end_oai_error, 441 }
442 443
444 -class ServerProperties(object):
445 """A container for what an OAI-PMH server gives in response to 446 identify. 447 """ 448 repositoryName = None 449 baseURL = None 450 protocolVersion = None 451 adminEmails = () 452 earliestDatestamp = None 453 deletedRecord = None 454 granularity = None 455 repositoryName = None 456 compressions = () 457
458 - def __init__(self):
459 self.adminEmails = [] 460 self.compressions = [] 461 self.descriptions = []
462
463 - def set(self, name, value):
464 setattr(self, name, value)
465
466 - def add(self, name, value):
467 getattr(self, name).append(value)
468 469
470 -class IdentifyParser(utils.StartEndHandler, OAIErrorMixin):
471 """A parser for the result of the identify operation. 472 473 The result (an instance of ServerProperties) is in the serverProperties 474 attribute. 475 """ 476 resumptionToken = None 477
478 - def getResult(self):
479 return self.serverProperties
480
481 - def _start_Identify(self, name, attrs):
482 self.serverProperties = ServerProperties()
483
484 - def _endListThing(self, name, attrs, content):
485 self.serverProperties.add(name+"s", content.strip())
486 487 _end_adminEmail = _end_compression \ 488 = _endListThing 489
490 - def _endStringThing(self, name, attrs, content):
491 self.serverProperties.set(name, content.strip())
492 493 _end_repositoryName = _end_baseURL = _end_protocolVersion \ 494 = _end_granularity = _end_deletedRecord = _end_earliestDatestamp \ 495 = _end_repositoryName = _endStringThing
496 497
498 -class OAIQuery(object):
499 """A container for queries to OAI interfaces. 500 501 Construct it with the oai endpoint and the OAI verb, plus some optional 502 query attributes. If you want to retain or access the raw responses 503 of the server, pass a contentCallback function -- it will be called 504 with a byte string containing the payload of the server response if 505 it was parsed successfully. Error responses cannot be obtained in 506 this way. 507 508 The OAIQuery is constructed with OAI-PMH parameters (verb, startDate, 509 endDate, set, metadataPrefix; see the OAI-PMH docs for what they mean, 510 only verb is mandatory). In addition, you can pass granularity, 511 which is the granularity 512 """ 513 startDate = None 514 endDate = None 515 set = None 516 registry = None 517 metadataPrefix = None 518 519 # maxRecords is mainly used in test_oai; that's why there's no 520 # constructor parameter for it 521 maxRecords = None 522 523 # a timeout on HTTP operations 524 timeout = 100 525
526 - def __init__(self, registry, verb, startDate=None, endDate=None, set=None, 527 metadataPrefix="ivo_vor", identifier=None, contentCallback=None, 528 granularity=None):
529 self.registry = registry 530 self.verb, self.set = verb, set 531 self.startDate, self.endDate = startDate, endDate 532 self.identifier = identifier 533 self.metadataPrefix = metadataPrefix 534 self.contentCallback = contentCallback 535 self.granularity = granularity 536 if not self.granularity: 537 self.granularity = "YYYY-MM-DD"
538
539 - def getKWs(self, **moreArgs):
540 """returns a dictionary containing query keywords for OAI interfaces 541 from what's specified on the command line. 542 """ 543 kws = {"verb": self.verb} 544 if self.metadataPrefix: 545 kws["metadataPrefix"] = self.metadataPrefix 546 kws.update(moreArgs) 547 548 if self.granularity=='YY-MM-DD': 549 dateFormat = "%Y-%m-%d" 550 else: 551 dateFormat = "%Y-%m-%dT%H:%M:%SZ" 552 if self.startDate: 553 kws["from"] = self.startDate.strftime(dateFormat) 554 if self.endDate: 555 kws["until"] = self.endDate.strftime(dateFormat) 556 557 if self.set: 558 kws["set"] = self.set 559 if self.maxRecords: 560 kws["maxRecords"] = str(self.maxRecords) 561 562 if self.identifier: 563 kws["identifier"] = self.identifier 564 565 if "resumptionToken" in kws: 566 kws = {"resumptionToken": kws["resumptionToken"], 567 "verb": kws["verb"]} 568 return kws
569
570 - def doHTTP(self, **moreArgs):
571 """returns the result of parsing the current query plus 572 moreArgs to the current registry. 573 574 The result is returned as a string. 575 """ 576 srcURL = self.registry.rstrip("?" 577 )+"?"+self._getOpQS(**self.getKWs(**moreArgs)) 578 base.ui.notifyInfo("OAI query %s"%srcURL) 579 f = utils.urlopenRemote(srcURL, timeout=self.timeout) 580 res = f.read() 581 f.close() 582 return res
583
584 - def _getOpQS(self, **args):
585 """returns a properly quoted HTTP query part from its (keyword) arguments. 586 """ 587 # we don't use urllib.urlencode to not encode empty values like a=&b=val 588 qString = "&".join("%s=%s"%(k, urllib.quote(v)) 589 for k, v in args.iteritems() if v) 590 return "%s"%(qString)
591
592 - def talkOAI(self, parserClass):
593 """processes an OAI dialogue for verb using the IdParser-derived 594 parserClass. 595 """ 596 res = self.doHTTP(verb=self.verb) 597 if not res.strip(): 598 # empty reply is not admissable XML here 599 raise FailedQuery("Empty HTTP response") 600 601 handler = parserClass() 602 try: 603 xmlReader = sax.make_parser() 604 xmlReader.setFeature(sax.handler.feature_namespaces, True) 605 xmlReader.setContentHandler(handler) 606 xmlReader.parse(StringIO(res)) 607 if self.contentCallback: 608 self.contentCallback(res) 609 except NoRecordsMatch: 610 return [] 611 oaiResult = handler.getResult() 612 613 while handler.resumptionToken is not None: 614 resumptionToken = handler.resumptionToken 615 handler = parserClass(oaiResult) 616 try: 617 res = self.doHTTP(verb=self.verb, 618 resumptionToken=resumptionToken) 619 sax.parseString(res, handler) 620 if self.contentCallback: 621 self.contentCallback(res) 622 except NoRecordsMatch: 623 break 624 625 return oaiResult
626 627
628 -def getIdentifiers(registry, startDate=None, endDate=None, set=None, 629 granularity=None):
630 """returns a list of "short" records for what's in the registry specified 631 by args. 632 """ 633 q = OAIQuery(registry, verb="ListIdentifiers", startDate=startDate, 634 endDate=endDate, set=set) 635 return q.talkOAI(IdParser)
636 637
638 -def getRecords(registry, startDate=None, endDate=None, set=None, 639 granularity=None):
640 """returns a list of "long" records for what's in the registry specified 641 by args. 642 643 parser should be a subclass of RecordParser; otherwise, you'll miss 644 resumption and possibly other features. 645 """ 646 q = OAIQuery(registry, verb="ListRecords", startDate=startDate, 647 endDate=endDate, set=set, granularity=granularity) 648 return q.talkOAI(RecordParser)
649 650
651 -def _addCanonicalNSDecls(xmlLiteral):
652 """adds XML namespace declarations for namespace prefixes we 653 suspect in xmlLiteral. 654 655 This is an ugly hack based on REs necessary because in the OAIRecordsParser 656 we discard the namespace declarations. It won't work with CDATA 657 sections, and it'll make a hash of things if namespace declarations are 658 already present. However, for the use case of making the mutilated 659 resource records coming out of the OAIRecordsParser valid, it will just 660 do. 661 662 Without an XML schema and a full parse (which of course is impossible 663 without the necessary declarations), this is, really, not possible. But 664 the whole idea of canonical namespace prefixes is a mess, and so we 665 hack along; in particular, we accept any string of the form \w+: within 666 what looks like an XML tag as a namespace. Oh my. 667 """ 668 prefixesUsed = set() 669 for elementContent in re.finditer("<[^>]+>", xmlLiteral): 670 prefixesUsed |= set(re.findall("([a-zA-Z_]\w*):[a-zA-Z_]", 671 elementContent.group())) 672 673 cp = getCanonicalPrefixes() 674 nsDecls = " ".join('xmlns:%s=%s'%( 675 pref, utils.escapeAttrVal(cp.getNSForPrefix(pref))) 676 for pref in prefixesUsed) 677 return re.sub("<([\w:-]+)", r"<\1 "+nsDecls, xmlLiteral, 1)
678 679
680 -def getRecord(registry, identifier):
681 """returns the XML form of an OAI-PMH record for identifier from 682 the OAI-PMH endpoint at URL registry. 683 684 This uses the OAIRecordsParser which enforces canonical prefixes, 685 and the function will add their declarations as necessary. This also means 686 that evil registry records could be broken by us. 687 """ 688 q = OAIQuery(registry, verb="GetRecord", identifier=identifier) 689 res = q.talkOAI(OAIRecordsParser) 690 dest, row = res[0] 691 assert dest=='oairecs' 692 return _addCanonicalNSDecls(row["oaixml"])
693 694
695 -def parseRecord(recordXML):
696 """returns some main properties from an XML-encoded VOResource record. 697 698 recordXML can be an OAI-PMH response or just a naked record. If multiple 699 records are contained in recordXML, only the first will be returned. 700 701 What's coming back is a dictionary as produced by RecordParser. 702 """ 703 handler = RecordParser() 704 sax.parseString(recordXML, handler) 705 return handler.recs[0]
706 707
708 -def getServerProperties(registry):
709 """returns a ServerProperties instance for registry. 710 711 In particular, you can retrieve the granularity argument that 712 actually matches the registry from the result's granularity attribute. 713 """ 714 q = OAIQuery(registry, verb="Identify", metadataPrefix=None) 715 return q.talkOAI(IdentifyParser)
716