1 """
2 A simple client of OAI-http.
3
4 This includes both some high-level functions and rudimentary parsers
5 that can serve as bases for more specialized parsers.
6 """
7
8
9
10
11
12
13
14 import cPickle as pickle
15 import hashlib
16 import os
17 import re
18 import urllib
19 from cStringIO import StringIO
20 from xml import sax
21 from xml.sax import saxutils
22
23 from gavo import base
24 from gavo import svcs
25 from gavo import utils
26
27
29 - def __init__(self, msg, code="?", value="?"):
32
33
36
37
40
41
42
43
44
46 """a self-persisting dictionary of the prefixes we use in our
47 OAI interface.
48
49 CanonicalPrefixes objects are constructed with the name of a
50 pickle file containing a list of (prefix, uri) pairs.
51
52 This reproduces some code from stanxml.NSRegistry, but we want that
53 stuff as instance method here, not as class method.
54 """
60
73
75 """registers prefix for ns or, if prefix is already taken, makes
76 up a new prefix for the namespace URI ns.
77 """
78 try:
79 self.registerPrefix(prefix, ns)
80 except PrefixIsTaken:
81 origPrefix, uniquer = prefix, 0
82 while True:
83 try:
84 prefix = origPrefix+str(uniquer)
85 self.registerPrefix(prefix, ns)
86 except PrefixIsTaken:
87 uniquer += 1
88 else:
89 break
90
97
100
102 try:
103 return self._registry[prefix]
104 except KeyError:
105 raise base.NotFoundError(prefix, "XML namespace prefix",
106 "registry of prefixes.")
107
110
112 """fills the instance from a list of prefix, uri pairs.
113
114 Pairs is what is stored in the pickle.
115 """
116 for prefix, uri in pairs:
117 self.registerPrefix(prefix, uri, save=False)
118
127
129 try:
130 with open(self.pickleName) as f:
131 self._fillFromPairs(pickle.load(f))
132 except IOError:
133 base.ui.notifyWarning("Starting new canonical prefixes")
134 self._bootstrap()
135
137 toPersist = list(sorted(self._registry.iteritems()))
138 try:
139 with open(self.pickleName+".tmp", "w") as f:
140 pickle.dump(toPersist, f)
141 os.rename(self.pickleName+".tmp", self.pickleName)
142 except IOError as msg:
143 base.ui.notifyWarning("Could not persist canonical prefixes: %s"%
144 msg)
145
146
150
151
158
159
160 -class IdParser(utils.StartEndHandler, OAIErrorMixin):
161 """A parser for simple OAI-PMH headers.
162
163 Records end up as a list of dictionaries in the recs attribute.
164 """
165 resumptionToken = None
166
173
176
178 self.recs[-1]["id"] = content.strip()
179
181 try:
182 self.recs[-1]["date"] = utils.parseISODT(content)
183 except ValueError:
184 self.recs[-1]["date"] = None
185
188
192
193
195 """A simple parser for ivo_vor records.
196
197 This only pulls out a number of the most salient items; more will
198 probably follow as needed.
199 """
203
207
211
214
215 - def _handleContentChild(self, name, attrs, content):
216 if self.getParentTag()=="content":
217 self.recs[-1][name] = content
218
219 _end_description = _end_source = _end_referenceURL = \
220 _handleContentChild
221
225
228
231
234
235
237 """a SAX ContentHandler generating tuples of some record-level metadata
238 and pre-formatted XML of simple implementation of the OAI interface.
239
240 canonicalPrefixes is a CanonicalPrefixesInstance built from
241 res/canonicalPrefixes.pickle
242
243 Note that we *require* that records actually carry ivo_vor metadata.
244 """
245
246
247
248 _referringAttributeNames = set(["id", "ref",
249 "coord_system_id"])
250
251 resumptionToken = None
252
253 - def __init__(self, canonicalPrefixes=None):
254 self.canonicalPrefixes = canonicalPrefixes or getCanonicalPrefixes()
255 sax.ContentHandler.__init__(self)
256 self.buffer = None
257 self.writer = None
258 self.rowdicts = []
259 self.prefixMap = {}
260 self.prefixesToTranslate = {}
261
263 self.prefixMap.setdefault(prefix, []).append(uri)
264
265
266
267
268 if not self.canonicalPrefixes.haveNS(uri):
269 self.canonicalPrefixes.registerPrefixOrMakeUp(prefix, uri)
270
271 canonPrefix = self.canonicalPrefixes.getPrefixForNS(uri)
272 if prefix!=canonPrefix or prefix in self.prefixesToTranslate:
273 self.prefixesToTranslate.setdefault(prefix, []).append(canonPrefix)
274
276 self.prefixMap[prefix].pop()
277 if prefix in self.prefixesToTranslate:
278 self.prefixesToTranslate[prefix].pop()
279 if not self.prefixesToTranslate[prefix]:
280 del self.prefixesToTranslate[prefix]
281
296
305
307 if self.writer:
308 self.writer.characters(stuff)
309
310 self._lastChars.append(stuff)
311
313 """fixes the namespace prefix of name if necessary.
314
315 name must be a qualified name, i.e., contain exactly one colon.
316
317 "normalize" here means make sure the prefix matches our canonical prefix
318 and change it to the canonical one if necessary.
319 """
320 prefix, base = name.split(":")
321 if prefix not in self.prefixesToTranslate:
322 return name
323 return self.prefixesToTranslate[prefix][-1]+":"+base
324
326 """fixes attribute name and attribute value namespaces if necessary.
327
328 It also always checks for xsi:type and fixes namespaced attribute
329 values as necessary.
330
331 See also normalizeNamespace.
332 """
333 newAttrs = {}
334 for ns, name in attrs.keys():
335 value = attrs[(ns, name)]
336 if ns is None:
337 newName = name
338 else:
339 newName = self.canonicalPrefixes.getPrefixForNS(ns)+":"+name
340
341 if newName=="xsi:type":
342 if ":" in value:
343 value = self.normalizeNamespace(value)
344
345
346
347
348
349 if newName in self._referringAttributeNames:
350 value = value+hashlib.md5(self.ivoid).hexdigest()
351
352 newAttrs[newName] = value
353
354 return newAttrs
355
356 - def _getLastContent(self):
357 """returns the entire character content since the last XML event.
358 """
359 return "".join(self._lastChars)
360
362 self._errorOccurred = True
363
365
366 if self.ivoid is None:
367 return
368
369 if not self.metadataSeen:
370 return
371 if self._errorOccurred:
372 return
373
374
375 if self._isDeleted:
376 return
377 self.rowdicts.append((role, record))
378
380 self._isDeleted = attrs.get("status", "").lower()=="deleted"
381
383 self._errorOccurred = False
384 self.curXML = StringIO()
385 self.writer = saxutils.XMLGenerator(self.curXML, "utf-8")
386 self.writer.startDocument()
387 self.ivoid, self.updated = None, None
388 self.metadataSeen = False
389 self.oaiSets = set()
390
392 self.metadataSeen = True
393
395 if self.writer is not None:
396 self.writer.endDocument()
397
398
399 oaixml = self.curXML.getvalue().decode("utf-8")
400
401
402 if oaixml.startswith("<?xml"):
403 oaixml = oaixml[oaixml.index("?>")+2:]
404 self.shipout("oairecs", {
405 "ivoid": self.ivoid,
406 "updated": self.updated,
407 "oaixml": oaixml})
408 self.writer = None
409 self.curXML = None
410
412 self.oaiSets.add(self._getLastContent())
413
415 self.ivoid = self._getLastContent().lower().strip()
416
419
421 self._errorAttrs = attrs
422
424 self._end_error(name, self._errorAttrs, self._getLastContent())
425
428
429 startHandlers = {
430 "oai:record": _start_oai_record,
431 "oai:header": _start_oai_header,
432 "ri:Resource": _start_ri_Resource,
433 "oai:error": _start_oai_error,
434 }
435 endHandlers = {
436 "oai:record": _end_oai_record,
437 "oai:setSpec": _end_oai_setSpec,
438 "oai:resumptionToken": _end_oai_resumptionToken,
439 "oai:identifier": _end_oai_identifier,
440 "oai:error": _end_oai_error,
441 }
442
443
468
469
471 """A parser for the result of the identify operation.
472
473 The result (an instance of ServerProperties) is in the serverProperties
474 attribute.
475 """
476 resumptionToken = None
477
479 return self.serverProperties
480
483
486
487 _end_adminEmail = _end_compression \
488 = _endListThing
489
492
493 _end_repositoryName = _end_baseURL = _end_protocolVersion \
494 = _end_granularity = _end_deletedRecord = _end_earliestDatestamp \
495 = _end_repositoryName = _endStringThing
496
497
499 """A container for queries to OAI interfaces.
500
501 Construct it with the oai endpoint and the OAI verb, plus some optional
502 query attributes. If you want to retain or access the raw responses
503 of the server, pass a contentCallback function -- it will be called
504 with a byte string containing the payload of the server response if
505 it was parsed successfully. Error responses cannot be obtained in
506 this way.
507
508 The OAIQuery is constructed with OAI-PMH parameters (verb, startDate,
509 endDate, set, metadataPrefix; see the OAI-PMH docs for what they mean,
510 only verb is mandatory). In addition, you can pass granularity,
511 which is the granularity
512 """
513 startDate = None
514 endDate = None
515 set = None
516 registry = None
517 metadataPrefix = None
518
519
520
521 maxRecords = None
522
523
524 timeout = 100
525
526 - def __init__(self, registry, verb, startDate=None, endDate=None, set=None,
527 metadataPrefix="ivo_vor", identifier=None, contentCallback=None,
528 granularity=None):
538
539 - def getKWs(self, **moreArgs):
540 """returns a dictionary containing query keywords for OAI interfaces
541 from what's specified on the command line.
542 """
543 kws = {"verb": self.verb}
544 if self.metadataPrefix:
545 kws["metadataPrefix"] = self.metadataPrefix
546 kws.update(moreArgs)
547
548 if self.granularity=='YY-MM-DD':
549 dateFormat = "%Y-%m-%d"
550 else:
551 dateFormat = "%Y-%m-%dT%H:%M:%SZ"
552 if self.startDate:
553 kws["from"] = self.startDate.strftime(dateFormat)
554 if self.endDate:
555 kws["until"] = self.endDate.strftime(dateFormat)
556
557 if self.set:
558 kws["set"] = self.set
559 if self.maxRecords:
560 kws["maxRecords"] = str(self.maxRecords)
561
562 if self.identifier:
563 kws["identifier"] = self.identifier
564
565 if "resumptionToken" in kws:
566 kws = {"resumptionToken": kws["resumptionToken"],
567 "verb": kws["verb"]}
568 return kws
569
570 - def doHTTP(self, **moreArgs):
571 """returns the result of parsing the current query plus
572 moreArgs to the current registry.
573
574 The result is returned as a string.
575 """
576 srcURL = self.registry.rstrip("?"
577 )+"?"+self._getOpQS(**self.getKWs(**moreArgs))
578 base.ui.notifyInfo("OAI query %s"%srcURL)
579 f = utils.urlopenRemote(srcURL, timeout=self.timeout)
580 res = f.read()
581 f.close()
582 return res
583
585 """returns a properly quoted HTTP query part from its (keyword) arguments.
586 """
587
588 qString = "&".join("%s=%s"%(k, urllib.quote(v))
589 for k, v in args.iteritems() if v)
590 return "%s"%(qString)
591
593 """processes an OAI dialogue for verb using the IdParser-derived
594 parserClass.
595 """
596 res = self.doHTTP(verb=self.verb)
597 if not res.strip():
598
599 raise FailedQuery("Empty HTTP response")
600
601 handler = parserClass()
602 try:
603 xmlReader = sax.make_parser()
604 xmlReader.setFeature(sax.handler.feature_namespaces, True)
605 xmlReader.setContentHandler(handler)
606 xmlReader.parse(StringIO(res))
607 if self.contentCallback:
608 self.contentCallback(res)
609 except NoRecordsMatch:
610 return []
611 oaiResult = handler.getResult()
612
613 while handler.resumptionToken is not None:
614 resumptionToken = handler.resumptionToken
615 handler = parserClass(oaiResult)
616 try:
617 res = self.doHTTP(verb=self.verb,
618 resumptionToken=resumptionToken)
619 sax.parseString(res, handler)
620 if self.contentCallback:
621 self.contentCallback(res)
622 except NoRecordsMatch:
623 break
624
625 return oaiResult
626
627
628 -def getIdentifiers(registry, startDate=None, endDate=None, set=None,
629 granularity=None):
636
637
638 -def getRecords(registry, startDate=None, endDate=None, set=None,
639 granularity=None):
649
650
652 """adds XML namespace declarations for namespace prefixes we
653 suspect in xmlLiteral.
654
655 This is an ugly hack based on REs necessary because in the OAIRecordsParser
656 we discard the namespace declarations. It won't work with CDATA
657 sections, and it'll make a hash of things if namespace declarations are
658 already present. However, for the use case of making the mutilated
659 resource records coming out of the OAIRecordsParser valid, it will just
660 do.
661
662 Without an XML schema and a full parse (which of course is impossible
663 without the necessary declarations), this is, really, not possible. But
664 the whole idea of canonical namespace prefixes is a mess, and so we
665 hack along; in particular, we accept any string of the form \w+: within
666 what looks like an XML tag as a namespace. Oh my.
667 """
668 prefixesUsed = set()
669 for elementContent in re.finditer("<[^>]+>", xmlLiteral):
670 prefixesUsed |= set(re.findall("([a-zA-Z_]\w*):[a-zA-Z_]",
671 elementContent.group()))
672
673 cp = getCanonicalPrefixes()
674 nsDecls = " ".join('xmlns:%s=%s'%(
675 pref, utils.escapeAttrVal(cp.getNSForPrefix(pref)))
676 for pref in prefixesUsed)
677 return re.sub("<([\w:-]+)", r"<\1 "+nsDecls, xmlLiteral, 1)
678
679
681 """returns the XML form of an OAI-PMH record for identifier from
682 the OAI-PMH endpoint at URL registry.
683
684 This uses the OAIRecordsParser which enforces canonical prefixes,
685 and the function will add their declarations as necessary. This also means
686 that evil registry records could be broken by us.
687 """
688 q = OAIQuery(registry, verb="GetRecord", identifier=identifier)
689 res = q.talkOAI(OAIRecordsParser)
690 dest, row = res[0]
691 assert dest=='oairecs'
692 return _addCanonicalNSDecls(row["oaixml"])
693
694
696 """returns some main properties from an XML-encoded VOResource record.
697
698 recordXML can be an OAI-PMH response or just a naked record. If multiple
699 records are contained in recordXML, only the first will be returned.
700
701 What's coming back is a dictionary as produced by RecordParser.
702 """
703 handler = RecordParser()
704 sax.parseString(recordXML, handler)
705 return handler.recs[0]
706
707
709 """returns a ServerProperties instance for registry.
710
711 In particular, you can retrieve the granularity argument that
712 actually matches the registry from the result's granularity attribute.
713 """
714 q = OAIQuery(registry, verb="Identify", metadataPrefix=None)
715 return q.talkOAI(IdentifyParser)
716