Source code for pyvo.dal.query

# Licensed under a 3-clause BSD style license - see LICENSE.rst
"""
A module for walking through the query response of VO data access layer
(DAL) queries and general VOTable-based datasets.

Most data queries in the VO return a table as a result, usually
formatted as a VOTable.  Each row of the table describes a single
physical or virtual dataset which can be retrieved.  For uniformity,
datasets are described via standard metadata defined by a data model
specific to the type of data being queried.  The fields of the data
model are identified most generally by their VOClient alias as defined
in this interface, or at a lower level by the Utype or UCD of the
specific standard and version of the standard being queried.  While
the data model differs depending upon the type of data being queried,
the form of the query response is the same for all classes of data,
allowing a common query response interface to be used.

An exception to this occurs when querying an astronomical catalog or
other externally defined table.  In this case there is no VO defined
standard data model.  Usually the field names are used to uniquely
identify table columns.
"""
__all__ = ["DALService", "DALQuery", "DALResults", "Record"]

import os
import shutil
import re
import requests
from collections.abc import Mapping

import collections

from warnings import warn

from astropy.table import Table, QTable
from astropy.io.votable import parse as votableparse
from astropy.io.votable.ucd import parse_ucd
from astropy.utils.exceptions import AstropyDeprecationWarning

from .mimetype import mime_object_maker
from .exceptions import (DALFormatError, DALServiceError, DALQueryError,
                         DALOverflowWarning)

from .. import samp

from ..utils.decorators import stream_decode_content
from ..utils.http import use_session


[docs]class DALService: """ an abstract base class representing a DAL service located a particular endpoint. """ def __init__(self, baseurl, *, session=None, capability_description=None,): """ instantiate the service connecting it to a base URL Parameters ---------- baseurl : str the base URL that should be used for forming queries to the service. session : object optional session to use for network requests description : str, optional the description of the service. """ self._baseurl = baseurl self._capability_description = capability_description self._session = use_session(session) @property def baseurl(self): """ the base URL identifying the location of the service and where queries are submitted (read-only) """ return self._baseurl @property def capability_description(self): """ The service description. """ return self._capability_description def __repr__(self) -> str: return (f"{type(self).__name__}(baseurl : '{self.baseurl}'," f" description : '{self.capability_description}')")
[docs] def search(self, **keywords): """ send a search query to this service. This implementation has no knowledge of the type of service being queried. The query parameters are given as arbitrary keywords which will be assumed to be understood by the service (i.e. there is no argument checking). The response is a generic DALResults object. Raises ------ DALServiceError for errors connecting to or communicating with the service DALQueryError for errors either in the input query syntax or other user errors detected by the service DALFormatError for errors parsing the VOTable response """ q = self.create_query(**keywords) return q.execute()
[docs] def create_query(self, **keywords): """ create a query object that constraints can be added to and then executed. Returns ------- DALQuery a generic query object """ q = DALQuery(self.baseurl, session=self._session, **keywords) return q
[docs] def describe(self): """ describe the general information about the DAL service """ print('DAL Service at {}'.format(self.baseurl))
[docs]class DALQuery(dict): """ a class for preparing a query to a particular service. Query constraints are added via its service type-specific methods. The various execute() functions will submit the query and return the results. The base URL for the query can be changed via the baseurl property. A session can also optionally be passed in that will be used for network transactions made by this object to remote services. """ _ex = None def __init__(self, baseurl, *, session=None, **keywords): """ initialize the query object with a baseurl """ if isinstance(baseurl, bytes): baseurl = baseurl.decode("utf-8") self._baseurl = baseurl.rstrip("?") self._session = use_session(session) self.update({key.upper(): value for key, value in keywords.items()}) @property def baseurl(self): """ the base URL that this query will be sent to when one of the execute functions is called. """ return self._baseurl
[docs] def execute(self): """ submit the query and return the results as a Results subclass instance Raises ------ DALServiceError for errors connecting to or communicating with the service DALQueryError for errors either in the input query syntax or other user errors detected by the service DALFormatError for errors parsing the VOTable response """ return DALResults(self.execute_votable(), url=self.queryurl, session=self._session)
[docs] def execute_raw(self): """ submit the query and return the raw response as a string. No exceptions are raised here because non-2xx responses might still contain payload. They can be raised later by calling ``raise_if_error`` """ f = self.execute_stream() out = None try: out = f.read() finally: f.close() return out
[docs] @stream_decode_content def execute_stream(self, *, post=False): """ Submit the query and return the raw response as a file stream. No exceptions are raised here because non-2xx responses might still contain payload. They can be raised later by calling ``raise_if_error`` """ response = self.submit(post=post) try: response.raise_for_status() except requests.RequestException as ex: # save for later use self._ex = ex finally: return response.raw
[docs] def submit(self, *, post=False): """ does the actual request """ url = self.queryurl params = {k: v for k, v in self.items()} if post: response = self._session.post(url, data=params, stream=True, allow_redirects=True) else: response = self._session.get(url, params=params, stream=True, allow_redirects=True) return response
[docs] def execute_votable(self, *, post=False): """ Submit the query and return the results as an AstroPy votable instance. As this is the level where qualified error messages are available, they are raised here instead of in the underlying execute_stream. Returns ------- astropy.io.votable.tree.VOTableFile an Astropy votable object Raises ------ DALServiceError for errors connecting to or communicating with the service DALFormatError for errors parsing the VOTable response See Also -------- astropy.io.votable DALServiceError DALFormatError DALQueryError """ try: return votableparse(self.execute_stream(post=post).read) except Exception as e: self.raise_if_error() raise DALFormatError(e, self.queryurl)
[docs] def raise_if_error(self): """ Raise if there was an error on http level. """ if self._ex: e = self._ex raise DALServiceError.from_except(e, self.queryurl)
@property def queryurl(self): """ The URL that encodes the current query. This is the URL that the execute functions will use if called next. """ return self.baseurl
[docs]class DALResults: """ Results from a DAL query. It provides random access to records in the response. Alternatively, it can provide results via a Cursor (compliant with the Python Database API) or an iterable. """ @classmethod @stream_decode_content def _from_result_url(cls, result_url, session): return session.get(result_url, stream=True).raw
[docs] @classmethod def from_result_url(cls, result_url, *, session=None): """ Create a result object from a url. Uses the optional session to make the request. """ session = use_session(session) return cls( votableparse(cls._from_result_url(result_url, session).read), url=result_url, session=session)
def __init__(self, votable, *, url=None, session=None): """ initialize the cursor. This constructor is not typically called by directly applications; rather an instance is obtained from calling a DALQuery's execute(). Parameters ---------- votable : str the service response parsed into an astropy.io.votable.tree.VOTableFile instance. url : str the URL that produced the response session : object optional session to use for network requests Raises ------ DALFormatError if the response VOTable does not contain a response table See Also -------- DALFormatError """ self._votable = votable self._url = url self._session = use_session(session) self._status = self._findstatus(votable) if self._status[0].lower() not in ("ok", "overflow"): raise DALQueryError(self._status[1], self._status[0], url) if self._status[0].lower() == "overflow": warn("Partial result set. Potential causes MAXREC, async storage space, etc.", category=DALOverflowWarning) self._resultstable = self._findresultstable(votable) if not self._resultstable: raise DALFormatError( reason="VOTable response missing results table", url=url) self._fldnames = tuple( field.name for field in self._resultstable.fields) if not self._fldnames: raise DALFormatError( reason="response table missing column descriptions.", url=url) self._infos = self._findinfos(votable) def _findresultstable(self, votable): # this can be overridden to specialize for a particular DAL protocol res = self._findresultsresource(votable) if not res or len(res.tables) < 1: return None return res.tables[0] def _findresultsresource(self, votable): # this can be overridden to specialize for a particular DAL protocol if len(votable.resources) < 1: return None for res in votable.resources: if res.type.lower() == "results": return res return votable.resources[0] def _findstatus(self, votable): # this can be overridden to specialize for a particular DAL protocol # look first in the result resource res = self._findresultsresource(votable) if res: # should be a RESOURCE/INFO info = self._findstatusinfo(res.infos) if info: return (info.value, info.content) # if not there, check inside first table if len(res.tables) > 0: info = self._findstatusinfo(res.tables[0].infos) if info: return (info.value, info.content) # otherwise, look just below the root element info = self._findstatusinfo(votable.infos) if info: return (info.value, info.content) # assume it's okay return ("OK", "QUERY_STATUS not specified") def _findstatusinfo(self, infos): # this can be overridden to specialize for a particular DAL protocol status = None # return the last status to catch potential overflow or error sent # after the results. for info in infos: if info.name.lower() == 'query_status': status = info return status def _findinfos(self, votable): # this can be overridden to specialize for a particular DAL protocol infos = {} res = self._findresultsresource(votable) for info in res.infos: infos[info.name] = info.value for info in votable.infos: infos[info.name] = info.value return infos def __repr__(self): return f"<DALResults{repr(self.to_table())[1:]}" @property def queryurl(self): """ the URL query that produced these results. None is returned if unknown """ return self._url @property def votable(self): """ The complete votable XML Document `astropy.io.votable.tree.VOTableFile` """ return self._votable @property def resultstable(self): """ The votable XML element `astropy.io.votable.tree.TableElement` """ return self._resultstable
[docs] def to_table(self): """ Returns a astropy Table object. Returns ------- `astropy.table.Table` """ return self.resultstable.to_table(use_names_over_ids=True)
[docs] def to_qtable(self): """ Returns a astropy QTable object containing quantities instead of simple values Returns ------- `astropy.table.QTable` """ return QTable(self.resultstable.to_table(use_names_over_ids=True))
@property def table(self): warn(AstropyDeprecationWarning( 'Using the table property is deprecated. ' 'Please use the to_table() instead.' )) return self.to_table() def __len__(self): """ return the record count """ return len(self.resultstable.array) def __getitem__(self, indx): """ if indx is a string, r[indx] will return the field with the name of indx; if indx is an integer, r[indx] will return the indx-th record. """ if isinstance(indx, int): return self.getrecord(indx) elif isinstance(indx, tuple): return self.getvalue(*indx) else: return self.getcolumn(indx) @property def fieldnames(self): """ return the names of the columns. These are the names that are used to access values from the dictionaries returned by getrecord(). They correspond to the column name. """ return self._fldnames @property def fielddescs(self): """ return the full metadata the columns as a list of Field instances, a simple object with attributes corresponding the the VOTable FIELD attributes, namely: name, id, type, ucd, utype, arraysize, description """ return self.resultstable.fields @property def status(self): """ The query status as a 2-element tuple e.g. ('OK', 'Everythings fine') """ return self._status
[docs] def fieldname_with_ucd(self, ucd): """ return the field name that has a given UCD value or None if the UCD is not found. """ search_ucds = set(parse_ucd(ucd, has_colon=True)) for field in (field for field in self.fielddescs if field.ucd): field_ucds = set(parse_ucd(field.ucd, has_colon=True)) if search_ucds & field_ucds: return field.name return None
[docs] def fieldname_with_utype(self, utype): """ return the field name that has a given UType value or None if the UType is not found. """ try: iterchain = ( self.getdesc(fieldname) for fieldname in self.fieldnames) iterchain = (field for field in iterchain if field.utype == utype) return next(iterchain).name except StopIteration: return None
[docs] def getcolumn(self, name): """ return a numpy array containing the values for the column with the given name """ try: if name not in self.fieldnames: name = self.resultstable.get_field_by_id(name).name return self.resultstable.array[name] except KeyError: raise KeyError("No such column: {}".format(name))
[docs] def getrecord(self, index): """ return a representation of a result record that follows dictionary semantics. The keys of the dictionary are those returned by this instance's fieldnames attribute.The returned record may have additional accessor methods for getting at stardard DAL response metadata (e.g. ra, dec). Parameters ---------- index : int the integer index of the desired record where 0 returns the first record Returns ------- Record a dictionary-like wrapper containing the result record metadata. Raises ------ IndexError if index is negative or equal or larger than the number of rows in the result table. See Also -------- Record """ return Record(self, index, session=self._session)
[docs] def getvalue(self, name, index): """ return the value of a record attribute--a value from a column and row. Parameters ---------- name : str the name of the attribute (column) index : int the zero-based index of the record Raises ------ IndexError if index is negative or equal or larger than the number of rows in the result table. KeyError if name is not a recognized column name """ return self.getrecord(index)[name]
[docs] def getdesc(self, name): """ return the field description for the record attribute (column) with the given name Parameters ---------- name : str the name of the attribute (column) Returns ------- object with attributes (name, id, datatype, unit, ucd, utype, arraysize) which describe the column """ if name not in self._fldnames: raise KeyError(name) return self.resultstable.get_field_by_id_or_name(name)
def __iter__(self): """ return a python iterable for stepping through the records in this result """ pos = 0 while True: try: out = self.getrecord(pos) except IndexError: break yield out pos += 1
[docs] def broadcast_samp(self, *, client_name=None): """ Broadcast the table to ``client_name`` via SAMP """ with samp.connection() as conn: samp.send_table_to( conn, self.to_table(), client_name=client_name, name=self.queryurl)
[docs] def cursor(self): """ return a cursor that is compliant with the Python Database API's :class:`.Cursor` interface. See PEP 249 for details. """ from .dbapi2 import Cursor return Cursor(self)
[docs]class Record(Mapping): """ one record from a DAL query result. The column values are accessible as dictionary items. It also provides special added functions for accessing the dataset the record corresponds to. Subclasses may provide additional functions for access to service type-specific data. """ def __init__(self, results, index, *, session=None): self._results = results self._index = index self._session = use_session(session) self._mapping = collections.OrderedDict( zip( results.fieldnames, results.resultstable.array.data[index] ) ) def __getitem__(self, key): try: if key not in self._mapping: key = self._results.resultstable.get_field_by_id(key).name return self._mapping[key] except KeyError: raise KeyError("No such column: {}".format(key)) def __iter__(self): return iter(self._mapping) def __len__(self): return len(self._mapping) def __repr__(self): return repr(tuple(self.values()))
[docs] def get(self, key, default=None, decode=False): """ This method mimics the dict get method and adds a decode parameter to allow decoding of binary strings. """ out = self._mapping.get(key, default) if decode and isinstance(out, bytes): out = out.decode('ascii') return out
[docs] def getbyucd(self, ucd, default=None, decode=False): """ return the column with the given ucd. """ return self.get( self._results.fieldname_with_ucd(ucd), default, decode)
[docs] def getbyutype(self, utype, default=None, decode=False): """ return the column with the given utype. Raises ------ KeyError if theres no column with the given utype. """ return self.get( self._results.fieldname_with_utype(utype), default, decode)
[docs] def getdataformat(self): """ return the mimetype of the dataset described by this record. """ return self.getbyucd('meta.code.mime', decode=True)
[docs] def getdataurl(self): """ return the URL contained in the access URL column which can be used to retrieve the dataset described by this record. None is returned if no such column exists. """ for fieldname in self._results.fieldnames: field = self._results.getdesc(fieldname) if (field.utype and "access.reference" in field.utype.lower()) or ( field.ucd and "meta.dataset" in field.ucd and "meta.ref.url" in field.ucd ): out = self[fieldname] if isinstance(out, bytes): out = out.decode('utf-8') return out return None
[docs] def getdataobj(self): """ return the appropriate data object suitable for the data content behind this record. """ return mime_object_maker(self.getdataurl(), self.getdataformat())
[docs] @stream_decode_content def getdataset(self, timeout=None): """ Get the dataset described by this record from the server. Parameters ---------- timeout : float the time in seconds to allow for a successful connection with server before failing with an IOError (specifically, socket.timeout) exception Returns ------- A file-like object which may be read to retrieve the referenced dataset. Raises ------ KeyError if no datast access URL is included in the record URLError if the dataset access URL is invalid (note: subclass of IOError) HTTPError if an HTTP error occurs while accessing the dataset (note: subclass of IOError) socket.timeout if the timeout is exceeded before a connection is established. (note: subclass of IOError) IOError if some other error occurs while establishing the data stream. """ url = self.getdataurl() if not url: raise KeyError("no dataset access URL recognized in record") if timeout: response = self._session.get(url, stream=True, timeout=timeout) else: response = self._session.get(url, stream=True) try: response.raise_for_status() except requests.RequestException as ex: raise DALServiceError.from_except(ex, url) return response.raw
[docs] def cachedataset(self, *, filename=None, dir=".", timeout=None, bufsize=None): """ retrieve the dataset described by this record and write it out to a file with the given name. If the file already exists, it will be over-written. Parameters ---------- filename : str the name of the file to write dataset to. If the value represents a relative path, it will be taken to be relative to the value of the ``dir`` parameter. If None, a default name is attempted based on the record title and format. dir : str the directory to write the file into. This value will be ignored if filename is an absolute path. timeout : int the time in seconds to allow for a successful connection with server before failing with an IOError (specifically, socket.timeout) exception bufsize : int a buffer size in bytes for copying the data to disk (default: 0.5 MB) Raises ------ KeyError if no datast access URL is included in the record URLError if the dataset access URL is invalid HTTPError if an HTTP error occurs while accessing the dataset socket.timeout if the timeout is exceeded before a connection is established. (note: subclass of IOError) IOError if an error occurs while writing out the dataset """ if not bufsize: bufsize = 524288 if not filename: filename = self.make_dataset_filename(dir=dir) inp = self.getdataset(timeout) try: with open(filename, 'wb') as out: shutil.copyfileobj(inp, out) finally: inp.close()
_dsname_no = 0 # used by make_dataset_filename
[docs] def make_dataset_filename(self, *, dir=".", base=None, ext=None): """ create a viable pathname in a given directory for saving the dataset available via getdataset(). The pathname that is returned is guaranteed not to already exist (under single-threaded conditions). This implementation will first try combining the base name with the file extension (with a dot). If this file already exists in the directory, a name that appends an integer suffix ("-#") to the base before joining with the extension will be tried. The integer will be incremented until a non-existent filename is created. Parameters ---------- dir : str the directory to save the dataset under. This must already exist. base : str a basename to use to as the base of the filename. If None, the result of ``suggest_dataset_basename()`` will be used. ext : str the filename extension to use. If None, the result of ``suggest_extension()`` will be used. """ if not dir: raise ValueError( "make_dataset_filename(): no dir parameter provided") if not os.path.exists(dir): os.mkdir(dir) if not os.path.isdir(dir): raise ValueError("{}: not a directory".format(dir)) if not base: base = self.suggest_dataset_basename() if not ext: ext = self.suggest_extension(default="dat") # be efficient when writing a bunch of files into the same directory # in succession n = self._dsname_no def mkpath(i): return os.path.join(dir, "{}-{}.{}".format(base, i, ext)) if n > 0: # find the last file written of the form, base-n.ext while n > 0 and not os.path.exists(mkpath(n)): n -= 1 if n > 0: n += 1 if n == 0: # never wrote a file of form, base-n.ext; try base.ext path = os.path.join(dir, "{}.{}".format(base, ext)) if not os.path.exists(path): return path n += 1 # find next available name while os.path.exists(mkpath(n)): n += 1 self._dsname_no = n return mkpath(n)
[docs] def suggest_dataset_basename(self): """ return a default base filename that the dataset available via ``getdataset()`` can be saved as. This function is specialized for a particular service type this record originates from so that it can be used by ``cachedataset()`` via ``make_dataset_filename()``. """ # abstract; specialized for the different service types return "dataset"
[docs] def suggest_extension(self, *, default=None): """ returns a recommended filename extension for the dataset described by this record. Typically, this would look at the column describing the format and choose an extension accordingly. This function is specialized for a particular service type this record originates from so that it can be used by ``cachedataset()`` via ``make_dataset_filename()``. """ # abstract; specialized for the different service types return default
class Iter: def __init__(self, res): self.resultset = res self.pos = 0 def __iter__(self): return self def __next__(self): try: out = self.resultset.getrecord(self.pos) self.pos += 1 return out except IndexError: raise StopIteration() next = __next__ class Upload: """ This class represents a DALI Upload as described in http://www.ivoa.net/documents/DALI/20161101/PR-DALI-1.1-20161101.html#tth_sEc3.4.5 """ def __init__(self, name, content): """ Initialise the Upload object with the given parameters Parameters ---------- name : str Tablename for use in queries content : object If its a file-like object, a string pointing to a local file, a `DALResults` object or a astropy table, `is_inline` will be true and it will expose a file-like object under `fileobj` Otherwise it exposes a URI under `uri` """ try: self._is_file = os.path.isfile(content) except Exception: self._is_file = False self._is_fileobj = hasattr(content, "read") self._is_table = isinstance(content, Table) self._is_resultset = isinstance(content, DALResults) self._inline = any(( self._is_file, self._is_fileobj, self._is_table, self._is_resultset, )) self._name = name self._content = content @property def is_inline(self): """ True if the upload can be inlined """ return self._inline @property def name(self): return self._name def fileobj(self): """ A file-like object for a local resource Raises ------ ValueError if theres no valid local resource """ if not self.is_inline: raise ValueError( "Upload {name} doesn't refer to a local resource".format( name=self.name)) # astropy table if isinstance(self._content, Table): from io import BytesIO fileobj = BytesIO() self._content.write(output=fileobj, format="votable") fileobj.seek(0) return fileobj elif isinstance(self._content, DALResults): from io import BytesIO fileobj = BytesIO() table = self._content.to_table() table.write(output=fileobj, format="votable") fileobj.seek(0) return fileobj fileobj = self._content try: fileobj = open(self._content) finally: return fileobj def uri(self): """ The URI pointing to the result """ # TODO: use a async job base class instead of hasattr for inspection if hasattr(self._content, "result_uri"): self._content.raise_if_error() uri = self._content.result_uri else: uri = str(self._content) return uri def query_part(self): """ The query part for use in DALI requests """ if self.is_inline: value = "{name},param:{name}" else: value = "{name},{uri}" return value.format(name=self.name, uri=self.uri()) class UploadList(list): """ This class extends the native python list with utility functions for upload handling """ @classmethod def fromdict(cls, dct): """ Constructs a upload list from a dictionary with table_name: content """ return cls(Upload(key, value) for key, value in dct.items()) def param(self): """ Returns a string suitable for use in UPLOAD parameters """ return ";".join(upload.query_part() for upload in self) _image_mt_re = re.compile(r'^image/(\w+)') _text_mt_re = re.compile(r'^text/(\w+)') _votable_mt_re = re.compile(r'^(\w+)/(x-)?votable(\+\w+)?')