Source code for gavo.imp.astropyucd

# Licensed under a 3-clause BSD style license - see LICENSE.rst
"""
This file contains routines to verify the correctness of UCD strings.

MD, 2021: I'm forking this here to develop it a bit.  The main changes
ought to go back to astropy when this has stabilised.
"""


# STDLIB
import re

# LOCAL
# from astropy.utils import data


from gavo import base


__all__ = ['parse_ucd', 'check_ucd']


class UCDWords:
    """
    Manages a list of acceptable UCD words.

    Works by reading in a data file exactly as provided by IVOA.  This
    file resides in data/ucd1p-words.txt.
    """

    def __init__(self):
        self._primary = set()
        self._secondary = set()
        self._descriptions = {}
        self._capitalization = {}

# astropy
#        with data.get_pkg_data_fileobj(
#                "data/ucd1p-words.txt", encoding='ascii') as fd:
# gavo
        with base.openDistFile("data/ucd-words.txt") as fd:
            for line in fd.readlines():
                if line.startswith("#"):
                    continue
                type, name, descr = [
                    x.strip() for x in line.split('|')]
                name_lower = name.lower()
                if type in 'QPEVC':
                    self._primary.add(name_lower)
                if type in 'QSEVC':
                    self._secondary.add(name_lower)
                self._descriptions[name_lower] = descr
                self._capitalization[name_lower] = name

    def is_primary(self, name):
        """
        Returns True if *name* is a valid primary name.
        """
        return name.lower() in self._primary

    def is_secondary(self, name):
        """
        Returns True if *name* is a valid secondary name.
        """
        return name.lower() in self._secondary

    def get_description(self, name):
        """
        Returns the official English description of the given UCD
        *name*.
        """
        return self._descriptions[name.lower()]

    def normalize_capitalization(self, name):
        """
        Returns the standard capitalization form of the given name.
        """
        return self._capitalization[name.lower()]


_ucd_singleton = None


[docs]def parse_ucd(ucd, check_controlled_vocabulary=True, has_colon=False): """ Parse the UCD into its component parts. Parameters ---------- ucd : str The UCD string check_controlled_vocabulary : bool, optional If `True`, then each word in the UCD will be verified against the UCD1+ controlled vocabulary, (as required by the VOTable specification version 1.2), otherwise not. has_colon : bool, optional If `True`, the UCD may contain a colon (as defined in earlier versions of the standard). Returns ------- parts : list The result is a list of tuples of the form: (*namespace*, *word*) If no namespace was explicitly specified, *namespace* will be returned as ``'ivoa'`` (i.e., the default namespace). Raises ------ ValueError : *ucd* is invalid """ global _ucd_singleton if _ucd_singleton is None: _ucd_singleton = UCDWords() if has_colon: m = re.search(r'[^A-Za-z0-9_.:;\-]', ucd) else: m = re.search(r'[^A-Za-z0-9_.;\-]', ucd) if m is not None: raise ValueError(f"UCD has invalid character '{m.group(0)}' in '{ucd}'") word_component_re = r'[A-Za-z0-9][A-Za-z0-9\-_]*' word_re = fr'{word_component_re}(\.{word_component_re})*' parts = ucd.split(';') words = [] for i, word in enumerate(parts): colon_count = word.count(':') if colon_count == 1: ns, word = word.split(':', 1) if not re.match(word_component_re, ns): raise ValueError(f"Invalid namespace '{ns}'") ns = ns.lower() elif colon_count > 1: raise ValueError(f"Too many colons in '{word}'") else: ns = 'ivoa' if not re.match(word_re, word): raise ValueError(f"Invalid word '{word}'") if ns == 'ivoa' and check_controlled_vocabulary: if i == 0: if not _ucd_singleton.is_primary(word): if _ucd_singleton.is_secondary(word): raise ValueError( f"Secondary word '{word}' is not valid as a primary word") else: raise ValueError(f"Unknown word '{word}'") else: if not _ucd_singleton.is_secondary(word): if _ucd_singleton.is_primary(word): raise ValueError( f"Primary word '{word}' is not valid as a secondary word") else: raise ValueError(f"Unknown word '{word}'") try: normalized_word = _ucd_singleton.normalize_capitalization(word) except KeyError: normalized_word = word words.append((ns, normalized_word)) return words
[docs]def check_ucd(ucd, check_controlled_vocabulary=False, has_colon=False): """ Returns False if *ucd* is not a valid `unified content descriptor`_. Parameters ---------- ucd : str The UCD string check_controlled_vocabulary : bool, optional If `True`, then each word in the UCD will be verified against the UCD1+ controlled vocabulary, (as required by the VOTable specification version 1.2), otherwise not. has_colon : bool, optional If `True`, the UCD may contain a colon (as defined in earlier versions of the standard). Returns ------- valid : bool """ if ucd is None: return True try: parse_ucd(ucd, check_controlled_vocabulary=check_controlled_vocabulary, has_colon=has_colon) except ValueError: return False return True