Source code for gavo.protocols.linetap

Support code for linetap.

This, in particular, contains bindings to the InChI library.
Use getILib() to obtain the bindings, and see the _InChIBindings
docstrings on what you can all.  It probably helps to be aware of
although the binding is substantially different.

This binding probably leaks memory left and right; I have not given
this a great deal of thought so far, as I don't expect InChI manipulations
will be common in long-running code.  If that's overly optimistic, then
we'll need a round of leak hunting.

#c Copyright 2008-2023, the GAVO project <>
#c This program is free software, covered by the GNU GPL.  See the
#c COPYING file in the source distribution.

import contextlib
import ctypes
import functools
import re
import warnings
from ctypes.util import find_library

from gavo import base
from gavo import utils

[docs]class InChIError(utils.Error): """is raised when a libinchi function indicates an error has occurred. This is usually rather cryptic but ought to return a numeric code that can be decoded using the libinchi API document. """
_EL_DATA_RE = re.compile(r'\{\s*"([A-Za-z]+)",' # Element name r"\s*(\d+)," # average mass r"\s*(\d+)," # norm mass r"\s*([\d.]+)," # avg mass r"\s*(METAL2?|0)\s*," # type r"\s*(\d+)," # electronegativity r"\s*(\d)," # "No H" (no clue:-) r"\s*([0-9,{}]+)" r"\}") def _parseInchiElData(line): """returns a dictionary of parsed items for a line in data/element_data.txt. These are C structure literals, but I blindly kill things with REs. """ mat = _EL_DATA_RE.match(line) return { "name":, "avg weight": int(, "weight": int(, "exact weight": float(, "type":, "el. neg": int(, # TODO: parse out valences, perhaps do something with "No H" }
[docs]@functools.lru_cache(None) def getElementData(): """returns a dictionary from element names to a dictionary of element properties. """ elementData = {} with base.openDistFile("data/element_data.txt") as f: for lineNumber, line in utils.iterSimpleText(f): el = _parseInchiElData(line) elementData[el["name"]] = el return elementData
# The following values should really be parsed from /usr/include/inchi_api.h # Maximum bonds per atom MAXVAL = 20 ATOM_EL_LEN = 6 class _inchi_Atom(ctypes.Structure): _fields_ = [ ("x", ctypes.c_double), ("y", ctypes.c_double), ("z", ctypes.c_double), ("neighbor", ctypes.c_short*MAXVAL), ("bond_type", ctypes.c_byte*MAXVAL), ("bond_stereo", ctypes.c_char*MAXVAL), ("elname", ctypes.c_char*ATOM_EL_LEN), ("num_bonds", ctypes.c_short), ("num_iso_H", ctypes.c_byte*4), ("isotopic_mass", ctypes.c_short), ("radical", ctypes.c_char), ("charge", ctypes.c_byte)] def connect(self, otherAtomIndex, bond_type): """Adds a connection to otherAtomIndex. User code will probably rather use _inchi_Input's connect method, as it usually cannot know otherAtomIndex. """ if self.num_bonds==MAXVAL: raise InChIError("Too many connections") self.neighbor[self.num_bonds] = otherAtomIndex self.bond_type[self.num_bonds] = bond_type self.num_bonds += 1 def getElementInfo(self): res = getElementData()[utils.debytify(self.elname)] if self.isotopic_mass: res = res.copy() if self.isotopic_mass>1000: res["weight"] += self.isotopic_mass-10000 else: res["weight"] = self.isotopic_mass return res class _inchi_Stereo0D(ctypes.Structure): _fields_ = [ ("neighbor", ctypes.c_short*4), ("central_atom", ctypes.c_short), ("type", ctypes.c_char), ("parity", ctypes.c_char), ] class _MoleculeMixin: """a mixin for structs having atoms. """ def iterAtoms(self): for atomIndex in range(self.num_atoms): yield self.atom[atomIndex] def getTotalWeight(self): """returns the total molecular weight of the substance. This takes isotopy into account. """ totalWeight = 0 for atom in self.iterAtoms(): totalWeight += atom.getElementInfo()["weight"] for hIso in range(1, 4): totalWeight += atom.num_iso_H[hIso]*hIso # no idea what the implicit H-s should really count as. totalWeight += atom.num_iso_H[0] return totalWeight def getTotalCharge(self): """returns the total charge of all compounds. """ return sum(atom.charge for atom in self.iterAtoms()) class _inchi_Input(ctypes.Structure, _MoleculeMixin): _fields_ = [ ("atom", ctypes.POINTER(_inchi_Atom)), ("stereo0D", ctypes.POINTER(_inchi_Stereo0D)), ("szOptions", ctypes.c_char_p), ("num_atoms", ctypes.c_int), ("num_stereo0D", ctypes.c_int),] def connect(self, atomIndex1, atomIndex2, bond_type=1): """add a bond between the atom at index1 and index2. """ if (not 0<=atomIndex1<self.num_atoms or not 0<=atomIndex2<self.num_atoms): raise InChIError("Atom indexes must be < {} here".format( self.num_atoms)) self.atom[atomIndex1].connect(atomIndex2, bond_type) self.atom[atomIndex2].connect(atomIndex1, bond_type) class _inchi_OutputStruct(ctypes.Structure, _MoleculeMixin): _fields_ = [ ("atom", ctypes.POINTER(_inchi_Atom)), ("stereo0D", ctypes.POINTER(_inchi_Stereo0D)), ("num_atoms", ctypes.c_short), ("num_stereo0D", ctypes.c_short), ("szMessage", ctypes.c_char_p), ("szLog", ctypes.c_char_p), ("WarningFlags", ctypes.c_long*4)] class _inchi_Output(ctypes.Structure): _fields_ = [ ("szInChI", ctypes.c_char_p), ("szAuxInfo", ctypes.c_char_p), ("szMessage", ctypes.c_char_p), ("szLog", ctypes.c_char_p)] class _inchi_InputINCHI(ctypes.Structure): _fields_ = [ ("szInChI", ctypes.c_char_p), ("szOptions", ctypes.c_char_p), ] class _InChIBinding: """A class encapsulating our binding to libinchi. This is intended to be a singleton; users ought to obtain it through the getInChI function below. This will raise a ReportableError if the inchi C library cannot be found. """ _libname = "inchi" def __init__(self): self.libPath = find_library(self._libname) if self.libPath is None: raise base.ReportableError("No libinchi found.", hint="On Debian systems, install libinchi1 to get it.") self.lib = ctypes.CDLL(self.libPath) for wrappedName, argtypes in [ ("CheckINCHI", [ctypes.c_char_p, ctypes.c_int]), ("GetStdINCHIKeyFromStdINCHI", [ctypes.c_char_p, ctypes.c_char_p]), ("GetINCHIKeyFromINCHI", [ctypes.c_char_p, ctypes.c_int, ctypes.c_int, ctypes.c_char_p, ctypes.c_char_p, ctypes.c_char_p]), ("GetINCHI", [ctypes.POINTER(_inchi_Input), ctypes.POINTER(_inchi_Output)]), ("GetStdINCHI", [ctypes.POINTER(_inchi_Input), ctypes.POINTER(_inchi_Output)]), ("GetINCHIfromINCHI", [ctypes.POINTER(_inchi_InputINCHI), ctypes.POINTER(_inchi_Output)]), ("GetStructFromINCHI", [ctypes.POINTER(_inchi_InputINCHI), ctypes.POINTER(_inchi_OutputStruct)]), ("FreeStdINCHI", [ctypes.POINTER(_inchi_Output)]), ("FreeStructFromINCHI", [ctypes.POINTER(_inchi_OutputStruct)]), ]: setattr(self, "_"+wrappedName, getattr(self.lib, wrappedName)) getattr(self, "_"+wrappedName).argtypes = argtypes def checkInChI(self, inchi, strict=1): """returns 0 if inchi is ok, something else (cf. API docs) otherwise. """ return self._CheckINCHI(utils.bytify(inchi), ctypes.c_int(strict)) def getInChIKey(self, inChI): """returns an InChIKey for a standard InChI. """ buf = ctypes.create_string_buffer(28) retval = self._GetINCHIKeyFromINCHI( utils.bytify(inChI), 0, 0, buf, None, None) if retval!=0: raise InChIError(f"Could not generate InChIKey: {retval}") return utils.debytify(buf.value) def getInput(self, atoms): """returns an _inchi_Input for a list of inchi_Atoms. This is what you need to call to generate an InChI(Key). Essentially, you create atoms using getAtom, put them into a list, pass them to getInput and then call connect(index1, index2) to make the molecule connections. """ atomsArray = (_inchi_Atom*len(atoms))(*atoms) return _inchi_Input( atom=atomsArray, stereo0D=None, szOptions=b"-DoNotAddH", num_atoms=len(atoms), num_stereo0D=0) def getAtom(self, elname, isotopic_mass=0, charge=0): """returns a new atom. Only pass isotopic_mass if you actually care about isotopism. """ neighbor = (ctypes.c_short*MAXVAL)(*([0]*MAXVAL)) bond_type = (ctypes.c_byte*MAXVAL)(*([0]*MAXVAL)) return _inchi_Atom( x=0, y=0, z=0, neighbor=neighbor, bond_type=bond_type, # (i.e., single, double... binding) bond_stereo=b"", # (let's see when we want it) elname=utils.bytify(elname), num_bonds=0, num_iso_H=(ctypes.c_byte*4)(0, 0, 0, 0), isotopic_mass=isotopic_mass, radical=0, charge=ctypes.c_byte(charge)) def _handleAndFree(self, retval, output, input=None): try: self._interpretICReturnCode(retval, output, input) return utils.debytify(output.szInChI) finally: self._FreeStdINCHI(output) _errcodes = { 0: "Success", 2: "No InChi created", 3: "Severe error (out of memory?)", 4: "Unknown error", 5: "Previous call has not returned yet", -1: "No structural data provided"} def _interpretICReturnCode(self, retval, output, input=None): if retval==1: if input is not None: warnings.warn("Source for the following warning: "+input) warnings.warn("InChI warning: "+utils.debytify(output.szMessage)) elif retval!=0: raise InChIError(utils.debytify(output.szMessage or self._errcodes.get(retval, "(Unclear problem)")), hint=utils.debytify(output.szLog)) def getInChI(self, inchiInput, rawInput=None): """returns the InChI of the molecule described through inchiInput as a string. """ output = _inchi_Output() res = self._GetStdINCHI(inchiInput, output) return self._handleAndFree(res, output, rawInput) def normalizeInChI(self, inChI): """returns a string inChI in normalised form. """ output = _inchi_Output() res = self._GetINCHIfromINCHI( _inchi_InputINCHI(utils.bytify(inChI)), output) return self._handleAndFree(res, output) _dcodeAtom = re.compile(r"(?P<iso>\d+)?(?P<at>[A-Z][a-z]?)(?P<ch>[-+]+)?$") _dcodeConn = re.compile(r"(\d+)([-=#])(\d+)$") _dcodeMult = {'-': 1, '=': 2, '#': 3} def _freeStructAfterUse(self, struct): """returns a context manager returning struct and freeing it once the controlled block is through. """ def _(): try: yield struct finally: self._FreeStructFromINCHI(struct) return contextlib.contextmanager(_)() def parseInChI(self, inchi): """returns a context manager for a parsed inchi from a string inchi. The result is probably not fun to work with, but it has getTotalWeight and getTotalCharge methods that are handy for filling LineTAP tables with incoming inchis. This returns a context manager so the stuff allocated by libinchi gets freed again. So, you'll use this like:: with ilib.parseInChI("InChI=1S...") as molecule: (do something) """ struct = _inchi_OutputStruct() input = _inchi_InputINCHI(utils.bytify(inchi)) retval = self._GetStructFromINCHI(input, struct) self._interpretICReturnCode(retval, struct, inchi) return self._freeStructAfterUse(struct) def getInChIFromCode(self, code): """returns a pair on (input, InChI) from a rather simple heuristic code. The input part may be useful if you want to manipulate it further; it's an _inchi_Input with iterAtoms and getTotalWeight methods. This consists of whitespace-separated atoms or connections in arbitrary order. An atom is [isotope]Atom[charge], where isotope is a number and charge a number of + or -. A connection is index(-=#)index, where index enumerates the atom, 1-based and -,=,# is a single, double, or triple bind. Yeah, I know it sucks there's *another* molecule input format, but, really, the chemistry people drove me crazy. Examples for this are found in tests/ """ atoms, connections = [], [] for tok in code.split(): mat = self._dcodeConn.match(tok) if mat: connections.append((self._dcodeMult[], int(, int( continue mat = self._dcodeAtom.match(tok) if mat: pars = mat.groupdict("") charge = 0 if pars["ch"]: charge = len(pars["ch"])*{"+": 1, "-": -1}[pars["ch"][0]] atoms.append(self.getAtom("at"), int("iso") or "0"), charge)) continue raise ValueError(f"Invalid molecule code token: '{tok}'") mol = self.getInput(atoms) for bond_type, index1, index2 in connections: mol.connect(index1-1, index2-1, bond_type) return mol, self.getInChI(mol, code)
[docs]@functools.lru_cache(None) def getILib(): """returns a shallow binding to the InChI library. This will always return the same object if called multiple times. It will raise a ReportableError if the library is not available. """ return _InChIBinding()