Source code for gavo.utils.stanxml

"""
A stan-like model for building namespaced XML trees.

The main reason for this module is that much of the VO's XML mess is based
on XML schema and thus has namespaced attributes.  This single design
decision ruins the entire XML design.  To retain some rests of
sanity, I treat the prefixes themselves as namespaces and maintain
a single central registry from prefixes to namespaces in this module.

Then, the elements only use these prefixes, and this module makes sure
that during serialization the instance document's root element contains
the namespace mapping (and the schema locations) required.
"""

#c Copyright 2008-2025, the GAVO project <gavo@ari.uni-heidelberg.de>
#c
#c This program is free software, covered by the GNU GPL.  See the
#c COPYING file in the source distribution.


import io
import types
# This is utils.ElementTree, which is what DaCHS-internal code
# should use to avoid multiple etrees having to interact:
from xml.etree import ElementTree #noflake: re-export

from gavo.utils import autonode
from gavo.utils import excs
from gavo.utils import misctricks
from gavo.utils import texttricks

from gavo.utils.dachstypes import (Any, BinaryIO, Callable, Dict,
	FrozenSet, IO, Iterator, List, Optional, Set, Tuple, Type, TypeVar, Union)

# note on type annotation: mypy 1.0.1 doesn't cope with attributes
# introduced by metaclasses.  Most of the type ignore pragmas in here
# work around that deficiency and can perhaps be dropped with a
# metaclass-aware mypy.


[docs]class Error(Exception):
	pass

[docs]class ChildNotAllowed(Error):
	pass


XML_HEADER = b'<?xml version="1.0" encoding="utf-8"?>'


class _Autoconstructor(autonode.AutoNodeType):
	"""A metaclass used for Elements.

	On the one hand, it does autonode's constructor magic with _a_<attrname>
	attributes, on the other, it will instantiate itself when indexed
	-- that we want for convenient stan-like notation.
	"""
	def __init__(cls, name: str, bases: Tuple[type, ...], dict: Dict):
		autonode.AutoNodeType.__init__(cls, name, bases, dict)
		if hasattr(cls, "_childSequence") and cls._childSequence is not None:
			cls._allowedChildren = set(cls._childSequence)
		else:
			cls._childSequence: Optional[List[str]] = None

	def __getitem__(cls, items: List[Any]) -> autonode.AutoNode:
		return cls()[items]


[docs]class Stub:
	"""A sentinel class for embedding objects not yet existing into
	stanxml trees.

	These have a single opaque object and need to be dealt with by the
	user.  One example of how these can be used is the ColRefs in stc to
	utype conversion.

	Stubs are equal to each othter if their handles are identical.
	"""
	name_ = "stub"
	text_ = None

	def __init__(self, dest: str) -> None:
		self.dest = dest

	def __repr__(self) -> str:
		return "%s(%s)"%(self.__class__.__name__, repr(self.dest))

	def __eq__(self, other: Any) -> bool:
		return self.dest==getattr(other, "dest", Stub)
	
	def __ne__(self, other: Any) -> bool:
		return not self==other

	def __hash__(self) -> int:
		return hash(self.dest)

[docs]	def isEmpty(self) -> bool:
		return False

[docs]	def shouldBeSkipped(self) -> bool:
		return False

[docs]	def getChildDict(self) -> Dict:
		return {}

[docs]	def iterAttNames(self):
		if False:
			yield

[docs]	def apply(self, func: Callable) -> None:
		"""does nothing.

		Stubs don't have what Element.apply needs, so we don't even pretend.
		"""
		return


[docs]class Element(metaclass=_Autoconstructor):
	"""An element for serialization into XML.

	This is loosely modelled after t.w.template stan.

	Don't add to the children attribute directly, use addChild or (more
	usually) __getitem__.

	Elements have attributes and children.  The attributes are defined,
	complete with defaults, in _a_<name> attributes as in AutoNodes.
	Attributes are checked.

	Children are not usually checked, but you can set a _childSequence
	attribute containing a list of (unqualified) element names.  These
	children will be emitted in the sequence given.
	
	When deriving from Elements, you may need attribute names that are not
	python identifiers (e.g., with dashes in them).  In that case, define
	an attribute _name_a_<att> and point it to any string you want as the
	attribute.

	When serializing these, empty elements (i.e. those having an empty text and
	having no non-empty children) are usually discarded.  If you need such an
	element (e.g., for attributes), set mayBeEmpty to True.

	Since insane XSD mandates that local elements must not be qualified when
	elementFormDefault is unqualified, you need to set _local=True on
	such local elements to suppress the namespace prefix.  Attribute names
	are never qualified here.  If you need qualified attributes, you'll
	have to use attribute name translation.

	The content of the DOM may be anything recognized by addChild.
	In particular, you can give objects a serializeToXMLStan method returning
	strings or an Element to make them good DOM citizens.

	Elements cannot harbor mixed content (or rather, there is only
	one piece of text).
	"""
	name_ = None
	_a_id = None
	_prefix = ""
	_additionalPrefixes: FrozenSet[str] = frozenset()
	_mayBeEmpty = False
	_local = False
	_stringifyContent = False

	_localAttrs: Union[Tuple, List[Tuple[str, str]]] = ()

	# should probably do this in the elements needing it (quite a lot of them
	# do, however...)
	_name_a_xsi_type = "xsi:type"

	# see _setupNode below for __init__

	def __getitem__(self, children: Any) -> "Element":
		self.addChild(children)
		return self

	def __call__(self, **kw: Any) -> "Element":
		if not kw:
			return self
	
		for k, v in kw.items():
			# Only allow setting attributes already present
			getattr(self, k)
			setattr(self, k, v)
		return self

	def __iter__(self):
		raise NotImplementedError("Element instances are not iterable.")

	def __bool__(self) -> bool:
		return self.isEmpty()

	def _setupNodeNext(self, cls) -> None:
		try:
			pc = super(cls, self)._setupNode  #type: ignore
		except AttributeError:
			pass
		else:
			pc()

	def _setupNode(self) -> None:
		self._isEmptyCache: Optional[bool] = None
		self._children: List[Any] = []
		self.text_ = ""
		if self.name_ is None:
			self.name_ = self.__class__.__name__.split(".")[-1]
		self._setupNodeNext(Element)

	def _makeAttrDict(self) -> Dict[str, str]:
		res = {}
		for name, attName in self.iterAttNames():
			if getattr(self, name, None) is not None:
				res[attName] = str(getattr(self, name))

		for name, value in self._localAttrs:
			res[name] = value

		return res

	def _iterChildrenInSequence(self) -> Iterator["Element"]:
		cDict = self.getChildDict()
		for cName in self._childSequence:  #type: ignore
			if cName in cDict:
				for c in cDict[cName]:
					yield c

[docs]	def bailIfBadChild(self, child: Any) -> None:
		if (self._childSequence is not None  # type: ignore
				and getattr(child, "name_", None) not in self._allowedChildren # type: ignore
				and type(child) not in self._allowedChildren):  # type: ignore
			raise ChildNotAllowed("No %s children in %s"%(
				getattr(child, "name_", "text"), self.name_))

[docs]	def deepcopy(self):
		"""returns a deep copy of self.
		"""
		copy = self.__class__(**self._makeAttrDict())
		for child in self.iterChildren():
			if isinstance(child, Element):
				copy.addChild(child.deepcopy())
			else:
				copy.addChild(child)
		return copy
	
	# t.w.template compatibility
	clone = deepcopy

[docs]	def addChild(self, child: Any) -> None:
		"""adds child to the list of children.

		Child may be an Element, a string, or a list or tuple of Elements and
		strings.  Finally, child may be None, in which case nothing will be
		added.
		"""
		self._isEmptyCache = None
		if child is None:
			pass

		elif hasattr(child, "serializeToXMLStan"):
			self.addChild(child.serializeToXMLStan())

		elif isinstance(child, str):
			self.bailIfBadChild(child)
			self.text_ = child

		elif isinstance(child, (Element, Stub)):
			self.bailIfBadChild(child)
			self._children.append(child)

		elif isinstance(child, (list, tuple, types.GeneratorType)):
			for c in child:
				self.addChild(c)

		elif isinstance(child, _Autoconstructor):
			self.addChild(child())

		elif self._stringifyContent:
			self.addChild(str(child))

		elif isinstance(child, Exception):
			self.addChild("EXCEPTION:\n%s: %s"%(
				child.__class__.__name__,
				str(child)))

		elif isinstance(child, bytes):
			# we assume our standard encoding of utf-8 and an actual string;
			# not much else would make sense within a stan tree
			self.addChild(child.decode("utf-8"))

		else:
			raise Error("%s element %s cannot be added to %s node"%(
				type(child), repr(child), self.name_))

[docs]	def isEmpty(self) -> bool:
		"""returns true if the current node has no non-empty children and no
		non-whitespace text content.
		"""
		if self._isEmptyCache is None:
			self._isEmptyCache = True

			if self.text_.strip():
				self._isEmptyCache = False
			if self._isEmptyCache:
				for c in self._children:
					if not c.shouldBeSkipped():
						self._isEmptyCache = False
						break

		return self._isEmptyCache

[docs]	def shouldBeSkipped(self) -> bool:
		"""returns true if the current node should be part of an output.

		That is true if it is either non-empty or _mayBeEmpty is true.
		An empty element is one that has only empty children and no
		non-whitespace text content.
		"""
		if self._mayBeEmpty:
			return False
		return self.isEmpty()

[docs]	def iterAttNames(self) -> Iterator[Tuple[str, str]]:
		"""iterates over the defined attribute names of this node.
		
		Each element returned is a pair of the node attribute name and the
		xml name (which may be translated via _a_name_<att>
		"""
		for name, default in self._nodeAttrs:  # type: ignore
			publicName = getattr(self, "_name_a_"+name, name)
			yield name, publicName

[docs]	def addAttribute(self, attName: str, attValue: str) -> None:
		"""adds attName, attValue as a local attribute/value pair.

		This is for when you have, if you will, "non-namespace" attributes
		(most of the time, xml namespace declarations).

		If you set the same name multiple times, the last value set will win.
		"""
		if self._localAttrs==():
			self._localAttrs = [(attName, attValue)]
		else:
			self._localAttrs.append((attName, attValue))  # type: ignore

[docs]	def iterChildrenOfType(self, type: Type["SomeElement"]
			) -> Iterator["SomeElement"]:
		"""iterates over all children having type.
		"""
		for c in self._children:
			if isinstance(c, type):
				yield c

[docs]	def iterChildren(self) -> Iterator[Any]:
		return iter(self._children)

[docs]	def getChildDict(self) -> Dict[str, Any]:
		cDict: Dict[str, List] = {}
		for c in self._children:
			cDict.setdefault(c.name_, []).append(c)
		return cDict
	
[docs]	def iterChildrenWithName(self, elName: str
			) -> Iterator[Any]:
		"""iterates over children whose element name is elName.

		This always does a linear search through the children and hence
		may be slow.
		"""
		for c in self._children:
			if c.name_==elName:
				yield c
		
	def _getChildIter(self) -> Iterator[Any]:
		if self._childSequence is None:  # type: ignore
			return iter(self._children)
		else:
			return self._iterChildrenInSequence()

[docs]	def apply(self, func: Callable) -> Optional[Any]:
		"""calls func(node, text, attrs, childIter).

		This is a building block for tree traversals.
		"""
		try:
			if self.shouldBeSkipped():
				return None
			attrs = self._makeAttrDict()
			return func(
				self, self.text_, attrs, self._getChildIter())
		except Error:
			raise
		except Exception:
			misctricks.sendUIEvent("Info",
				"Internal failure while building XML; context is"
				" %s node with children %s"%(
					self.name_,
					texttricks.makeEllipsis(repr(self._children), 60)))
			raise

[docs]	def render(self,
			prefixForEmpty: Optional[str] = None,
			includeSchemaLocation: bool = True,
			xmlDecl: bool = False,
			prolog: str = "") -> bytes:
		"""returns this and its children as a string.
		"""
		f = io.BytesIO()
		xmlwrite(self,
			f,
			prefixForEmpty=prefixForEmpty,
			xmlDecl=xmlDecl,
			includeSchemaLocation=includeSchemaLocation,
			prolog=prolog)
		return f.getvalue()


SomeElement = TypeVar("SomeElement", bound=Element)


[docs]class NSRegistry:
	"""A container for a registry of namespace prefixes to namespaces.

	This is used to have fixed namespace prefixes (IMHO the only way
	to have namespaced attribute values and retain sanity).  The
	class is never instantiated.  It is used through the module-level
	method registerPrefix.
	"""
	_registry: Dict[str, str] = {}
	_reverseRegistry: Dict[str, str] = {}
	_schemaLocations: Dict[str, Optional[str]] = {}

[docs]	@classmethod
	def registerPrefix(cls,
			prefix: str,
			ns: str,
			schemaLocation: Optional[str]) -> None:
		if prefix in cls._registry:
			if ns!=cls._registry[prefix]:
				raise ValueError("Prefix %s is already allocated for namespace %s"%
					(prefix, ns))
		cls._registry[prefix] = ns
		cls._reverseRegistry[ns] = prefix
		cls._schemaLocations[prefix] = schemaLocation

[docs]	@classmethod
	def getPrefixForNS(cls, ns:str) -> str:
		try:
			return cls._reverseRegistry[ns]
		except KeyError:
			raise excs.NotFoundError(ns, "XML namespace",
				"registry of XML namespaces.", hint="The registry is filled"
				" by modules as they are imported -- maybe you need to import"
				" the right module?")

[docs]	@classmethod
	def getNSForPrefix(cls, prefix:str) -> str:
		try:
			return cls._registry[prefix]
		except KeyError:
			raise excs.NotFoundError(prefix, "XML namespace prefix",
				"registry of prefixes.", hint="The registry is filled"
				" by modules as they are imported -- maybe you need to import"
				" the right module?")
	
	@classmethod
	def _iterNSAttrs(cls,
			prefixes: set[str],
			prefixForEmpty: Optional[str],
			includeSchemaLocation: bool
			) -> Iterator[Tuple[str, str]]:
		"""iterates over pairs of (attrName, attrVal) for declaring
		prefixes.
		"""
		# null prefixes are ignored here; prefixForEmpty, if non-null, gives
		# the prefix the namespace would normally be bound to.
		prefixes.discard("")

		schemaLocations = []
		for pref in sorted(prefixes):
			yield "xmlns:%s"%pref, cls._registry[pref]
			if includeSchemaLocation and cls._schemaLocations[pref]:
				schemaLocations.append("%s %s"%(
					cls._registry[pref],
					cls._schemaLocations[pref]))

		if prefixForEmpty:
			yield "xmlns", cls._registry[prefixForEmpty]

		if schemaLocations:
			if not "xsi" in prefixes:
				yield "xmlns:xsi", cls._registry["xsi"]
			yield "xsi:schemaLocation", " ".join(schemaLocations)

[docs]	@classmethod
	def addNamespaceDeclarations(cls,
			root: Element,
			prefixes: Set[str],
			prefixForEmpty: Optional[str] = None,
			includeSchemaLocation: bool = True) -> None:
		"""adds xmlns declarations for prefixes to the stanxml node root.

		With stanxml and the global-prefix scheme, xmlns declarations
		only come at the root element; thus, root should indeed be root
		rather than some random element.
		"""
		for attName, attVal in cls._iterNSAttrs(
				prefixes, prefixForEmpty, includeSchemaLocation):
			root.addAttribute(attName, attVal)

[docs]	@classmethod
	def getPrefixInfo(cls, prefix: str) -> Tuple[str, Optional[str]]:
		return (cls._registry[prefix], cls._schemaLocations[prefix])

[docs]	@classmethod
	def getSchemaForNS(self, ns: str) -> Optional[str]:
		try:
			return self._schemaLocations[self._reverseRegistry[ns]]
		except KeyError:
			raise excs.NotFoundError(ns, "XML namespace",
				"registry of XML namespaces.", hint="The registry is filled"
				" by modules as they are imported -- maybe you need to import"
				" the right module?")


registerPrefix = NSRegistry.registerPrefix
getPrefixInfo = NSRegistry.getPrefixInfo

[docs]def schemaURL(xsdName: str) -> str:
	"""returns the URL to the local mirror of the schema xsdName.

	This is used by the various xmlstan clients to make schemaLocations.
	"""
	return "http://vo.ari.uni-heidelberg.de/docs/schemata/"+xsdName


registerPrefix("xsi","http://www.w3.org/2001/XMLSchema-instance",  None)
# convenience for _additionalPrefixes of elements needing the xsi prefix
# (and no others) in their attributes.
xsiPrefix = frozenset(["xsi"])


[docs]class NillableMixin:
	"""An Element mixin making the element XSD nillable.

	This element will automatically have an xsi:nil="true" attribute
	on empty elements (rather than leave them out entirely).

	This overrides apply, so the mixin must be before the base class in
	the inheritance list.
	"""
# typing TODO: Mixins are nasty; we would need to define a Protocol,
# which seems over the top here.  Let's see if they come up with something
# smarter and meanwhile sprinkle ignore-s.
	_mayBeEmpty = True

[docs]	def apply(self, func: Callable) -> Any:
		if self.text_:  # type: ignore
			return Element.apply(self, func) # type: ignore
		else:
			attrs = self._makeAttrDict()  # type: ignore
			attrs["xsi:nil"] = "true"
			self._additionalPrefixes |= {"xsi"}  # type: ignore
			return func(self, "", attrs, ())

[docs]	def isEmpty(self) -> bool:
		return False


[docs]def escapePCDATA(val: str) -> str:
	return (val
		).replace("&", "&amp;"
		).replace('<', '&lt;'
		).replace('>', '&gt;'
		).replace("\0", "&x00;")


[docs]def escapeAttrVal(val: str) -> str:
	return '"%s"'%(escapePCDATA(val).replace('"', '&quot;'))


[docs]class WriteBuffer:
	"""a very simple write buffer, collecting strings and handing them
	through to an actual files in suitably large chunks.

	This ought to help make our XML writing to twisted requests a bit
	more wire-efficient.

	The raw argument of course can't be Any, it needs to be a
	binary-writable thing.  I'm not giving a proper type because
	what's here most of the time is a twisted web Request.  The
	incomplete implementation of an io writer is also why this
	class exists in the first place.
	"""
	def __init__(self, raw: Any, bufSize: int=8192):
		self.raw, self.bufSize = raw, 8192
		self.curSize = 0
		self.buffer: List[bytes] = []
	
[docs]	def write(self, data):
		self.curSize += len(data)
		self.buffer.append(data)
		if self.curSize>=self.bufSize:
			self.flush()
	
[docs]	def flush(self):
		self.raw.write(b"".join(self.buffer))
		self.curSize, self.buffer = 0, []


def _makeVisitor(
		outputFile: Union[IO, WriteBuffer],
		prefixForEmpty: Optional[str]) -> Callable:
	"""returns a function writing nodes to outputFile.
	"""
	
	def visit(
			node: Any,
			text: str,
			attrs: Dict[str, str],
			childIter: Iterator) -> None:
		attrRepr = " ".join(sorted("%s=%s"%(k, escapeAttrVal(attrs[k]))
			for k in attrs))
		if attrRepr:
			attrRepr = " "+attrRepr

		if getattr(node, "_fixedTagMaterial", None):
			attrRepr = attrRepr+" "+node._fixedTagMaterial

		if not node._prefix or node._local or node._prefix==prefixForEmpty:
			name = node.name_
		else:
			name = "%s:%s"%(node._prefix, node.name_)

		if node.isEmpty():
			if node._mayBeEmpty:
				outputFile.write(texttricks.bytify("<%s%s/>"%(name, attrRepr)))
		else:
			outputFile.write(texttricks.bytify("<%s%s>"%(name, attrRepr)))
			try:
				try:
					if text:
						outputFile.write(texttricks.bytify(escapePCDATA(text)))

					for c in childIter:
						if hasattr(c, "write"):
							c.write(outputFile)
						else:
							c.apply(visit)
				except Exception as ex:
					if hasattr(node, "writeErrorElement"):
						node.writeErrorElement(outputFile, ex)
					raise
			finally:
				outputFile.write(texttricks.bytify("</%s>"%name))

	return visit


[docs]def xmlwrite(
	root: Element,
	outputFile: BinaryIO,
	prefixForEmpty: Optional[str] = None,
	nsRegistry: Type[NSRegistry] = NSRegistry,
	xmlDecl: bool = True,
	includeSchemaLocation: bool = True,
	prolog: Optional[str] = None) -> None:
	"""writes an xmlstan tree starting at root to outputFile.

	prefixForEmpty is a namespace URI that should have no prefix at all.

	outputFile must be opened in binary mode.
	"""
	# since namespaces only enter here through prefixes, I just need to
	# figure out which ones are used.
	prefixesUsed: Set[str] = set()

	def collectPrefixes(
			node: Element,
			text: str,
			attrs: Dict[str, str],
			childIter:  Iterator[Any],
			prefixesUsed: Set[str] = prefixesUsed) -> None:
		prefixesUsed |= node._additionalPrefixes
		prefixesUsed.add(node._prefix)
		for child in childIter:
			child.apply(collectPrefixes)

	# put a buffer between us and the output; if we don't twisted
	# will produce one little chunk per tag in http chunked encoding.
	buffered = WriteBuffer(outputFile)

	root.apply(collectPrefixes)
	# An incredibly nasty hack for VOTable generation; we need a better
	# way to handle with the 1.1/1.2 namespaces: Root may declare it
	# handles all NS declarations itself.  Die, die, die.
	if getattr(root, "_fixedTagMaterial", None) is None:
		nsRegistry.addNamespaceDeclarations(root, prefixesUsed, prefixForEmpty,
			includeSchemaLocation)

	if xmlDecl:
		buffered.write(b"<?xml version='1.0' encoding='utf-8'?>\n")

	if prolog:
		buffered.write(texttricks.bytify(prolog)+b"\n")

	try:
		root.apply(_makeVisitor(buffered, prefixForEmpty))
	finally:
		buffered.flush()


[docs]def xmlrender(
	tree: Element,
	prolog: str = "",
	prefixForEmpty: Optional[Any] = None) -> bytes:
	"""returns a unicode object containing tree in serialized forms.

	tree should be a stanxml.Element.  Anything else will be phased out.

	If prolog is given, it must be a string that will be prepended to the
	serialization of tree.  The way ElementTree currently is implemented,
	you can use this for xml declarations or stylesheet processing
	instructions.
	"""
	# Historically, we have accepted all kind of mess in here.  In the
	# end, we only want to retain option 1 of the following selection.
	# TODO: nuke the rest
	if hasattr(tree, "render"):
		res = tree.render(prefixForEmpty=prefixForEmpty)
	elif isinstance(tree, bytes):
		res = tree.decode("utf-8")
	elif isinstance(tree, str):
		res = tree
	else:
		raise ValueError("Cannot render %s"%repr(tree))
	if prolog:
		res = texttricks.bytify(prolog)+res
	return res
Source code for gavo.utils.stanxml

gavo

Navigation

Related Topics