Source code for gavo.grammars.xmlgrammar

"""
A grammar for generic XML documents.
"""

#c Copyright 2008-2023, the GAVO project <gavo@ari.uni-heidelberg.de>
#c
#c This program is free software, covered by the GNU GPL.  See the
#c COPYING file in the source distribution.


from lxml import etree

from gavo import base
from gavo.grammars.common import Grammar, RowIterator

[docs]def iterEventsCounting(inputFile, normalizeWhitespace):
	"""wraps etree.iterparse so [ct] elements are appended to element names
	when they are repeated.

	This currently takes some pains to strip namespaces, which probably just
	uglify the keys in almost all applications I can see for this.
	"""
	curPath = []
	seenTags = [{}]
	contentStack = [[]]

	for action, elem in etree.iterparse(
			inputFile,
			events=("start", "end"),
			remove_blank_text=normalizeWhitespace):
		curTag = elem.tag
		if curTag.startswith('{'):
			curTag = curTag[curTag.index('}')+1:]

		if action=="start":
			if curTag in seenTags[-1]:
				curPath.append("%s[%d]"%(curTag, seenTags[-1][curTag]))
				seenTags[-1][curTag] += 1
			else:
				curPath.append(curTag)
				seenTags[-1][curTag] = 0
			seenTags.append({})
			contentStack.append([])

		elif action=="end":
			if elem.text is not None:
				contentStack[-1][:0] = [elem.text]
			content = "".join(contentStack.pop()) or None
			if content and normalizeWhitespace:
				content = " ".join(content.split()) or None

			basePath = "/".join(curPath)
			yield  basePath, content

			for key, value in list(elem.items()):
				yield basePath+"/@"+key, value

			if elem.tail is not None:
				contentStack[-1].append(elem.tail)
			curPath.pop()
			seenTags.pop()


[docs]class XMLRowIterator(RowIterator):
	"""an iterator for XMLGrammars.
	"""
	def _iterRows(self):
		if hasattr(self.sourceToken, "read"):
			f, keepopen = self.sourceToken, True
		else:
			f, keepopen = open(self.sourceToken, "rb"), False

		try:
			yield dict(iterEventsCounting(
				f,
				self.grammar.normalizeWhitespace))
		except etree.LxmlSyntaxError as ex:
			raise base.SourceParseError(
				ex.msg,
				location=ex.position,
				source=ex.filename)
		finally:
			if not keepopen:
				f.close()


[docs]class XMLGrammar(Grammar):
	"""A grammar parsing from generic XML files.

	Use this grammar to parse from generic XML files.  For now, one rawdict
	per document is returned (later extensions might let you define elements
	that will yield rows).

	The keys are xpaths (e.g., root/element or root/element/@attr), the values
	the (joined) text nodes that are immediate children or the element.

	When elements are repeated within an element, [ct] is appended to the path
	element (e.g., root/element([0]).

	For now, this grammar ignores namespaces.

	Because most of the keys are not valid python identifiers, you cannot
	use the @key syntax when mapping this.  Use vars[key] instead (or
	<map key="dest" source="path"/>).

	Do not use this for VOTables; use VOTableGrammar instead.
	"""
	name_ = "xmlGrammar"

	_nsw = base.BooleanAttribute("normalizeWhitespace",
		description="By default, the parser will return whitespace-only"
			" content as None and will turn internal whitespace to a single"
			" blank.  Set this to False to preserve whitespace as present"
			" in the document.",
		default=True,
		copyable=True)
	rowIterator = XMLRowIterator
Source code for gavo.grammars.xmlgrammar

gavo

Navigation

Related Topics