Source code for gavo.grammars.xmlgrammar

A grammar for generic XML documents.

#c Copyright 2008-2022, the GAVO project <>
#c This program is free software, covered by the GNU GPL.  See the
#c COPYING file in the source distribution.

from lxml import etree

from gavo import base
from gavo.grammars.common import Grammar, RowIterator

[docs]def iterEventsCounting(inputFile, normalizeWhitespace): """wraps etree.iterparse so [ct] elements are appended to element names when they are repeated. This currently takes some pains to strip namespaces, which probably just uglify the keys in almost all applications I can see for this. """ curPath = [] seenTags = [{}] contentStack = [[]] for action, elem in etree.iterparse( inputFile, events=("start", "end"), remove_blank_text=normalizeWhitespace): curTag = elem.tag if curTag.startswith('{'): curTag = curTag[curTag.index('}')+1:] if action=="start": if curTag in seenTags[-1]: curPath.append("%s[%d]"%(curTag, seenTags[-1][curTag])) seenTags[-1][curTag] += 1 else: curPath.append(curTag) seenTags[-1][curTag] = 0 seenTags.append({}) contentStack.append([]) elif action=="end": if elem.text is not None: contentStack[-1][:0] = [elem.text] content = "".join(contentStack.pop()) or None if content and normalizeWhitespace: content = " ".join(content.split()) or None basePath = "/".join(curPath) yield basePath, content for key, value in list(elem.items()): yield basePath+"/@"+key, value if elem.tail is not None: contentStack[-1].append(elem.tail) curPath.pop() seenTags.pop()
[docs]class XMLRowIterator(RowIterator): """an iterator for XMLGrammars. """ def _iterRows(self): if hasattr(self.sourceToken, "read"): f, keepopen = self.sourceToken, True else: f, keepopen = open(self.sourceToken, "rb"), False try: yield dict(iterEventsCounting( f, self.grammar.normalizeWhitespace)) except etree.LxmlSyntaxError as ex: raise base.SourceParseError( ex.msg, location=ex.position, source=ex.filename) finally: if not keepopen: f.close()
[docs]class XMLGrammar(Grammar): """A grammar parsing from generic XML files. Use this grammar to parse from generic XML files. For now, one rawdict per document is returned (later extensions might let you define elements that will yield rows). The keys are xpaths (e.g., root/element or root/element/@attr), the values the (joined) text nodes that are immediate children or the element. When elements are repeated within an element, [ct] is appended to the path element (e.g., root/element([0]). For now, this grammar ignores namespaces. Because most of the keys are not valid python identifiers, you cannot use the @key syntax when mapping this. Use vars[key] instead (or <map key="dest" source="path"/>). Do not use this for VOTables; use VOTableGrammar instead. """ name_ = "xmlGrammar" _nsw = base.BooleanAttribute("normalizeWhitespace", description="By default, the parser will return whitespace-only" " content as None and will turn internal whitespace to a single" " blank. Set this to False to preserve whitespace as present" " in the document.", default=True, copyable=True) rowIterator = XMLRowIterator