gavo.user.rdmanipulator

1 """ 2 Helpers for manipulating serialised RDs. 3 4 The problem here is that RDs typically are formatted with lots of love, 5 also within elements -- e.g., like this: 6 7 <column name="bla" type="text" 8 ucd="foo.bar" 9 description="A long text carefully 10 broken at the right place" 11 /> 12 13 There's no way one can coax a normal XML parser into giving events that'd 14 allow us to preserve this formatting. Hence, when manipulating 15 RD sources, I need something less sophisticated -- the dump XML parser 16 implemented here. 17 18 Note that this will accept non-well-formed documents; don't use this except 19 for the limited purpose of editing supposedly well-formed documents. 20 """ 21 22 #c Copyright 2008-2019, the GAVO project 23 #c 24 #c This program is free software, covered by the GNU GPL. See the 25 #c COPYING file in the source distribution. 26 27 28 import os 29 30 from gavo import base 31 from gavo import rscdesc 32 from gavo import utils 33 from gavo.user import info 34 from pyparsing import (CharsNotIn, Forward, Literal, Optional, 35 ParseResults, QuotedString, SkipTo, 36 StringEnd, White, Word, ZeroOrMore, alphas, alphanums) 37 38

39 -def flatten(arg):

40 """returns a string from a (possibly edited) parse tree. 41 """ 42 if isinstance(arg, basestring): 43 return arg 44 elif isinstance(arg, (list, ParseResults)): 45 return "".join(flatten(a) for a in arg) 46 else: 47 return arg.flatten()

48 49

50 -def _nodify(s, p, t):

51 # a parse action to keep pyparsing from flattenting out things into 52 # a single list 53 return [t.asList()]

54 55

56 -class Attribute(list):

57 """a sentinel for XML attributes. 58 """

59 - def __init__(self, t):

60 list.__init__(self, t) 61 self.name, self.value = t[0], t[2][1:-1]

62 63

64 -def getAttribute(parseResult, name):

65 """returns the Attribute element with name within parseResult. 66 67 If no such attribute exists, a KeyError is raised. 68 """ 69 for el in parseResult: 70 if isinstance(el, Attribute): 71 if el.name==name: 72 return el 73 raise KeyError("No attribute %s in %s"%(name, flatten(parseResult)))

74 75

76 -class NewElement(object):

77 """an element to be inserted into a parsed xml tree. 78 """

79 - def __init__(self, elementName, textContent):

80 self.elementName, self.textContent = elementName, textContent

81

82 - def flatten(self):

83 return "<%s>%s</%s>"%( 84 self.elementName, 85 utils.escapePCDATA(self.textContent), 86 self.elementName)

87 88

89 -class Element(list):

90 """a sentinel for XML elements. 91 92 These are constructed with lists of the type [tag,...]; the opening (or 93 empty) tag is always item 0. 94 """

95 - def __init__(self, t):

96 list.__init__(self, t) 97 self.name = t[0][1]

98

99 - def append(self, newChild):

100 # our last element will in general be a closing element 101 # TODO: special-case empty elements. 102 self[-2:-2] = [newChild]

103

104 - def getAttribute(self, name):

105 """returns the Attribute element with name within self. 106 107 If no such attribute exists, a KeyError is raised. 108 """ 109 return getAttribute(self[0], name)

110

111 - def findElement(self, name):

112 """returns the first element called name somewhere within the xml 113 grammar-parsed parseResult 114 115 This is a depth-first search, and it will return None if there 116 is no such element. 117 """ 118 for el in self: 119 if isinstance(el, Element): 120 if el.name==name: 121 return el 122 123 res = el.findElement(name) 124 if res is not None: 125 return res

126

127 - def countElements(self, name):

128 """returns the number of name elements that are direct children 129 of self. 130 """ 131 res = 0 132 for el in self: 133 if isinstance(el, Element) and el.name==name: 134 res = res+1 135 return res

136 137

138 -def getXMLGrammar(manipulator):

139 140 with utils.pyparsingWhitechars("\r"): 141 name = Word(alphas+"_:", alphanums+".:_-") 142 opener = Literal("<") 143 closer = Literal(">") 144 value = (QuotedString(quoteChar="'", multiline=True, unquoteResults=False) 145 | QuotedString(quoteChar='"', multiline=True, unquoteResults=False)) 146 attribute = (name 147 + Optional(White()) 148 + Literal("=") 149 + Optional(White()) 150 + value) 151 tagOpener = (opener 152 + name 153 + ZeroOrMore(White() + attribute) 154 + Optional(White())) 155 156 openingTag = (tagOpener 157 + closer) 158 closingTag = (opener 159 + Literal("/") 160 + name 161 + Optional(White()) 162 + closer) 163 emptyTag = (tagOpener 164 + Optional(White()) 165 + Literal("/>")) 166 167 processingInstruction = (opener 168 + Literal("?") 169 + SkipTo("?>", include="True")) 170 comment = (opener 171 + Literal("!--") 172 + SkipTo("-->", include="True")) 173 cdataSection = (opener 174 + Literal("![CDATA[") 175 + SkipTo("]]>", include="True")) 176 177 nonTagStuff = CharsNotIn("<", min=1) 178 179 docItem = Forward() 180 element = ( 181 (openingTag + ZeroOrMore(docItem) + closingTag) 182 | emptyTag) 183 docItem << (element 184 | processingInstruction 185 | comment 186 | cdataSection 187 | nonTagStuff) 188 189 document = (ZeroOrMore(Optional(White()) + docItem) 190 + Optional(White()) + StringEnd()) 191 document.parseWithTabs() 192 193 element.addParseAction(manipulator._feedElement) 194 tagOpener.addParseAction(manipulator._openElement) 195 attribute.addParseAction(lambda s,p,t: [Attribute(t)]) 196 openingTag.addParseAction(_nodify) 197 closingTag.addParseAction(_nodify) 198 emptyTag.addParseAction(_nodify) 199 200 del manipulator 201 for el in locals().itervalues(): 202 # this *really* shouldn't be necessary 203 el.leaveWhitespace() 204 del el 205 206 return locals()

207 208

209 -def processXML(document, manipulator):

210 """processes an XML-document with manipulator. 211 212 document is a string containing the XML, and the function returns 213 serialized an XML. You're doing yourself a favour if document is 214 a unicode string. 215 216 manipulator is an instance of a derivation of Manipulator below. 217 There's a secret handshake between Manipulator and the grammar, so 218 you really need to inherit, just putting in the two methods won't do. 219 """ 220 syms = getXMLGrammar(manipulator) 221 # from gavo.adql import grammar; grammar.enableDebug(syms) 222 res = utils.pyparseString(syms["document"], document) 223 return flatten(res)

224 225

226 -class Manipulator(object):

227 """a base class for processXML manipulators. 228 229 Pass instances of these into processXML. You must up-call the 230 constructor without arguments. 231 232 Override the gotElement(parseResult) method to do what you want. The 233 parseResult is a pyparsing object with the tag name in second position of the 234 first matched thing and the attributes barely parsed out (if you need them, 235 improve the parsing to get at the attributes with less effort.) 236 237 gotElement receives an entire element with opening tag, content, and 238 closing tag (or just an empty tag). To manipulate the thing, just 239 return what you want in the document. 240 241 There's also startElement(parsedOpener) that essentially works 242 analogously; you will, however *not* receive startElements for 243 empty elements, so that's really intended for bookkeeping. 244 245 You also have a hasParent(tagName) method on Manipulators returning 246 whether there's a tagName element somewhere among the ancestors 247 of the current tag. 248 """

249 - def __init__(self):

250 self.tagStack = []

251

252 - def _openElement(self, s, p, parsedOpener):

253 # called by the grammar when an XML element is opened. 254 self.tagStack.append(parsedOpener[1]) 255 return self.startElement(parsedOpener)

256

257 - def hasParent(self, name):

258 return name in self.tagStack

259

260 - def _feedElement(self, s, p, parsedElement):

261 # called by the grammar after an XML element has been closed 262 self.tagStack.pop() 263 parsedElement = Element(parsedElement) 264 return [self.gotElement(parsedElement)]

265

266 - def startElement(self, parsedOpener):

267 return parsedOpener

268

269 - def gotElement(self, parsedElement):

270 return parsedElement

271 272

273 -class NROWS:

274 """a singleton sentinel to communicate nrows in tableTriggers. 275 """

276

277 -class _ValuesChanger(Manipulator):

278 """a manipulator fiddling in values limits as returned by 279 iterLimitsForTable. 280 281 Note again: this implementation just supports a single coverage 282 element per RD. We'll have to change limits contents when 283 there can reasonable be more. 284 """

285 - def __init__(self, limits):

286 self.tableTriggers, self.coverageItems = {}, {} 287 self.curColumns = None 288 for kind, payload in limits: 289 if kind=="limits": 290 tableName, columnName, min, max = payload 291 self.tableTriggers.setdefault(tableName, {})[ 292 columnName] = (min, max) 293 294 elif kind=="coverage": 295 reserved, axis, value = payload 296 self.coverageItems[axis] = value 297 298 elif kind=="nrows": 299 tableName, nrows = payload 300 self.tableTriggers.setdefault(tableName, {})[ 301 NROWS] = nrows 302 303 else: 304 assert False 305 306 Manipulator.__init__(self)

307

308 - def startElement(self, parsedTag):

309 if parsedTag[1]=="table": 310 try: 311 tableName = getAttribute(parsedTag, "id").value 312 self.curColumns = self.tableTriggers.get(tableName) 313 self.curNRows = self.tableTriggers.get(tableName, {}).get(NROWS) 314 except KeyError: 315 pass 316 return parsedTag

317

318 - def _fixValues(self, parsedElement, limits):

319 values = parsedElement.findElement("values") 320 for attName, val in zip (["min", "max"], limits): 321 if val is not None: 322 try: 323 values.getAttribute(attName)[2] = utils.escapeAttrVal(str(val)) 324 except (AttributeError, KeyError): 325 # user didn't put this limit into RD; let's assume for a reason 326 pass

327

328 - def _fixCoverage(self, coverageElement):

329 for axisName in self.coverageItems: 330 destEl = coverageElement.findElement(axisName) 331 if destEl is None: 332 coverageElement.append( 333 NewElement(axisName, self.coverageItems[axisName])) 334 else: 335 if coverageElement.countElements(axisName)!=1: 336 raise base.ReportableError("Cannot replace coverage for" 337 " axis '%s': unsupported previous content."%axisName, 338 hint="DaCHS will only replace coverage if there is" 339 " just one element for an axis. If you want DaCHS" 340 " to update the coverage on this axis, delete any" 341 " previous elements for this axis.") 342 343 if len(destEl)==1: 344 elName = destEl.pop()[1] 345 # empty element 346 destEl[:] = [ 347 ['<', elName, '>'], 348 self.coverageItems[axisName], 349 ['</', elName, '>'],] 350 elif len(destEl)==2: 351 # opening and closing tag, insert text 352 destEl[1:1] = self.coverageItems[axisName] 353 elif len(destEl)==3: 354 # element with content, replace previous content 355 destEl[1] = self.coverageItems[axisName] 356 else: 357 assert False

358

359 - def _updateNRows(self, nRowsElement):

360 """changes nRowsElement to postgres' current estimate of the table 361 size, if available. 362 """ 363 if self.curNRows: 364 assert len(nRowsElement)==3 365 nRowsElement[1] = str(self.curNRows)

366

367 - def gotElement(self, parsedElement):

368 if self.curColumns is not None: 369 if parsedElement.name=="column": 370 for attrName in ["name", "original"]: 371 try: 372 colName = parsedElement.getAttribute(attrName).value 373 if colName in self.curColumns: 374 self._fixValues(parsedElement, self.curColumns[colName]) 375 except KeyError: 376 continue 377 break 378 379 if parsedElement.name=="table": 380 self.curColumns = None # tables don't nest in DaCHS 381 382 elif parsedElement.name=="coverage": 383 self._fixCoverage(parsedElement) 384 385 elif parsedElement.name=="nrows": 386 self._updateNRows(parsedElement) 387 388 return parsedElement

389 390

391 -def iterCoverageItems(updater):

392 """yields coverage items for inclusion in RDs. 393 394 NOTE: so far, we can only have one coverage item. So, it's enough 395 to just say "fill this into axis x of coverage". If and when we 396 have more than one coverage items, we'll have to re-think that. 397 That's why there's the "reserved" value in the tuples. We'll have to 398 put something in there (presumably the index of the coverage element, 399 but perhaps we'll have a better identity at some point). 400 """ 401 if updater.parent.spatial is not None: 402 sourceTable = updater.spaceTable or updater.sourceTable 403 if sourceTable: 404 yield "coverage", ("reserved", "spatial", info.getMOCForStdTable( 405 sourceTable, updater.mocOrder).asASCII()) 406 407 if updater.parent.temporal is not None: 408 sourceTable = updater.timeTable or updater.sourceTable 409 if sourceTable: 410 for interval in info.iterScalarLimits( 411 sourceTable, 412 info.getTimeLimitsColumnNames): 413 yield "coverage", ("reserved", "temporal", 414 str(interval)) 415 416 if updater.parent.spectral is not None: 417 sourceTable = updater.spectralTable or updater.sourceTable 418 if sourceTable: 419 for interval in info.iterScalarLimits( 420 sourceTable, 421 info.getSpectralLimitsColumnNames): 422 yield "coverage", ("reserved", "spectral", 423 str(interval))

424 425

426 -def iterLimitsForTable(tableDef):

427 """returns a list of values to fill in into tableDef. 428 429 This will be empty if the table doesn't exist. Otherwise, it will be 430 a tuple ("limit", table-id, column-name, min, max) for every column with 431 a reasonably numeric type that has a min and max values. 432 433 The other thing that *could* come back (but currently only does for 434 iterLimitsForRD) is ("coverage", reserved, axis, literal); see 435 iterCoverageItems for details. 436 """ 437 with base.AdhocQuerier() as q: 438 if q.getTableType(tableDef.getQName()) is None: 439 return 440 yield "nrows", (tableDef.id, q.getRowEstimate(tableDef.getQName())) 441 442 info.annotateDBTable(tableDef, extended=False, requireValues=True) 443 for col in tableDef: 444 if col.annotations: 445 min, max = col.annotations["min"], col.annotations["max"] 446 yield "limits", (tableDef.id, col.name, min, max)

447 448

449 -def iterLimitsForRD(rd):

450 """returns a list of values to fill in for an entire RD. 451 452 See iterLimitsForTable. 453 """ 454 for td in rd.tables: 455 if td.onDisk: 456 try: 457 for limits in iterLimitsForTable(td): 458 yield limits 459 except base.ReportableError as msg: 460 base.ui.notifyError("Skipping %s: %s"%(td.id, utils.safe_str(msg))) 461 462 if rd.coverage and rd.coverage.updater: 463 for covItem in iterCoverageItems(rd.coverage.updater): 464 yield covItem

465 466

467 -def getChangedRD(rdId, limits):

468 """returns a string corresponding to the RD with rdId with limits applied. 469 470 Limits is a sequence of (table-id, column-name, min, max) tuples. 471 We assume the values elements already exist. 472 """ 473 _, f = rscdesc.getRDInputStream(rdId) 474 content = f.read() 475 f.close() 476 return processXML(content, _ValuesChanger(limits))

477 478

479 -def parseCmdLine():

480 from argparse import ArgumentParser 481 482 parser = ArgumentParser( 483 description="Updates existing values min/max items in a referenced" 484 " table or RD.") 485 parser.add_argument("itemId", help="Cross-RD reference of a table or" 486 " RD to update, as in ds/q or ds/q#mytable; only RDs in inputsDir" 487 " can be updated.") 488 return parser.parse_args()

489 490

491 -def main():

492 from gavo import api 493 args = parseCmdLine() 494 item = api.getReferencedElement(args.itemId) 495 496 if isinstance(item, api.TableDef): 497 changes = iterLimitsForTable(item) 498 rd = item.rd 499 500 elif isinstance(item, api.RD): 501 changes = iterLimitsForRD(item) 502 rd = item 503 504 else: 505 raise base.ReportableError( 506 "%s references neither an RD nor a table definition"%args.itemId) 507 508 newText = getChangedRD(rd.sourceId, changes) 509 destFName = os.path.join( 510 api.getConfig("inputsDir"), 511 rd.sourceId+".rd") 512 with utils.safeReplaced(destFName) as f: 513 f.write(newText)

514

Source Code for Module gavo.user.rdmanipulator