1 """
2 Formatting, text manipulation, string constants, and such.
3 """
4
5
6
7
8
9
10
11 import datetime
12 import math
13 import os
14 import random
15 import re
16 import string
17 import time
18 from email import utils as emailutils
19
20 from gavo.utils import codetricks
21 from gavo.utils import misctricks
22 from gavo.utils.excs import Error, SourceParseError
23
24 floatRE = r"[+-]?(?:\d+\.?\d*|\.\d+)(?:[eE][+-]?\d+)?"
25 dateRE = re.compile("\d\d\d\d-\d\d-\d\d$")
26 datetimeRE = re.compile("\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\dZ?$")
27 identifierPattern = re.compile("[A-Za-z_][A-Za-z0-9_]*$")
28 isoTimestampFmt = "%Y-%m-%dT%H:%M:%SZ"
29 isoTimestampFmtNoTZ = "%Y-%m-%dT%H:%M:%S"
30 entityrefPat = re.compile("&([^;])+;")
31 looksLikeURLPat = re.compile("[a-z]{2,5}://")
32
33
34
35
36 _SAFE_FILENAME = re.compile("[,-:=@-Z_a-z{}~-]+$")
37
38 xmlEntities = {
39 'lt': '<',
40 'gt': '>',
41 'amp': '&',
42 'apos': "'",
43 'quot': '"',
44 }
58
61 """returns aStr cropped to maxLen if necessary.
62
63 Cropped strings are returned with an ellipsis marker.
64 """
65 if len(aStr)>maxLen:
66 return aStr[:maxLen-3]+"..."
67 return aStr
68
71 """returns aStr shortened to maxLen by dropping prefixes if necessary.
72
73 Cropped strings are returned with an ellipsis marker.
74 >>> makeLeftEllipsis("0123456789"*2, 11)
75 '...23456789'
76 """
77 if len(aStr)>maxLen:
78 return "..."+aStr[-maxLen+3:]
79 return aStr
80
83 """returns a string hopefully representative for a source token.
84
85 These are, in particular, passed around withing rsc.makeData. Usually,
86 these are (potentially long) strings, but now and then they can be
87 other things with appallingly long reprs. When DaCHS messages
88 need to refer to such sources, this function is used to come up
89 with representative strings.
90 """
91 if isinstance(sourceToken, basestring):
92 return makeLeftEllipsis(sourceToken)
93 else:
94 return makeEllipsis(repr(sourceToken), maxLen=160)
95
99 """returns the file stem of a file path.
100
101 The base name is what remains if you take the base name and split off
102 extensions. The extension here starts with the last dot in the file name,
103 except up to one of some common compression extensions (.gz, .xz, .bz2,
104 .Z, .z) is stripped off the end if present before determining the extension.
105
106 >>> getFileStem("/foo/bar/baz.x.y")
107 'baz.x'
108 >>> getFileStem("/foo/bar/baz.x.gz")
109 'baz'
110 >>> getFileStem("/foo/bar/baz")
111 'baz'
112 """
113 for ext in [".gz", ".xz", ".bz2", ".Z", ".z"]:
114 if fPath.endswith(ext):
115 fPath = fPath[:-len(ext)]
116 break
117 return os.path.splitext(os.path.basename(fPath))[0]
118
150
154 """returns rest if fullPath has the form rootPath/rest and raises an
155 exception otherwise.
156
157 Pass ``liberalChars=False`` to make this raise a ValueError when
158 URL-dangerous characters (blanks, amperands, pluses, non-ASCII, and
159 similar) are present in the result. This is mainly for products.
160 """
161 if not fullPath.startswith(rootPath):
162 raise ValueError(
163 "Full path %s does not start with resource root %s"%(fullPath, rootPath))
164 res = fullPath[len(rootPath):].lstrip("/")
165 if not liberalChars and not _SAFE_FILENAME.match(res):
166 raise ValueError("File path '%s' contains characters known to"
167 " the DaCHS authors to be hazardous in URLs. Please defuse the name"
168 " before using it for published names (or see howDoI)."%res)
169 return res
170
173 """joins relPath to rootPath and makes sure the result really is
174 in rootPath.
175 """
176 relPath = relPath.lstrip("/")
177 fullPath = os.path.realpath(os.path.join(rootPath, relPath))
178 if not fullPath.startswith(rootPath):
179 raise ValueError(
180 "Full path %s does not start with resource root %s"%(fullPath, rootPath))
181 if not os.path.exists(fullPath):
182 raise ValueError(
183 "Invalid path %s. This should not happend."%(fullPath))
184 return fullPath
185
188 """returns code with all whitespace from governingLine removed from
189 every line and newIndent prepended to every line.
190
191 governingLine lets you select a line different from the first one
192 for the determination of the leading white space. Lines before that
193 line are left alone.
194
195 >>> fixIndentation(" foo\\n bar", "")
196 'foo\\nbar'
197 >>> fixIndentation(" foo\\n bar", " ")
198 ' foo\\n bar'
199 >>> fixIndentation(" foo\\n bar\\n baz", "", 1)
200 'foo\\nbar\\n baz'
201 >>> fixIndentation(" foo\\nbar", "")
202 Traceback (most recent call last):
203 Error: Bad indent in line 'bar'
204 """
205 codeLines = [line for line in code.split("\n")]
206 reserved, codeLines = codeLines[:governingLine], codeLines[governingLine:]
207 while codeLines:
208 if codeLines[0].strip():
209 firstIndent = re.match("^\s*", codeLines[0]).group()
210 break
211 else:
212 reserved.append(codeLines.pop(0))
213 if codeLines:
214 fixedLines = []
215 for line in codeLines:
216 if not line.strip():
217 fixedLines.append(newIndent)
218 else:
219 if line[:len(firstIndent)]!=firstIndent:
220 raise Error("Bad indent in line %s"%repr(line))
221 fixedLines.append(newIndent+line[len(firstIndent):])
222 else:
223 fixedLines = codeLines
224 reserved = [newIndent+l.lstrip() for l in reserved]
225 return "\n".join(reserved+fixedLines)
226
230 """helps parsePercentExpression.
231 """
232 parts = re.split(r"(%\w)", format)
233 newReParts = []
234 for ind, p in enumerate(parts):
235 if p.startswith("%"):
236
237 if ind+2<len(parts) and parts[ind+1]=="":
238 if p[1] in "HMS":
239 newReParts.append("(?P<%s>..)"%p[1])
240 else:
241 raise ValueError(
242 "At %s: conversions with no intervening literal not supported."% p)
243 else:
244 newReParts.append("(?P<%s>.*?)"%p[1])
245 else:
246 newReParts.append(re.escape(p))
247 return re.compile("".join(newReParts)+"$")
248
251 """returns a dictionary of parts in the %-template format.
252
253 format is a template with %<conv> conversions, no modifiers are
254 allowed. Each conversion is allowed to contain zero or more characters
255 matched stingily. Successive conversions without intervening literals
256 aren't really supported. There's a hack for strptime-type times, though:
257 H, M, and S just eat two characters each if there's no seperator.
258
259 This is really only meant as a quick hack to support times like 25:33.
260
261 >>> r=parsePercentExpression("12,xy:33,","%a:%b,%c"); r["a"], r["b"], r["c"]
262 ('12,xy', '33', '')
263 >>> sorted(parsePercentExpression("2357-x", "%H%M-%u").items())
264 [('H', '23'), ('M', '57'), ('u', 'x')]
265 >>> r = parsePercentExpression("12,13,14", "%a:%b,%c")
266 Traceback (most recent call last):
267 ValueError: '12,13,14' cannot be parsed using format '%a:%b,%c'
268 """
269 mat = _getREForPercentExpression(format).match(literal)
270 if not mat:
271 raise ValueError("'%s' cannot be parsed using format '%s'"%(
272 literal, format))
273 return mat.groupdict()
274
277 """returns a name mapping dictionary from a list of assignments.
278
279 This is the preferred form of communicating a mapping from external names
280 to field names in records to macros -- in a string that contains
281 ":"-seprated pairs seperated by whitespace, like "a:b b:c", where
282 the incoming names are leading, the desired names are trailing.
283
284 If you need defaults to kick in when the incoming data is None, try
285 _parseDestWithDefault in the client function.
286
287 This function parses a dictionary mapping original names to desired names.
288
289 >>> parseAssignments("a:b b:c")
290 {'a': 'b', 'b': 'c'}
291 """
292 return dict([(lead, trail) for lead, trail in
293 [litPair.split(":") for litPair in assignments.split()]])
294
298 """returns the time angle (h m s.decimals) as a float in degrees.
299
300 >>> "%3.8f"%hmsToDeg("22 23 23.3")
301 '335.84708333'
302 >>> "%3.8f"%hmsToDeg("22:23:23.3", ":")
303 '335.84708333'
304 >>> "%3.8f"%hmsToDeg("222323.3", "")
305 '335.84708333'
306 >>> hmsToDeg("junk")
307 Traceback (most recent call last):
308 ValueError: Invalid time with sepChar None: 'junk'
309 """
310 hms = hms.strip()
311 try:
312 if sepChar=="":
313 parts = hms[:2], hms[2:4], hms[4:]
314 else:
315 parts = hms.split(sepChar)
316 if len(parts)==3:
317 hours, minutes, seconds = parts
318 elif len(parts)==2:
319 hours, minutes = parts
320 seconds = 0
321 else:
322 raise ValueError("Too many parts")
323 timeSeconds = int(hours)*3600+float(minutes)*60+float(seconds or "0")
324 except ValueError:
325 raise ValueError("Invalid time with sepChar %s: %s"%(
326 repr(sepChar), repr(hms)))
327 return timeSeconds/3600/24*360
328
332 """returns the degree minutes seconds-specified dmsAngle as a
333 float in degrees.
334
335 >>> "%3.8f"%dmsToDeg("45 30.6")
336 '45.51000000'
337 >>> "%3.8f"%dmsToDeg("45:30.6", ":")
338 '45.51000000'
339 >>> "%3.8f"%dmsToDeg("-45 30 7.6")
340 '-45.50211111'
341 >>> dmsToDeg("junk")
342 Traceback (most recent call last):
343 ValueError: Invalid dms value with sepChar None: 'junk'
344 """
345 dmsAngle = dmsAngle.strip()
346 sign = 1
347 if dmsAngle.startswith("+"):
348 dmsAngle = dmsAngle[1:].strip()
349 elif dmsAngle.startswith("-"):
350 sign, dmsAngle = -1, dmsAngle[1:].strip()
351 try:
352 if sepChar=="":
353 parts = dmsAngle[:2], dmsAngle[2:4], dmsAngle[4:]
354 else:
355 parts = dmsAngle.split(sepChar)
356 if len(parts)==3:
357 deg, min, sec = parts
358 elif len(parts)==2:
359 deg, min = parts
360 sec = 0
361 else:
362 raise ValueError("Invalid # of parts")
363 arcSecs = sign*(int(deg)*3600+float(min)*60+float(sec or 0))
364 except ValueError:
365 raise misctricks.logOldExc(
366 ValueError("Invalid dms value with sepChar %s: %s"%(
367 repr(sepChar), repr(dmsAngle))))
368 return arcSecs/3600
369
372 """returns the time angle fracHours given in decimal hours in degrees.
373 """
374 return float(fracHours)*360./24.
375
376
377 -def degToHms(deg, sepChar=" ", secondFracs=3):
378 """converts a float angle in degrees to an time angle (hh:mm:ss.mmm).
379
380 >>> degToHms(0)
381 '00 00 00.000'
382 >>> degToHms(122.056, secondFracs=1)
383 '08 08 13.4'
384 >>> degToHms(-0.056, secondFracs=0)
385 '-00 00 13'
386 >>> degToHms(-1.056, secondFracs=0)
387 '-00 04 13'
388 >>> degToHms(359.2222, secondFracs=4, sepChar=":")
389 '23:56:53.3280'
390 >>> "%.4f"%hmsToDeg(degToHms(256.25, secondFracs=9))
391 '256.2500'
392 """
393 sign = ""
394 if deg<0:
395 sign = "-"
396 deg = -deg
397 rest, hours = math.modf(deg/360.*24)
398 rest, minutes = math.modf(rest*60)
399 if secondFracs<1:
400 secondFracs = -1
401 return sign+sepChar.join(["%02d"%int(hours), "%02d"%abs(int(minutes)),
402 "%0*.*f"%(secondFracs+3, secondFracs, abs(rest*60))])
403
404
405 -def degToDms(deg, sepChar=" ", secondFracs=2):
406 """converts a float angle in degrees to a sexagesimal string.
407
408 >>> degToDms(0)
409 '+0 00 00.00'
410 >>> degToDms(-0.25)
411 '-0 15 00.00'
412 >>> degToDms(-23.50, secondFracs=4)
413 '-23 30 00.0000'
414 >>> "%.4f"%dmsToDeg(degToDms(-25.6835, sepChar=":"), sepChar=":")
415 '-25.6835'
416 """
417 sign = '+'
418 if deg<0:
419 sign = "-"
420 deg = -deg
421 rest, degs = math.modf(deg)
422 rest, minutes = math.modf(rest*60)
423 if secondFracs==0:
424 secondFracs = -1
425 return sepChar.join(["%s%d"%(sign, int(degs)), "%02d"%abs(int(minutes)),
426 "%0*.*f"%(secondFracs+3, secondFracs, abs(rest*60))])
427
430 """returns a UTC datetime object in the format requried by http.
431
432 This may crap when you fuzz with the locale. In general, when handling
433 "real" times within the DC, prefer unix timestamps over datetimes and
434 use the other *RFC2616 functions.
435 """
436 return dt.strftime('%a, %d %b %Y %H:%M:%S GMT')
437
440 """returns seconds since unix epoch representing UTC from the HTTP-compatible
441 time specification s.
442 """
443 parts = emailutils.parsedate_tz(s)
444 return emailutils.mktime_tz(parts)
445
446
447
448 -def _d(y, m, d, days=(0,31,59,90,120,151,181,212,243,273,304,334,365)):
449 return (((y - 1901)*1461)/4 + days[m-1] + d + (
450 (m > 2 and not y % 4 and (y % 100 or not y % 400)) and 1))
451
452 -def timegm(tm, epoch=_d(1970,1,1)):
453 year, month, day, h, m, s = tm[:6]
454 return (_d(year, month, day) - epoch)*86400 + h*3600 + m*60 + s
455
463
464
465 _isoDTRE = re.compile(r"(?P<year>\d\d\d\d)-?(?P<month>\d\d)-?(?P<day>\d\d)"
466 r"(?:[T ](?P<hour>\d\d):?(?P<minute>\d\d):?"
467 r"(?P<seconds>\d\d)(?P<secFracs>\.\d*)?Z?(\+00:00)?)?$")
472 """returns a datetime object for a ISO time literal.
473
474 There's no real timezone support yet, but we accept and ignore various
475 ways of specifying UTC.
476
477 >>> parseISODT("1998-12-14")
478 datetime.datetime(1998, 12, 14, 0, 0)
479 >>> parseISODT("1998-12-14T13:30:12")
480 datetime.datetime(1998, 12, 14, 13, 30, 12)
481 >>> parseISODT("1998-12-14T13:30:12Z")
482 datetime.datetime(1998, 12, 14, 13, 30, 12)
483 >>> parseISODT("1998-12-14T13:30:12.224Z")
484 datetime.datetime(1998, 12, 14, 13, 30, 12, 224000)
485 >>> parseISODT("19981214T133012Z")
486 datetime.datetime(1998, 12, 14, 13, 30, 12)
487 >>> parseISODT("19981214T133012+00:00")
488 datetime.datetime(1998, 12, 14, 13, 30, 12)
489 >>> parseISODT("junk")
490 Traceback (most recent call last):
491 ValueError: Bad ISO datetime literal: junk (required format: yyyy-mm-ddThh:mm:ssZ)
492 """
493 literal = literal.rstrip("Z")
494 mat = _isoDTRE.match(literal.strip())
495 if not mat:
496 raise ValueError("Bad ISO datetime literal: %s"
497 " (required format: yyyy-mm-ddThh:mm:ssZ)"%literal)
498 parts = mat.groupdict()
499 if parts["hour"] is None:
500 parts["hour"] = parts["minute"] = parts["seconds"] = 0
501 if parts["secFracs"] is None:
502 parts["secFracs"] = 0
503 else:
504 parts["secFracs"] = "0"+parts["secFracs"]
505 return datetime.datetime(int(parts["year"]), int(parts["month"]),
506 int(parts["day"]), int(parts["hour"]), int(parts["minute"]),
507 int(parts["seconds"]), int(float(parts["secFracs"])*1000000))
508
509
510 _SUPPORTED_DT_FORMATS =[
511 '%Y-%m-%dT%H:%M:%S',
512 '%Y-%m-%d %H:%M:%S',
513 '%Y-%m-%d',]
516 if literal is None or isinstance(literal, datetime.datetime):
517 return literal
518 if literal.endswith("Z"):
519 literal = literal[:-1]
520
521 literal = literal.split(".")[0]
522 for format in _SUPPORTED_DT_FORMATS:
523 try:
524 return datetime.datetime(
525 *time.strptime(literal, format)[:6])
526 except ValueError:
527 pass
528 return parseISODT(literal)
529
532 if literal is None or isinstance(literal, datetime.date):
533 return literal
534 return datetime.date(*time.strptime(literal, '%Y-%m-%d')[:3])
535
538 if literal is None or isinstance(literal, datetime.time):
539 return literal
540
541 return datetime.time(*time.strptime(literal, '%H:%M:%S')[3:6])
542
545 """returns a datetime instance rounded to whole seconds.
546
547 This also recklessly clears any time zone marker. So, don't pass
548 in anything with a meaningful time zone.
549 """
550 if dt.microsecond>500000:
551 return dt.replace(microsecond=0, tzinfo=None
552 )+datetime.timedelta(seconds=1)
553 else:
554 return dt.replace(microsecond=0, tzinfo=None)
555
579
582 """is a name mapper fed from a simple text file.
583
584 The text file format simply is:
585
586 <target-id> "TAB" <src-id>{whitespace <src-id>}
587
588 src-ids have to be encoded quoted-printable when they contain whitespace
589 or other "bad" characters ("="!). You can have #-comments and empty
590 lines.
591 """
592 - def __init__(self, src, missingOk=False):
593 self._parseSrc(src, missingOk)
594
596 return name in self.namesDict
597
599 self.namesDict = {}
600 try:
601 f = open(src)
602 except IOError:
603 if not missingOk:
604 raise
605 else:
606 return
607 try:
608 for ln in f:
609 if ln.startswith("#") or not ln.strip():
610 continue
611 ob, names = re.split("\t+", ln)
612 for name in names.lower().split():
613 self.namesDict[name.decode("quoted-printable")] = ob
614 except ValueError:
615 raise misctricks.logOldExc(ValueError(
616 "Syntax error in %s: Line %s not understood."%(src, repr(ln))))
617 f.close()
618
621
622
623 _STANDARD_ENTITIES = {
624 'lt': '<',
625 'gt': '>',
626 'amp': '&',
627 'apos': "'",
628 'quot': '"',
629 }
633 entRef = matob.group(1)
634 if entRef in _STANDARD_ENTITIES:
635 return _STANDARD_ENTITIES[entRef]
636 elif entRef.startswith("#x"):
637 return unichr(int(entRef[2:], 16))
638 elif entRef.startswith("#"):
639 return unichr(int(entRef[1:]))
640 else:
641 raise ValueError("Unknown entity reference: &%s;"%entRef)
642
645 return entityrefPat.sub(_decodeEntityref, unicodeString)
646
649 """returns s with exactly one trailing slash.
650 """
651 return s.rstrip("/")+"/"
652
655 """helps iterSimpleText.
656 """
657 for (lineNumber, curLine) in enumerate(f):
658 curLine = curLine.strip()
659 if curLine and not curLine.startswith("#"):
660 yield (lineNumber+1), curLine
661
662
663 @codetricks.document
664 -def iterSimpleText(f):
665 """iterates over ``(physLineNumber, line)`` in f with some usual
666 conventions for simple data files.
667
668 You should use this function to read from simple configuration and/or
669 table files that don't warrant a full-blown grammar/rowmaker combo.
670 The intended use is somewhat like this::
671
672 with open(rd.getAbsPath("res/mymeta")) as f:
673 for lineNumber, content in iterSimpleText(f):
674 try:
675 ...
676 except Exception, exc:
677 sys.stderr.write("Bad input line %s: %s"%(lineNumber, exc))
678
679 The grammar rules are, specifically:
680
681 * leading and trailing whitespace is stripped
682 * empty lines are ignored
683 * lines beginning with a hash are ignored
684 * lines ending with a backslash are joined with the following line;
685 to have intervening whitespace, have a blank in front of the backslash.
686 """
687 iter = _iterSimpleTextNoContinuation(f)
688 try:
689 while True:
690 lineNumber, curLine = iter.next()
691
692 while curLine.endswith("\\"):
693 try:
694 lineNumber, newStuff = iter.next()
695 except StopIteration:
696 raise SourceParseError("File ends with a backslash",
697 location="line %d"%lineNumber)
698 curLine = curLine[:-1]+newStuff
699
700 yield lineNumber, curLine
701 except StopIteration:
702 pass
703
704
705 _RANDOM_STRING_OK_CHARS = string.letters+string.digits+"_.,"
708 """returns a random string of harmless printable characters.
709 """
710 return "".join(
711 random.choice(_RANDOM_STRING_OK_CHARS) for c in range(length))
712
715 if isinstance(val, str):
716 return val
717 elif isinstance(val, unicode):
718 return val.encode("ascii", "ignore")
719 else:
720 return str(val)
721
724 """parses an RFC 2616 accept header and returns a dict mapping media
725 type patterns to their (unparsed) parameters.
726
727 If aString is None, an empty dict is returned
728
729 If we ever want to do fancy things with http content negotiation, this
730 will be further wrapped to provide something implementing the complex
731 RFC 2616 rules; this primitive interface really is intended for telling
732 apart browsers (which accept text/html) from other clients (which
733 hopefully do not) at this point.
734
735 >>> sorted(parseAccept("text/html, text/*; q=0.2; level=3").items())
736 [('text/*', 'q=0.2; level=3'), ('text/html', '')]
737 >>> parseAccept(None)
738 {}
739 """
740 res = {}
741 if aString is not None:
742 for item in aString.split(","):
743 if ";" in item:
744 key, params = item.split(";", 1)
745 else:
746 key, params = item, ""
747 res[key.strip()] = params.strip()
748
749 return res
750
755
756
757 if __name__=="__main__":
758 _test()
759