Source code for gavo.utils.texttricks

"""
Formatting, text manipulation, string constants, and such.
"""

#c Copyright 2008-2025, the GAVO project <gavo@ari.uni-heidelberg.de>
#c
#c This program is free software, covered by the GNU GPL.  See the
#c COPYING file in the source distribution.


import calendar
import datetime
import functools
import os
import pathlib
import quopri
import random
import re
import string
import time
from email import utils as emailutils
from email import message as emailmessage

from gavo.utils.dachstypes import (Any, Filename, Generator,
	Optional, SourceToken, StrToStrMap, TextIO, Tuple, Union)

from gavo.imp import angle_formats
from gavo.utils import codetricks
from gavo.utils import misctricks
from gavo.utils.excs import SourceParseError, StructureError

floatRE = r"[+-]?(?:\d+\.?\d*|\.\d+)(?:[eE][+-]?\d+)?"
dateRE = re.compile(r"\d\d\d\d-\d\d-\d\d$")
datetimeRE = re.compile(r"\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\dZ?$")
identifierPattern = re.compile(r"[A-Za-z_][A-Za-z0-9_]*$")
isoTimestampFmt = "%Y-%m-%dT%H:%M:%SZ"
isoTimestampFmtNoTZ = "%Y-%m-%dT%H:%M:%S"
entityrefPat = re.compile(r"&([^;])+;")
looksLikeURLPat = re.compile(r"[a-z]{2,5}://")


# file names that don't cause headaches in URLs and are otherwise reasonably
# benign (so, let's disallow shell metachars while we're at it).
_SAFE_FILENAME = re.compile("[,-:=@-Z_a-z{}~-]+$")

xmlEntities = {
		'lt': '<',
		'gt': '>',
		'amp': '&',
		'apos': "'",
		'quot': '"',
}


[docs]def formatSize(val: float, sf: int=1) -> str:
	"""returns a human-friendly representation of a file size.
	"""
	if val<1e3:
		return "%d Bytes"%int(val)
	elif val<1e6:
		return "%.*fkiB"%(sf, val/1024.)
	elif val<1e9:
		return "%.*fMiB"%(sf, val/1024./1024.)
	else:
		return "%.*fGiB"%(sf, val/1024./1024./1024)


[docs]def makeEllipsis(aStr: str, maxLen: int=60, ellChars: str="...") -> str:
	"""returns aStr cropped to maxLen if necessary.

	Cropped strings are returned with an ellipsis marker.
	"""
	aStr = re.sub(r"\s+", " ", aStr)
	if len(aStr)>maxLen:
		return aStr[:maxLen-len(ellChars)]+ellChars
	return aStr


[docs]def makeLeftEllipsis(aStr: str, maxLen: int=60):
	"""returns aStr shortened to maxLen by dropping prefixes if necessary.

	Cropped strings are returned with an ellipsis marker.

	>>> makeLeftEllipsis("0123456789"*2, 11)
	'...23456789'
	"""
	aStr = re.sub(r"\s+", " ", aStr)
	if len(aStr)>maxLen:
		return "..."+aStr[-maxLen+3:]
	return aStr


[docs]def makeSourceEllipsis(sourceToken: SourceToken) -> str:
	"""returns a string hopefully representative for a source token.

	These are, in particular, passed around within rsc.makeData.  Usually,
	these are (potentially long) strings, but now and then they can be
	other things with appallingly long reprs.  When DaCHS messages
	need to refer to such sources, this function is used to come up
	with representative strings.
	"""
	if isinstance(sourceToken, str):
		return makeLeftEllipsis(sourceToken)
	else:
		return makeEllipsis(repr(sourceToken), maxLen=160)


[docs]@codetricks.document
def getFileStem(fPath: str):
	"""returns the file stem of a file path.

	The base name is what remains if you take the base name and split off
	extensions.  The extension here starts with the last dot in the file name,
	except up to one of some common compression extensions (.gz, .xz, .bz2,
	.Z, .z) is stripped off the end if present before determining the extension.

	>>> getFileStem("/foo/bar/baz.x.y")
	'baz.x'
	>>> getFileStem("/foo/bar/baz.x.gz")
	'baz'
	>>> getFileStem("/foo/bar/baz")
	'baz'
	"""
	for ext in [".gz", ".xz", ".bz2", ".Z", ".z"]:
		if fPath.endswith(ext):
			fPath = fPath[:-len(ext)]
			break
	return os.path.splitext(os.path.basename(fPath))[0]
	

[docs]def formatSimpleTable(
		data: list[list[Any]],
		stringify: bool=True,
		titles: Optional[list[str]]=None) -> str:
	"""returns a string containing a text representation of tabular data.

	All columns of data are simply stringified, then the longest member
	determines the width of the text column.  The behaviour if data
	does not contain rows of equal length is unspecified; data must
	contain at least one row.

	If you have serialised the values in data yourself, pass stringify=False.

	If you pass titles, it must be a sequence of strings; they are then
	used as table headers; the shorter of data[0] and titles will determine
	the number of columns displayed.
	"""
	if stringify:
		data = [[str(v) for v in row] for row in data]

	if not data:
		return ""

	colWidthes = [max(len(row[colInd]) for row in data)
		for colInd in range(len(data[0]))]
	if titles is not None:
		colWidthes = [max(len(t), l) for t, l in zip(titles, colWidthes)]
	
	fmtStr = "  ".join("%%%ds"%w for w in colWidthes)
	table = "\n".join(fmtStr%tuple(row) for row in data)
	if titles is not None:
		table = fmtStr%tuple(titles)+"\n\n"+table
	return table


[docs]@codetricks.document
def getRelativePath(
		fullPath: Filename,
		rootPath: Filename,
		liberalChars: bool=True) -> Filename:
	"""returns rest if fullPath has the form rootPath/rest and raises a
	ValueError otherwise.

	This accepts either strings or pathlib.Path-s and returns an object of the
	type of fullPath (pathlib functionality since 2.9.3).

	Pass ``liberalChars=False`` to make this raise a ValueError when
	URL-dangerous characters (blanks, amperands, pluses, non-ASCII, and
	similar) are present in the result.  This is mainly for products.
	"""
	if isinstance(fullPath, pathlib.PurePath):
		res: Filename = fullPath.relative_to(rootPath)
	else:
		if not fullPath.startswith(str(rootPath)):
			raise ValueError(f"{fullPath} does not start with {rootPath}")
		res = os.path.relpath(fullPath, str(rootPath))
		# backwards compatibility hack
		if res==".":
			res = ""

	if not liberalChars and not _SAFE_FILENAME.match(str(res)):
		raise ValueError("File path '%s' contains characters known to"
			" the DaCHS authors to be hazardous in URLs.  Please defuse the name"
			" before using it for published names (or see howDoI)."%res)

	return res


[docs]def resolvePath(rootPath: str, relPath: str) -> str:
	"""joins relPath to rootPath and makes sure the result really is
	in rootPath.
	"""
	relPath = relPath.lstrip("/")
	fullPath = os.path.realpath(os.path.join(rootPath, relPath))
	if not fullPath.startswith(rootPath):
		raise ValueError(
			"Full path %s does not start with resource root %s"%(fullPath, rootPath))
	if not os.path.exists(fullPath):
		raise ValueError(
			"Invalid path %s. This should not happen."%(fullPath))
	return fullPath


[docs]def fixIndentation(code: str, newIndent: str, governingLine: int=0) -> str:
	"""returns code with all whitespace from governingLine removed from
	every line and newIndent prepended to every line.

	governingLine lets you select a line different from the first one
	for the determination of the leading white space.  Lines before that
	line are left alone.

	>>> fixIndentation("  foo\\n  bar", "")
	'foo\\nbar'
	>>> fixIndentation("  foo\\n   bar", " ")
	' foo\\n  bar'
	>>> fixIndentation("  foo\\n   bar\\n    baz", "", 1)
	'foo\\nbar\\n baz'
	>>> fixIndentation("  foo\\nbar", "")
	Traceback (most recent call last):
	gavo.utils.excs.StructureError: Bad indent in line 'bar'
	"""
	codeLines = [line for line in code.split("\n")]
	reserved, codeLines = codeLines[:governingLine], codeLines[governingLine:]
	while codeLines:
		if codeLines[0].strip():
			if mat := re.match(r"^\s*", codeLines[0]):
				firstIndent = mat.group()
				break
		else:
			reserved.append(codeLines.pop(0))
	if codeLines:
		fixedLines = []
		for line in codeLines:
			if not line.strip():
				fixedLines.append(newIndent)
			else:
				if line[:len(firstIndent)]!=firstIndent:
					raise StructureError("Bad indent in line %s"%repr(line))
				fixedLines.append(newIndent+line[len(firstIndent):])
	else:
		fixedLines = codeLines
	reserved = [newIndent+l.lstrip() for l in reserved]
	return "\n".join(reserved+fixedLines)


@functools.lru_cache()
def _getREForPercentExpression(format: str) -> re.Pattern:
	"""helps parsePercentExpression.
	"""
	parts = re.split(r"(%\w)", format)
	newReParts = []
	for ind, p in enumerate(parts):
		if p.startswith("%"):
			# the time-parsing hack explained in the docstring:
			if ind+2<len(parts) and parts[ind+1]=="":
				if p[1] in "HMS":
					newReParts.append("(?P<%s>..)"%p[1])
				else:
					raise ValueError(
						"At %s: conversions with no intervening literal not supported."% p)
			else:
				newReParts.append("(?P<%s>.*?)"%p[1])
		else:
			newReParts.append(re.escape(p))
	return re.compile("".join(newReParts)+"$")


[docs]def parsePercentExpression(literal: str, format: str) -> dict:
	"""returns a dictionary of parts in the %-template format.

	format is a template with %<conv> conversions, no modifiers are
	allowed.  Each conversion is allowed to contain zero or more characters
	matched stingily.  Successive conversions without intervening literals
	aren't really supported.  There's a hack for strptime-type times, though:
	H, M, and S just eat two characters each if there's no separator.
	
	This is really only meant as a quick hack to support times like 25:33.

	>>> r=parsePercentExpression("12,xy:33,","%a:%b,%c"); r["a"], r["b"], r["c"]
	('12,xy', '33', '')
	>>> sorted(parsePercentExpression("2357-x", "%H%M-%u").items())
	[('H', '23'), ('M', '57'), ('u', 'x')]
	>>> r = parsePercentExpression("12,13,14", "%a:%b,%c")
	Traceback (most recent call last):
	ValueError: '12,13,14' cannot be parsed using format '%a:%b,%c'
	"""
	mat = _getREForPercentExpression(format).match(literal)
	if not mat:
		raise ValueError("'%s' cannot be parsed using format '%s'"%(
			literal, format))
	return mat.groupdict()


[docs]def parseAssignments(assignments: str) -> StrToStrMap:
	"""returns a name mapping dictionary from a list of assignments.

	This is the preferred form of communicating a mapping from external names
	to field names in records to macros -- in a string that contains
	":"-seprated pairs separated by whitespace, like "a:b  b:c", where
	the incoming names are leading, the desired names are trailing.

	If you need defaults to kick in when the incoming data is None, try
	_parseDestWithDefault in the client function.

	This function parses a dictionary mapping original names to desired names.

	>>> parseAssignments("a:b  b:c")
	{'a': 'b', 'b': 'c'}
	"""
	return dict([(lead, trail) for lead, trail in
		[litPair.split(":") for litPair in assignments.split()]])


[docs]@codetricks.document
def hmsToDeg(hms: str, sepChar: Optional[str]=None) -> float:
	"""returns the time angle (h m s.decimals) as a float in degrees.

	>>> "%3.8f"%hmsToDeg("22 23 23.3")
	'335.84708333'
	>>> "%3.8f"%hmsToDeg("22:23:23.3", ":")
	'335.84708333'
	>>> "%3.8f"%hmsToDeg("222323.3", "")
	'335.84708333'
	>>> hmsToDeg("junk")
	Traceback (most recent call last):
	ValueError: Invalid time with sepChar None: 'junk'
	"""
	hms = hms.strip()
	try:
		if sepChar=="":
			parts = [hms[:2], hms[2:4], hms[4:]]
		else:
			parts = hms.split(sepChar)
		if len(parts)==3:
			hours, minutes, seconds = parts
		elif len(parts)==2:
			hours, minutes = parts
			seconds = "0"
		else:
			raise ValueError("Too many parts")
		timeSeconds = int(hours)*3600+float(minutes)*60+float(seconds or "0")
	except ValueError:
		raise ValueError("Invalid time with sepChar %s: %s"%(
			repr(sepChar), repr(hms)))
	return timeSeconds/3600/24*360


[docs]@codetricks.document
def dmsToDeg(dmsAngle: str, sepChar: Optional[str]=None) -> float:
	"""returns the degree minutes seconds-specified dmsAngle as a
	float in degrees.

	>>> "%3.8f"%dmsToDeg("45 30.6")
	'45.51000000'
	>>> "%3.8f"%dmsToDeg("45:30.6", ":")
	'45.51000000'
	>>> "%3.8f"%dmsToDeg("-45 30 7.6")
	'-45.50211111'
	>>> dmsToDeg("junk")
	Traceback (most recent call last):
	ValueError: Invalid dms value with sepChar None: 'junk'
	"""
	dmsAngle = dmsAngle.strip()
	sign = 1
	if dmsAngle.startswith("+"):
		dmsAngle = dmsAngle[1:].strip()
	elif dmsAngle.startswith("-"):
		sign, dmsAngle = -1, dmsAngle[1:].strip()
	try:
		if sepChar=="":
			parts = [dmsAngle[:2], dmsAngle[2:4], dmsAngle[4:]]
		else:
			parts = dmsAngle.split(sepChar)
		if len(parts)==3:
			deg, min, sec = parts
		elif len(parts)==2:
			deg, min = parts
			sec = "0"
		else:
			raise ValueError("Invalid # of parts")
		arcSecs = sign*(int(deg)*3600+float(min)*60+float(sec or "0"))
	except ValueError:
		raise misctricks.logOldExc(
			ValueError("Invalid dms value with sepChar %s: %s"%(
				repr(sepChar), repr(dmsAngle))))
	return arcSecs/3600


[docs]def fracHoursToDeg(fracHours: float) -> float:
	"""returns the time angle fracHours given in decimal hours in degrees.
	"""
	return float(fracHours)*360./24.


[docs]@codetricks.document
def degToHms(deg: float,
		sepChar: str=" ",
		secondFracs: int=3,
		truncate: bool=False) -> str:
	"""converts a float angle in degrees to an time angle (hh:mm:ss.mmm).

	This takes a lot of optional arguments:

	* sepChar is the char separating the components
	* secondFracs is the number for fractional seconds to generate
	* truncate can be set to True if fractional seconds should be truncated
	  rather then rounded (as necessary for building IAU identifiers)

	>>> degToHms(0, sepChar=":")
	'00:00:00.000'
	>>> degToHms(122.057, secondFracs=1)
	'08 08 13.7'
	>>> degToHms(122.057, secondFracs=1, truncate=True)
	'08 08 13.6'
	>>> degToHms(-0.055, secondFracs=0)
	'-00 00 13'
	>>> degToHms(-0.055, secondFracs=0, truncate=True)
	'-00 00 13'
	>>> degToHms(-1.056, secondFracs=0)
	'-00 04 13'
	>>> degToHms(-1.056, secondFracs=0)
	'-00 04 13'
	>>> degToHms(359.9999999)
	'24 00 00.000'
	>>> degToHms(359.2222, secondFracs=4, sepChar=":")
	'23:56:53.3280'
	>>> "%.4f"%hmsToDeg(degToHms(256.25, secondFracs=9))
	'256.2500'
	"""
	if truncate:
		return angle_formats.hours_to_string(
			deg/360*24, sep=(sepChar, sepChar),
			precision=secondFracs+8, pad=True)[:-8].rstrip(".")

	return angle_formats.hours_to_string(
		deg/360*24, sep=(sepChar, sepChar), precision=secondFracs, pad=True)


[docs]@codetricks.document
def hoursToHms(decimal_hours: float,
		sepChar: str=":", secondFracs:int =0) -> str:
	"""returns a time span in hours in sexagesmal time (h:m:s).

	The optional arguments are as for degToHms.

	>>> hoursToHms(0)
	'00:00:00'
	>>> hoursToHms(23.5)
	'23:30:00'
	>>> hoursToHms(23.55)
	'23:33:00'
	>>> hoursToHms(23.525)
	'23:31:30'
	>>> hoursToHms(23.553, secondFracs=2)
	'23:33:10.80'
	>>> hoursToHms(123.553, secondFracs=2)
	'123:33:10.80'
	"""
	return angle_formats.hours_to_string(
		decimal_hours, sep=(sepChar, sepChar), precision=secondFracs, pad=True)


[docs]@codetricks.document
def degToDms(deg: float,
		sepChar: str=" ",
		secondFracs: int=2,
		preserveLeading: bool=False,
		truncate: bool=False,
		addSign: bool=True) -> str:
	"""converts a float angle in degrees to a sexagesimal string.

	This takes a lot of optional arguments:

	* sepChar is the char separating the components
	* secondFracs is the number for fractional seconds to generate
	* preserveLeading can be set to True if leading zeroes should be
	  preserved
	* truncate can be set to True if fractional seconds should be truncated
	  rather then rounded (as necessary for building IAU identifiers)
	* addSign, if true, makes the function return a + in front of positive
	  values (the default)

	>>> degToDms(-3.24722, "", 0, True, True)
	'-031449'
	>>> degToDms(0)
	'+0 00 00.00'
	>>> degToDms(0, addSign=False)
	'0 00 00.00'
	>>> degToDms(-0.25, sepChar=":")
	'-0:15:00.00'
	>>> degToDms(-23.50, secondFracs=4)
	'-23 30 00.0000'
	>>> "%.4f"%dmsToDeg(degToDms(-25.6835, sepChar=":"), sepChar=":")
	'-25.6835'
	"""
	if truncate:
		secondFracs += 8

	fmted = angle_formats.degrees_to_string(
		deg, sep=(sepChar, sepChar), precision=secondFracs,
		pad=preserveLeading)

	if truncate:
		fmted = fmted[:-8].rstrip(".")

	if addSign and deg>=0:
		fmted = "+"+fmted
	
	return fmted


[docs]@codetricks.document
def makeIAUId(prefix: str,
		long: float, lat: float,
		longSec: int=0, latSec: int=0) -> str:
	"""returns an (equatorial) IAU identifier for an object at long and lat.

	The rules are given on https://cds.unistra.fr/Dic/iau-spec.html

	The prefix, including the system identifier, you have to pass in.
	You cannot build identifiers using only minutes precision.
	If you want to include sub-arcsec precision, pass in longSec and/or
	latSec (the number of factional seconds to preserve).
	"""
	longPart = degToHms(long, "", longSec, True)
	latPart = degToDms(lat, "", latSec, True, True)
	return f"{prefix}{longPart}{latPart}"


[docs]def datetimeToRFC2616(dt: datetime.datetime) -> str:
	"""returns a UTC datetime object in the format required by http.

	This may crap when you fuzz with the locale.  In general, when handling
	"real" times within the DC, prefer unix timestamps over datetimes and
	use the other ``*RFC2616`` functions.
	"""
	return dt.strftime('%a, %d %b %Y %H:%M:%S GMT')


[docs]def parseRFC2616Date(s: str) -> float:
	"""returns seconds since unix epoch representing UTC from the HTTP-compatible
	time specification s.
	"""
	parts = emailutils.parsedate_tz(s)
	assert parts is not None
	return emailutils.mktime_tz(parts)


# The following timegm implementation is due to Frederik Lundh
def _d(y: int, m: int, d: int,
		days: Tuple[int, ...]=(0,31,59,90,120,151,181,212,243,273,304,334,365)
		) -> int:
	return (((y - 1901)*1461)//4 + days[m-1] + d + (
		(m > 2 and not y % 4 and (y % 100 or not y % 400)) and 1))

[docs]def timegm(tm: time.struct_time, epoch: float=_d(1970,1,1)):
		year, month, day, h, m, s = tm[:6]
		return (_d(year, month, day) - epoch)*86400 + h*3600 + m*60 + s


[docs]def formatRFC2616Date(secs: Optional[float]=None) -> str:
	"""returns an RFC2616 date string for UTC seconds since unix epoch.
	"""
	if secs is None:
		secs = time.time()
	return emailutils.formatdate(secs, localtime=False, usegmt=True)


_isoDTRE = re.compile(r"(?P<year>\d\d\d\d)-?(?P<month>\d\d)-?(?P<day>\d\d)"
		r"(?:[T ](?P<hour>\d\d):?(?P<minute>\d\d):?"
		r"(?P<seconds>\d\d)(?P<secFracs>\.\d*)?Z?(\+00:00)?)?$")


[docs]@codetricks.document
def parseISODT(literal: str, useTime: bool=False) -> datetime.datetime:
	"""returns a datetime object for a ISO time literal.

	There's no real timezone support yet, but we accept and ignore various
	ways of specifying UTC.

	By default, this uses plain python datetime because it usually covers a large
	date range than the time module.  The downside is that it does not know about
	leap seconds.  Pass useTime=True to go through time tuples, which know how to
	deal with them (but may not deal with dates far in the past or future).

	>>> parseISODT("1998-12-14")
	datetime.datetime(1998, 12, 14, 0, 0)
	>>> parseISODT("1998-12-14T13:30:12")
	datetime.datetime(1998, 12, 14, 13, 30, 12)
	>>> parseISODT("1998-12-14T13:30:12Z")
	datetime.datetime(1998, 12, 14, 13, 30, 12)
	>>> parseISODT("1998-12-14T13:30:12.224Z")
	datetime.datetime(1998, 12, 14, 13, 30, 12, 224000)
	>>> parseISODT("19981214T133012Z")
	datetime.datetime(1998, 12, 14, 13, 30, 12)
	>>> parseISODT("19981214T133012+00:00")
	datetime.datetime(1998, 12, 14, 13, 30, 12)
	>>> parseISODT("2016-12-31T23:59:60")
	Traceback (most recent call last):
	ValueError: second must be in 0..59
	>>> parseISODT("2016-12-31T23:59:60", useTime=True)
	datetime.datetime(2017, 1, 1, 0, 0)
	>>> parseISODT("junk")
	Traceback (most recent call last):
	ValueError: Bad ISO datetime literal: junk (required format: yyyy-mm-ddThh:mm:ssZ)
	"""
	if isinstance(literal, datetime.datetime):
		return literal

	literal = literal.rstrip("Z")
	mat = _isoDTRE.match(literal.strip())
	if not mat:
		raise ValueError("Bad ISO datetime literal: %s"
			" (required format: yyyy-mm-ddThh:mm:ssZ)"%literal)
	parts = mat.groupdict()
	if parts["hour"] is None:
		parts["hour"] = parts["minute"] = parts["seconds"] = 0
	if parts["secFracs"] is None:
		parts["secFracs"] = 0
	else:
		parts["secFracs"] = "0"+parts["secFracs"]
	
	if useTime:
		return datetime.datetime.utcfromtimestamp(
			calendar.timegm((
				# type calculus broken for RE results in mypy 1.0.1
				int(parts["year"]), int(parts["month"]), int(parts["day"]),  # type: ignore
				int(parts["hour"]), int(parts["minute"]), int(parts["seconds"]),  # type: ignore
				-1, -1, -1)))

	else:
		return datetime.datetime(int(parts["year"]), int(parts["month"]), # type: ignore
			int(parts["day"]), int(parts["hour"]), int(parts["minute"]),    # type: ignore
			int(parts["seconds"]), int(float(parts["secFracs"])*1000000))   # type: ignore


_SUPPORTED_DT_FORMATS =[
	'%Y-%m-%dT%H:%M:%S',
	'%Y-%m-%d %H:%M:%S',
	'%Y-%m-%d',]

[docs]def parseDefaultDatetime(literal: Optional[Union[str, datetime.datetime]]
		) -> Optional[datetime.datetime]:
	"""returns a datetime from string or passes through datetimes and Nones.

	The function will try to parse a string in various ways; we will
	try not to drop formats from one minor version to the next.
	"""
	if literal is None or isinstance(literal, datetime.datetime):
		return literal
	if literal.endswith("Z"):
		literal = literal[:-1]

	for format in _SUPPORTED_DT_FORMATS:
		try:
			return datetime.datetime(
				*time.strptime(literal, format)[:6])
		except ValueError:
			pass

	return parseISODT(literal)


[docs]def parseDefaultDate(literal: Optional[Union[str, datetime.date]]
		) -> Optional[datetime.date]:
	"""parseDefaultDatetime's little sister.
	"""
	if literal is None or isinstance(literal, datetime.date):
		return literal
	return datetime.date(*time.strptime(literal, '%Y-%m-%d')[:3])


[docs]def parseDefaultTime(literal: Optional[Union[str, datetime.time]]
		) -> Optional[datetime.time]:
	"""parseDefaultDatetime's other little sister.
	"""
	if literal is None or isinstance(literal, datetime.time):
		return literal
	# as long as we're building on top of time, we can't do fractional seconds
	return datetime.time(*time.strptime(literal, '%H:%M:%S')[3:6])


[docs]def roundToSeconds(dt: datetime.datetime) -> datetime.datetime:
	"""returns a datetime instance rounded to whole seconds.

	This also recklessly clears any time zone marker.  So, don't pass
	in anything with a meaningful time zone.
	"""
	if dt.microsecond>500000:
		return dt.replace(microsecond=0, tzinfo=None
			)+datetime.timedelta(seconds=1)
	else:
		return dt.replace(microsecond=0, tzinfo=None)


[docs]def formatISODT(dt: datetime.datetime) -> str:
	"""returns some ISO8601 representation of a datetime instance.

	The reason for preferring this function over a simple str is that
	datetime's default representation is too difficult for some other
	code (e.g., itself); hence, this code suppresses any microsecond part
	and always adds a Z (where strftime works, utils.isoTimestampFmt produces
	an identical string).

	The behaviour of this function for timezone-aware datetimes is undefined.

	For convenience, None is returned as None.

	Also for convenience, you can pass in a string; this will then be parsed
	first, which provides both some basic format validation and guaranteed
	DALI-compliant serialisation.

	>>> formatISODT(datetime.datetime(2015, 10, 20, 12, 34, 22, 250))
	'2015-10-20T12:34:22Z'
	>>> formatISODT(datetime.datetime(1815, 10, 20, 12, 34, 22, 250))
	'1815-10-20T12:34:22Z'
	>>> formatISODT(datetime.datetime(2018, 9, 21, 23, 59, 59, 640000))
	'2018-09-22T00:00:00Z'
	"""
	if dt is None:
		return None
	if isinstance(dt, str):
		dt = parseDefaultDatetime(dt)

	return roundToSeconds(dt).isoformat()+"Z"


[docs]@codetricks.document
def formatFloat(f: float) -> str:
	"""returns floating-point numbers somewhat suitable for human consumption.

	The idea of this function is to slowly migrate ad-hoc formatting of
	this kind that's in all kind of different places in DaCHS here and
	thus have a central knob we can eventually use to adapt to tastes.

	>>> formatFloat(1)
	'1'
	>>> formatFloat(1/3)
	'0.333333'
	>>> formatFloat(-1/3e20)
	'-3.33333e-21'
	>>> import math;formatFloat(math.pi)
	'3.14159'
	>>> formatFloat(20000000.23)
	'2e+07'
	"""
	return "{:.6g}".format(f)


[docs]class NameMap(object):
	"""is a name mapper fed from a simple text file.

	The text file format simply is:

	<target-id> "TAB" <src-id>{whitespace <src-id>}

	src-ids have to be encoded quoted-printable when they contain whitespace
	or other "bad" characters ("="!).  You can have #-comments and empty
	lines.

	The file is supposed to be ASCII, with non-ASCII encoded quoted-printable.
	The qp-decoded strings are assumed to be utf-8 encoded, but there's a
	constructor argument to change that.
	"""
	def __init__(self, src: Filename, missingOk: bool=False, enc: str="utf-8"):
		self._parseSrc(src, missingOk, enc)
	
	def __contains__(self, name: str) -> bool:
		return name in self.namesDict

	def _parseSrc(self, src: Filename, missingOk: bool, enc: str) -> None:
		self.namesDict: StrToStrMap = {}
		try:
			f = open(src, "rb")
		except IOError:
			if not missingOk:
				raise
			else:
				return
		try:
			for ln in f:
				if ln.startswith(b"#") or not ln.strip():
					continue
				ob, names = re.split(b"\t+", ln)
				for name in names.lower().split():
					self.namesDict[quopri.decodestring(name).decode(enc)
						] = ob.decode(enc)
		except ValueError:
			raise misctricks.logOldExc(ValueError(
				"Syntax error in %s: Line %s not understood."%(src, repr(ln))))
		f.close()
	
[docs]	def resolve(self, name: str) -> str:
		return self.namesDict[name.lower()]


_STANDARD_ENTITIES = {
		'lt': '<',
		'gt': '>',
		'amp': '&',
		'apos': "'",
		'quot': '"',
}


def _decodeEntityref(matob: re.Match) -> str:
	entRef = matob.group(1)
	if entRef in _STANDARD_ENTITIES:
		return _STANDARD_ENTITIES[entRef]
	elif entRef.startswith("#x"):
		return chr(int(entRef[2:], 16))
	elif entRef.startswith("#"):
		return chr(int(entRef[1:]))
	else:
		raise ValueError("Unknown entity reference: &%s;"%entRef)


[docs]def replaceXMLEntityRefs(unicodeString: str) -> str:
	"""retplaces all known HTML entities in unicodeString with actual unicode
	chars.

	(and dies on unknown entities).

	TODO: this is unused and probably not very useful to clients.  Discard?
	"""
	return entityrefPat.sub(_decodeEntityref, unicodeString)


[docs]def ensureOneSlash(s: str) -> str:
	"""returns s with exactly one trailing slash.
	"""
	return s.rstrip("/")+"/"


_SimpleTextGenerator = Generator[Tuple[int, str], None, None]

def _iterSimpleTextNoContinuation(f: TextIO
		) -> _SimpleTextGenerator:
	"""helps iterSimpleText.
	"""
	for (lineNumber, curLine) in enumerate(f):
		curLine = curLine.strip()
		if curLine and not curLine.startswith("#"):
			yield (lineNumber+1), curLine


[docs]@codetricks.document
def iterSimpleText(f: TextIO) -> _SimpleTextGenerator:
	"""iterates over ``(physLineNumber, line)`` in f with some usual
	conventions for simple data files.

	You should use this function to read from simple configuration and/or
	table files that don't warrant a full-blown grammar/rowmaker combo.
	The intended use is somewhat like this::
		
		with open(rd.getAbsPath("res/mymeta")) as f:
			for lineNumber, content in iterSimpleText(f):
				try:
					...
				except Exception, exc:
					sys.stderr.write("Bad input line %s: %s"%(lineNumber, exc))

	The grammar rules are, specifically:

	* leading and trailing whitespace is stripped
	* empty lines are ignored
	* lines beginning with a hash are ignored
	* lines ending with a backslash are joined with the following line;
	  to have intervening whitespace, have a blank in front of the backslash.
	"""
	iter = _iterSimpleTextNoContinuation(f)
	try:
		while True:
			lineNumber, curLine = next(iter)

			while curLine.endswith("\\"):
				try:
					lineNumber, newStuff = next(iter)
				except StopIteration:
					raise SourceParseError("File ends with a backslash",
						location="line %d"%lineNumber)
				curLine = curLine[:-1]+newStuff

			yield lineNumber, curLine
	except StopIteration:  # all done, leave loop
		pass


_RANDOM_STRING_OK_CHARS = string.ascii_letters+string.digits+"_.,"

[docs]def getRandomString(length: int) -> str:
	"""returns a random string of harmless printable characters.
	"""
	return "".join(
		random.choice(_RANDOM_STRING_OK_CHARS) for c in range(length))


[docs]def safe_str(val: Any) -> str:
	"""returns a reasonable string from pretty much anything.
	"""
	if isinstance(val, str):
		return val
	elif isinstance(val, bytes):
		return val.decode("ascii", "ignore")
	else:
		return str(val)


[docs]def bytify(s: Union[str, bytes]) -> bytes:
	"""returns s utf-8 encoded if it is a string, unmodified otherwise.
	"""
	if isinstance(s, str):
		return s.encode("utf-8")
	return s


[docs]def debytify(b: Union[str, bytes], enc: str="ascii"):
	"""returns a bytestring b as a normal string.

	This will return b unless it's bytes.  If it is bytes, it will be
	decoded as enc (and the thing will fail when the encoding is wrong).
	"""
	if isinstance(b, bytes):
		return b.decode(enc, "ignore")
	return b


DEFUSE_NONPRINTABLE = bytes.maketrans(bytes(range(32)), b" "*32)

[docs]def defuseFileName(fName: Union[Filename, bytes], replaceSlash: bool=True):
	"""returns fName without any non-ASCII or slashes but in a way that
	people can still work out what the file name has been.

	This is basically a quoted-printable encoding.  What's returned is
	a string that's guaranteed to be ASCII only.  With replaceSlash=False,
	it can also double as a reasonable asciification.
	"""
	if isinstance(fName, pathlib.Path):
		fName = str(fName)

	nameBytes = fName.encode("utf-8") if isinstance(fName, str) else fName
	res = quopri.encodestring(nameBytes
		).translate(DEFUSE_NONPRINTABLE
		).decode("ascii")
	if replaceSlash:
		res = res.replace('/', "=2F")
	return res


[docs]def parseAccept(aString: str) -> StrToStrMap:
	"""parses an RFC 2616 accept header and returns a dict mapping media
	type patterns to their (unparsed) parameters.

	If aString is None, an empty dict is returned

	If we ever want to do fancy things with http content negotiation, this
	will be further wrapped to provide something implementing the complex
	RFC 2616 rules; this primitive interface really is intended for telling
	apart browsers (which accept text/html) from other clients (which
	hopefully do not) at this point.

	>>> sorted(parseAccept("text/html, text/*; q=0.2; level=3").items())
	[('text/*', 'q=0.2; level=3'), ('text/html', '')]
	>>> parseAccept(None)
	{}
	"""
	res = {}
	if aString is not None:
		for item in aString.split(","):
			if ";" in item:
				key, params = item.split(";", 1)
			else:
				key, params = item, ""
			res[key.strip()] = params.strip()
	
	return res


[docs]def parseMediaType(mediaType: str) -> Tuple[str, dict]:
	"""returns a pair of basic, lowercased media type and a parameter
	dictionary for an RFC 2045 media type.

	Except for lowercasing the type, normalisation follows whatever the mail
	package does, which is probably too little.

	Regrettably, EmailMessage's parsing code silently falls back to text/plain
	when it cannot make out anything.  We can't use that, so we have extra
	logic that doubts text/plain and raises a ValueError if it suspects
	that's a hallucination.

	>>> parseMediaType("application/x-VOTable+xml;serialisation=tabledata; content=datalink")
	('application/x-votable+xml', {'serialisation': 'tabledata', 'content': 'datalink'})
	>>> parseMediaType("not a media type at all.")
	Traceback (most recent call last):
	ValueError: Cannot parse media type 'not a media type at all.'
	"""
	# yeah, it's weird to construct an email message here, but it's what
	# the purgers of the cgi module wanted
	m = emailmessage.EmailMessage()
	m.add_header("content-type", mediaType)
	baseType = "{}/{}".format(m.get_content_maintype(), m.get_content_subtype())
	if baseType=="text/plain":
		if not mediaType.lower().startswith(baseType):
			raise ValueError(f"Cannot parse media type '{mediaType}'")

	return baseType, dict(m.get("content-type").params)


if __name__=="__main__": # pragma: no cover
	import doctest
	doctest.testmod()
Source code for gavo.utils.texttricks

gavo

Navigation

Related Topics