tzparse.py

# Copyright The IETF Trust 2012-2019, All Rights Reserved
#!/usr/bin/env python

"""
NAME
    tzparse

SYNOPSIS
    >>> tzparse("2008-09-08 14:40:35 +0200", "%Y-%m-%d %H:%M:%S %Z")
    datetime.datetime(2008, 9, 8, 14, 40, 35, tzinfo=pytz.FixedOffset(120))
    
    >>> print(tzparse("14:40:35 CEST, 08 Sep 2008", "%H:%M:%S %Z, %d %b %Y"))
    2008-09-08 14:40:35+02:00

DESCRIPTION
    This describes the python 'tzparse' module. It exports only one function: tzparse().

    tzparse() parses a string according to a specified format, exactly as time.strptime()
    does, but with the added capability to parse most common timezone specifications,
    such as 'UTC', the standard timezones ('NST', 'EST', 'CST', 'MST', 'PST', 'HNY'
    [North America], 'WET', 'CET', 'EET', 'MSK' [Europe], and more), the summer timezones
    ('CEST', 'EEST', 'EDT', PDT' etc.), military timezones ('A' .. 'Z') and numeric
    timezone indications ('+0200', '-0700', '-03:30' etc.).

    The time zone specification may be placed anywhere, not only at the end.

    tzparse() calls time.strptime() to parse everything except the timezone. To parse
    the timezone, it first tries to use the pytz module, but if that doesn't give
    any joy, it falls back to a hardcoded list of common time zone abbreviations and
    their offset from UTC.

BUGS

    * tzparse() cannot parse all valid RFC 3339 formats: it doesn't extract
    fractional seconds, and the underlying time.strptime() doesn't parse fractional
    seconds.

    * Parsing according to format specifications using the generic %c, %x and %X
    specifiers will only succeed if there are explicit delimiting characters
    between the %Z specifier and the %c, %x or %X part.

COPYRIGHT
    Copyright 2009 Henrik Levkowetz

    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.
"""

import re
import time
from datetime import datetime as Datetime, timedelta as Timedelta # For re-export.  pyflakes:ignore
import pytz

tzdef = {
	"A":	"+0100",	"ACDT":	"+1030",	"ACST":	"+0930",	"ADT":	"-0300",
	"AEDT":	"+1100",	"AEST":	"+1000",	"AKDT":	"-0800",	"AKST":	"-0900",
	"AST":	"-0400",	"AWDT":	"+0900",	"AWST":	"+0800",	"B":	"+0200",
	"BST":	"+0100",	"C":	"+0300",	"CDT":	"+1030",	"CDT":	"-0500",
	"CEDT":	"+0200",	"CEST":	"+0200",	"CET":	"+0100",	"CST":	"+1030",
	"CST":	"+0930",	"CST":	"-0600",	"CXT":	"+0700",	"D":	"+0400",
	"E":	"+0500",	"EDT":	"+1100",	"EDT":	"-0400",	"EEDT":	"+0300",
	"EEST":	"+0300",	"EET":	"+0200",	"EST":	"+1100",	"EST":	"+1000",
	"EST":	"-0500",	"F":	"+0600",	"G":	"+0700",	"GMT":	"+0000",
	"H":	"+0800",	"HAA":	"-0300",	"HAC":	"-0500",	"HADT":	"-0900",
	"HAE":	"-0400",	"HAP":	"-0700",	"HAR":	"-0600",	"HAST":	"-1000",
	"HAT":	"-0230",	"HAY":	"-0800",	"HNA":	"-0400",	"HNC":	"-0600",
	"HNE":	"-0500",	"HNP":	"-0800",	"HNR":	"-0700",	"HNT":	"-0330",
	"HNY":	"-0900",	"I":	"+0900",	"IST":	"+0100",	"K":	"+1000",
	"L":	"+1100",	"M":	"+1200",	"MDT":	"-0600",	"MESZ":	"+0200",
	"MEZ":	"+0100",	"MSD":	"+0400",	"MSK":	"+0300",	"MST":	"-0700",
	"N":	"-0100",	"NDT":	"-0230",	"NFT":	"+1130",	"NST":	"-0330",
	"O":	"-0200",	"P":	"-0300",	"PDT":	"-0700",	"PST":	"-0800",
	"Q":	"-0400",	"R":	"-0500",	"S":	"-0600",	"T":	"-0700",
	"U":	"-0800",	"UTC":	"+0000",	"V":	"-0900",	"W":	"-1000",
	"WDT":	"+0900",	"WEDT":	"+0100",	"WEST":	"+0100",	"WET":	"+0000",
	"WST":	"+0900",	"WST":	"+0800",	"X":	"-1100",	"Y":	"-1200",
	"Z":	"+0000",
    }


def tzparse(string, format):
    # It's surprising that there's no tz parsing capability in the python standard
    # library...

    """
    Given a time specification string and a format, tzparse() returns a localized
    datetime.datetime.

    >>> print(tzparse("9 Oct 2009 CEST 13:58", "%d %b %Y %Z %H:%M"))
    2009-10-09 13:58:00+02:00

    >>> print(tzparse("9 Oct 2009 13:58:00 Europe/Stockholm", "%d %b %Y %H:%M:%S %Z"))
    2009-10-09 13:58:00+02:00

    >>> print(tzparse("9 Oct 2009 13:58:00 +0200", "%d %b %Y %H:%M:%S %Z"))
    2009-10-09 13:58:00+02:00

    >>> print(tzparse("Fri, 9 Oct 2009 13:58:00 +0200", "%a, %d %b %Y %H:%M:%S %Z"))
    2009-10-09 13:58:00+02:00

    >>> print(tzparse("2009-10-09 13:58:00 EST", '%Y-%m-%d %H:%M:%S %Z'))
    2009-10-09 13:58:00-05:00

    >>> print(tzparse("2009-10-09 13:58:00+02:00", "%Y-%m-%d %H:%M:%S%Z"))
    2009-10-09 13:58:00+02:00
    
    >>> print(tzparse("1985-04-12T23:20:50Z", "%Y-%m-%dT%H:%M:%S%Z"))
    1985-04-12 23:20:50+00:00

    >>> print(tzparse("1996-12-19T16:39:57-08:00", "%Y-%m-%dT%H:%M:%S%Z"))
    1996-12-19 16:39:57-08:00

    >>> print(tzparse("1996-12-19T16:39:57", "%Y-%m-%dT%H:%M:%S"))
    1996-12-19 16:39:57+01:00

    """
    
    if not "%Z" in format:
        timetuple = time.strptime(string, format)
        tzstr = time.tzname[0]
    else:
        # extract the %Z part from the format and build a pattern to extract it
        # from the string, too.

        def fmt2pat(s):
            s = re.sub("%[dHIjmMSUwWyY]", r"\\d+", s)
            s = re.sub("%[aAbBp]", r"\\w+", s)
            s = re.sub("%[cxX]", ".+", s)
            s = s.replace("%%", "%")
            return s

        frontfmt, backfmt = format.split("%Z")
        frontpat = "^" + fmt2pat(frontfmt)
        backpat =  fmt2pat(backfmt) + "$"

        
        frontstr = re.search(frontpat, string) and re.search(frontpat, string).group(0) or ""
        backstr = re.search(backpat, string) and re.search(backpat, string).group(0) or ""
        tzstr = string.replace(frontstr, "").replace(backstr, "") # This will fail is backstr occurs twice

        timetuple = time.strptime(frontstr+backstr, frontfmt+backfmt)
    dt = Datetime(*timetuple[:6])

    if not tzstr:
        tzstr = time.tzname[0]
        #raise ValueError("No timezone string found in '%s', but format contained %Z: '%s'."%(string, format))
    try:
        tz = pytz.timezone(tzstr)
    except KeyError:
        if tzstr in tzdef:
            # if we know the offset of the abbreviation, fall back to that
            tzstr = tzdef[tzstr]
        if re.search("^[+-][0-9][0-9]:?[0-9][0-9]$", tzstr):
            if ":" in tzstr:
                tzstr = tzstr[:3]+tzstr[4:]
            # convert numeric timezone to minutes
            sign = tzstr[0]
            h = int(tzstr[1:3])
            m = h*60 + int(tzstr[3:5])
            if sign == "-":
                m = -m
            tz = pytz.FixedOffset(m)
        else:
            raise ValueError("Unknown timezone '%s'" % tzstr)
    dt = tz.localize(dt)

    return dt

if __name__ == "__main__":
    import sys
    if len(sys.argv[1:]) == 2:
        print(tzparse(sys.argv[1], sys.argv[2]))
    else:
        print("Running module tests:\n")
        import doctest
        print(doctest.testmod())