Skip to content

Instantly share code, notes, and snippets.

@marklap
Created August 9, 2021 21:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save marklap/133f1dbd51113de460475321b467aa70 to your computer and use it in GitHub Desktop.
Save marklap/133f1dbd51113de460475321b467aa70 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
################################################################################
# The MIT License (MIT)
#
# Copyright (c) 2021 Mark LaPerriere
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
################################################################################
"""
Converts an archaic base64 encoded Parquet timestamp (int96) to an iso8601 format. The value must first be
base64 decoded then split - the first 8 bytes are the nanoseconds since midnight and the last 4 bytes are
the Julian Day (https://en.wikipedia.org/wiki/Julian_day). Since python calculates ordinal dates using the
Rata Die system, the Julian Day value must first be converted to a Rata Die anchored Julian Day.
Requires: Python3.6+
"""
import argparse
import base64
import logging
import os
import sys
from datetime import datetime, timedelta
logging.basicConfig(level=logging.getLevelName(os.environ.get('LOG_LEVEL', 'INFO').upper()))
logger = logging.getLogger()
# Python's ordinal dates are anchored to this: https://en.wikipedia.org/wiki/Rata_Die
RATA_DIE = 1721424.5
# The number of nanoseconds in a microsecond
NANOS_IN_MICROS = 1000
# project description
PROJECT_DESCRIPTION = ('Convert an Apache Impala Timestamp (int96) to ISO8601 format '
'(ref: https://tiny.amazon.com/16uw64dw5/stacques5310cast5310)')
def bytes_to_int(data: bytearray) -> int:
""" packs the values of a bytearray into an int of arbitrary size (thank you Python)
"""
buf = 0
ptr = (len(data)-1) * 8
for b in data:
buf |= (int(b) << ptr)
ptr -= 8
return buf
def julian_day_from_ts(data: bytes) -> int:
""" extracts the julian day out of timestamp data - provided in reverse as the byte
order is little endian
"""
return bytes_to_int(bytearray(reversed(data[8:12]))) # last 4 bytes of data in reverse order (little endian)
def julian_day_to_datetime(jd: int) -> datetime:
""" creates a datetime object out of the julian day value using the Rata Die offset
Python expects
"""
return datetime.fromordinal(int(jd - RATA_DIE))
def nanos_from_ts(data: bytes) -> int:
""" extracts the nanaseconds since midnight out of timestamp data - provided in
reverse as the byte order is little endian
"""
return bytes_to_int(bytearray(reversed(data[0:8]))) # first 8 bytes of data in reverse order (little endian)
def nanos_to_timedelta(ns: int) -> timedelta:
""" creates a timedelta out of a count of nanoseconds
"""
return timedelta(microseconds=(ns / NANOS_IN_MICROS))
def combine_julian_day_and_nanos(jd: datetime, ns: timedelta) -> datetime:
""" creates a datetime representing a datetime derived from a julian day and a timedelta derived from
a nanoseconds from midnight duration
"""
return jd + ns
def parse_input(ts: str) -> bytes:
""" converts raw user input (assumed a base64 encode byte array with 12 bytes)
"""
return base64.b64decode(ts)
def cli() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(PROJECT_DESCRIPTION)
parser.add_argument(nargs=1, dest='timestamp', metavar='BASE64_IMPALA_DATE',
help='base64 encode byte string')
return parser.parse_args()
def main(args=None):
ts = parse_input(args.timestamp[0])
logger.debug(f'Converted {args.timestamp[0]} TS to 0x{ts.hex()} bytes')
jd = julian_day_from_ts(ts)
jd_dt = julian_day_to_datetime(jd)
logger.debug(f'Julian Day from TS: {jd} ({jd_dt})')
ns = nanos_from_ts(ts)
ns_td = nanos_to_timedelta(ns)
logger.debug(f'Nanos since Midnight of Julian Day: {ns} ({ns_td.seconds}s)')
dt = combine_julian_day_and_nanos(jd_dt, ns_td)
logger.debug(f'Datetime: {dt}')
print(dt.isoformat())
return 0
if __name__ == '__main__':
sys.exit(main(cli()))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment