nattaylor/ndbc.py

## ndbc.py
"""Work with NDBC historical data"""

import io
import csv
import datetime
import gzip
import requests
import re
from typing import Iterable, Optional

class NdbcStdmet:
    """Class to work with NDBC historical stdmet data

    See: https://www.ndbc.noaa.gov/historical_data.shtml#stdmet

    Usage:

    >>> ndbc = NdbcStdmet(44013, 2020)
    >>> print(list(ndbc.load())[:3])
    [(datetime.datetime(2019, 12, 31, 23, 50), '44013', 5.4, 7.0 etc]
    """
    URL = 'https://www.ndbc.noaa.gov/data/historical/stdmet/{station}h{year}.txt.gz'
    MISSING = ('99.00', '999', '999.9', '99.0', '999.0', '')
    HEADERS = ['WSPD', 'GST', 'WVHT', 'DPD', 'APD', 'MWD', 'PRES', 'ATMP', 'WTMP', 'DEWP', 'VIS', 'TIDE']

    station = None
    year = None

    def __init__(self, station, year):
        self.station = station
        self.year = year

    def load(self) -> Iterable[tuple]:
        """get observations, handle headers and process missing data"""

        reader = csv.reader(io.TextIOWrapper(gzip.GzipFile(fileobj=io.BytesIO(requests.get(self.URL.format(station=self.station, year=self.year), stream=True).content)), 'utf8'), delimiter=' ', skipinitialspace=True)

        header = next(reader)

        # Does observation includes minutes
        dt_last_col = header.index('mm')+1 if 'mm' in header else 4

        year_prefix = '19' if header[0] == 'YY' else ''

        # Remove # and skip row for new format starting in 2007 with 2-header rows and # prefix
        if header[0][0] == '#':
            header[0] = header[0][1:]
            next(reader)

        # TIDE is often missing
        cols_to_add = []
        if header[-1] != 'TIDE':
            header.append('TIDE')
            cols_to_add += ['TIDE']

        if self.HEADERS != header[dt_last_col+1:]:
            raise ValueError(f'Corrupt header.  Missing: {set(self.HEADERS) - set(header[dt_last_col+1:])}')

        def _float(s: str) -> Optional[float]:
            return None if s in self.MISSING else float(s)

        def _row(row: list) -> tuple:
            row[0] = year_prefix+row[0]
            # Discard empty string extra column
            if row[-1] == '':
                row = row[:-1]
            # Still handling TIDE
            for col in cols_to_add:
                row.append('')
            # First columns are date related, normalized above
            dt = datetime.datetime(*[int(r) for r in row[0:dt_last_col]])
            return (dt, self.station, *[_float(r) for r in row[dt_last_col+1:]])

        return (_row(row) for row in reader)
	"""Work with NDBC historical data"""

	import io
	import csv
	import datetime
	import gzip
	import requests
	import re
	from typing import Iterable, Optional

	class NdbcStdmet:
	"""Class to work with NDBC historical stdmet data

	See: https://www.ndbc.noaa.gov/historical_data.shtml#stdmet

	Usage:

	>>> ndbc = NdbcStdmet(44013, 2020)
	>>> print(list(ndbc.load())[:3])
	[(datetime.datetime(2019, 12, 31, 23, 50), '44013', 5.4, 7.0 etc]
	"""
	URL = 'https://www.ndbc.noaa.gov/data/historical/stdmet/{station}h{year}.txt.gz'
	MISSING = ('99.00', '999', '999.9', '99.0', '999.0', '')
	HEADERS = ['WSPD', 'GST', 'WVHT', 'DPD', 'APD', 'MWD', 'PRES', 'ATMP', 'WTMP', 'DEWP', 'VIS', 'TIDE']

	station = None
	year = None

	def __init__(self, station, year):
	self.station = station
	self.year = year

	def load(self) -> Iterable[tuple]:
	"""get observations, handle headers and process missing data"""

	reader = csv.reader(io.TextIOWrapper(gzip.GzipFile(fileobj=io.BytesIO(requests.get(self.URL.format(station=self.station, year=self.year), stream=True).content)), 'utf8'), delimiter=' ', skipinitialspace=True)

	header = next(reader)

	# Does observation includes minutes
	dt_last_col = header.index('mm')+1 if 'mm' in header else 4

	year_prefix = '19' if header[0] == 'YY' else ''

	# Remove # and skip row for new format starting in 2007 with 2-header rows and # prefix
	if header[0][0] == '#':
	header[0] = header[0][1:]
	next(reader)

	# TIDE is often missing
	cols_to_add = []
	if header[-1] != 'TIDE':
	header.append('TIDE')
	cols_to_add += ['TIDE']

	if self.HEADERS != header[dt_last_col+1:]:
	raise ValueError(f'Corrupt header. Missing: {set(self.HEADERS) - set(header[dt_last_col+1:])}')

	def _float(s: str) -> Optional[float]:
	return None if s in self.MISSING else float(s)

	def _row(row: list) -> tuple:
	row[0] = year_prefix+row[0]
	# Discard empty string extra column
	if row[-1] == '':
	row = row[:-1]
	# Still handling TIDE
	for col in cols_to_add:
	row.append('')
	# First columns are date related, normalized above
	dt = datetime.datetime(*[int(r) for r in row[0:dt_last_col]])
	return (dt, self.station, *[_float(r) for r in row[dt_last_col+1:]])

	return (_row(row) for row in reader)