smcateer/ph-data.py

## ph-data.py
def load_ph_data():
    """
    Cheap and nasty function for grabbing public holiday data from data.gov.au and converting it into a more usable
    format (as a pandas.DataFrame).
    """
    import pandas as pd
    import urllib as ul
    import json
    import io
    import re

    # Grab the metadata from data.gov.au: https://data.gov.au/dataset/australian-holidays-machine-readable-dataset
    # The CSVs change over time (more are added each year), but the metadata should provide a reiable list of the
    # required CSVs at any given time ... hopefully.
    # ====

    # The metadata URL. This lists the URLs of the CSV files, which are added to over time. Avoids the need to
    # hard-code the individual CSV URLs (which would requite maintenance).
    print('Getting metadata ...', end='')
    url = 'https://data.gov.au/api/3/action/package_show?id=b1bc6077-dadd-4f61-9f8c-002ab2cdff10'
    req = ul.request.urlopen(url)
    metadata = req.read()

    # Metadata is in JSON format, unpack it to a dict
    metadata_dict = json.loads(metadata)

    # Grab the CSV URLs
    urls = [x['url'] for x in metadata_dict['result']['resources']]

    # How many did we get?
    print(' %g CSV URLs found.' % len(urls))

    # Do they all look like CSVs?
    if any([not bool(re.match('.*\.csv$', x)) for x in urls]):
        print('Not all URLs were CSVs:')
        for url in urls:
            print(url)

    # Grab the data from data.gov.au
    # ====

    # Download the CSVs and union them (there is overlap, we take care of this later)
    dfs = []
    for i, url in enumerate(urls):
        print('Getting CSV %g of %g ...' % (i+1, len(urls)), end='')
        # Pull the CSVs
        req = ul.request.urlopen(url)
        data_dl = req.read()
        df_dl = pd.read_csv(io.BytesIO(data_dl), encoding='latin-1')
        # in some data the col 'Applicable To' is called 'Jurisdiction' with lower-case data
        df_dl = df_dl.rename(columns={'Jurisdiction':'Applicable To'})
        df_dl.loc[:, 'Applicable To'] = df_dl.loc[:, 'Applicable To'].str.upper()
        dfs.append(df_dl)
        print(' done.')
    data = pd.concat(dfs)
    del dfs

    # The states ('Applicable To') are listed in the format 'VIC|NSW|...' except where it applies to all states, in
    # which it is recorded as 'NAT', this just replaces the 'NAT's with the bar-delimited format
    # ====

    # Form the bar-delim version of 'NAT'
    states = data.loc[:, 'Applicable To'].unique()
    all_states = '|'.join([x for x in states if len(x) < 4 and x != 'NAT'])

    # replace the 'NAT's
    data.loc[data.loc[:, 'Applicable To'] == 'NAT', 'Applicable To'] = all_states

    # A fn for replicating a column with delimited values (one copy for each delimited var).
    # e.g. [['A', '1,2'], ['B', '3,4']] --> [['A', '1'], ['A', '2'], ['B', '3'], ['B', '4']]
    # Here we're using it to split the bar-delimited states.
    # ====

    # from https://gist.github.com/jlln/338b4b0b55bd6984f883
    def split_data_frame_list(df, target_column, separator):
        """
        df = dataframe to split,
        target_column = the column containing the values to split
        separator = the symbol used to perform the split
        returns: a dataframe with each entry for the target column separated, with each element moved into a new row.
        The values in the other columns are duplicated across the newly divided rows.
        """

        def split_list_to_rows(row, row_accumulator, target_column, separator):
            split_row = row[target_column].split(separator)
            for s in split_row:
                new_row = row.to_dict()
                new_row[target_column] = s
                row_accumulator.append(new_row)
        new_rows = []
        df.apply(split_list_to_rows, axis=1, args=(new_rows, target_column, separator))
        new_df = pd.DataFrame(new_rows)
        return new_df

    # Split/replicate the state entries
    data = split_data_frame_list(data, 'Applicable To', '|')

    # remove duplicated entries, and sort
    data = data.loc[~data.loc[:, ['Applicable To', 'Date']].duplicated()].sort_values(['Applicable To', 'Date']).reset_index(drop=True)

    # Just cosmetic stuff from here
    # ===

    # Change the YYYYMMDD dates to YYYY-MM-DD format
    data.Date = data.Date.map(lambda x: '-'.join(re.split(r'(\d{4})(\d{2})(\d{2})', str(x))[1:-1]))
    # Rename the columns
    data.columns = data.columns.str.upper().str.replace(' ', '_')

    # Drop unneeded columns
    data = data.loc[:, ['APPLICABLE_TO', 'DATE', 'HOLIDAY_NAME', 'INFORMATION', 'MORE_INFORMATION']]

    return data
	def load_ph_data():
	"""
	Cheap and nasty function for grabbing public holiday data from data.gov.au and converting it into a more usable
	format (as a pandas.DataFrame).
	"""
	import pandas as pd
	import urllib as ul
	import json
	import io
	import re

	# Grab the metadata from data.gov.au: https://data.gov.au/dataset/australian-holidays-machine-readable-dataset
	# The CSVs change over time (more are added each year), but the metadata should provide a reiable list of the
	# required CSVs at any given time ... hopefully.
	# ====

	# The metadata URL. This lists the URLs of the CSV files, which are added to over time. Avoids the need to
	# hard-code the individual CSV URLs (which would requite maintenance).
	print('Getting metadata ...', end='')
	url = 'https://data.gov.au/api/3/action/package_show?id=b1bc6077-dadd-4f61-9f8c-002ab2cdff10'
	req = ul.request.urlopen(url)
	metadata = req.read()

	# Metadata is in JSON format, unpack it to a dict
	metadata_dict = json.loads(metadata)

	# Grab the CSV URLs
	urls = [x['url'] for x in metadata_dict['result']['resources']]

	# How many did we get?
	print(' %g CSV URLs found.' % len(urls))

	# Do they all look like CSVs?
	if any([not bool(re.match('.*\.csv$', x)) for x in urls]):
	print('Not all URLs were CSVs:')
	for url in urls:
	print(url)

	# Grab the data from data.gov.au
	# ====

	# Download the CSVs and union them (there is overlap, we take care of this later)
	dfs = []
	for i, url in enumerate(urls):
	print('Getting CSV %g of %g ...' % (i+1, len(urls)), end='')
	# Pull the CSVs
	req = ul.request.urlopen(url)
	data_dl = req.read()
	df_dl = pd.read_csv(io.BytesIO(data_dl), encoding='latin-1')
	# in some data the col 'Applicable To' is called 'Jurisdiction' with lower-case data
	df_dl = df_dl.rename(columns={'Jurisdiction':'Applicable To'})
	df_dl.loc[:, 'Applicable To'] = df_dl.loc[:, 'Applicable To'].str.upper()
	dfs.append(df_dl)
	print(' done.')
	data = pd.concat(dfs)
	del dfs

	# The states ('Applicable To') are listed in the format 'VIC\|NSW\|...' except where it applies to all states, in
	# which it is recorded as 'NAT', this just replaces the 'NAT's with the bar-delimited format
	# ====

	# Form the bar-delim version of 'NAT'
	states = data.loc[:, 'Applicable To'].unique()
	all_states = '\|'.join([x for x in states if len(x) < 4 and x != 'NAT'])

	# replace the 'NAT's
	data.loc[data.loc[:, 'Applicable To'] == 'NAT', 'Applicable To'] = all_states

	# A fn for replicating a column with delimited values (one copy for each delimited var).
	# e.g. [['A', '1,2'], ['B', '3,4']] --> [['A', '1'], ['A', '2'], ['B', '3'], ['B', '4']]
	# Here we're using it to split the bar-delimited states.
	# ====

	# from https://gist.github.com/jlln/338b4b0b55bd6984f883
	def split_data_frame_list(df, target_column, separator):
	"""
	df = dataframe to split,
	target_column = the column containing the values to split
	separator = the symbol used to perform the split
	returns: a dataframe with each entry for the target column separated, with each element moved into a new row.
	The values in the other columns are duplicated across the newly divided rows.
	"""

	def split_list_to_rows(row, row_accumulator, target_column, separator):
	split_row = row[target_column].split(separator)
	for s in split_row:
	new_row = row.to_dict()
	new_row[target_column] = s
	row_accumulator.append(new_row)
	new_rows = []
	df.apply(split_list_to_rows, axis=1, args=(new_rows, target_column, separator))
	new_df = pd.DataFrame(new_rows)
	return new_df

	# Split/replicate the state entries
	data = split_data_frame_list(data, 'Applicable To', '\|')

	# remove duplicated entries, and sort
	data = data.loc[~data.loc[:, ['Applicable To', 'Date']].duplicated()].sort_values(['Applicable To', 'Date']).reset_index(drop=True)

	# Just cosmetic stuff from here
	# ===

	# Change the YYYYMMDD dates to YYYY-MM-DD format
	data.Date = data.Date.map(lambda x: '-'.join(re.split(r'(\d{4})(\d{2})(\d{2})', str(x))[1:-1]))
	# Rename the columns
	data.columns = data.columns.str.upper().str.replace(' ', '_')

	# Drop unneeded columns
	data = data.loc[:, ['APPLICABLE_TO', 'DATE', 'HOLIDAY_NAME', 'INFORMATION', 'MORE_INFORMATION']]

	return data