Last active
February 21, 2019 00:15
-
-
Save smcateer/8930c8f4fcb96830e5e51345352df776 to your computer and use it in GitHub Desktop.
Cheap and nasty function for grabbing public holiday data from data.gov.au and converting it into a more usable format (as a pandas.DataFrame).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def load_ph_data(): | |
""" | |
Cheap and nasty function for grabbing public holiday data from data.gov.au and converting it into a more usable | |
format (as a pandas.DataFrame). | |
""" | |
import pandas as pd | |
import urllib as ul | |
import json | |
import io | |
import re | |
# Grab the metadata from data.gov.au: https://data.gov.au/dataset/australian-holidays-machine-readable-dataset | |
# The CSVs change over time (more are added each year), but the metadata should provide a reiable list of the | |
# required CSVs at any given time ... hopefully. | |
# ==== | |
# The metadata URL. This lists the URLs of the CSV files, which are added to over time. Avoids the need to | |
# hard-code the individual CSV URLs (which would requite maintenance). | |
print('Getting metadata ...', end='') | |
url = 'https://data.gov.au/api/3/action/package_show?id=b1bc6077-dadd-4f61-9f8c-002ab2cdff10' | |
req = ul.request.urlopen(url) | |
metadata = req.read() | |
# Metadata is in JSON format, unpack it to a dict | |
metadata_dict = json.loads(metadata) | |
# Grab the CSV URLs | |
urls = [x['url'] for x in metadata_dict['result']['resources']] | |
# How many did we get? | |
print(' %g CSV URLs found.' % len(urls)) | |
# Do they all look like CSVs? | |
if any([not bool(re.match('.*\.csv$', x)) for x in urls]): | |
print('Not all URLs were CSVs:') | |
for url in urls: | |
print(url) | |
# Grab the data from data.gov.au | |
# ==== | |
# Download the CSVs and union them (there is overlap, we take care of this later) | |
dfs = [] | |
for i, url in enumerate(urls): | |
print('Getting CSV %g of %g ...' % (i+1, len(urls)), end='') | |
# Pull the CSVs | |
req = ul.request.urlopen(url) | |
data_dl = req.read() | |
df_dl = pd.read_csv(io.BytesIO(data_dl), encoding='latin-1') | |
# in some data the col 'Applicable To' is called 'Jurisdiction' with lower-case data | |
df_dl = df_dl.rename(columns={'Jurisdiction':'Applicable To'}) | |
df_dl.loc[:, 'Applicable To'] = df_dl.loc[:, 'Applicable To'].str.upper() | |
dfs.append(df_dl) | |
print(' done.') | |
data = pd.concat(dfs) | |
del dfs | |
# The states ('Applicable To') are listed in the format 'VIC|NSW|...' except where it applies to all states, in | |
# which it is recorded as 'NAT', this just replaces the 'NAT's with the bar-delimited format | |
# ==== | |
# Form the bar-delim version of 'NAT' | |
states = data.loc[:, 'Applicable To'].unique() | |
all_states = '|'.join([x for x in states if len(x) < 4 and x != 'NAT']) | |
# replace the 'NAT's | |
data.loc[data.loc[:, 'Applicable To'] == 'NAT', 'Applicable To'] = all_states | |
# A fn for replicating a column with delimited values (one copy for each delimited var). | |
# e.g. [['A', '1,2'], ['B', '3,4']] --> [['A', '1'], ['A', '2'], ['B', '3'], ['B', '4']] | |
# Here we're using it to split the bar-delimited states. | |
# ==== | |
# from https://gist.github.com/jlln/338b4b0b55bd6984f883 | |
def split_data_frame_list(df, target_column, separator): | |
""" | |
df = dataframe to split, | |
target_column = the column containing the values to split | |
separator = the symbol used to perform the split | |
returns: a dataframe with each entry for the target column separated, with each element moved into a new row. | |
The values in the other columns are duplicated across the newly divided rows. | |
""" | |
def split_list_to_rows(row, row_accumulator, target_column, separator): | |
split_row = row[target_column].split(separator) | |
for s in split_row: | |
new_row = row.to_dict() | |
new_row[target_column] = s | |
row_accumulator.append(new_row) | |
new_rows = [] | |
df.apply(split_list_to_rows, axis=1, args=(new_rows, target_column, separator)) | |
new_df = pd.DataFrame(new_rows) | |
return new_df | |
# Split/replicate the state entries | |
data = split_data_frame_list(data, 'Applicable To', '|') | |
# remove duplicated entries, and sort | |
data = data.loc[~data.loc[:, ['Applicable To', 'Date']].duplicated()].sort_values(['Applicable To', 'Date']).reset_index(drop=True) | |
# Just cosmetic stuff from here | |
# === | |
# Change the YYYYMMDD dates to YYYY-MM-DD format | |
data.Date = data.Date.map(lambda x: '-'.join(re.split(r'(\d{4})(\d{2})(\d{2})', str(x))[1:-1])) | |
# Rename the columns | |
data.columns = data.columns.str.upper().str.replace(' ', '_') | |
# Drop unneeded columns | |
data = data.loc[:, ['APPLICABLE_TO', 'DATE', 'HOLIDAY_NAME', 'INFORMATION', 'MORE_INFORMATION']] | |
return data |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment