Skip to content

Instantly share code, notes, and snippets.

@smcateer
Last active February 21, 2019 00:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save smcateer/8930c8f4fcb96830e5e51345352df776 to your computer and use it in GitHub Desktop.
Save smcateer/8930c8f4fcb96830e5e51345352df776 to your computer and use it in GitHub Desktop.
Cheap and nasty function for grabbing public holiday data from data.gov.au and converting it into a more usable format (as a pandas.DataFrame).
def load_ph_data():
"""
Cheap and nasty function for grabbing public holiday data from data.gov.au and converting it into a more usable
format (as a pandas.DataFrame).
"""
import pandas as pd
import urllib as ul
import json
import io
import re
# Grab the metadata from data.gov.au: https://data.gov.au/dataset/australian-holidays-machine-readable-dataset
# The CSVs change over time (more are added each year), but the metadata should provide a reiable list of the
# required CSVs at any given time ... hopefully.
# ====
# The metadata URL. This lists the URLs of the CSV files, which are added to over time. Avoids the need to
# hard-code the individual CSV URLs (which would requite maintenance).
print('Getting metadata ...', end='')
url = 'https://data.gov.au/api/3/action/package_show?id=b1bc6077-dadd-4f61-9f8c-002ab2cdff10'
req = ul.request.urlopen(url)
metadata = req.read()
# Metadata is in JSON format, unpack it to a dict
metadata_dict = json.loads(metadata)
# Grab the CSV URLs
urls = [x['url'] for x in metadata_dict['result']['resources']]
# How many did we get?
print(' %g CSV URLs found.' % len(urls))
# Do they all look like CSVs?
if any([not bool(re.match('.*\.csv$', x)) for x in urls]):
print('Not all URLs were CSVs:')
for url in urls:
print(url)
# Grab the data from data.gov.au
# ====
# Download the CSVs and union them (there is overlap, we take care of this later)
dfs = []
for i, url in enumerate(urls):
print('Getting CSV %g of %g ...' % (i+1, len(urls)), end='')
# Pull the CSVs
req = ul.request.urlopen(url)
data_dl = req.read()
df_dl = pd.read_csv(io.BytesIO(data_dl), encoding='latin-1')
# in some data the col 'Applicable To' is called 'Jurisdiction' with lower-case data
df_dl = df_dl.rename(columns={'Jurisdiction':'Applicable To'})
df_dl.loc[:, 'Applicable To'] = df_dl.loc[:, 'Applicable To'].str.upper()
dfs.append(df_dl)
print(' done.')
data = pd.concat(dfs)
del dfs
# The states ('Applicable To') are listed in the format 'VIC|NSW|...' except where it applies to all states, in
# which it is recorded as 'NAT', this just replaces the 'NAT's with the bar-delimited format
# ====
# Form the bar-delim version of 'NAT'
states = data.loc[:, 'Applicable To'].unique()
all_states = '|'.join([x for x in states if len(x) < 4 and x != 'NAT'])
# replace the 'NAT's
data.loc[data.loc[:, 'Applicable To'] == 'NAT', 'Applicable To'] = all_states
# A fn for replicating a column with delimited values (one copy for each delimited var).
# e.g. [['A', '1,2'], ['B', '3,4']] --> [['A', '1'], ['A', '2'], ['B', '3'], ['B', '4']]
# Here we're using it to split the bar-delimited states.
# ====
# from https://gist.github.com/jlln/338b4b0b55bd6984f883
def split_data_frame_list(df, target_column, separator):
"""
df = dataframe to split,
target_column = the column containing the values to split
separator = the symbol used to perform the split
returns: a dataframe with each entry for the target column separated, with each element moved into a new row.
The values in the other columns are duplicated across the newly divided rows.
"""
def split_list_to_rows(row, row_accumulator, target_column, separator):
split_row = row[target_column].split(separator)
for s in split_row:
new_row = row.to_dict()
new_row[target_column] = s
row_accumulator.append(new_row)
new_rows = []
df.apply(split_list_to_rows, axis=1, args=(new_rows, target_column, separator))
new_df = pd.DataFrame(new_rows)
return new_df
# Split/replicate the state entries
data = split_data_frame_list(data, 'Applicable To', '|')
# remove duplicated entries, and sort
data = data.loc[~data.loc[:, ['Applicable To', 'Date']].duplicated()].sort_values(['Applicable To', 'Date']).reset_index(drop=True)
# Just cosmetic stuff from here
# ===
# Change the YYYYMMDD dates to YYYY-MM-DD format
data.Date = data.Date.map(lambda x: '-'.join(re.split(r'(\d{4})(\d{2})(\d{2})', str(x))[1:-1]))
# Rename the columns
data.columns = data.columns.str.upper().str.replace(' ', '_')
# Drop unneeded columns
data = data.loc[:, ['APPLICABLE_TO', 'DATE', 'HOLIDAY_NAME', 'INFORMATION', 'MORE_INFORMATION']]
return data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment