Skip to content

Instantly share code, notes, and snippets.

View mjam03's full-sized avatar

Mark Jamison mjam03

  • Belfast / London
View GitHub Profile
# earliest records for weekly death data back to 1974 for scotland
START_DATE = dt.date(1973, 12, 29)
# define start dates and end dates
start_date_list = [START_DATE+dt.timedelta(days=7*x) for x in range(0,52*50)]
start_date_list = [x for x in start_date_list if x < (dt.date.today() + dt.timedelta(days=28))]
end_date_list = [x + dt.timedelta(days=-1) for x in start_date_list[1:]]
# create a df for them and a week count per year
df_week_map = pd.DataFrame({'week_start': start_date_list[:-1], 'week_end': end_date_list})
# name the urls
NRS_ROOT = 'https://www.nrscotland.gov.uk'
NRS_DEATH_STRING = 'statistics/covid19/covid-deaths-21'
# request website, parse and identify from the html only the url link elements
req = Request(NRS_ROOT+'/covid19stats', headers=hdr)
html_page = urlopen(req)
soup = BeautifulSoup(html_page, "lxml")
links = []
for link in soup.findAll('a'):
l = link.get('href')
# these tables are for detailing COVID Mentions
scot_20_m_fd = [x for x in zips.namelist() if '2020' in x and 'Table 1' in x][0]
scot_21_m_fd = [x for x in zips.namelist() if '2021' in x and 'Table 1' in x][0]
# these tables breakdown underlying cod across categories
scot_20_cod_fd = [x for x in zips.namelist() if '2020' in x and 'Table 3' in x][0]
scot_21_cod_fd = [x for x in zips.namelist() if '2021' in x and 'Table 3' in x][0]
# grab 2020 and 2021 excel tables, extract, reformat and concat
# 2020
df_s_m_20 = pd.read_csv(zips.extract(scot_20_m_fd), header=2)[:4].T[2:][[0,3]]
df_s_m_20 = df_s_m_20.dropna()
df_s_m_20.columns = ['week_start', 'Deaths']
df_s_m_20['Week'] = df_s_m_20.index
df_s_m_20['Year'] = 2020
# 2021
# 2020
df_s_cod_20 = pd.read_csv(zips.extract(scot_20_cod_fd), header=4)[9:16].T
df_s_cod_20.columns = df_s_cod_20.iloc[1].values
df_s_cod_20 = df_s_cod_20.iloc[2:]
df_s_cod_20 = df_s_cod_20[~df_s_cod_20.index.str.contains('Unnamed')]
df_s_cod_20.index.name = 'week_start'
df_s_cod_20 = df_s_cod_20.reset_index()
df_s_cod_20['Week'] = df_s_cod_20.index + 1
df_s_cod_20['Year'] = 2020
cods = [x for x in df_s_cod if x!='Week' and x!='Year' and x!='Country' and x!='week_start']
df_s_cod = pd.melt(df_s_cod, id_vars=['Country', 'Year', 'Week', 'week_start'], value_vars=cods, var_name='Cause', value_name='Deaths')
df_s_cod = pd.concat([df_s_cod, df_s_m])
df_s_cod['Deaths'] = pd.to_numeric(df_s_cod['Deaths'].apply(lambda x: x.replace('-', '').replace(',', '').strip()))
df_s_cod['week_start'] = pd.to_datetime(df_s_cod['week_start'])
# homogenise week_start / week_end with ONS
df_s_cod = pd.merge(left=df_s_cod.drop(columns=['week_start']), right=df_week_map, how='left', on=['Year', 'Week'])
df_s_cod.tail()
# fortunately this file doesn't change and we have one file for the last 45+ years
r = requests.get('https://www.nrscotland.gov.uk/files//statistics/weekly-monthly-births-deaths-data/2020/dec/weekly-december-20-tab-2.zip')
zips = zipfile.ZipFile(io.BytesIO(r.content))
zips.namelist()
# extract scottish death time series data from spreadsheet
df_s_ed = pd.read_csv(zips.extract(zips.namelist()[0]), header=3)
df_s_ed.head()
# rename col, drop redundant rows, melt to column format
df_s_ed = df_s_ed.rename(columns={'Unnamed: 0': 'Week'}).dropna()
df_s_ed = df_s_ed[[x for x in df_s_ed.columns if 'Unnamed' not in x]]
df_s_ed = pd.melt(df_s_ed, id_vars='Week', value_vars=[x for x in df_s_ed.columns if x!='Week'], value_name='Deaths', var_name='Year')
# format data types
df_s_ed['Week'] = pd.to_numeric(df_s_ed['Week'].str[-2:])
df_s_ed['Year'] = pd.to_numeric(df_s_ed['Year'])
# remove number formatting from death numbers and '-' for zero/unknown
df_s_ed['Deaths'] = pd.to_numeric(df_s_ed['Deaths'].apply(lambda x: x.replace('-', '').replace(',', '').strip()))
# merge on dates for {Year, Week} pair, then chronologically sort
df_s_ed = pd.merge(left=df_s_ed, right=df_week_map, how='left', on=['Year', 'Week'])
df_s_ed = df_s_ed.sort_values(['Year', 'Week'])
df_s_ed['Country'] = 'Scotland'
df_s_ed = df_s_ed[['Country', 'Year', 'Week' ,'week_start', 'week_end', 'Deaths']]