Mark Jamison mjam03

## 51abc171-968e-4544-8189-fd98a1eecd25.py
# earliest records for weekly death data back to 1974 for scotland
START_DATE = dt.date(1973, 12, 29)

# define start dates and end dates
start_date_list = [START_DATE+dt.timedelta(days=7*x) for x in range(0,52*50)]
start_date_list = [x for x in start_date_list if x < (dt.date.today() + dt.timedelta(days=28))]
end_date_list = [x + dt.timedelta(days=-1) for x in start_date_list[1:]]

# create a df for them and a week count per year
df_week_map = pd.DataFrame({'week_start': start_date_list[:-1], 'week_end': end_date_list})

## 55d95f40-a71c-46ab-9619-9a632c530e4f.py
# name the urls
NRS_ROOT = 'https://www.nrscotland.gov.uk'
NRS_DEATH_STRING = 'statistics/covid19/covid-deaths-21'
# request website, parse and identify from the html only the url link elements
req = Request(NRS_ROOT+'/covid19stats', headers=hdr)
html_page = urlopen(req)
soup = BeautifulSoup(html_page, "lxml")
links = []
for link in soup.findAll('a'):
    l = link.get('href')

## 7f2c4202-4fa2-4e1d-86a1-dfb9559e1a8c.py
# these tables are for detailing COVID Mentions
scot_20_m_fd = [x for x in zips.namelist() if '2020' in x and 'Table 1' in x][0]
scot_21_m_fd = [x for x in zips.namelist() if '2021' in x and 'Table 1' in x][0]

# these tables breakdown underlying cod across categories
scot_20_cod_fd = [x for x in zips.namelist() if '2020' in x and 'Table 3' in x][0]
scot_21_cod_fd = [x for x in zips.namelist() if '2021' in x and 'Table 3' in x][0]

## 3b6a5c36-956b-44dd-8270-f2f48efebd0f.py
# grab 2020 and 2021 excel tables, extract, reformat and concat

# 2020
df_s_m_20 = pd.read_csv(zips.extract(scot_20_m_fd), header=2)[:4].T[2:][[0,3]]
df_s_m_20 = df_s_m_20.dropna()
df_s_m_20.columns = ['week_start', 'Deaths']
df_s_m_20['Week'] = df_s_m_20.index
df_s_m_20['Year'] = 2020

# 2021

## 6d363a9b-daea-4524-a216-8d4826be75fc.py
# 2020
df_s_cod_20 = pd.read_csv(zips.extract(scot_20_cod_fd), header=4)[9:16].T
df_s_cod_20.columns = df_s_cod_20.iloc[1].values
df_s_cod_20 = df_s_cod_20.iloc[2:]
df_s_cod_20 = df_s_cod_20[~df_s_cod_20.index.str.contains('Unnamed')]

df_s_cod_20.index.name = 'week_start'
df_s_cod_20 = df_s_cod_20.reset_index()
df_s_cod_20['Week'] = df_s_cod_20.index + 1
df_s_cod_20['Year'] = 2020

## 305371f6-9688-4165-a689-afa6bfc481b6.py
cods = [x for x in df_s_cod if x!='Week' and x!='Year' and x!='Country' and x!='week_start']
df_s_cod = pd.melt(df_s_cod, id_vars=['Country', 'Year', 'Week', 'week_start'], value_vars=cods, var_name='Cause', value_name='Deaths')
df_s_cod = pd.concat([df_s_cod, df_s_m])

df_s_cod['Deaths'] = pd.to_numeric(df_s_cod['Deaths'].apply(lambda x: x.replace('-', '').replace(',', '').strip()))
df_s_cod['week_start'] = pd.to_datetime(df_s_cod['week_start'])

# homogenise week_start / week_end with ONS
df_s_cod = pd.merge(left=df_s_cod.drop(columns=['week_start']), right=df_week_map, how='left', on=['Year', 'Week'])
df_s_cod.tail()

## 82f54667-dc08-4d50-ac3e-58b0446249fc.py
# fortunately this file doesn't change and we have one file for the last 45+ years
r = requests.get('https://www.nrscotland.gov.uk/files//statistics/weekly-monthly-births-deaths-data/2020/dec/weekly-december-20-tab-2.zip')
zips = zipfile.ZipFile(io.BytesIO(r.content))
zips.namelist()

## c2d90342-51e2-4094-8570-184806b39305.py
# extract scottish death time series data from spreadsheet
df_s_ed = pd.read_csv(zips.extract(zips.namelist()[0]), header=3)
df_s_ed.head()

## 37157fe7-a5fb-4810-a0f3-a868fb372067.py
# rename col, drop redundant rows, melt to column format
df_s_ed = df_s_ed.rename(columns={'Unnamed: 0': 'Week'}).dropna()
df_s_ed = df_s_ed[[x for x in df_s_ed.columns if 'Unnamed' not in x]]
df_s_ed = pd.melt(df_s_ed, id_vars='Week', value_vars=[x for x in df_s_ed.columns if x!='Week'], value_name='Deaths', var_name='Year')

# format data types
df_s_ed['Week'] = pd.to_numeric(df_s_ed['Week'].str[-2:])
df_s_ed['Year'] = pd.to_numeric(df_s_ed['Year'])
# remove number formatting from death numbers and '-' for zero/unknown
df_s_ed['Deaths'] = pd.to_numeric(df_s_ed['Deaths'].apply(lambda x: x.replace('-', '').replace(',', '').strip()))

## d69727bb-5677-4f05-a12c-73b4ff539854.py
# merge on dates for {Year, Week} pair, then chronologically sort
df_s_ed = pd.merge(left=df_s_ed, right=df_week_map, how='left', on=['Year', 'Week'])
df_s_ed = df_s_ed.sort_values(['Year', 'Week'])
df_s_ed['Country'] = 'Scotland'
df_s_ed = df_s_ed[['Country', 'Year', 'Week' ,'week_start', 'week_end', 'Deaths']]
	# earliest records for weekly death data back to 1974 for scotland
	START_DATE = dt.date(1973, 12, 29)

	# define start dates and end dates
	start_date_list = [START_DATE+dt.timedelta(days=7x) for x in range(0,5250)]
	start_date_list = [x for x in start_date_list if x < (dt.date.today() + dt.timedelta(days=28))]
	end_date_list = [x + dt.timedelta(days=-1) for x in start_date_list[1:]]

	# create a df for them and a week count per year
	df_week_map = pd.DataFrame({'week_start': start_date_list[:-1], 'week_end': end_date_list})
	# name the urls
	NRS_ROOT = 'https://www.nrscotland.gov.uk'
	NRS_DEATH_STRING = 'statistics/covid19/covid-deaths-21'
	# request website, parse and identify from the html only the url link elements
	req = Request(NRS_ROOT+'/covid19stats', headers=hdr)
	html_page = urlopen(req)
	soup = BeautifulSoup(html_page, "lxml")
	links = []
	for link in soup.findAll('a'):
	l = link.get('href')
	# these tables are for detailing COVID Mentions
	scot_20_m_fd = [x for x in zips.namelist() if '2020' in x and 'Table 1' in x][0]
	scot_21_m_fd = [x for x in zips.namelist() if '2021' in x and 'Table 1' in x][0]

	# these tables breakdown underlying cod across categories
	scot_20_cod_fd = [x for x in zips.namelist() if '2020' in x and 'Table 3' in x][0]
	scot_21_cod_fd = [x for x in zips.namelist() if '2021' in x and 'Table 3' in x][0]
	# grab 2020 and 2021 excel tables, extract, reformat and concat

	# 2020
	df_s_m_20 = pd.read_csv(zips.extract(scot_20_m_fd), header=2)[:4].T[2:][[0,3]]
	df_s_m_20 = df_s_m_20.dropna()
	df_s_m_20.columns = ['week_start', 'Deaths']
	df_s_m_20['Week'] = df_s_m_20.index
	df_s_m_20['Year'] = 2020

	# 2021
	# 2020
	df_s_cod_20 = pd.read_csv(zips.extract(scot_20_cod_fd), header=4)[9:16].T
	df_s_cod_20.columns = df_s_cod_20.iloc[1].values
	df_s_cod_20 = df_s_cod_20.iloc[2:]
	df_s_cod_20 = df_s_cod_20[~df_s_cod_20.index.str.contains('Unnamed')]

	df_s_cod_20.index.name = 'week_start'
	df_s_cod_20 = df_s_cod_20.reset_index()
	df_s_cod_20['Week'] = df_s_cod_20.index + 1
	df_s_cod_20['Year'] = 2020
	cods = [x for x in df_s_cod if x!='Week' and x!='Year' and x!='Country' and x!='week_start']
	df_s_cod = pd.melt(df_s_cod, id_vars=['Country', 'Year', 'Week', 'week_start'], value_vars=cods, var_name='Cause', value_name='Deaths')
	df_s_cod = pd.concat([df_s_cod, df_s_m])

	df_s_cod['Deaths'] = pd.to_numeric(df_s_cod['Deaths'].apply(lambda x: x.replace('-', '').replace(',', '').strip()))
	df_s_cod['week_start'] = pd.to_datetime(df_s_cod['week_start'])

	# homogenise week_start / week_end with ONS
	df_s_cod = pd.merge(left=df_s_cod.drop(columns=['week_start']), right=df_week_map, how='left', on=['Year', 'Week'])
	df_s_cod.tail()
	# fortunately this file doesn't change and we have one file for the last 45+ years
	r = requests.get('https://www.nrscotland.gov.uk/files//statistics/weekly-monthly-births-deaths-data/2020/dec/weekly-december-20-tab-2.zip')
	zips = zipfile.ZipFile(io.BytesIO(r.content))
	zips.namelist()
	# extract scottish death time series data from spreadsheet
	df_s_ed = pd.read_csv(zips.extract(zips.namelist()[0]), header=3)
	df_s_ed.head()
	# rename col, drop redundant rows, melt to column format
	df_s_ed = df_s_ed.rename(columns={'Unnamed: 0': 'Week'}).dropna()
	df_s_ed = df_s_ed[[x for x in df_s_ed.columns if 'Unnamed' not in x]]
	df_s_ed = pd.melt(df_s_ed, id_vars='Week', value_vars=[x for x in df_s_ed.columns if x!='Week'], value_name='Deaths', var_name='Year')

	# format data types
	df_s_ed['Week'] = pd.to_numeric(df_s_ed['Week'].str[-2:])
	df_s_ed['Year'] = pd.to_numeric(df_s_ed['Year'])
	# remove number formatting from death numbers and '-' for zero/unknown
	df_s_ed['Deaths'] = pd.to_numeric(df_s_ed['Deaths'].apply(lambda x: x.replace('-', '').replace(',', '').strip()))
	# merge on dates for {Year, Week} pair, then chronologically sort
	df_s_ed = pd.merge(left=df_s_ed, right=df_week_map, how='left', on=['Year', 'Week'])
	df_s_ed = df_s_ed.sort_values(['Year', 'Week'])
	df_s_ed['Country'] = 'Scotland'
	df_s_ed = df_s_ed[['Country', 'Year', 'Week' ,'week_start', 'week_end', 'Deaths']]