Skip to content

Instantly share code, notes, and snippets.

@rcsmit
Last active June 24, 2024 12:33
Show Gist options
  • Save rcsmit/8f2954eef8009c747df47dbca885e32e to your computer and use it in GitHub Desktop.
Save rcsmit/8f2954eef8009c747df47dbca885e32e to your computer and use it in GitHub Desktop.
Get from Eurostat : Deaths by week, sex and 5-year age group
def get_data_eurostat():
"""Get from Eurostat : Deaths by week, sex and 5-year age group
Data from https://ec.europa.eu/eurostat/databrowser/product/view/demo_r_mwk_05?lang=en
https://ec.europa.eu/eurostat/databrowser/bookmark/fbd80cd8-7b96-4ad9-98be-1358dd80f191?lang=en
https://ec.europa.eu/eurostat/api/dissemination/sdmx/2.1/dataflow/ESTAT/DEMO_R_MWK_05/1.0?references=descendants&detail=referencepartial&format=sdmx_2.1_generic&compressed=true
Returns:
df: dataframe with weekly mortality in 5 year tranches
DATAFLOW LAST UPDATE freq age sex ... OBS_VALUE OBS_FLAG age_sex jaar weeknr
0 ESTAT:DEMO_R_MWK_05(1.0) 24/06/24 23:00:00 W TOTAL F ... 1868.0 p TOTAL_F 2000 1
1 ESTAT:DEMO_R_MWK_05(1.0) 24/06/24 23:00:00 W TOTAL M ... 1699.0 p TOTAL_M 2000 1
2 ESTAT:DEMO_R_MWK_05(1.0) 24/06/24 23:00:00 W TOTAL T ... 3567.0 p TOTAL_T 2000 1
...
80071 ESTAT:DEMO_R_MWK_05(1.0) 24/06/24 23:00:00 W Y_LT5 M ... 8.0 p Y_LT5_M 2024 19
80072 ESTAT:DEMO_R_MWK_05(1.0) 24/06/24 23:00:00 W Y_LT5 T ... 12.0 p Y_LT5_T 2024 19
W = Weekly
NL = Netherlands
NR = Number
P = Provisory
"""
code = "DEMO_R_MWK_05"
# ['freq', 'age', 'sex', 'unit', 'geo']
# pars = eurostat.get_pars(code)
# result : ['freq', 'age', 'sex', 'unit', 'geo']
# for p in pars:
# par_values = eurostat.get_par_values(code,p)
# print (f"{p} ------------------------------")
# print (par_values)
my_filter_pars = {'beginPeriod': 2015, 'geo': ['NL']} # beginPeriod is ignored somehow
flags = True
if flags:
df = eurostat.get_data_df(code, flags=True, filter_pars=my_filter_pars, verbose=True, reverse_time=False)
print (df)
# Identify value and flag columns
value_columns = [col for col in df.columns if col.endswith('_value')]
flag_columns = [col for col in df.columns if col.endswith('_flag')]
# Melt the value columns
df_values = df.melt(id_vars=['freq', 'age', 'sex', 'unit', 'geo\\TIME_PERIOD'],
value_vars=value_columns,
var_name='TIME_PERIOD', value_name='OBS_VALUE')
# Remove '_value' suffix from TIME_PERIOD column
df_values['TIME_PERIOD'] = df_values['TIME_PERIOD'].str.replace('_value', '')
# Melt the flag columns
df_flags = df.melt(id_vars=['freq', 'age', 'sex', 'unit', 'geo\\TIME_PERIOD'],
value_vars=flag_columns,
var_name='TIME_PERIOD', value_name='OBS_FLAG')
# Remove '_flag' suffix from TIME_PERIOD column
df_flags['TIME_PERIOD'] = df_flags['TIME_PERIOD'].str.replace('_flag', '')
# Merge the values and flags dataframes
df_long = pd.merge(df_values, df_flags, on=['freq', 'age', 'sex', 'unit', 'geo\\TIME_PERIOD', 'TIME_PERIOD'])
# Add additional columns
df_long['DATAFLOW'] = 'ESTAT:DEMO_R_MWK_05(1.0)'
df_long['LAST UPDATE'] = '14/06/24 23:00:00'
# Rename the columns to match the desired output
df_long.rename(columns={'geo\\TIME_PERIOD': 'geo'}, inplace=True)
# Filter out rows with None values in OBS_VALUE
df_long = df_long[df_long['OBS_VALUE'].notna()]
# Reorder the columns
df_long = df_long[['DATAFLOW', 'LAST UPDATE', 'freq', 'age', 'sex', 'unit', 'geo', 'TIME_PERIOD', 'OBS_VALUE', 'OBS_FLAG']]
else:
df = eurostat.get_data_df(code, flags=False, filter_pars=my_filter_pars, verbose=True, reverse_time=False)
print (df)
# Melt the dataframe to long format
df_long = df.melt(id_vars=['freq', 'age', 'sex', 'unit', r'geo\TIME_PERIOD'],
var_name='TIME_PERIOD', value_name='OBS_VALUE')
# Add additional columns, made to be reverse compatible with older code
df_long['DATAFLOW'] = 'ESTAT:DEMO_R_MWK_05(1.0)'
df_long['LAST UPDATE'] = '24/06/24 23:00:00'
#df_long['OBS_FLAG'] = 'p'
# Rename the columns to match the desired output
df_long.rename(columns={'geo\\TIME_PERIOD': 'geo'}, inplace=True)
# Filter out rows with None values in OBS_VALUE
df_long = df_long[df_long['OBS_VALUE'].notna()]
# Reorder the columns
df_long = df_long[['DATAFLOW', 'LAST UPDATE', 'freq', 'age', 'sex', 'unit', 'geo', 'TIME_PERIOD', 'OBS_VALUE', 'OBS_FLAG']]
df_long["age_sex"] = df_long["age"] + "_" +df_long["sex"]
df_long["jaar"] = (df_long["TIME_PERIOD"].str[:4]).astype(int)
df_long["weeknr"] = (df_long["TIME_PERIOD"].str[6:]).astype(int)
# Display the resulting dataframe
print (df_long)
return (df_long)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment