Last active
April 7, 2020 09:24
-
-
Save ebergam/7725f1fd982a6e1078ace700ae99afed to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import os | |
import matplotlib.pyplot as plt | |
import missingno as msno | |
## The script merges and reshapes the data provided by https://github.com/kylemcdonald/covid-mobility-data | |
## Assumes all tsv files are in a folder called "archive" | |
## Inputs different files, outputs longform tidy data. | |
bdf = pd.read_csv('archive/2020-03-29_ES_Mobility_Report_en.pdf.tsv', sep='\t', nrows=0) | |
for filename in os.listdir('archive'): | |
df = pd.read_csv('archive/{}'.format(filename), sep = '\t') | |
n = filename.replace('2020-03-29_', '') | |
n = n.replace('_Mobility_Report_en.pdf.tsv', '') | |
df['n'] = n | |
bdf = bdf.append(df) | |
print("Categories: {}".format(len(bdf.Category.unique()))) | |
print("States: {}".format(len(bdf.n.unique()))) | |
print("Regions: {}".format(len(bdf.Name.unique()))) | |
## Stack data from wide to long shape | |
bdf = bdf.set_index(['Category', 'Name', 'n', 'Kind']) | |
# Reshape wide to long | |
d = bdf.stack(dropna=False).reset_index().rename(columns={'level_4':'date', 0: 'val', 'n':'CountryCode'}) | |
d.date = pd.to_datetime(d.date, format='%Y-%m-%d') # convert dates to datetime | |
d = d.sort_values(['CountryCode', 'Name', 'date']).reset_index(drop=True) # sort data | |
d.to_csv('google_mobility.csv', index=False) # back it up | |
## Visualize all combinations for comparison | |
# Decomment and create and "all_charts/" folder | |
#from matplotlib import pyplot as plt; plt.ioff() | |
#%matplotlib agg | |
## For r in d.Region.unique(): #this would print *all* combos | |
#for r in ['Italy', 'Belgium', 'Spain']: | |
# for c in d.Category.unique(): | |
# m = d[(d['Region']==r) & (d['Category']==c)] | |
# title = r + ' - ' + c | |
# fig = m.plot(x='date', y='val', title=title, style='k-', figsize=(8,5)) | |
# plt.savefig('all_charts/'+title+'.png', quality=80, dpi=100, bbox_inches='tight') | |
### Viz to check code | |
# %matplotlib inline | |
# msno.matrix(d) #all | |
# msno.matrix(d[d['Kind']=='region']) #countrylevel |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment