Skip to content

Instantly share code, notes, and snippets.

@cgivre
Created March 24, 2020 00:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cgivre/85d2a6b6f122a24808a80451527316e8 to your computer and use it in GitHub Desktop.
Save cgivre/85d2a6b6f122a24808a80451527316e8 to your computer and use it in GitHub Desktop.
Clean CSSE COVID-19 Data
import pandas as pd
import glob
import os
import re
# This script cleans the data available from CSSE's github repo available here:
# https://github.com/CSSEGISandData/COVID-19
BASE_PATH = "<path to data>"
DATA_PATH = "csse_covid_19_data/csse_covid_19_daily_reports"
'''
Transform all dates into dates formatted yyyy-mm-dd
'''
def dateFixer(d):
pattern1 = r'([0-9]{1,2})/([0-9]{1,2})/([0-9]{4})'
pattern2 = r'1/([0-9]{2})/20 '
matchObj = re.match(pattern1, d)
matchObj2 = re.match(pattern2, d)
if matchObj:
newDate = matchObj.group(3) + "-0" + matchObj.group(1) + "-" + matchObj.group(2)
elif matchObj2:
newDate = "2020-01-" + matchObj2.group(1)
else:
newDate = d[0:10]
return newDate
# Iterate over data files
for filename in glob.glob(os.path.join(BASE_PATH, DATA_PATH, '*.csv')):
with open(filename, 'r') as f
# Open each data file
df = pd.read_csv(filename)
# Convert all dates to YYYY-MM-DD Format
df['Last Update'] = df['Last Update'].apply(dateFixer)
# Clean up stats columns, convert all to ints and fill blanks with zeros
df['Confirmed'] = df['Confirmed'].fillna(0).astype('int64')
df['Deaths'] = df['Deaths'].fillna(0).astype('int64')
df['Recovered'] = df['Recovered'].fillna(0).astype('int64')
# Save as CSV
df.to_csv(filename, index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment