Last active
March 9, 2020 08:51
-
-
Save qtangs/fd3948ac33a7c1ea6620d941269a04c8 to your computer and use it in GitHub Desktop.
Clean Covid 19 data from https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
COUNTRY_COLUMN = "Country/Region" | |
LOCATION_COLUMN = "Location" | |
DATE_COLUMN = "Date" | |
TYPE_COLUMN = "Type" | |
INFECTED_COLUMN = "Infected" | |
RECOVERED_COLUMN = "Recovered" | |
DEATHS_COLUMN = "Deaths" | |
CURRENT_COLUMN = "Current" | |
VALUE_COLUMN = "Total" | |
NEW_COLUMN = "New" | |
alternate_names = { | |
"Faroe Islands": "Faeroe Islands", | |
"Macau": "Macao", | |
"Mainland China": "China", | |
"North Macedonia": "Macedonia", | |
"Russia": "Russian Federation", | |
"Saint Barthelemy": "Saint-Barthélemy", | |
"South Korea": "Republic of Korea", | |
"UK": "United Kingdom", | |
"US": "United States", | |
} | |
def get_data_by_date(input_data, value_column): | |
by_date = input_data.groupby(COUNTRY_COLUMN).sum() | |
by_date.reset_index(inplace=True) | |
by_date = by_date.melt(id_vars=[COUNTRY_COLUMN], var_name=DATE_COLUMN, value_name=value_column) | |
by_date = by_date[(by_date[DATE_COLUMN] != "Lat") & (by_date[DATE_COLUMN] != "Long")] | |
by_date[DATE_COLUMN] = pd.to_datetime(by_date[DATE_COLUMN], format="%m/%d/%y") | |
by_date.columns = [LOCATION_COLUMN, DATE_COLUMN, value_column] | |
return by_date | |
def get_diff_data(input_data): | |
diff_data = input_data.pivot(index=DATE_COLUMN, columns=LOCATION_COLUMN, values=VALUE_COLUMN) | |
diff_data = diff_data.diff().fillna(0).reset_index() | |
diff_data = diff_data.melt(id_vars=[DATE_COLUMN], var_name=LOCATION_COLUMN, value_name=NEW_COLUMN) | |
return diff_data | |
if __name__ == "__main__": | |
# https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv | |
confirmed = pd.read_csv("time_series_19-covid-Confirmed.csv") | |
confirmed[COUNTRY_COLUMN] = [ | |
alternate_names[p] if p in alternate_names else p for p in confirmed[COUNTRY_COLUMN] | |
] | |
recovered = pd.read_csv("time_series_19-covid-Recovered.csv") | |
recovered[COUNTRY_COLUMN] = [ | |
alternate_names[p] if p in alternate_names else p for p in recovered[COUNTRY_COLUMN] | |
] | |
deaths = pd.read_csv("time_series_19-covid-Deaths.csv") | |
deaths[COUNTRY_COLUMN] = [ | |
alternate_names[p] if p in alternate_names else p for p in deaths[COUNTRY_COLUMN] | |
] | |
dates = pd.to_datetime(confirmed.columns, errors="coerce") | |
latest_date = dates.max() | |
latest_date_column = confirmed.columns[dates == latest_date][0] | |
print("latest_date", latest_date, "latest_date_column", latest_date_column) | |
# By Country | |
latest_by_country = confirmed[[COUNTRY_COLUMN, latest_date_column]] | |
latest_by_country.columns = [LOCATION_COLUMN, INFECTED_COLUMN] | |
latest_by_country = latest_by_country.groupby(LOCATION_COLUMN).sum() | |
latest_by_country.reset_index(inplace=True) | |
latest_by_country.sort_values(by=[LOCATION_COLUMN], inplace=True) | |
latest_by_country.to_csv("latest_by_country.csv", encoding='utf-8', index=False) | |
print(latest_by_country.head()) | |
# By Date | |
confirmed_by_date = get_data_by_date(confirmed, INFECTED_COLUMN) | |
recovered_by_date = get_data_by_date(recovered, RECOVERED_COLUMN) | |
deaths_by_date = get_data_by_date(deaths, DEATHS_COLUMN) | |
# confirmed_by_date.to_csv("data/confirmed_by_date.csv", encoding='utf-8', index=False) | |
print(confirmed_by_date.head()) | |
print(recovered_by_date.head()) | |
print(deaths_by_date.head()) | |
# all_by_date = pd.merge(confirmed_by_date, recovered_by_date, how='outer', on=[LOCATION_COLUMN, DATE_COLUMN]) | |
# all_by_date = pd.merge(all_by_date, deaths_by_date, how='outer', on=[LOCATION_COLUMN, DATE_COLUMN]) | |
current_by_date = pd.concat( | |
[ | |
confirmed_by_date[LOCATION_COLUMN], | |
confirmed_by_date[DATE_COLUMN], | |
confirmed_by_date[INFECTED_COLUMN] - recovered_by_date[RECOVERED_COLUMN] - deaths_by_date[DEATHS_COLUMN] | |
], | |
axis=1 | |
) | |
confirmed_by_date.columns = [LOCATION_COLUMN, DATE_COLUMN, VALUE_COLUMN] | |
recovered_by_date.columns = [LOCATION_COLUMN, DATE_COLUMN, VALUE_COLUMN] | |
deaths_by_date.columns = [LOCATION_COLUMN, DATE_COLUMN, VALUE_COLUMN] | |
current_by_date.columns = [LOCATION_COLUMN, DATE_COLUMN, VALUE_COLUMN] | |
confirmed_by_date[TYPE_COLUMN] = INFECTED_COLUMN | |
recovered_by_date[TYPE_COLUMN] = RECOVERED_COLUMN | |
deaths_by_date[TYPE_COLUMN] = DEATHS_COLUMN | |
current_by_date[TYPE_COLUMN] = CURRENT_COLUMN | |
confirmed_by_date_diff = get_diff_data(confirmed_by_date) | |
recovered_by_date_diff = get_diff_data(recovered_by_date) | |
deaths_by_date_diff = get_diff_data(deaths_by_date) | |
confirmed_by_date = pd.merge(confirmed_by_date, confirmed_by_date_diff, on=[LOCATION_COLUMN, DATE_COLUMN]) | |
recovered_by_date = pd.merge(recovered_by_date, recovered_by_date_diff, on=[LOCATION_COLUMN, DATE_COLUMN]) | |
deaths_by_date = pd.merge(deaths_by_date, deaths_by_date_diff, on=[LOCATION_COLUMN, DATE_COLUMN]) | |
current_by_date[NEW_COLUMN] = None | |
all_by_date = pd.concat([confirmed_by_date, current_by_date, recovered_by_date, deaths_by_date], ignore_index=True) | |
all_by_date.to_csv("all_by_date.csv", encoding='utf-8', index=False) | |
print(all_by_date.head()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment