Skip to content

Instantly share code, notes, and snippets.

@qtangs
Last active March 9, 2020 08:51
Show Gist options
  • Save qtangs/fd3948ac33a7c1ea6620d941269a04c8 to your computer and use it in GitHub Desktop.
Save qtangs/fd3948ac33a7c1ea6620d941269a04c8 to your computer and use it in GitHub Desktop.
import pandas as pd
COUNTRY_COLUMN = "Country/Region"
LOCATION_COLUMN = "Location"
DATE_COLUMN = "Date"
TYPE_COLUMN = "Type"
INFECTED_COLUMN = "Infected"
RECOVERED_COLUMN = "Recovered"
DEATHS_COLUMN = "Deaths"
CURRENT_COLUMN = "Current"
VALUE_COLUMN = "Total"
NEW_COLUMN = "New"
alternate_names = {
"Faroe Islands": "Faeroe Islands",
"Macau": "Macao",
"Mainland China": "China",
"North Macedonia": "Macedonia",
"Russia": "Russian Federation",
"Saint Barthelemy": "Saint-Barthélemy",
"South Korea": "Republic of Korea",
"UK": "United Kingdom",
"US": "United States",
}
def get_data_by_date(input_data, value_column):
by_date = input_data.groupby(COUNTRY_COLUMN).sum()
by_date.reset_index(inplace=True)
by_date = by_date.melt(id_vars=[COUNTRY_COLUMN], var_name=DATE_COLUMN, value_name=value_column)
by_date = by_date[(by_date[DATE_COLUMN] != "Lat") & (by_date[DATE_COLUMN] != "Long")]
by_date[DATE_COLUMN] = pd.to_datetime(by_date[DATE_COLUMN], format="%m/%d/%y")
by_date.columns = [LOCATION_COLUMN, DATE_COLUMN, value_column]
return by_date
def get_diff_data(input_data):
diff_data = input_data.pivot(index=DATE_COLUMN, columns=LOCATION_COLUMN, values=VALUE_COLUMN)
diff_data = diff_data.diff().fillna(0).reset_index()
diff_data = diff_data.melt(id_vars=[DATE_COLUMN], var_name=LOCATION_COLUMN, value_name=NEW_COLUMN)
return diff_data
if __name__ == "__main__":
# https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv
confirmed = pd.read_csv("time_series_19-covid-Confirmed.csv")
confirmed[COUNTRY_COLUMN] = [
alternate_names[p] if p in alternate_names else p for p in confirmed[COUNTRY_COLUMN]
]
recovered = pd.read_csv("time_series_19-covid-Recovered.csv")
recovered[COUNTRY_COLUMN] = [
alternate_names[p] if p in alternate_names else p for p in recovered[COUNTRY_COLUMN]
]
deaths = pd.read_csv("time_series_19-covid-Deaths.csv")
deaths[COUNTRY_COLUMN] = [
alternate_names[p] if p in alternate_names else p for p in deaths[COUNTRY_COLUMN]
]
dates = pd.to_datetime(confirmed.columns, errors="coerce")
latest_date = dates.max()
latest_date_column = confirmed.columns[dates == latest_date][0]
print("latest_date", latest_date, "latest_date_column", latest_date_column)
# By Country
latest_by_country = confirmed[[COUNTRY_COLUMN, latest_date_column]]
latest_by_country.columns = [LOCATION_COLUMN, INFECTED_COLUMN]
latest_by_country = latest_by_country.groupby(LOCATION_COLUMN).sum()
latest_by_country.reset_index(inplace=True)
latest_by_country.sort_values(by=[LOCATION_COLUMN], inplace=True)
latest_by_country.to_csv("latest_by_country.csv", encoding='utf-8', index=False)
print(latest_by_country.head())
# By Date
confirmed_by_date = get_data_by_date(confirmed, INFECTED_COLUMN)
recovered_by_date = get_data_by_date(recovered, RECOVERED_COLUMN)
deaths_by_date = get_data_by_date(deaths, DEATHS_COLUMN)
# confirmed_by_date.to_csv("data/confirmed_by_date.csv", encoding='utf-8', index=False)
print(confirmed_by_date.head())
print(recovered_by_date.head())
print(deaths_by_date.head())
# all_by_date = pd.merge(confirmed_by_date, recovered_by_date, how='outer', on=[LOCATION_COLUMN, DATE_COLUMN])
# all_by_date = pd.merge(all_by_date, deaths_by_date, how='outer', on=[LOCATION_COLUMN, DATE_COLUMN])
current_by_date = pd.concat(
[
confirmed_by_date[LOCATION_COLUMN],
confirmed_by_date[DATE_COLUMN],
confirmed_by_date[INFECTED_COLUMN] - recovered_by_date[RECOVERED_COLUMN] - deaths_by_date[DEATHS_COLUMN]
],
axis=1
)
confirmed_by_date.columns = [LOCATION_COLUMN, DATE_COLUMN, VALUE_COLUMN]
recovered_by_date.columns = [LOCATION_COLUMN, DATE_COLUMN, VALUE_COLUMN]
deaths_by_date.columns = [LOCATION_COLUMN, DATE_COLUMN, VALUE_COLUMN]
current_by_date.columns = [LOCATION_COLUMN, DATE_COLUMN, VALUE_COLUMN]
confirmed_by_date[TYPE_COLUMN] = INFECTED_COLUMN
recovered_by_date[TYPE_COLUMN] = RECOVERED_COLUMN
deaths_by_date[TYPE_COLUMN] = DEATHS_COLUMN
current_by_date[TYPE_COLUMN] = CURRENT_COLUMN
confirmed_by_date_diff = get_diff_data(confirmed_by_date)
recovered_by_date_diff = get_diff_data(recovered_by_date)
deaths_by_date_diff = get_diff_data(deaths_by_date)
confirmed_by_date = pd.merge(confirmed_by_date, confirmed_by_date_diff, on=[LOCATION_COLUMN, DATE_COLUMN])
recovered_by_date = pd.merge(recovered_by_date, recovered_by_date_diff, on=[LOCATION_COLUMN, DATE_COLUMN])
deaths_by_date = pd.merge(deaths_by_date, deaths_by_date_diff, on=[LOCATION_COLUMN, DATE_COLUMN])
current_by_date[NEW_COLUMN] = None
all_by_date = pd.concat([confirmed_by_date, current_by_date, recovered_by_date, deaths_by_date], ignore_index=True)
all_by_date.to_csv("all_by_date.csv", encoding='utf-8', index=False)
print(all_by_date.head())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment