Skip to content

Instantly share code, notes, and snippets.

@romeokienzler
Created April 16, 2020 17:12
Show Gist options
  • Save romeokienzler/8c58b91aabeb2c6d913ebf3f8aada782 to your computer and use it in GitHub Desktop.
Save romeokienzler/8c58b91aabeb2c6d913ebf3f8aada782 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": "# The code was removed by Watson Studio for sharing."
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": "import types\nimport pandas as pd\nfrom botocore.client import Config\nimport ibm_boto3\n\ndef __iter__(self): return 0\n\n\nclient = ibm_boto3.client(service_name='s3',\n ibm_api_key_id=credentials_1['IBM_API_KEY_ID'],\n ibm_auth_endpoint=credentials_1['IBM_AUTH_ENDPOINT'],\n config=Config(signature_version='oauth'),\n endpoint_url=credentials_1['ENDPOINT'])\nbucket=credentials_1['BUCKET']"
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": "--2020-04-16 17:10:50-- http://api.worldbank.org/v2/en/indicator/SP.POP.TOTL?downloadformat=csv\nResolving api.worldbank.org (api.worldbank.org)... 107.23.127.3\nConnecting to api.worldbank.org (api.worldbank.org)|107.23.127.3|:80... connected.\nHTTP request sent, awaiting response... 200 OK\nLength: 78803 (77K) [application/zip]\nSaving to: \u2018SP.POP.TOTL?downloadformat=csv\u2019\n\n100%[======================================>] 78,803 --.-K/s in 0.06s \n\n2020-04-16 17:10:51 (1.22 MB/s) - \u2018SP.POP.TOTL?downloadformat=csv\u2019 saved [78803/78803]\n\nArchive: SP.POP.TOTL?downloadformat=csv\n inflating: Metadata_Indicator_API_SP.POP.TOTL_DS2_en_csv_v2_936048.csv \n inflating: API_SP.POP.TOTL_DS2_en_csv_v2_936048.csv \n inflating: Metadata_Country_API_SP.POP.TOTL_DS2_en_csv_v2_936048.csv \n"
}
],
"source": "#get world's population data\n!rm -f SP.POP.TOTL?downloadformat=csv\n!rm -f API_SP.POP.TOTL_DS2_en_csv_v2_936048.csv\n!rm -f Metadata_Country_API_SP.POP.TOTL_DS2_en_csv_v2_936048.csv\n!rm -f Metadata_Indicator_API_SP.POP.TOTL_DS2_en_csv_v2_936048.csv\n!wget http://api.worldbank.org/v2/en/indicator/SP.POP.TOTL?downloadformat=csv\n!unzip SP.POP.TOTL?downloadformat=csv\n!tail -n +5 API_SP.POP.TOTL_DS2_en_csv_v2_936048.csv > pop.csv"
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": "client.upload_file(Filename='pop.csv',Bucket=bucket,Key='pop.csv')"
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": "df_pop=pd.read_csv(\"pop.csv\")\n"
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": "import datetime\n\ntoday = datetime.date.today()\nurl = 'https://www.ecdc.europa.eu/sites/default/files/documents/COVID-19-geographic-disbtribution-worldwide-'+str(today)+'.xlsx'\nurl = 'https://www.ecdc.europa.eu/sites/default/files/documents/COVID-19-geographic-disbtribution-worldwide-2020-04-15.xlsx'\n#url = './COVID-19-geographic-disbtribution-worldwide-2020-03-24.xlsx'"
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": "import pandas as pd\ndf_raw =pd.read_excel(url)\n"
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": "['Czechia', 'Dominican_Republic', 'Egypt', 'Iran', 'Russia', 'South_Korea', 'United_Kingdom', 'United_States_of_America']\n"
}
],
"source": "group_by_deaths = df_raw.groupby(['countriesAndTerritories']).sum()['deaths']\nbad_countries = list(group_by_deaths[group_by_deaths>100].index)\ndf_raw = df_raw[df_raw['countriesAndTerritories'].isin(bad_countries)]\ncountry_names = df_raw['countriesAndTerritories'].unique()\npop_coutry_names = df_pop['Country Name'].unique()\nprint([x for x in country_names if x not in pop_coutry_names])\n\ndf_pop = df_pop.replace('Iran, Islamic Rep.','Iran')\ndf_pop = df_pop.replace('Korea, Rep.','South_Korea')\ndf_pop = df_pop.replace('United Kingdom','United_Kingdom')\ndf_pop = df_pop.replace('United States','United_States_of_America')\ndf_pop = df_pop.replace('Dominican Republic','Dominican_Republic')\ndf_pop = df_pop.replace('Czech Republic','Czechia')\ndf_pop = df_pop.replace('Russian Federation','Russia')\ndf_pop = df_pop.replace('Egypt, Arab Rep.','Egypt')\n\n\n\n\n\n\npop_coutry_names = df_pop['Country Name'].unique()\nassert len([x for x in country_names if x not in pop_coutry_names]) == 0, 'ETL error, not all country names matching: '+str([x for x in country_names if x not in pop_coutry_names])"
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": "df_pop=df_pop.rename(columns={'Country Name': 'countriesAndTerritories'})\n\ndf_raw = pd.merge(df_raw, df_pop, on='countriesAndTerritories', how='inner')"
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": "df_raw.to_csv('cases.csv')"
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": "client.upload_file(Filename='cases.csv',Bucket=bucket,Key='cases.csv')"
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.6",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment