Created
January 8, 2017 23:20
-
-
Save martijnburger/508d97928b1160deeb8a9e1a56f7975c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numbers | |
import re | |
country_dict = {"Republic of Korea": "South Korea", | |
"United States of America": "United States", | |
"United Kingdom of Great Britain and Northern Ireland": "United Kingdom", | |
"China, Hong Kong Special Administrative Region": "Hong Kong", | |
"Korea, Rep.": "South Korea", | |
"Iran, Islamic Rep.": "Iran", | |
"Hong Kong SAR, China": "Hong Kong"} | |
def convert_county_name(name): | |
name = re.sub("\(.*\)", "", name) | |
name = re.sub("[0-9]", "", name) | |
for k, v in country_dict.items(): | |
name = name.replace(k, v) | |
return name | |
def answer_one(): | |
en_converters = {1 : lambda x: x * 1000000 if isinstance(x, numbers.Number) else x, | |
0 : convert_county_name} | |
energy = pd.read_excel("Energy Indicators.xls", | |
skiprows = 17, | |
skip_footer = 38, | |
parse_cols = [2,3,4,5], | |
names = ['Country', 'Energy Supply', 'Energy Supply per Capita', | |
'% Renewable'], | |
na_values="...", | |
converters = en_converters) | |
gdp_converters = {0 : convert_county_name} | |
GDP = pd.read_csv("world_bank.csv", | |
skiprows=4, | |
converters = gdp_converters, | |
usecols=['Country Name', '2006', '2007', '2008', '2009', '2010', '2011', | |
'2012', '2013', '2014', '2015']) | |
GDP = GDP.rename(columns = {'Country Name':'Country'}) | |
ScimEn = pd.read_excel("scimagojr-3.xlsx") | |
combined1 = pd.merge(GDP, ScimEn, how='inner', on='Country') | |
combined2 = pd.merge(combined1, energy, how='inner', on='Country') | |
filtered_columns = ['Country', 'Rank', 'Documents', 'Citable documents', 'Citations', | |
'Self-citations', 'Citations per document', 'H index', 'Energy Supply', | |
'Energy Supply per Capita', '% Renewable', '2006', '2007', '2008', | |
'2009', '2010', '2011', '2012', '2013', '2014', '2015'] | |
filtered = combined2[filtered_columns].set_index('Country') | |
#return energy[energy['Country'].str.contains("Iran")] | |
#return ScimEn[ScimEn['Country'].str.contains("Iran")]['Country'] | |
#return GDP[GDP['Country'].str.contains("Iran")]['Country'] | |
#return combined1[combined1['Country'].str.contains("Iran")] | |
#### | |
## WHY CAN'T I FIND IRAN IN COMBINED2 BELOW WHILE IT IS IN COMBINED1??? | |
#### | |
#return combined2[combined2['Country'].str.contains("Iran")] | |
#return filtered[filtered.index.str.contains("Iran")] | |
return filtered[filtered["Rank"] <= 15] | |
answer_one() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment