Skip to content

Instantly share code, notes, and snippets.

@martijnburger
Created January 8, 2017 23:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save martijnburger/508d97928b1160deeb8a9e1a56f7975c to your computer and use it in GitHub Desktop.
Save martijnburger/508d97928b1160deeb8a9e1a56f7975c to your computer and use it in GitHub Desktop.
import pandas as pd
import numbers
import re
country_dict = {"Republic of Korea": "South Korea",
"United States of America": "United States",
"United Kingdom of Great Britain and Northern Ireland": "United Kingdom",
"China, Hong Kong Special Administrative Region": "Hong Kong",
"Korea, Rep.": "South Korea",
"Iran, Islamic Rep.": "Iran",
"Hong Kong SAR, China": "Hong Kong"}
def convert_county_name(name):
name = re.sub("\(.*\)", "", name)
name = re.sub("[0-9]", "", name)
for k, v in country_dict.items():
name = name.replace(k, v)
return name
def answer_one():
en_converters = {1 : lambda x: x * 1000000 if isinstance(x, numbers.Number) else x,
0 : convert_county_name}
energy = pd.read_excel("Energy Indicators.xls",
skiprows = 17,
skip_footer = 38,
parse_cols = [2,3,4,5],
names = ['Country', 'Energy Supply', 'Energy Supply per Capita',
'% Renewable'],
na_values="...",
converters = en_converters)
gdp_converters = {0 : convert_county_name}
GDP = pd.read_csv("world_bank.csv",
skiprows=4,
converters = gdp_converters,
usecols=['Country Name', '2006', '2007', '2008', '2009', '2010', '2011',
'2012', '2013', '2014', '2015'])
GDP = GDP.rename(columns = {'Country Name':'Country'})
ScimEn = pd.read_excel("scimagojr-3.xlsx")
combined1 = pd.merge(GDP, ScimEn, how='inner', on='Country')
combined2 = pd.merge(combined1, energy, how='inner', on='Country')
filtered_columns = ['Country', 'Rank', 'Documents', 'Citable documents', 'Citations',
'Self-citations', 'Citations per document', 'H index', 'Energy Supply',
'Energy Supply per Capita', '% Renewable', '2006', '2007', '2008',
'2009', '2010', '2011', '2012', '2013', '2014', '2015']
filtered = combined2[filtered_columns].set_index('Country')
#return energy[energy['Country'].str.contains("Iran")]
#return ScimEn[ScimEn['Country'].str.contains("Iran")]['Country']
#return GDP[GDP['Country'].str.contains("Iran")]['Country']
#return combined1[combined1['Country'].str.contains("Iran")]
####
## WHY CAN'T I FIND IRAN IN COMBINED2 BELOW WHILE IT IS IN COMBINED1???
####
#return combined2[combined2['Country'].str.contains("Iran")]
#return filtered[filtered.index.str.contains("Iran")]
return filtered[filtered["Rank"] <= 15]
answer_one()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment