martijnburger/answer_one.py

## answer_one.py
import pandas as pd
import numbers
import re

country_dict = {"Republic of Korea": "South Korea",
           "United States of America": "United States",
           "United Kingdom of Great Britain and Northern Ireland": "United Kingdom",
           "China, Hong Kong Special Administrative Region": "Hong Kong",
           "Korea, Rep.": "South Korea",
           "Iran, Islamic Rep.": "Iran",
           "Hong Kong SAR, China": "Hong Kong"}

def convert_county_name(name):
    name = re.sub("\(.*\)", "", name)
    name = re.sub("[0-9]", "", name)
    for k, v in country_dict.items():
        name = name.replace(k, v)
    return name

def answer_one():
    en_converters = {1 : lambda x: x * 1000000 if isinstance(x, numbers.Number) else x,
                     0 : convert_county_name}
    energy = pd.read_excel("Energy Indicators.xls",
                          skiprows = 17,
                          skip_footer = 38,
                          parse_cols = [2,3,4,5],
                          names = ['Country', 'Energy Supply', 'Energy Supply per Capita',
                                   '% Renewable'],
                          na_values="...",
                          converters = en_converters)

    gdp_converters = {0 : convert_county_name}
    GDP = pd.read_csv("world_bank.csv",
                      skiprows=4,
                      converters = gdp_converters,
                      usecols=['Country Name', '2006', '2007', '2008', '2009', '2010', '2011',
                               '2012', '2013', '2014', '2015'])
    GDP = GDP.rename(columns = {'Country Name':'Country'})

    ScimEn = pd.read_excel("scimagojr-3.xlsx")
    combined1 = pd.merge(GDP, ScimEn, how='inner', on='Country')
    combined2 = pd.merge(combined1, energy, how='inner', on='Country')
    filtered_columns = ['Country', 'Rank', 'Documents', 'Citable documents', 'Citations',
                        'Self-citations', 'Citations per document', 'H index', 'Energy Supply',
                        'Energy Supply per Capita', '% Renewable', '2006', '2007', '2008',
                        '2009', '2010', '2011', '2012', '2013', '2014', '2015']
    filtered = combined2[filtered_columns].set_index('Country')
    #return energy[energy['Country'].str.contains("Iran")]
    #return ScimEn[ScimEn['Country'].str.contains("Iran")]['Country']
    #return GDP[GDP['Country'].str.contains("Iran")]['Country']
    #return combined1[combined1['Country'].str.contains("Iran")]
    ####
    ## WHY CAN'T I FIND IRAN IN COMBINED2 BELOW WHILE IT IS IN COMBINED1???
    ####
    #return combined2[combined2['Country'].str.contains("Iran")]
    #return filtered[filtered.index.str.contains("Iran")]
    return filtered[filtered["Rank"] <= 15]
answer_one()
	import pandas as pd
	import numbers
	import re

	country_dict = {"Republic of Korea": "South Korea",
	"United States of America": "United States",
	"United Kingdom of Great Britain and Northern Ireland": "United Kingdom",
	"China, Hong Kong Special Administrative Region": "Hong Kong",
	"Korea, Rep.": "South Korea",
	"Iran, Islamic Rep.": "Iran",
	"Hong Kong SAR, China": "Hong Kong"}

	def convert_county_name(name):
	name = re.sub("\(.*\)", "", name)
	name = re.sub("[0-9]", "", name)
	for k, v in country_dict.items():
	name = name.replace(k, v)
	return name

	def answer_one():
	en_converters = {1 : lambda x: x * 1000000 if isinstance(x, numbers.Number) else x,
	0 : convert_county_name}
	energy = pd.read_excel("Energy Indicators.xls",
	skiprows = 17,
	skip_footer = 38,
	parse_cols = [2,3,4,5],
	names = ['Country', 'Energy Supply', 'Energy Supply per Capita',
	'% Renewable'],
	na_values="...",
	converters = en_converters)

	gdp_converters = {0 : convert_county_name}
	GDP = pd.read_csv("world_bank.csv",
	skiprows=4,
	converters = gdp_converters,
	usecols=['Country Name', '2006', '2007', '2008', '2009', '2010', '2011',
	'2012', '2013', '2014', '2015'])
	GDP = GDP.rename(columns = {'Country Name':'Country'})

	ScimEn = pd.read_excel("scimagojr-3.xlsx")
	combined1 = pd.merge(GDP, ScimEn, how='inner', on='Country')
	combined2 = pd.merge(combined1, energy, how='inner', on='Country')
	filtered_columns = ['Country', 'Rank', 'Documents', 'Citable documents', 'Citations',
	'Self-citations', 'Citations per document', 'H index', 'Energy Supply',
	'Energy Supply per Capita', '% Renewable', '2006', '2007', '2008',
	'2009', '2010', '2011', '2012', '2013', '2014', '2015']
	filtered = combined2[filtered_columns].set_index('Country')
	#return energy[energy['Country'].str.contains("Iran")]
	#return ScimEn[ScimEn['Country'].str.contains("Iran")]['Country']
	#return GDP[GDP['Country'].str.contains("Iran")]['Country']
	#return combined1[combined1['Country'].str.contains("Iran")]
	####
	## WHY CAN'T I FIND IRAN IN COMBINED2 BELOW WHILE IT IS IN COMBINED1???
	####
	#return combined2[combined2['Country'].str.contains("Iran")]
	#return filtered[filtered.index.str.contains("Iran")]
	return filtered[filtered["Rank"] <= 15]
	answer_one()