Last active
January 7, 2018 08:08
-
-
Save Cross22/02aa297a88d1563e9b6e55d83ea9b9a8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
def fixCountryNames(c): | |
return c.split('(')[0].rstrip().strip('0123456789') | |
def fixEnergyNames(energy): | |
# Rename index labels using a string processing function | |
energy.rename(index=fixCountryNames, inplace=True) | |
# Rename index labels using a dictionary | |
countryRenames = { | |
"Republic of Korea": "South Korea", | |
"United States of America": "United States", | |
"United Kingdom of Great Britain and Northern Ireland": "United Kingdom", | |
"China, Hong Kong Special Administrative Region": "Hong Kong"} | |
energy.rename(countryRenames, inplace=True) | |
def loadEnergy(): | |
columnNames=['Country', 'Energy Supply', 'Energy Supply per Capita', '% Renewable'] | |
# Load excel file, ignore first 18 lines and last 38 lines | |
# Parse columns [2,3,4,5] and use the list above as their names | |
# Treat any fields with "..." as NaN value | |
df = pd.read_excel("Energy Indicators.xls", | |
header=None, | |
skiprows=18, skip_footer=38, | |
usecols=range(2,6), | |
names=columnNames, na_values = "...") | |
# First column should be index labels | |
df.set_index('Country', inplace=True) | |
# Bad data fixups | |
fixEnergyNames(df) | |
return df | |
def loadGDP(): | |
df= pd.read_csv("world_bank.csv", skiprows=4) | |
# Change single column name | |
df.rename(index=str, columns={"Country Name":"Country"}, inplace=True) | |
# First column should be index labels | |
df.set_index('Country', inplace=True) | |
# Change some row labels | |
df.rename(index= | |
{"Korea, Rep.": "South Korea", | |
"Iran, Islamic Rep.": "Iran", | |
"Hong Kong SAR, China": "Hong Kong"}, inplace=True) | |
return df | |
def loadScience(): | |
df= pd.read_excel("scimagojr-3.xlsx") | |
# First column should be index labels | |
df.set_index('Country', inplace=True) | |
return df | |
def merge(GDP, energy, ScimEn): | |
#Join the three datasets: GDP, Energy, and ScimEn into a new dataset (using the intersection of country names). | |
#and only the top 15 countries by Scimagojr 'Rank' (Rank 1 through 15). | |
ScimEn= ScimEn[ScimEn["Rank"]<=15] | |
GDP= GDP.merge(ScimEn, left_index=True, right_index=True) | |
GDP= GDP.merge(energy, left_index=True, right_index=True) | |
keepColumns = ['Rank', 'Documents', 'Citable documents', 'Citations', 'Self-citations', | |
'Citations per document', 'H index', | |
'Energy Supply', 'Energy Supply per Capita', '% Renewable', | |
'2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015'] | |
return GDP[keepColumns] | |
def answer_one(): | |
# Load XLS and promote the "Country" column to be the index labels | |
energy = loadEnergy() | |
# Multiply all field values in column "Energy Supply" | |
energy['Energy Supply'] *= 1_000_000 | |
GDP = loadGDP() | |
ScimEn = loadScience() | |
return merge(GDP,energy,ScimEn) | |
# make a new column based on existing columns | |
# then find maximum value and label of that max value in that new column | |
def answer_seven(): | |
Top15 = answer_one() | |
Top15["ratio"]=Top15["Self-citations"] / Top15["Citations"] | |
ser = Top15["ratio"] | |
return (ser.idxmax(),ser.max()) | |
# Make nice floating point output | |
#pd.options.display.float_format = '{:.2f}'.format |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment