Skip to content

Instantly share code, notes, and snippets.

@Cross22
Last active January 7, 2018 08:08
Show Gist options
  • Save Cross22/02aa297a88d1563e9b6e55d83ea9b9a8 to your computer and use it in GitHub Desktop.
Save Cross22/02aa297a88d1563e9b6e55d83ea9b9a8 to your computer and use it in GitHub Desktop.
import pandas as pd
def fixCountryNames(c):
return c.split('(')[0].rstrip().strip('0123456789')
def fixEnergyNames(energy):
# Rename index labels using a string processing function
energy.rename(index=fixCountryNames, inplace=True)
# Rename index labels using a dictionary
countryRenames = {
"Republic of Korea": "South Korea",
"United States of America": "United States",
"United Kingdom of Great Britain and Northern Ireland": "United Kingdom",
"China, Hong Kong Special Administrative Region": "Hong Kong"}
energy.rename(countryRenames, inplace=True)
def loadEnergy():
columnNames=['Country', 'Energy Supply', 'Energy Supply per Capita', '% Renewable']
# Load excel file, ignore first 18 lines and last 38 lines
# Parse columns [2,3,4,5] and use the list above as their names
# Treat any fields with "..." as NaN value
df = pd.read_excel("Energy Indicators.xls",
header=None,
skiprows=18, skip_footer=38,
usecols=range(2,6),
names=columnNames, na_values = "...")
# First column should be index labels
df.set_index('Country', inplace=True)
# Bad data fixups
fixEnergyNames(df)
return df
def loadGDP():
df= pd.read_csv("world_bank.csv", skiprows=4)
# Change single column name
df.rename(index=str, columns={"Country Name":"Country"}, inplace=True)
# First column should be index labels
df.set_index('Country', inplace=True)
# Change some row labels
df.rename(index=
{"Korea, Rep.": "South Korea",
"Iran, Islamic Rep.": "Iran",
"Hong Kong SAR, China": "Hong Kong"}, inplace=True)
return df
def loadScience():
df= pd.read_excel("scimagojr-3.xlsx")
# First column should be index labels
df.set_index('Country', inplace=True)
return df
def merge(GDP, energy, ScimEn):
#Join the three datasets: GDP, Energy, and ScimEn into a new dataset (using the intersection of country names).
#and only the top 15 countries by Scimagojr 'Rank' (Rank 1 through 15).
ScimEn= ScimEn[ScimEn["Rank"]<=15]
GDP= GDP.merge(ScimEn, left_index=True, right_index=True)
GDP= GDP.merge(energy, left_index=True, right_index=True)
keepColumns = ['Rank', 'Documents', 'Citable documents', 'Citations', 'Self-citations',
'Citations per document', 'H index',
'Energy Supply', 'Energy Supply per Capita', '% Renewable',
'2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']
return GDP[keepColumns]
def answer_one():
# Load XLS and promote the "Country" column to be the index labels
energy = loadEnergy()
# Multiply all field values in column "Energy Supply"
energy['Energy Supply'] *= 1_000_000
GDP = loadGDP()
ScimEn = loadScience()
return merge(GDP,energy,ScimEn)
# make a new column based on existing columns
# then find maximum value and label of that max value in that new column
def answer_seven():
Top15 = answer_one()
Top15["ratio"]=Top15["Self-citations"] / Top15["Citations"]
ser = Top15["ratio"]
return (ser.idxmax(),ser.max())
# Make nice floating point output
#pd.options.display.float_format = '{:.2f}'.format
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment