Cross22/Pandas notes (coursera course)

## Pandas notes (coursera course)
import pandas as pd

def fixCountryNames(c):
    return c.split('(')[0].rstrip().strip('0123456789')

def fixEnergyNames(energy):
    # Rename index labels using a string processing function
    energy.rename(index=fixCountryNames, inplace=True)

    # Rename index labels using a dictionary
    countryRenames = {
        "Republic of Korea": "South Korea",
        "United States of America": "United States",
        "United Kingdom of Great Britain and Northern Ireland": "United Kingdom",
        "China, Hong Kong Special Administrative Region": "Hong Kong"}
    energy.rename(countryRenames, inplace=True)

def loadEnergy():
    columnNames=['Country', 'Energy Supply', 'Energy Supply per Capita', '% Renewable']

    # Load excel file, ignore first 18 lines and last 38 lines
    # Parse columns [2,3,4,5] and use the list above as their names
    # Treat any fields with "..." as NaN value
    df = pd.read_excel("Energy Indicators.xls",
                            header=None,
                            skiprows=18, skip_footer=38,
                            usecols=range(2,6),
                            names=columnNames, na_values = "...")

    # First column should be index labels
    df.set_index('Country', inplace=True)

    # Bad data fixups
    fixEnergyNames(df)
    return df

def loadGDP():
    df= pd.read_csv("world_bank.csv", skiprows=4)
    # Change single column name
    df.rename(index=str, columns={"Country Name":"Country"}, inplace=True)

    # First column should be index labels
    df.set_index('Country', inplace=True)

    # Change some row labels
    df.rename(index=
           {"Korea, Rep.": "South Korea",
            "Iran, Islamic Rep.": "Iran",
            "Hong Kong SAR, China": "Hong Kong"}, inplace=True)

    return df

def loadScience():
    df= pd.read_excel("scimagojr-3.xlsx")
    # First column should be index labels
    df.set_index('Country', inplace=True)
    return df

def merge(GDP, energy, ScimEn):
    #Join the three datasets: GDP, Energy, and ScimEn into a new dataset (using the intersection of country names).
    #and only the top 15 countries by Scimagojr 'Rank' (Rank 1 through 15).
    ScimEn= ScimEn[ScimEn["Rank"]<=15]
    GDP= GDP.merge(ScimEn, left_index=True, right_index=True)
    GDP= GDP.merge(energy, left_index=True, right_index=True)

    keepColumns = ['Rank', 'Documents', 'Citable documents', 'Citations', 'Self-citations',
                   'Citations per document', 'H index',
                   'Energy Supply', 'Energy Supply per Capita', '% Renewable',
                    '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']
    return GDP[keepColumns]

def answer_one():
    # Load XLS and promote the "Country" column to be the index labels
    energy = loadEnergy()
    # Multiply all field values in column "Energy Supply"
    energy['Energy Supply'] *= 1_000_000

    GDP = loadGDP()
    ScimEn = loadScience()

    return merge(GDP,energy,ScimEn)

# make a new column based on existing columns
# then find maximum value and label of that max value in that new column
def answer_seven():
    Top15 = answer_one()
    Top15["ratio"]=Top15["Self-citations"] / Top15["Citations"]
    ser = Top15["ratio"]
    return (ser.idxmax(),ser.max())

# Make nice floating point output
#pd.options.display.float_format = '{:.2f}'.format
	import pandas as pd

	def fixCountryNames(c):
	return c.split('(')[0].rstrip().strip('0123456789')

	def fixEnergyNames(energy):
	# Rename index labels using a string processing function
	energy.rename(index=fixCountryNames, inplace=True)

	# Rename index labels using a dictionary
	countryRenames = {
	"Republic of Korea": "South Korea",
	"United States of America": "United States",
	"United Kingdom of Great Britain and Northern Ireland": "United Kingdom",
	"China, Hong Kong Special Administrative Region": "Hong Kong"}
	energy.rename(countryRenames, inplace=True)

	def loadEnergy():
	columnNames=['Country', 'Energy Supply', 'Energy Supply per Capita', '% Renewable']

	# Load excel file, ignore first 18 lines and last 38 lines
	# Parse columns [2,3,4,5] and use the list above as their names
	# Treat any fields with "..." as NaN value
	df = pd.read_excel("Energy Indicators.xls",
	header=None,
	skiprows=18, skip_footer=38,
	usecols=range(2,6),
	names=columnNames, na_values = "...")

	# First column should be index labels
	df.set_index('Country', inplace=True)

	# Bad data fixups
	fixEnergyNames(df)
	return df

	def loadGDP():
	df= pd.read_csv("world_bank.csv", skiprows=4)
	# Change single column name
	df.rename(index=str, columns={"Country Name":"Country"}, inplace=True)

	# First column should be index labels
	df.set_index('Country', inplace=True)

	# Change some row labels
	df.rename(index=
	{"Korea, Rep.": "South Korea",
	"Iran, Islamic Rep.": "Iran",
	"Hong Kong SAR, China": "Hong Kong"}, inplace=True)

	return df

	def loadScience():
	df= pd.read_excel("scimagojr-3.xlsx")
	# First column should be index labels
	df.set_index('Country', inplace=True)
	return df

	def merge(GDP, energy, ScimEn):
	#Join the three datasets: GDP, Energy, and ScimEn into a new dataset (using the intersection of country names).
	#and only the top 15 countries by Scimagojr 'Rank' (Rank 1 through 15).
	ScimEn= ScimEn[ScimEn["Rank"]<=15]
	GDP= GDP.merge(ScimEn, left_index=True, right_index=True)
	GDP= GDP.merge(energy, left_index=True, right_index=True)

	keepColumns = ['Rank', 'Documents', 'Citable documents', 'Citations', 'Self-citations',
	'Citations per document', 'H index',
	'Energy Supply', 'Energy Supply per Capita', '% Renewable',
	'2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']
	return GDP[keepColumns]

	def answer_one():
	# Load XLS and promote the "Country" column to be the index labels
	energy = loadEnergy()
	# Multiply all field values in column "Energy Supply"
	energy['Energy Supply'] *= 1_000_000

	GDP = loadGDP()
	ScimEn = loadScience()

	return merge(GDP,energy,ScimEn)

	# make a new column based on existing columns
	# then find maximum value and label of that max value in that new column
	def answer_seven():
	Top15 = answer_one()
	Top15["ratio"]=Top15["Self-citations"] / Top15["Citations"]
	ser = Top15["ratio"]
	return (ser.idxmax(),ser.max())

	# Make nice floating point output
	#pd.options.display.float_format = '{:.2f}'.format