edencakir/assignment.py

## assignment.py
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Thu Jan 27 11:30:33 2017

@author: mllearner
"""

import pandas
import numpy

data = pandas.read_csv('gapminder.csv')
print len(data)         # Number of observations
print len(data.columns) # Number of variables

# IMPORTANT NOTE: Since I know all my variables are lower case, I skipped the part where
# we manually change every char upper case.
pandas.set_option('display.float_format', lambda x: '%f' %x)

datavariable1 = "femaleemployrate"
datavariable2 = "internetuserate"
datavariable3 = "urbanrate"

data[datavariable1] = data[datavariable1].convert_objects(convert_numeric=True)
data[datavariable2] = data[datavariable2].convert_objects(convert_numeric=True)
data[datavariable3] = data[datavariable3].convert_objects(convert_numeric=True)

# Subset of observations with no empty data. You can aswell add more complicated logic
# statements to narrow our results down.
sub1 = data[(data['femaleemployrate'] > 0) & (data['internetuserate'] > 0)]

sub2 = sub1.copy()

print "counts for femaleemployrate // switched to polityscore due to readibilty"
c11 = sub2['polityscore'].value_counts(sort=False)
print c11

sub2['polityscore'] = sub2['polityscore'].replace('-2', numpy.nan)
ctest = sub2['polityscore'].value_counts(sort=False, dropna=False)
print ctest
# we can clearly see that if we use femaleemployrate instead of polityscore
# we'll end up seeing no NAN values, since floating point issue appears.
# I instead switched to polityscore.

# I'm gonna check for null variables, since my dataset includes them.
cnull = sub2['polityscore'].head(25).isnull()
# none of them is null. oh polityscore has 2, 4 and 210. bingo.
sub2['urbanrate'] = sub2['urbanrate'] * sub2['urbanrate']
# ok we successfully multiplied.
	#!/usr/bin/env python2
	# -- coding: utf-8 --
	"""
	Created on Thu Jan 27 11:30:33 2017

	@author: mllearner
	"""

	import pandas
	import numpy

	data = pandas.read_csv('gapminder.csv')
	print len(data) # Number of observations
	print len(data.columns) # Number of variables

	# IMPORTANT NOTE: Since I know all my variables are lower case, I skipped the part where
	# we manually change every char upper case.
	pandas.set_option('display.float_format', lambda x: '%f' %x)

	datavariable1 = "femaleemployrate"
	datavariable2 = "internetuserate"
	datavariable3 = "urbanrate"

	data[datavariable1] = data[datavariable1].convert_objects(convert_numeric=True)
	data[datavariable2] = data[datavariable2].convert_objects(convert_numeric=True)
	data[datavariable3] = data[datavariable3].convert_objects(convert_numeric=True)

	# Subset of observations with no empty data. You can aswell add more complicated logic
	# statements to narrow our results down.
	sub1 = data[(data['femaleemployrate'] > 0) & (data['internetuserate'] > 0)]

	sub2 = sub1.copy()

	print "counts for femaleemployrate // switched to polityscore due to readibilty"
	c11 = sub2['polityscore'].value_counts(sort=False)
	print c11

	sub2['polityscore'] = sub2['polityscore'].replace('-2', numpy.nan)
	ctest = sub2['polityscore'].value_counts(sort=False, dropna=False)
	print ctest
	# we can clearly see that if we use femaleemployrate instead of polityscore
	# we'll end up seeing no NAN values, since floating point issue appears.
	# I instead switched to polityscore.

	# I'm gonna check for null variables, since my dataset includes them.
	cnull = sub2['polityscore'].head(25).isnull()
	# none of them is null. oh polityscore has 2, 4 and 210. bingo.
	sub2['urbanrate'] = sub2['urbanrate'] * sub2['urbanrate']
	# ok we successfully multiplied.