Skip to content

Instantly share code, notes, and snippets.

@edencakir
Created January 26, 2017 20:46
Show Gist options
  • Save edencakir/18cccde112601345e7d048d632233060 to your computer and use it in GitHub Desktop.
Save edencakir/18cccde112601345e7d048d632233060 to your computer and use it in GitHub Desktop.
Coursera Data Analysis First Assignment
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Thu Jan 26 21:44:11 2017
@author: mllearner
"""
import pandas
import numpy # Ignore the warning. going to be used later on.
data = pandas.read_csv('gapminder.csv')
print len(data) # Number of observations
print len(data.columns) # Number of variables
# IMPORTANT NOTE: Since I know all my variables are lower case, I skipped the part where
# we manually change every char upper case.
pandas.set_option('display.float_format', lambda x: '%f' %x)
datavariable1 = "femaleemployrate"
datavariable2 = "internetuserate"
datavariable3 = "urbanrate"
data[datavariable1] = data[datavariable1].convert_objects(convert_numeric=True)
data[datavariable2] = data[datavariable2].convert_objects(convert_numeric=True)
data[datavariable3] = data[datavariable3].convert_objects(convert_numeric=True)
print "counts for Female Employ Rate - usual frequency how females are employed"
c1 = data[datavariable1].value_counts(sort=False)
print "percentages for Female Employ Rate - usual frequency how females are employed"
p1 = data[datavariable1].value_counts(sort=False, normalize=True)
print "counts for Internet Use Rate - usual frequency how many people are using the internet"
c2 = data[datavariable2].value_counts(sort=False)
print "percnetages for Internet Use Rate - usual frequency how many people are using the internet"
p2 = data[datavariable2].value_counts(sort=False, normalize=True)
print "counts for Urban Rate - usual frequency how many people are living in the urbans"
c3 = data[datavariable3].value_counts(sort=False)
print "percnetages for Urban Rate - usual frequency how many people are living in the urbans"
p4 = data[datavariable3].value_counts(sort=False, normalize=True)
# Since there are lots of rows (213), I mean countries, the percentages are really low.
# Subset of observations with no empty data. You can aswell add more complicated logic
# statements to narrow our results down.
sub1 = data[(data['femaleemployrate'] >= 0) & (data['internetuserate'] >= 0)]
sub2 = sub1.copy()
print "counts for femaleemployrate"
c11 = sub2[datavariable1].value_counts(sort=False)
print c11
print "percentages for femaleemployrate"
p12 = sub2[datavariable1].value_counts(sort=False, normalize=True)
print p12
print "counts for internetuserate"
c21 = sub2[datavariable2].value_counts(sort=False)
print c21
print "percentages for internetuserate"
p22 = sub2[datavariable2].value_counts(sort=False, normalize=True)
print p22
print "counts for urbanrate"
c31 = sub2[datavariable3].value_counts(sort=False)
print c31
print "percentages for urbanrate"
p32 = sub2[datavariable3].value_counts(sort=False, normalize=True)
print p32
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment