Created
January 26, 2017 20:46
-
-
Save edencakir/18cccde112601345e7d048d632233060 to your computer and use it in GitHub Desktop.
Coursera Data Analysis First Assignment
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2 | |
# -*- coding: utf-8 -*- | |
""" | |
Created on Thu Jan 26 21:44:11 2017 | |
@author: mllearner | |
""" | |
import pandas | |
import numpy # Ignore the warning. going to be used later on. | |
data = pandas.read_csv('gapminder.csv') | |
print len(data) # Number of observations | |
print len(data.columns) # Number of variables | |
# IMPORTANT NOTE: Since I know all my variables are lower case, I skipped the part where | |
# we manually change every char upper case. | |
pandas.set_option('display.float_format', lambda x: '%f' %x) | |
datavariable1 = "femaleemployrate" | |
datavariable2 = "internetuserate" | |
datavariable3 = "urbanrate" | |
data[datavariable1] = data[datavariable1].convert_objects(convert_numeric=True) | |
data[datavariable2] = data[datavariable2].convert_objects(convert_numeric=True) | |
data[datavariable3] = data[datavariable3].convert_objects(convert_numeric=True) | |
print "counts for Female Employ Rate - usual frequency how females are employed" | |
c1 = data[datavariable1].value_counts(sort=False) | |
print "percentages for Female Employ Rate - usual frequency how females are employed" | |
p1 = data[datavariable1].value_counts(sort=False, normalize=True) | |
print "counts for Internet Use Rate - usual frequency how many people are using the internet" | |
c2 = data[datavariable2].value_counts(sort=False) | |
print "percnetages for Internet Use Rate - usual frequency how many people are using the internet" | |
p2 = data[datavariable2].value_counts(sort=False, normalize=True) | |
print "counts for Urban Rate - usual frequency how many people are living in the urbans" | |
c3 = data[datavariable3].value_counts(sort=False) | |
print "percnetages for Urban Rate - usual frequency how many people are living in the urbans" | |
p4 = data[datavariable3].value_counts(sort=False, normalize=True) | |
# Since there are lots of rows (213), I mean countries, the percentages are really low. | |
# Subset of observations with no empty data. You can aswell add more complicated logic | |
# statements to narrow our results down. | |
sub1 = data[(data['femaleemployrate'] >= 0) & (data['internetuserate'] >= 0)] | |
sub2 = sub1.copy() | |
print "counts for femaleemployrate" | |
c11 = sub2[datavariable1].value_counts(sort=False) | |
print c11 | |
print "percentages for femaleemployrate" | |
p12 = sub2[datavariable1].value_counts(sort=False, normalize=True) | |
print p12 | |
print "counts for internetuserate" | |
c21 = sub2[datavariable2].value_counts(sort=False) | |
print c21 | |
print "percentages for internetuserate" | |
p22 = sub2[datavariable2].value_counts(sort=False, normalize=True) | |
print p22 | |
print "counts for urbanrate" | |
c31 = sub2[datavariable3].value_counts(sort=False) | |
print c31 | |
print "percentages for urbanrate" | |
p32 = sub2[datavariable3].value_counts(sort=False, normalize=True) | |
print p32 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment