Created
January 27, 2017 19:38
-
-
Save edencakir/ddf6a2d63293fef5826e4c6d882d637f to your computer and use it in GitHub Desktop.
Making Data Management Decisions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2 | |
# -*- coding: utf-8 -*- | |
""" | |
Created on Thu Jan 27 11:30:33 2017 | |
@author: mllearner | |
""" | |
import pandas | |
import numpy | |
data = pandas.read_csv('gapminder.csv') | |
print len(data) # Number of observations | |
print len(data.columns) # Number of variables | |
# IMPORTANT NOTE: Since I know all my variables are lower case, I skipped the part where | |
# we manually change every char upper case. | |
pandas.set_option('display.float_format', lambda x: '%f' %x) | |
datavariable1 = "femaleemployrate" | |
datavariable2 = "internetuserate" | |
datavariable3 = "urbanrate" | |
data[datavariable1] = data[datavariable1].convert_objects(convert_numeric=True) | |
data[datavariable2] = data[datavariable2].convert_objects(convert_numeric=True) | |
data[datavariable3] = data[datavariable3].convert_objects(convert_numeric=True) | |
# Subset of observations with no empty data. You can aswell add more complicated logic | |
# statements to narrow our results down. | |
sub1 = data[(data['femaleemployrate'] > 0) & (data['internetuserate'] > 0)] | |
sub2 = sub1.copy() | |
print "counts for femaleemployrate // switched to polityscore due to readibilty" | |
c11 = sub2['polityscore'].value_counts(sort=False) | |
print c11 | |
sub2['polityscore'] = sub2['polityscore'].replace('-2', numpy.nan) | |
ctest = sub2['polityscore'].value_counts(sort=False, dropna=False) | |
print ctest | |
# we can clearly see that if we use femaleemployrate instead of polityscore | |
# we'll end up seeing no NAN values, since floating point issue appears. | |
# I instead switched to polityscore. | |
# I'm gonna check for null variables, since my dataset includes them. | |
cnull = sub2['polityscore'].head(25).isnull() | |
# none of them is null. oh polityscore has 2, 4 and 210. bingo. | |
sub2['urbanrate'] = sub2['urbanrate'] * sub2['urbanrate'] | |
# ok we successfully multiplied. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment