Skip to content

Instantly share code, notes, and snippets.

@edencakir
Created January 27, 2017 19:38
Show Gist options
  • Save edencakir/ddf6a2d63293fef5826e4c6d882d637f to your computer and use it in GitHub Desktop.
Save edencakir/ddf6a2d63293fef5826e4c6d882d637f to your computer and use it in GitHub Desktop.
Making Data Management Decisions
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Thu Jan 27 11:30:33 2017
@author: mllearner
"""
import pandas
import numpy
data = pandas.read_csv('gapminder.csv')
print len(data) # Number of observations
print len(data.columns) # Number of variables
# IMPORTANT NOTE: Since I know all my variables are lower case, I skipped the part where
# we manually change every char upper case.
pandas.set_option('display.float_format', lambda x: '%f' %x)
datavariable1 = "femaleemployrate"
datavariable2 = "internetuserate"
datavariable3 = "urbanrate"
data[datavariable1] = data[datavariable1].convert_objects(convert_numeric=True)
data[datavariable2] = data[datavariable2].convert_objects(convert_numeric=True)
data[datavariable3] = data[datavariable3].convert_objects(convert_numeric=True)
# Subset of observations with no empty data. You can aswell add more complicated logic
# statements to narrow our results down.
sub1 = data[(data['femaleemployrate'] > 0) & (data['internetuserate'] > 0)]
sub2 = sub1.copy()
print "counts for femaleemployrate // switched to polityscore due to readibilty"
c11 = sub2['polityscore'].value_counts(sort=False)
print c11
sub2['polityscore'] = sub2['polityscore'].replace('-2', numpy.nan)
ctest = sub2['polityscore'].value_counts(sort=False, dropna=False)
print ctest
# we can clearly see that if we use femaleemployrate instead of polityscore
# we'll end up seeing no NAN values, since floating point issue appears.
# I instead switched to polityscore.
# I'm gonna check for null variables, since my dataset includes them.
cnull = sub2['polityscore'].head(25).isnull()
# none of them is null. oh polityscore has 2, 4 and 210. bingo.
sub2['urbanrate'] = sub2['urbanrate'] * sub2['urbanrate']
# ok we successfully multiplied.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment