Create a gist now

Instantly share code, notes, and snippets.

Basic data manipulation in python
#------------------------------------------------------------------
# PROGRAM NAME: Python Examples.py
# DATE: 6/1/16
# CREATED BY: MATT BOGARD
# PROJECT FILE:
#----------------------------------------------------------------
# PURPOSE: BASIC DATA MANAGEMENT AND STATS IN PYTHON
# ---------------------------------------------------------------
#---------------------------------
# reading data
#----------------------------------
import pandas as pd #primary package for data manipulation
import numpy as np #primary scientific package for data analysis
# read csv
yield_data = pd.read_csv('C:\Documents\yield_plots.csv')
print yield_data
# look at a few rows
yield_data.head(n=5)
# make a data frame manually
data = {'GARST' :[150,140,145,137,141,145,149,153,157,161],
'PIO':[160,150,146,138,142,146,150,154,158,162],
'MYC':[137,148,151,139,143,120,115,136,130,129],
'DEK':[150,149,145,140,144,148,152,156,160,164],
'PLOT':[1,2,3,4,5,6,7,8,9,10],
'BT': ['Y','Y', 'N','N','N','N','Y','N','Y','Y'],
'RR':['Y','N','Y','N','N','N','N','Y','Y','N'],
}
yield_data = pd.DataFrame(data,columns=['GARST','PIO','MYC','DEK','PLOT','BT','RR'])
#---------------------------
# subsetting data
#---------------------------
# subset based on columns
my_hybrids = yield_data[['GARST', 'PIO']]
# subset based on values
high_yields = yield_data.query('GARST == 150 and PIO == 160')
stacked_traits = yield_data.query('BT =="Y" and RR =="Y"')
#--------------------------------------
# creating and adding new variables
#--------------------------------------
yield_data['d_grst_pio'] = yield_data['GARST'] - yield_data['PIO'] # new var difference in yields
yield_data.head(n=5)
#----------------------------------
# conditional processing
#----------------------------------
# tutorial: http://anh.cs.luc.edu/python/hands-on/3.1/handsonHtml/ifstatements.html
# set a default value
yield_data['GMO'] = 'Non-GMO ';
# check for RR traits
yield_data['GMO'][yield_data['RR']== 'Y']= 'Single Trait '
# check for BT traits
yield_data['GMO'][yield_data['BT']== 'Y']= 'Single Trait '
# check for double stacked traits
yield_data['GMO'][(yield_data['BT']== 'Y') & (yield_data['RR']=='Y')]= 'Stacked Trait'
#-----------------------------------------
# stacking and merging data
#-----------------------------------------
# reference: http://pandas.pydata.org/pandas-docs/stable/merging.html
# create two data sets to stack
top = yield_data.query('PLOT <= 5')
bottom = yield_data.query('PLOT > 5')
# stack data
stack = pd.concat([top, bottom], ignore_index=True)
# create two separate data sources to join
hybrid = yield_data[['GARST','PLOT']]
traits = yield_data[['GMO','PLOT']]
# join this data on PLOT as a key
hybrid_traits = pd.merge(hybrid, traits, on='PLOT', how='left')
#-------------------------------------
# sorting data
#------------------------------------
hybrid_traits.sort('PLOT', ascending=[False])
# sort ascending by trait and by descending GARST yield_data
hybrid_traits.sort(['GMO', 'GARST'], ascending=[True, False])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment