Last active
February 1, 2024 23:39
-
-
Save BioSciEconomist/d5ee4659251d7c6464348b8e2130d87f to your computer and use it in GitHub Desktop.
examples of data munging and analysis in python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# *----------------------------------------------------------------- | |
# | PROGRAM NAME: py code pallette.py | |
# | DATE: 7/17/20 (original: 12/28/18) | |
# | CREATED BY: MATT BOGARD | |
# | PROJECT FILE: | |
# *---------------------------------------------------------------- | |
# | PURPOSE: examples of data munging and analysis in python | |
# *---------------------------------------------------------------- | |
# see also: https://pandas.pydata.org/pandas-docs/stable/getting_started/comparison/comparison_with_r.html | |
############################################################################### | |
############################################################################### | |
################# under construction ############################ | |
############################################################################### | |
############################################################################### | |
#---------------------------------- | |
# basic data management | |
# --------------------------------- | |
pd.set_option('display.max_rows', None) # to get untruncated output (see: https://thispointer.com/python-pandas-how-to-display-full-dataframe-i-e-print-all-rows-columns-without-truncation/) | |
pd.set_option('display.float_format', '{:.2f}'.format) # suppress sci notation | |
# list all data frames | |
sheets=[] | |
for var in dir(): | |
if isinstance(locals()[var], pd.core.frame.DataFrame) and var[0]!='_': | |
sheets.append(var) | |
print(sheets) | |
# delete and release data frame from memory | |
import gc | |
del [[df]] | |
gc.collect() | |
df = pd.DataFrame() | |
# multiple dfs | |
del [[df1,df]] | |
gc.collect() | |
df1=pd.DataFrame() | |
df=pd.DataFrame() | |
### reading and writing data | |
import pandas as pd | |
df1 = pd.read_csv('bank_marketing.csv') | |
# Create a list of the new column labels: new_labels | |
new_labels = ['year','ID'] | |
# Read in the file, specifying the header and names parameters: df2 | |
df2 = pd.read_csv('bank_marketing.csv', header=0, names=new_labels) | |
# example with specified file path | |
df = pd.read_csv('C:/Users/Documents/Tools and References/Data/bank_marketing.csv') | |
# Save the cleaned up DataFrame to a CSV file without the index | |
df2.to_csv(file_clean, index=False) | |
# import by including index_col | |
cars = pd.read_csv('cars.csv', index_col = 0) | |
# find working directory | |
import os | |
os.getcwd() | |
print("Current Working Directory " , os.getcwd()) # check & print current directory | |
### export or 'pickle' python data frame | |
# export data frame for future | |
# change working directory | |
os.chdir("//Projects//Data") | |
# export as pickle file | |
df.to_pickle('df.pkl') | |
# remove previous copy | |
import gc | |
# read file back in - un-pickle | |
df = pd.read_pickle('df.pkl') | |
### export to csv with a specified file path vs working directory | |
bank_mkt_scored.to_csv('test123.csv') # this will write to working directory | |
path='C:\\Users\\mtb2901\\Documents\\Tools and References\\Data' | |
path2 = path + '\\test123.csv' | |
bank_mkt_scored.to_csv(path2) # this writes to directory above | |
# write to text file | |
df.to_csv('ProgramOutcome_DE_DB_2.txt', sep='\t', index=False) | |
### reading a txt file | |
import pandas as pd | |
# Assign filename: file | |
file = 'P:/Data/cohortfile.txt' | |
# Import file: data | |
df = pd.read_csv(file, sep='\t',encoding = "ISO-8859-1") | |
# list all data frames | |
sheets=[] | |
for var in dir(): | |
if isinstance(locals()[var], pd.core.frame.DataFrame) and var[0]!='_': | |
sheets.append(var) | |
print(sheets) | |
# | |
# extracting zip files from web | |
# | |
import requests | |
import zipfile | |
import io | |
url = 'https://ihmecovid19storage.blob.core.windows.net/latest/ihme-covid19.zip' | |
r = requests.get(url,stream=True) | |
z = zipfile.ZipFile(io.BytesIO(r.content)) | |
z.extractall('//econometrics//data') | |
#### cleaning up temp files | |
# delete df to clear up memory | |
del X | |
del X_score | |
del X_test | |
del X_train | |
### create a toy pandas data frame | |
# define a dictionary | |
data = {'GARST' :[150,140,145,137,141,145,149,153,157,161], | |
'PIO':[160,150,146,138,142,146,150,154,158,162], | |
'MYC':[137,148,151,139,143,120,115,136,130,129], | |
'DEK':[150,149,145,140,144,148,152,156,160,164], | |
'WYF':[110,125,135,122,127,132,130,'NaN',147,119], | |
'PLOT':[1,2,3,4,5,6,7,8,9,10], | |
'BT': ['Y','Y', 'N','N','N','N','Y','N','Y','Y'], | |
'RR':['Y','N','Y','N','N','N','N','Y','Y','N'], | |
'ID':[1,2,3,4,5,6,7,8,9,10] | |
} | |
# convert to a data frame | |
df = pd.DataFrame(data,columns=['ID','GARST','PIO','MYC','DEK','WYF','PLOT','BT','RR']) | |
print(df) | |
### inspecting your data | |
df.columns | |
list(df.columns) # untruncated list | |
sorted(list(df2.columns)) # sorted list | |
df.head() | |
df.tail() | |
df.info() | |
df.shape() | |
df.index | |
### misc things you can do to manipulate values and fields | |
df['ID'] = df['ID'].astype(str) # convert ID to string | |
df['PLOT'] = df['PLOT'].astype(str) # convert plot to string | |
df['WYF'] = pd.to_numeric(df['WYF'], errors='coerce', downcast ='integer') | |
### example working with data types | |
df['xvar'] = df['GARST'] | |
df['xvar'] = df['xvar'].astype(str) # convert to string | |
# use pandas to convert back to numeric | |
df['xvar'] = pd.to_numeric(df['xvar'], errors='coerce') | |
# pad leading zeros | |
df['GARST0'] = df['GARST'].apply(lambda x:'{:0>6}'.format(x)) | |
df.head() | |
# create unique ID for each line in data frame | |
df['ID2'] = list(range(1,11)) | |
# Print out type of GARST | |
print(type(df['GARST'])) | |
# Print out length of var1 | |
print(len(df['GARST'])) | |
# drop variables | |
df.drop(['xvar', 'ID2'], axis=1) | |
# create dummy variables (example) | |
df = pd.get_dummies(df, columns=['ORIGIN', 'DEST']) | |
df.head() | |
#---------------------------------- | |
# working with duplicated data | |
#---------------------------------- | |
data = {'WT' :[150,148,145,200,198,196,191,175,175,161], | |
'HT':[72,72,72,68,68,68,68,69,69,71], | |
'EX': ['Y','Y','Y','Y','Y','Y','Y','N','N','N'], | |
'DIET':['N','N','N','Y','Y','Y','Y','N','N','N'], | |
'PLAN':['A','A','B','A','A','B','B','A','A','B'], | |
'ID':[1,1,1,2,2,2,2,3,3,4] | |
} | |
# convert to a data frame | |
tmp = pd.DataFrame(data,columns=['ID','WT','EX','DIET','PLAN']) | |
print(tmp) | |
# get unique members | |
tmp2 = tmp.drop_duplicates(['ID']) | |
print(tmp2) | |
# get unique id and plan combinations | |
tmp3 = tmp.drop_duplicates(['ID','PLAN']) | |
print(tmp3) | |
# identify duplicates (example) | |
tmp = df1.groupby(['ID']).size().reset_index(name='count') # count duplicates | |
tmp = tmp.sort_values(['count'], ascending=[False]) # sort | |
print(tmp) # check | |
#----------------------------------- | |
# high level summary stats | |
#----------------------------------- | |
df.describe() # summary stats for all numerical colunms | |
# example summary stats with specified percentiles for variable yields | |
perc =[.25, .50, .75, .90,.95,.99] | |
df1.yields.describe(percentiles = perc) | |
# Print the value counts for BT and RR | |
print(df['BT'].value_counts(dropna=False)) | |
print(df['RR'].value_counts(dropna=False)) | |
# sort by GARST yield | |
df= df.sort_values('GARST', ascending=False) | |
print(df) | |
df= df.sort_values('GARST') | |
print(df) | |
# sort ascending by trait and by descending GARST yield_data | |
df = df.sort_values(['BT', 'GARST'], ascending=[True, False]) | |
print(df) | |
df = df.sort_values(['BT', 'GARST'], ascending=[False, False]) | |
print(df) | |
df = df.sort_values(['BT', 'GARST'], ascending=[False, True]) | |
print(df) | |
# sort by index to restore original order | |
df = df.sort_index() | |
print(df) | |
#------------------------------------ | |
# subsetting and basic data frame manipulation | |
#------------------------------------ | |
# Print out GARST column as Pandas Series | |
print(df['GARST']) | |
# Print out GARST column as Pandas DataFrame (use double brackets) | |
print(df[['GARST']]) | |
# Print out DataFrame with GARST and PLOT columns | |
print(df[['GARST','PLOT']]) | |
# subset data via variable selection | |
my_hybrids = df[['GARST','PIO']] | |
my_hybrids.head() # check | |
my_hybrids.info() # check | |
### example using .loc | |
# Create the list of column labels: cols and get rows | |
cols = ['GARST','PIO','PLOT'] | |
rows = df.index | |
# Create the new DataFrame | |
my_hybrids= df.loc[rows,cols] | |
print(my_hybrids) | |
# get only the first three rows | |
cols = ['GARST','PIO','PLOT'] | |
rows = [0,1,2] | |
my_hybrids= df.loc[rows,cols] | |
print(my_hybrids) | |
### subset / subquery similar to %in% operator in R (example) | |
dups = df1[df1[ID'].isin(['99999991', | |
'99999992', | |
'99999993', | |
'99999994',])] | |
# 'not' in (example) | |
df2 = df1[~df1[ID'].isin(['99999991', | |
'99999992', | |
'99999993', | |
'99999994',])] | |
### filtering based on logical conditions | |
# numpy logical operators | |
# np.logical_and(), np.logical_or() and np.logical_not() | |
# subset data based on observed values | |
import numpy as np | |
# define logical condition | |
hi = np.logical_and(df['GARST'] > 150, df['PIO'] > 150) | |
hi_yields = df[hi] # subset df based on condition | |
hi_yields.head() # check | |
# define logical condition | |
stack = np.logical_and(df['BT'] == "Y",df['RR'] == "Y") | |
stacked_traits = df[stack] # subset based on condition | |
stacked_traits.head() # check | |
# we don't have to use numpy | |
stack = (df['BT'] == "Y") & (df['RR'] == "Y") | |
stacked_traits = df[stack] | |
print(stacked_traits) | |
# or similarly | |
mask = (tmp['year'] == 2016) | |
tmp = tmp[mask] | |
# Create the boolean array: high_turnout | |
hi = df['GARST'] > 150 | |
# Filter with array: hi | |
hi_garst = df.loc[hi] | |
print(hi_garst) | |
#------------------------------------------ | |
# if else logic | |
#------------------------------------------ | |
def traits(BT): | |
# retunr gmo vs non-gmo trait | |
if BT == "Y": | |
return "bt" | |
else: | |
return "non-bt" | |
df['trait'] = df.BT.apply(traits) | |
print(df) | |
# you can do this with a lambda function | |
df['trait'] = df['BT'].apply(lambda BT: 'bt' if BT == 'Y' else 'non-bt') | |
print(df) | |
def elite(x): | |
max_yield = max(x) | |
yield_advantage = max_yield - 150 | |
return yield_advantage | |
df_elites = df[['GARST','PIO','DEK','MYC']].apply(elite) | |
print(df_elites) | |
### create categories based on value ranges | |
conditions = [ | |
(df['GARST'] < 140), | |
(df['GARST'] >= 140) & (df['GARST'] < 150), | |
(df['GARST'] >= 150)] | |
choices = ['low', 'med', 'high'] | |
df['lvl'] = np.select(conditions, choices, default='na') | |
print(df) | |
# another example | |
conditions = [ | |
(df['RR'] == "Y"), | |
(df['BT'] == "Y")] | |
choices = ['gmo', 'gmo'] | |
df['gmo'] = np.select(conditions, choices, default='non-gmo') | |
# compact for two levels | |
df['trait'] = np.where(df['BT']=='Y', 'bt', 'non-bt') | |
print(df) | |
# create a binary flag | |
df['flag'] = np.where(df['gmo']=='gmo', 1,0) | |
print(df) | |
df.columns | |
### example from datacamp - creating custom segments | |
# Define rfm_level function | |
def rfm_level(df): | |
if df['RFM_Score'] >= 10: | |
return 'Top' | |
elif ((df['RFM_Score'] >= 6) and (df['RFM_Score'] < 10)): | |
return 'Middle' | |
else: | |
return 'Low' | |
# Create a new variable RFM_Level | |
datamart['RFM_Level'] = datamart.apply(rfm_level, axis=1) | |
#------------------------------------- | |
# merging data | |
#------------------------------------ | |
# inner | |
pd.merge(bronze, gold, on=['NOC', 'Country'],suffixes=['_bronze', '_gold'], how='inner') | |
# left | |
pd.merge(bronze, gold, on=['NOC', 'Country'],suffixes=['_bronze', '_gold'], how='left') | |
# right | |
pd.merge(bronze, gold, on=['NOC', 'Country'],suffixes=['_bronze', '_gold'], how='right') | |
# outer | |
pd.merge(bronze, gold, on=['NOC', 'Country'],suffixes=['_bronze', '_gold'], how='outer') | |
# other examples | |
df3 = df1.merge(df2[['ID','Date','Country']], on = ['ID','Date], how = 'left') | |
#------------------------------------ | |
# strings | |
#------------------------------------ | |
### string methods | |
# ex | |
variety = "garst8590bt" | |
print(type(variety)) | |
# convert to upcase | |
variety_up = variety.upper() | |
print(variety) | |
print(variety_up) | |
# Print out the number of t's in variety | |
print(variety.count('t')) | |
# strip white spaces | |
df.columns = df.columns.str.strip() | |
string = "freeCodeCamp" | |
print(string[0:5]) | |
# substring operations | |
data = {'HYBRID' :['P1324RR','P1498RR','P1365BT','DKC2988RR','DKC4195BT'], | |
'YIELD':[160,150,146,138,142], | |
'ID':[1,2,3,4,5] | |
} | |
# convert to a data frame | |
df = pd.DataFrame(data,columns=['ID','HYBRID','ID']) | |
print(df) | |
df['HYBRID_CD'] = df.HYBRID.str.slice(0, 1) # this gets the first character | |
df['HYBRID_CD2'] = df.HYBRID.str[:1] # this gets the first character | |
df['HYBRID_CD2'] = df.HYBRID.str[1:] # this skipps the first character | |
df['TRIAT'] = df['HYBRID'].str[-2:] # this gets the last 2 characters | |
#------------------------------------ | |
# loops | |
#----------------------------------- | |
### loop over a list | |
# ex | |
garst= [150,140,145,137,141] | |
for yields in garst : | |
print(yields) | |
### basic loop | |
x = 1 | |
while x < 4 : | |
print(x) | |
x = x + 1 | |
### loop over data frame | |
# ex: create new string column for BT that is lower case | |
for lab, row in df.iterrows() : | |
df.loc[lab,"bt"] = row["BT"].lower() | |
df.head() # check | |
# this can similarly be accomplished via 'apply' with string function | |
df["rr"] = df["RR"].apply(str.lower) | |
df.head() # check | |
# ex create new column giving amount of GARST yielding > 150 (or under) | |
for lab, row in df.iterrows() : | |
df.loc[lab, "amt_over_150"] = (row["GARST"] - 150) | |
df.head() # check | |
#--------------------------------------------- | |
# functions | |
#--------------------------------------------- | |
# example of function syntax | |
def fun(a,b): | |
"""State what function does here""" | |
# Computation performed here | |
return x, y | |
# define function | |
def square(num): | |
new_value = num ** 2 | |
return new_value | |
square(5) # call function | |
### basic function to calculate amount over 150 for a given hybrid | |
def diff(df,hybrid): | |
"calculates difference in yield from 150" | |
for lab, row in df.iterrows() : | |
df.loc[lab, "amt_over_150"] = (row[hybrid] - 150) | |
diff(df,"PIO") # call function | |
df.head() # check | |
### function that finds the yeild advantage for a variety | |
def elite(df,var1): | |
max_yield = max(df[var1]) | |
yield_advantage = max_yield - 150 | |
return yield_advantage | |
elite(df,"GARST") # for garst | |
elite(df,"PIO") # for pio | |
elite(df,"DEK") # for dekalb | |
elite(df,"MYC") # for myc | |
# make the function more general and use apply | |
def elite(x): | |
max_yield = max(x) | |
yield_advantage = max_yield - 150 | |
return yield_advantage | |
df_elites = df[['GARST','PIO','DEK','MYC']].apply(elite) | |
print(df_elites) | |
# application: determine which hybrid has | |
# at least a 10 bu yield advantage | |
#--------------------------------------------- | |
# transposing or reshaping data | |
#--------------------------------------------- | |
# use melt to convert columns into rows | |
df2 = pd.melt(df, id_vars=['ID','BT','RR'], var_name='hybrid', value_name='yield') | |
print(df2) # check | |
df2['yield'] = df2['yield'].astype(int) # fix loss of format | |
# pivot this data back into tidy form | |
df3 = df2.pivot_table(index = ['ID','BT','RR'],columns = 'hybrid',values= 'yield') | |
df3.head() # check | |
# convert indices back to columns | |
df3.reset_index(level=['ID','BT','RR']) | |
df3.reset_index() # this would have worked too | |
df3.info() | |
### example with toy panel data | |
# create toy panel (long) data | |
data = {'id' :[1,1,1,2,2,2,3,3,3], | |
'measure':["depth","temp","width","depth","temp","width","depth","temp","width"], | |
'values': [2,50,18,1.5,53,18,2.5,60,18], | |
} | |
# convert to a data frame | |
tmp = pd.DataFrame(data,columns=['id','measure','values']) | |
print(tmp) | |
# pivot this data back into tidy form (wide) | |
df1 = tmp.pivot_table(index = ['id'],columns = 'measure',values= 'values') | |
# convert indices back to columns | |
df1 = df1.reset_index() | |
print(df1) # check | |
# use melt to convert columns back into rows (panel or long) | |
df2 = pd.melt(df1, id_vars=['id'], var_name='measure', value_name='values') | |
print(df2) | |
#-------------------------------------- | |
# group by operations | |
#-------------------------------------- | |
df.groupby('BT')['GARST'].mean() # average yield for garst BT | |
# check this the hard way: | |
garst = df[['GARST','BT']] # double brackets gives a data frame vs series in python | |
print(garst) | |
bt = garst['BT'] == "Y" | |
print(garst.loc[bt]) # check | |
print(garst.loc[~bt]) # check | |
garst['GARST'].loc[bt].mean() # average for BT | |
garst['GARST'].loc[~bt].mean() # average for non-BT | |
# group by list of hybrids of interest | |
hybrids = ['GARST','PIO','MYC'] | |
df.groupby('BT')[hybrids].mean() | |
# overall means by BT trait | |
df.groupby('BT').mean() | |
# check the hard way | |
df.loc[df['BT'] == 'Y'].mean() # just get mean for BT overall | |
df.loc[df['BT'] == 'N'].mean() # just get mean for non BT overall | |
# create new data frame aggregating by variable | |
tmp2 = tmp.groupby('ID')[['purchases']].mean().reset_index() | |
### aggregations using agg functon | |
# see: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.agg.html | |
import numpy as np | |
# create toy data (note how this leverages numpy to create NaN values) | |
df = pd.DataFrame([[1, 2, 3], | |
[4, 5, 6], | |
[7, 8, 9], | |
[np.nan, np.nan, np.nan]], | |
columns=['A', 'B', 'C']) | |
# ex: min and max for each column | |
df.agg(['sum', 'min']) | |
# different aggregations specific for each specified column | |
df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']}) | |
# aggregate over columns | |
df.agg("mean", axis="columns") | |
# use aggregate to create new variables and custom function | |
data_mart = df.agg({ | |
'A': lambda x: ( x.max() - x), | |
'B': 'count', | |
'C': 'mean', | |
}) | |
# Rename the columns | |
data_mart.rename(columns={'A': 'MaxDiff', | |
'B': 'Frequency', | |
'C': 'Mean'}, inplace=True) | |
# apply to panel data | |
# create toy panel (long) data | |
data = {'id' :[1,1,1,2,2,2,3,3,3], | |
'measure':["depth","temp","width","depth","temp","width","depth","temp","width"], | |
'values': [2,50,18,1.5,53,18,2.5,60,18], | |
} | |
# convert to a data frame | |
df = pd.DataFrame(data,columns=['id','measure','values']) | |
print(df) | |
# sum total values by id | |
data_mart = df.groupby(['id']).agg({'values': 'sum'}) | |
#----------------------------------------- | |
# cross tabs | |
#---------------------------------------- | |
# for more info: https://pbpython.com/pandas-crosstab.html | |
pd.crosstab(dat.BT, dat.RR) | |
pd.crosstab(df1.Gender_Code,df1.treat, normalize=True) # gives % in each combination | |
pd.crosstab(df1.treat,df1.Diag_Grouping, normalize='index') # example: normalize gives % row wise | |
pd.crosstab(df1.Diag_Grouping,df1.treat, normalize='columns') # this is easier to read columnwise | |
# see also groupby | |
#-------------------------------------- | |
# missing data | |
#-------------------------------------- | |
df.isnull().sum(axis=0) # count of missing values by variable in dataframe | |
df1.isnull().sum() # total missing per column | |
df[df.isnull().values.any(axis=1)].head() # check data where there are missing values | |
tmp = df.isnull().sum(axis=0) # check missing values | |
df.dropna(subset=['ZipCode','Income'], how='all', inplace=True) # drop variables with missing values | |
# drop missing values | |
wyf = df[["WYF"]] | |
wyf2 = wyf.dropna(axis=0,how='any') | |
# how does python handle missing data | |
wyf.mean() | |
wyf2.mean() | |
wyf.std() | |
wyf2.std() | |
del wyf | |
del wyf2 | |
# Replace the NaN price values with 0 | |
purchase_data.price = np.where(np.isnan(purchase_data.price),0, purchase_data.price) | |
# replace missing categorical | |
conditions = [(tmp['RiskScore'].isnull() == True)] | |
choices = ['Missing'] | |
tmp['HasRiskScore'] = np.select(conditions, choices, default='Yes') | |
print(tmp['HasRiskScore'].value_counts(dropna=False)) # check values | |
### replace misisng and none string values | |
# create data with blank and None string values | |
data = {'GARST' :[150,140,145,137,141,145,149,153,157,161], | |
'PIO':[160,150,146,138,142,146,150,154,158,162], | |
'MYC':[137,148,151,139,143,120,115,136,130,129], | |
'DEK':[150,149,145,140,144,148,152,156,160,164], | |
'WYF':[110,125,135,122,127,132,130,'NaN',147,119], | |
'PLOT':[1,2,3,4,5,6,7,8,9,10], | |
'BT': ['Y','Y', None,'N','N','N','Y','N','Y',''], | |
'RR':['Y','N','Y','N','N','N','N','Y',None,'N'], | |
'ID':[1,2,3,4,5,6,7,8,9,10] | |
} | |
# convert to a data frame | |
df = pd.DataFrame(data,columns=['ID','GARST','PIO','MYC','DEK','WYF','PLOT','BT','RR']) | |
print(df) | |
# preview data | |
df.head(10) | |
# replace None with 'MISSING' | |
df = df.replace({None: 'Missing'}) | |
# replace blank with missing | |
df = df.replace({'': 'Missing'}) | |
#------------------------------------- | |
# imputation | |
#------------------------------------- | |
# Write a function that imputes median | |
def impute_median(series): | |
return series.fillna(series.median()) | |
df.WYF.median() # waht is the median | |
# Impute median | |
df['WYFimp'] = df.WYF.transform(impute_median) | |
print(df) # check | |
#-------------------------------------- | |
# basic descriptives | |
#-------------------------------------- | |
print(df['RR'].value_counts(dropna=False)) | |
df.GARST.mean() | |
df.GARST.std() | |
df.GARST.var() | |
#------------------------------------- | |
# covariance matrix operations | |
#------------------------------------- | |
covariance_matrix = np.cov(df.GARST,df.PIO) | |
# Print covariance matrix | |
print(covariance_matrix) | |
# Extract covariance of garst and pioneer | |
garst_pio= covariance_matrix[0,1] | |
print(garst_pio) | |
# same as above | |
garst_pio= covariance_matrix[1,0] | |
print(garst_pio) | |
# variance of pioneer | |
pio_var= covariance_matrix[1,1] | |
print(pio_var) | |
# variance of garst | |
garst_var= covariance_matrix[0,0] | |
print(garst_var) | |
#------------------------------------- | |
# data visualizatiion | |
#------------------------------------ | |
# Import necessary modules | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
### boxplot | |
# Create the boxplot | |
df.boxplot(column = 'GARST') | |
df.boxplot(column='GARST', by='RR', rot=90) | |
# Display the plot | |
plt.show() | |
### Box-and-whisker plot with seaborn | |
# Create box plot with Seaborn's default settings | |
_ = sns.boxplot(x='RR', y='GARST', data=df) | |
# Label the axes | |
_ = plt.xlabel('RR') | |
_ = plt.ylabel('yield') | |
# Show the plot | |
plt.show() | |
### countplot / bar plot | |
sns.countplot(x="total_flags", data=df2) | |
speed = [0.1, 17.5, 40, 48, 52, 69, 88] | |
lifespan = [2, 8, 70, 1.5, 25, 12, 28] | |
index = ['snail', 'pig', 'elephant', 'rabbit', 'giraffe', 'coyote', 'horse'] | |
df = pd.DataFrame({'speed': speed,'lifespan': lifespan}, index=index) | |
print(df) | |
ax = df.plot.bar(rot=0) | |
ax = df.plot.bar(y='speed', rot=0) # single column | |
### histogram | |
# Plot histogram of versicolor petal lengths | |
_ = plt.hist(df.GARST) | |
# Label axes | |
_ = plt.xlabel('yield') | |
_ = plt.ylabel('count') | |
# Show histogram | |
plt.show() | |
### histogram with pandas | |
# Plot the PDF | |
df.GARST.plot( kind='hist', normed=True, bins=30, range=(100,200)) | |
plt.show() | |
# Plot the CDF | |
df.GARST.plot( kind='hist', normed=True, cumulative = True, bins=30, range=(100,200)) | |
plt.show() | |
### bee swarm plot | |
_ = sns.swarmplot(x='RR', y='GARST', data=df) | |
_ = plt.xlabel('RR') | |
_ = plt.ylabel('yield') | |
plt.show() | |
### ECDF | |
def ecdf(data): | |
"""Compute ECDF for a one-dimensional array of measurements.""" | |
# Number of data points: n | |
n = len(data) | |
# x-data for the ECDF: x | |
x = np.sort(data) | |
# y-data for the ECDF: y | |
y = np.arange(1,n+1) / n | |
return x, y | |
### Plotting the ECDF | |
# Compute ECDF for versicolor data: x_vers, y_vers | |
x_val, y_val = ecdf(df.GARST) | |
# Generate plot | |
_ = plt.plot(x_val, y_val, marker='.',linestyle = 'none') | |
# Label the axes | |
_ = plt.xlabel('yield') | |
_ = plt.ylabel('ECDF') | |
# Display the plot | |
plt.show() | |
### Computing percentiles | |
# Specify array of percentiles: percentiles | |
percentiles = [2.5,25,50,75,97.5] | |
# Compute percentiles: ptiles_vers | |
ptiles = np.percentile(df.GARST, percentiles) | |
# Print the result | |
print(ptiles) | |
### scatter plot | |
_ = plt.plot(df.GARST, df.PIO, marker='.', linestyle='none') | |
# Label the axes | |
_ = plt.xlabel('garst') | |
_ = plt.ylabel('pioneer') | |
# Show the result | |
plt.show() | |
#---------------------------------------- | |
# time series | |
#---------------------------------------- | |
### create data frame with date time index | |
data = {'date': ['2014-05-01 18:00:00', '2014-05-01 18:30:00', '2014-05-02 17:00:00', '2014-05-02 16:00:00', '2014-05-02 15:30:00', '2014-05-02 14:00:00', '2014-05-03 13:00:00', '2014-05-03 18:00:00', '2014-04-30 15:00:00', '2014-04-30 18:00:00'], | |
'aphids': [15, 25, 26, 12, 17, 14, 26, 32, 48, 41]} | |
df = pd.DataFrame(data, columns = ['date', 'aphids']) | |
print(df) | |
df.info() | |
# Convert df['date'] from string to datetime | |
# Prepare a format string: time_format | |
time_format = '%Y-%m-%d %H:%M' | |
# Convert date_list into a datetime object: my_datetimes | |
df['date'] = pd.to_datetime(df['date'], format=time_format) | |
df.info() | |
# function for parsing calendar date | |
import datetime as dt | |
def get_day(x): return dt.datetime(x.year, x.month, x.day) | |
# Create day column | |
df['day'] = df['date'].apply(get_day) | |
df.head() | |
# Set df['date'] as the index and delete the column | |
df.index = df['date'] | |
del df['date'] | |
df # check | |
df.info() # check | |
### date operations | |
# Extract the hour from 2pm to 4pm on '2014-05-02': ts1 | |
ts1 = df.loc['2014-05-02 14:00:00':'2014-05-02 16:00:00'] | |
# Extract '2014-05-02': ts2 | |
ts2 = df.loc['2014-05-02'] | |
# Extract data from '2014-05-03' to '22014-05-02': ts3 | |
ts3 = df.loc['2014-05-03':'2014-05-05'] | |
# Downsample to get total within 2 hours | |
df1 = df['aphids'].resample('2h').sum() | |
print(df) # compare | |
print(df1) # check | |
# Downsample to get daily total aphid counts | |
df1 = df['aphids'].resample('D').sum() | |
# get daily high counts | |
daily_highs = df['aphids'].resample('D').max() | |
# get counts for april | |
april = df['aphids']['2014-Apr'] | |
print(april) | |
# examples from DataCamp Customer Analytics and A/B Testing | |
# Provide the correct format for the date Saturday January 27, 2017 | |
date_data_one = pd.to_datetime(date_data_one, format="%A %B %d, %Y") | |
print(date_data_one) | |
# Provide the correct format for the date 2017-08-01 | |
date_data_two = pd.to_datetime(date_data_two, format= "%Y-%m-%d") | |
print(date_data_two) | |
# Provide the correct format for the date 08/17/1978 | |
date_data_three = pd.to_datetime(date_data_three, format='%m/%d/%Y') | |
print(date_data_three) | |
# Provide the correct format for the date 2016 March 01 01:56 | |
date_data_four = pd.to_datetime(date_data_four, format="%Y %B %d %H:%M") | |
print(date_data_four) | |
# great reference: https://www.dataquest.io/blog/python-datetime-tutorial/ | |
# hacky way to deal with dirty stringy dates | |
df3['END_DT'] = df3['END_DT'].str.slice(stop=10) | |
df3['END_DT'] = pd.to_datetime(df3.END_DT, format='%Y/%m/%d') | |
print(df3.END_DT.head()) | |
# calculate 12 months pre and post dates based on index date | |
df_chrt['date_post'] = df_chrt['INDEX_DT'] + pd.DateOffset(months=12) | |
df_chrt['date_pre'] = df_chrt['INDEX_DT'] - pd.DateOffset(months=12) | |
#------------------------------------- | |
# random numbers | |
#------------------------------------- | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
### random uniform | |
# Seed the random number generator | |
np.random.seed(42) | |
# Initialize random numbers: random_numbers | |
random_numbers = np.empty(100000) | |
# Generate random numbers by looping over range(100000) | |
for i in range(100000): | |
random_numbers[i] = np.random.random() | |
# Plot a histogram | |
_ = plt.hist(random_numbers) | |
# Show the plot | |
plt.show() | |
# not sure why a loop is always necessary | |
rs = np.random.random(10000) | |
plt.hist(rs) | |
# both approahces create an array containing the random values | |
type(rs) | |
type(random_numbers) | |
#### poisson | |
samples_poisson = np.random.poisson(3,size =10000) | |
# Print the mean and standard deviation | |
print('Poisson: ', np.mean(samples_poisson), | |
np.std(samples_poisson)) | |
plt.hist(samples_poisson) | |
### normal distribution | |
normdist = np.random.normal(20, 1, size=100000) | |
# pdf - histogram | |
plt.hist(normdist ,bins=100,normed=True,histtype='step') | |
# normal cdf | |
x1, y1 = ecdf(normdist) | |
_ = plt.plot(x1,y1, marker='.',linestyle = 'none') | |
_ = plt.legend(('normal cdf'), loc='lower right') | |
plt.show() | |
### compare GARST to a normal distribution | |
mean = np.mean(df.GARST) # get empirical mean | |
std = np.std(df.GARST) # get empirical std deviation | |
rnorm = np.random.normal(mean, std, size=1000) # simulate normal data | |
x, y = ecdf(df.GARST) # empirical distribution | |
x_norm, y_norm = ecdf(rnorm) # normal distribution | |
sns.set() # apply seaborn templates | |
_ = plt.plot(x_norm, y_norm) # plot normally distributed data points | |
_ = plt.plot(x, y, marker='.', linestyle='none') # plot empirical data | |
_ = plt.xlabel('yield') | |
_ = plt.ylabel('CDF') | |
plt.show() | |
#------------------------------------------------------ | |
# simulation | |
#------------------------------------------------------ | |
df1['prob'] = np.random.uniform(0, 1, df1.shape[0]) # add random uniformly distributed value as a column |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment