Skip to content

Instantly share code, notes, and snippets.

@BioSciEconomist
Last active February 1, 2024 23:39
Show Gist options
  • Save BioSciEconomist/d5ee4659251d7c6464348b8e2130d87f to your computer and use it in GitHub Desktop.
Save BioSciEconomist/d5ee4659251d7c6464348b8e2130d87f to your computer and use it in GitHub Desktop.
examples of data munging and analysis in python
# *-----------------------------------------------------------------
# | PROGRAM NAME: py code pallette.py
# | DATE: 7/17/20 (original: 12/28/18)
# | CREATED BY: MATT BOGARD
# | PROJECT FILE:
# *----------------------------------------------------------------
# | PURPOSE: examples of data munging and analysis in python
# *----------------------------------------------------------------
# see also: https://pandas.pydata.org/pandas-docs/stable/getting_started/comparison/comparison_with_r.html
###############################################################################
###############################################################################
################# under construction ############################
###############################################################################
###############################################################################
#----------------------------------
# basic data management
# ---------------------------------
pd.set_option('display.max_rows', None) # to get untruncated output (see: https://thispointer.com/python-pandas-how-to-display-full-dataframe-i-e-print-all-rows-columns-without-truncation/)
pd.set_option('display.float_format', '{:.2f}'.format) # suppress sci notation
# list all data frames
sheets=[]
for var in dir():
if isinstance(locals()[var], pd.core.frame.DataFrame) and var[0]!='_':
sheets.append(var)
print(sheets)
# delete and release data frame from memory
import gc
del [[df]]
gc.collect()
df = pd.DataFrame()
# multiple dfs
del [[df1,df]]
gc.collect()
df1=pd.DataFrame()
df=pd.DataFrame()
### reading and writing data
import pandas as pd
df1 = pd.read_csv('bank_marketing.csv')
# Create a list of the new column labels: new_labels
new_labels = ['year','ID']
# Read in the file, specifying the header and names parameters: df2
df2 = pd.read_csv('bank_marketing.csv', header=0, names=new_labels)
# example with specified file path
df = pd.read_csv('C:/Users/Documents/Tools and References/Data/bank_marketing.csv')
# Save the cleaned up DataFrame to a CSV file without the index
df2.to_csv(file_clean, index=False)
# import by including index_col
cars = pd.read_csv('cars.csv', index_col = 0)
# find working directory
import os
os.getcwd()
print("Current Working Directory " , os.getcwd()) # check & print current directory
### export or 'pickle' python data frame
# export data frame for future
# change working directory
os.chdir("//Projects//Data")
# export as pickle file
df.to_pickle('df.pkl')
# remove previous copy
import gc
# read file back in - un-pickle
df = pd.read_pickle('df.pkl')
### export to csv with a specified file path vs working directory
bank_mkt_scored.to_csv('test123.csv') # this will write to working directory
path='C:\\Users\\mtb2901\\Documents\\Tools and References\\Data'
path2 = path + '\\test123.csv'
bank_mkt_scored.to_csv(path2) # this writes to directory above
# write to text file
df.to_csv('ProgramOutcome_DE_DB_2.txt', sep='\t', index=False)
### reading a txt file
import pandas as pd
# Assign filename: file
file = 'P:/Data/cohortfile.txt'
# Import file: data
df = pd.read_csv(file, sep='\t',encoding = "ISO-8859-1")
# list all data frames
sheets=[]
for var in dir():
if isinstance(locals()[var], pd.core.frame.DataFrame) and var[0]!='_':
sheets.append(var)
print(sheets)
#
# extracting zip files from web
#
import requests
import zipfile
import io
url = 'https://ihmecovid19storage.blob.core.windows.net/latest/ihme-covid19.zip'
r = requests.get(url,stream=True)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall('//econometrics//data')
#### cleaning up temp files
# delete df to clear up memory
del X
del X_score
del X_test
del X_train
### create a toy pandas data frame
# define a dictionary
data = {'GARST' :[150,140,145,137,141,145,149,153,157,161],
'PIO':[160,150,146,138,142,146,150,154,158,162],
'MYC':[137,148,151,139,143,120,115,136,130,129],
'DEK':[150,149,145,140,144,148,152,156,160,164],
'WYF':[110,125,135,122,127,132,130,'NaN',147,119],
'PLOT':[1,2,3,4,5,6,7,8,9,10],
'BT': ['Y','Y', 'N','N','N','N','Y','N','Y','Y'],
'RR':['Y','N','Y','N','N','N','N','Y','Y','N'],
'ID':[1,2,3,4,5,6,7,8,9,10]
}
# convert to a data frame
df = pd.DataFrame(data,columns=['ID','GARST','PIO','MYC','DEK','WYF','PLOT','BT','RR'])
print(df)
### inspecting your data
df.columns
list(df.columns) # untruncated list
sorted(list(df2.columns)) # sorted list
df.head()
df.tail()
df.info()
df.shape()
df.index
### misc things you can do to manipulate values and fields
df['ID'] = df['ID'].astype(str) # convert ID to string
df['PLOT'] = df['PLOT'].astype(str) # convert plot to string
df['WYF'] = pd.to_numeric(df['WYF'], errors='coerce', downcast ='integer')
### example working with data types
df['xvar'] = df['GARST']
df['xvar'] = df['xvar'].astype(str) # convert to string
# use pandas to convert back to numeric
df['xvar'] = pd.to_numeric(df['xvar'], errors='coerce')
# pad leading zeros
df['GARST0'] = df['GARST'].apply(lambda x:'{:0>6}'.format(x))
df.head()
# create unique ID for each line in data frame
df['ID2'] = list(range(1,11))
# Print out type of GARST
print(type(df['GARST']))
# Print out length of var1
print(len(df['GARST']))
# drop variables
df.drop(['xvar', 'ID2'], axis=1)
# create dummy variables (example)
df = pd.get_dummies(df, columns=['ORIGIN', 'DEST'])
df.head()
#----------------------------------
# working with duplicated data
#----------------------------------
data = {'WT' :[150,148,145,200,198,196,191,175,175,161],
'HT':[72,72,72,68,68,68,68,69,69,71],
'EX': ['Y','Y','Y','Y','Y','Y','Y','N','N','N'],
'DIET':['N','N','N','Y','Y','Y','Y','N','N','N'],
'PLAN':['A','A','B','A','A','B','B','A','A','B'],
'ID':[1,1,1,2,2,2,2,3,3,4]
}
# convert to a data frame
tmp = pd.DataFrame(data,columns=['ID','WT','EX','DIET','PLAN'])
print(tmp)
# get unique members
tmp2 = tmp.drop_duplicates(['ID'])
print(tmp2)
# get unique id and plan combinations
tmp3 = tmp.drop_duplicates(['ID','PLAN'])
print(tmp3)
# identify duplicates (example)
tmp = df1.groupby(['ID']).size().reset_index(name='count') # count duplicates
tmp = tmp.sort_values(['count'], ascending=[False]) # sort
print(tmp) # check
#-----------------------------------
# high level summary stats
#-----------------------------------
df.describe() # summary stats for all numerical colunms
# example summary stats with specified percentiles for variable yields
perc =[.25, .50, .75, .90,.95,.99]
df1.yields.describe(percentiles = perc)
# Print the value counts for BT and RR
print(df['BT'].value_counts(dropna=False))
print(df['RR'].value_counts(dropna=False))
# sort by GARST yield
df= df.sort_values('GARST', ascending=False)
print(df)
df= df.sort_values('GARST')
print(df)
# sort ascending by trait and by descending GARST yield_data
df = df.sort_values(['BT', 'GARST'], ascending=[True, False])
print(df)
df = df.sort_values(['BT', 'GARST'], ascending=[False, False])
print(df)
df = df.sort_values(['BT', 'GARST'], ascending=[False, True])
print(df)
# sort by index to restore original order
df = df.sort_index()
print(df)
#------------------------------------
# subsetting and basic data frame manipulation
#------------------------------------
# Print out GARST column as Pandas Series
print(df['GARST'])
# Print out GARST column as Pandas DataFrame (use double brackets)
print(df[['GARST']])
# Print out DataFrame with GARST and PLOT columns
print(df[['GARST','PLOT']])
# subset data via variable selection
my_hybrids = df[['GARST','PIO']]
my_hybrids.head() # check
my_hybrids.info() # check
### example using .loc
# Create the list of column labels: cols and get rows
cols = ['GARST','PIO','PLOT']
rows = df.index
# Create the new DataFrame
my_hybrids= df.loc[rows,cols]
print(my_hybrids)
# get only the first three rows
cols = ['GARST','PIO','PLOT']
rows = [0,1,2]
my_hybrids= df.loc[rows,cols]
print(my_hybrids)
### subset / subquery similar to %in% operator in R (example)
dups = df1[df1[ID'].isin(['99999991',
'99999992',
'99999993',
'99999994',])]
# 'not' in (example)
df2 = df1[~df1[ID'].isin(['99999991',
'99999992',
'99999993',
'99999994',])]
### filtering based on logical conditions
# numpy logical operators
# np.logical_and(), np.logical_or() and np.logical_not()
# subset data based on observed values
import numpy as np
# define logical condition
hi = np.logical_and(df['GARST'] > 150, df['PIO'] > 150)
hi_yields = df[hi] # subset df based on condition
hi_yields.head() # check
# define logical condition
stack = np.logical_and(df['BT'] == "Y",df['RR'] == "Y")
stacked_traits = df[stack] # subset based on condition
stacked_traits.head() # check
# we don't have to use numpy
stack = (df['BT'] == "Y") & (df['RR'] == "Y")
stacked_traits = df[stack]
print(stacked_traits)
# or similarly
mask = (tmp['year'] == 2016)
tmp = tmp[mask]
# Create the boolean array: high_turnout
hi = df['GARST'] > 150
# Filter with array: hi
hi_garst = df.loc[hi]
print(hi_garst)
#------------------------------------------
# if else logic
#------------------------------------------
def traits(BT):
# retunr gmo vs non-gmo trait
if BT == "Y":
return "bt"
else:
return "non-bt"
df['trait'] = df.BT.apply(traits)
print(df)
# you can do this with a lambda function
df['trait'] = df['BT'].apply(lambda BT: 'bt' if BT == 'Y' else 'non-bt')
print(df)
def elite(x):
max_yield = max(x)
yield_advantage = max_yield - 150
return yield_advantage
df_elites = df[['GARST','PIO','DEK','MYC']].apply(elite)
print(df_elites)
### create categories based on value ranges
conditions = [
(df['GARST'] < 140),
(df['GARST'] >= 140) & (df['GARST'] < 150),
(df['GARST'] >= 150)]
choices = ['low', 'med', 'high']
df['lvl'] = np.select(conditions, choices, default='na')
print(df)
# another example
conditions = [
(df['RR'] == "Y"),
(df['BT'] == "Y")]
choices = ['gmo', 'gmo']
df['gmo'] = np.select(conditions, choices, default='non-gmo')
# compact for two levels
df['trait'] = np.where(df['BT']=='Y', 'bt', 'non-bt')
print(df)
# create a binary flag
df['flag'] = np.where(df['gmo']=='gmo', 1,0)
print(df)
df.columns
### example from datacamp - creating custom segments
# Define rfm_level function
def rfm_level(df):
if df['RFM_Score'] >= 10:
return 'Top'
elif ((df['RFM_Score'] >= 6) and (df['RFM_Score'] < 10)):
return 'Middle'
else:
return 'Low'
# Create a new variable RFM_Level
datamart['RFM_Level'] = datamart.apply(rfm_level, axis=1)
#-------------------------------------
# merging data
#------------------------------------
# inner
pd.merge(bronze, gold, on=['NOC', 'Country'],suffixes=['_bronze', '_gold'], how='inner')
# left
pd.merge(bronze, gold, on=['NOC', 'Country'],suffixes=['_bronze', '_gold'], how='left')
# right
pd.merge(bronze, gold, on=['NOC', 'Country'],suffixes=['_bronze', '_gold'], how='right')
# outer
pd.merge(bronze, gold, on=['NOC', 'Country'],suffixes=['_bronze', '_gold'], how='outer')
# other examples
df3 = df1.merge(df2[['ID','Date','Country']], on = ['ID','Date], how = 'left')
#------------------------------------
# strings
#------------------------------------
### string methods
# ex
variety = "garst8590bt"
print(type(variety))
# convert to upcase
variety_up = variety.upper()
print(variety)
print(variety_up)
# Print out the number of t's in variety
print(variety.count('t'))
# strip white spaces
df.columns = df.columns.str.strip()
string = "freeCodeCamp"
print(string[0:5])
# substring operations
data = {'HYBRID' :['P1324RR','P1498RR','P1365BT','DKC2988RR','DKC4195BT'],
'YIELD':[160,150,146,138,142],
'ID':[1,2,3,4,5]
}
# convert to a data frame
df = pd.DataFrame(data,columns=['ID','HYBRID','ID'])
print(df)
df['HYBRID_CD'] = df.HYBRID.str.slice(0, 1) # this gets the first character
df['HYBRID_CD2'] = df.HYBRID.str[:1] # this gets the first character
df['HYBRID_CD2'] = df.HYBRID.str[1:] # this skipps the first character
df['TRIAT'] = df['HYBRID'].str[-2:] # this gets the last 2 characters
#------------------------------------
# loops
#-----------------------------------
### loop over a list
# ex
garst= [150,140,145,137,141]
for yields in garst :
print(yields)
### basic loop
x = 1
while x < 4 :
print(x)
x = x + 1
### loop over data frame
# ex: create new string column for BT that is lower case
for lab, row in df.iterrows() :
df.loc[lab,"bt"] = row["BT"].lower()
df.head() # check
# this can similarly be accomplished via 'apply' with string function
df["rr"] = df["RR"].apply(str.lower)
df.head() # check
# ex create new column giving amount of GARST yielding > 150 (or under)
for lab, row in df.iterrows() :
df.loc[lab, "amt_over_150"] = (row["GARST"] - 150)
df.head() # check
#---------------------------------------------
# functions
#---------------------------------------------
# example of function syntax
def fun(a,b):
"""State what function does here"""
# Computation performed here
return x, y
# define function
def square(num):
new_value = num ** 2
return new_value
square(5) # call function
### basic function to calculate amount over 150 for a given hybrid
def diff(df,hybrid):
"calculates difference in yield from 150"
for lab, row in df.iterrows() :
df.loc[lab, "amt_over_150"] = (row[hybrid] - 150)
diff(df,"PIO") # call function
df.head() # check
### function that finds the yeild advantage for a variety
def elite(df,var1):
max_yield = max(df[var1])
yield_advantage = max_yield - 150
return yield_advantage
elite(df,"GARST") # for garst
elite(df,"PIO") # for pio
elite(df,"DEK") # for dekalb
elite(df,"MYC") # for myc
# make the function more general and use apply
def elite(x):
max_yield = max(x)
yield_advantage = max_yield - 150
return yield_advantage
df_elites = df[['GARST','PIO','DEK','MYC']].apply(elite)
print(df_elites)
# application: determine which hybrid has
# at least a 10 bu yield advantage
#---------------------------------------------
# transposing or reshaping data
#---------------------------------------------
# use melt to convert columns into rows
df2 = pd.melt(df, id_vars=['ID','BT','RR'], var_name='hybrid', value_name='yield')
print(df2) # check
df2['yield'] = df2['yield'].astype(int) # fix loss of format
# pivot this data back into tidy form
df3 = df2.pivot_table(index = ['ID','BT','RR'],columns = 'hybrid',values= 'yield')
df3.head() # check
# convert indices back to columns
df3.reset_index(level=['ID','BT','RR'])
df3.reset_index() # this would have worked too
df3.info()
### example with toy panel data
# create toy panel (long) data
data = {'id' :[1,1,1,2,2,2,3,3,3],
'measure':["depth","temp","width","depth","temp","width","depth","temp","width"],
'values': [2,50,18,1.5,53,18,2.5,60,18],
}
# convert to a data frame
tmp = pd.DataFrame(data,columns=['id','measure','values'])
print(tmp)
# pivot this data back into tidy form (wide)
df1 = tmp.pivot_table(index = ['id'],columns = 'measure',values= 'values')
# convert indices back to columns
df1 = df1.reset_index()
print(df1) # check
# use melt to convert columns back into rows (panel or long)
df2 = pd.melt(df1, id_vars=['id'], var_name='measure', value_name='values')
print(df2)
#--------------------------------------
# group by operations
#--------------------------------------
df.groupby('BT')['GARST'].mean() # average yield for garst BT
# check this the hard way:
garst = df[['GARST','BT']] # double brackets gives a data frame vs series in python
print(garst)
bt = garst['BT'] == "Y"
print(garst.loc[bt]) # check
print(garst.loc[~bt]) # check
garst['GARST'].loc[bt].mean() # average for BT
garst['GARST'].loc[~bt].mean() # average for non-BT
# group by list of hybrids of interest
hybrids = ['GARST','PIO','MYC']
df.groupby('BT')[hybrids].mean()
# overall means by BT trait
df.groupby('BT').mean()
# check the hard way
df.loc[df['BT'] == 'Y'].mean() # just get mean for BT overall
df.loc[df['BT'] == 'N'].mean() # just get mean for non BT overall
# create new data frame aggregating by variable
tmp2 = tmp.groupby('ID')[['purchases']].mean().reset_index()
### aggregations using agg functon
# see: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.agg.html
import numpy as np
# create toy data (note how this leverages numpy to create NaN values)
df = pd.DataFrame([[1, 2, 3],
[4, 5, 6],
[7, 8, 9],
[np.nan, np.nan, np.nan]],
columns=['A', 'B', 'C'])
# ex: min and max for each column
df.agg(['sum', 'min'])
# different aggregations specific for each specified column
df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']})
# aggregate over columns
df.agg("mean", axis="columns")
# use aggregate to create new variables and custom function
data_mart = df.agg({
'A': lambda x: ( x.max() - x),
'B': 'count',
'C': 'mean',
})
# Rename the columns
data_mart.rename(columns={'A': 'MaxDiff',
'B': 'Frequency',
'C': 'Mean'}, inplace=True)
# apply to panel data
# create toy panel (long) data
data = {'id' :[1,1,1,2,2,2,3,3,3],
'measure':["depth","temp","width","depth","temp","width","depth","temp","width"],
'values': [2,50,18,1.5,53,18,2.5,60,18],
}
# convert to a data frame
df = pd.DataFrame(data,columns=['id','measure','values'])
print(df)
# sum total values by id
data_mart = df.groupby(['id']).agg({'values': 'sum'})
#-----------------------------------------
# cross tabs
#----------------------------------------
# for more info: https://pbpython.com/pandas-crosstab.html
pd.crosstab(dat.BT, dat.RR)
pd.crosstab(df1.Gender_Code,df1.treat, normalize=True) # gives % in each combination
pd.crosstab(df1.treat,df1.Diag_Grouping, normalize='index') # example: normalize gives % row wise
pd.crosstab(df1.Diag_Grouping,df1.treat, normalize='columns') # this is easier to read columnwise
# see also groupby
#--------------------------------------
# missing data
#--------------------------------------
df.isnull().sum(axis=0) # count of missing values by variable in dataframe
df1.isnull().sum() # total missing per column
df[df.isnull().values.any(axis=1)].head() # check data where there are missing values
tmp = df.isnull().sum(axis=0) # check missing values
df.dropna(subset=['ZipCode','Income'], how='all', inplace=True) # drop variables with missing values
# drop missing values
wyf = df[["WYF"]]
wyf2 = wyf.dropna(axis=0,how='any')
# how does python handle missing data
wyf.mean()
wyf2.mean()
wyf.std()
wyf2.std()
del wyf
del wyf2
# Replace the NaN price values with 0
purchase_data.price = np.where(np.isnan(purchase_data.price),0, purchase_data.price)
# replace missing categorical
conditions = [(tmp['RiskScore'].isnull() == True)]
choices = ['Missing']
tmp['HasRiskScore'] = np.select(conditions, choices, default='Yes')
print(tmp['HasRiskScore'].value_counts(dropna=False)) # check values
### replace misisng and none string values
# create data with blank and None string values
data = {'GARST' :[150,140,145,137,141,145,149,153,157,161],
'PIO':[160,150,146,138,142,146,150,154,158,162],
'MYC':[137,148,151,139,143,120,115,136,130,129],
'DEK':[150,149,145,140,144,148,152,156,160,164],
'WYF':[110,125,135,122,127,132,130,'NaN',147,119],
'PLOT':[1,2,3,4,5,6,7,8,9,10],
'BT': ['Y','Y', None,'N','N','N','Y','N','Y',''],
'RR':['Y','N','Y','N','N','N','N','Y',None,'N'],
'ID':[1,2,3,4,5,6,7,8,9,10]
}
# convert to a data frame
df = pd.DataFrame(data,columns=['ID','GARST','PIO','MYC','DEK','WYF','PLOT','BT','RR'])
print(df)
# preview data
df.head(10)
# replace None with 'MISSING'
df = df.replace({None: 'Missing'})
# replace blank with missing
df = df.replace({'': 'Missing'})
#-------------------------------------
# imputation
#-------------------------------------
# Write a function that imputes median
def impute_median(series):
return series.fillna(series.median())
df.WYF.median() # waht is the median
# Impute median
df['WYFimp'] = df.WYF.transform(impute_median)
print(df) # check
#--------------------------------------
# basic descriptives
#--------------------------------------
print(df['RR'].value_counts(dropna=False))
df.GARST.mean()
df.GARST.std()
df.GARST.var()
#-------------------------------------
# covariance matrix operations
#-------------------------------------
covariance_matrix = np.cov(df.GARST,df.PIO)
# Print covariance matrix
print(covariance_matrix)
# Extract covariance of garst and pioneer
garst_pio= covariance_matrix[0,1]
print(garst_pio)
# same as above
garst_pio= covariance_matrix[1,0]
print(garst_pio)
# variance of pioneer
pio_var= covariance_matrix[1,1]
print(pio_var)
# variance of garst
garst_var= covariance_matrix[0,0]
print(garst_var)
#-------------------------------------
# data visualizatiion
#------------------------------------
# Import necessary modules
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
### boxplot
# Create the boxplot
df.boxplot(column = 'GARST')
df.boxplot(column='GARST', by='RR', rot=90)
# Display the plot
plt.show()
### Box-and-whisker plot with seaborn
# Create box plot with Seaborn's default settings
_ = sns.boxplot(x='RR', y='GARST', data=df)
# Label the axes
_ = plt.xlabel('RR')
_ = plt.ylabel('yield')
# Show the plot
plt.show()
### countplot / bar plot
sns.countplot(x="total_flags", data=df2)
speed = [0.1, 17.5, 40, 48, 52, 69, 88]
lifespan = [2, 8, 70, 1.5, 25, 12, 28]
index = ['snail', 'pig', 'elephant', 'rabbit', 'giraffe', 'coyote', 'horse']
df = pd.DataFrame({'speed': speed,'lifespan': lifespan}, index=index)
print(df)
ax = df.plot.bar(rot=0)
ax = df.plot.bar(y='speed', rot=0) # single column
### histogram
# Plot histogram of versicolor petal lengths
_ = plt.hist(df.GARST)
# Label axes
_ = plt.xlabel('yield')
_ = plt.ylabel('count')
# Show histogram
plt.show()
### histogram with pandas
# Plot the PDF
df.GARST.plot( kind='hist', normed=True, bins=30, range=(100,200))
plt.show()
# Plot the CDF
df.GARST.plot( kind='hist', normed=True, cumulative = True, bins=30, range=(100,200))
plt.show()
### bee swarm plot
_ = sns.swarmplot(x='RR', y='GARST', data=df)
_ = plt.xlabel('RR')
_ = plt.ylabel('yield')
plt.show()
### ECDF
def ecdf(data):
"""Compute ECDF for a one-dimensional array of measurements."""
# Number of data points: n
n = len(data)
# x-data for the ECDF: x
x = np.sort(data)
# y-data for the ECDF: y
y = np.arange(1,n+1) / n
return x, y
### Plotting the ECDF
# Compute ECDF for versicolor data: x_vers, y_vers
x_val, y_val = ecdf(df.GARST)
# Generate plot
_ = plt.plot(x_val, y_val, marker='.',linestyle = 'none')
# Label the axes
_ = plt.xlabel('yield')
_ = plt.ylabel('ECDF')
# Display the plot
plt.show()
### Computing percentiles
# Specify array of percentiles: percentiles
percentiles = [2.5,25,50,75,97.5]
# Compute percentiles: ptiles_vers
ptiles = np.percentile(df.GARST, percentiles)
# Print the result
print(ptiles)
### scatter plot
_ = plt.plot(df.GARST, df.PIO, marker='.', linestyle='none')
# Label the axes
_ = plt.xlabel('garst')
_ = plt.ylabel('pioneer')
# Show the result
plt.show()
#----------------------------------------
# time series
#----------------------------------------
### create data frame with date time index
data = {'date': ['2014-05-01 18:00:00', '2014-05-01 18:30:00', '2014-05-02 17:00:00', '2014-05-02 16:00:00', '2014-05-02 15:30:00', '2014-05-02 14:00:00', '2014-05-03 13:00:00', '2014-05-03 18:00:00', '2014-04-30 15:00:00', '2014-04-30 18:00:00'],
'aphids': [15, 25, 26, 12, 17, 14, 26, 32, 48, 41]}
df = pd.DataFrame(data, columns = ['date', 'aphids'])
print(df)
df.info()
# Convert df['date'] from string to datetime
# Prepare a format string: time_format
time_format = '%Y-%m-%d %H:%M'
# Convert date_list into a datetime object: my_datetimes
df['date'] = pd.to_datetime(df['date'], format=time_format)
df.info()
# function for parsing calendar date
import datetime as dt
def get_day(x): return dt.datetime(x.year, x.month, x.day)
# Create day column
df['day'] = df['date'].apply(get_day)
df.head()
# Set df['date'] as the index and delete the column
df.index = df['date']
del df['date']
df # check
df.info() # check
### date operations
# Extract the hour from 2pm to 4pm on '2014-05-02': ts1
ts1 = df.loc['2014-05-02 14:00:00':'2014-05-02 16:00:00']
# Extract '2014-05-02': ts2
ts2 = df.loc['2014-05-02']
# Extract data from '2014-05-03' to '22014-05-02': ts3
ts3 = df.loc['2014-05-03':'2014-05-05']
# Downsample to get total within 2 hours
df1 = df['aphids'].resample('2h').sum()
print(df) # compare
print(df1) # check
# Downsample to get daily total aphid counts
df1 = df['aphids'].resample('D').sum()
# get daily high counts
daily_highs = df['aphids'].resample('D').max()
# get counts for april
april = df['aphids']['2014-Apr']
print(april)
# examples from DataCamp Customer Analytics and A/B Testing
# Provide the correct format for the date Saturday January 27, 2017
date_data_one = pd.to_datetime(date_data_one, format="%A %B %d, %Y")
print(date_data_one)
# Provide the correct format for the date 2017-08-01
date_data_two = pd.to_datetime(date_data_two, format= "%Y-%m-%d")
print(date_data_two)
# Provide the correct format for the date 08/17/1978
date_data_three = pd.to_datetime(date_data_three, format='%m/%d/%Y')
print(date_data_three)
# Provide the correct format for the date 2016 March 01 01:56
date_data_four = pd.to_datetime(date_data_four, format="%Y %B %d %H:%M")
print(date_data_four)
# great reference: https://www.dataquest.io/blog/python-datetime-tutorial/
# hacky way to deal with dirty stringy dates
df3['END_DT'] = df3['END_DT'].str.slice(stop=10)
df3['END_DT'] = pd.to_datetime(df3.END_DT, format='%Y/%m/%d')
print(df3.END_DT.head())
# calculate 12 months pre and post dates based on index date
df_chrt['date_post'] = df_chrt['INDEX_DT'] + pd.DateOffset(months=12)
df_chrt['date_pre'] = df_chrt['INDEX_DT'] - pd.DateOffset(months=12)
#-------------------------------------
# random numbers
#-------------------------------------
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
### random uniform
# Seed the random number generator
np.random.seed(42)
# Initialize random numbers: random_numbers
random_numbers = np.empty(100000)
# Generate random numbers by looping over range(100000)
for i in range(100000):
random_numbers[i] = np.random.random()
# Plot a histogram
_ = plt.hist(random_numbers)
# Show the plot
plt.show()
# not sure why a loop is always necessary
rs = np.random.random(10000)
plt.hist(rs)
# both approahces create an array containing the random values
type(rs)
type(random_numbers)
#### poisson
samples_poisson = np.random.poisson(3,size =10000)
# Print the mean and standard deviation
print('Poisson: ', np.mean(samples_poisson),
np.std(samples_poisson))
plt.hist(samples_poisson)
### normal distribution
normdist = np.random.normal(20, 1, size=100000)
# pdf - histogram
plt.hist(normdist ,bins=100,normed=True,histtype='step')
# normal cdf
x1, y1 = ecdf(normdist)
_ = plt.plot(x1,y1, marker='.',linestyle = 'none')
_ = plt.legend(('normal cdf'), loc='lower right')
plt.show()
### compare GARST to a normal distribution
mean = np.mean(df.GARST) # get empirical mean
std = np.std(df.GARST) # get empirical std deviation
rnorm = np.random.normal(mean, std, size=1000) # simulate normal data
x, y = ecdf(df.GARST) # empirical distribution
x_norm, y_norm = ecdf(rnorm) # normal distribution
sns.set() # apply seaborn templates
_ = plt.plot(x_norm, y_norm) # plot normally distributed data points
_ = plt.plot(x, y, marker='.', linestyle='none') # plot empirical data
_ = plt.xlabel('yield')
_ = plt.ylabel('CDF')
plt.show()
#------------------------------------------------------
# simulation
#------------------------------------------------------
df1['prob'] = np.random.uniform(0, 1, df1.shape[0]) # add random uniformly distributed value as a column
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment