Skip to content

Instantly share code, notes, and snippets.

@domdfcoding
Created May 3, 2019 10:58
Show Gist options
  • Save domdfcoding/6b437829e2c3d6e01dcb3307a89d5ba3 to your computer and use it in GitHub Desktop.
Save domdfcoding/6b437829e2c3d6e01dcb3307a89d5ba3 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Additional data analysis to accompany
# "Improving the Analysis of Organic Gunshot Residue with GunShotMatch"
#
# Copyright 2019 Dominic Davis-Foster <dominic@davis-foster.co.uk>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
# MA 02110-1301, USA.
#
#
# Based on:
# http://cleverowl.uk/2015/07/01/using-one-way-anova-and-tukeys-test-to-compare-data-sets/
# https://www.psychometrica.de/effect_size.html#transform
# https://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/hedgeg.htm
# and Cohen, J. (1988). Statistical power analysis for the behavioral sciences (2nd Edition). Hillsdale, NJ: Lawrence Erlbaum Associates
import numpy as np
import scipy.stats as stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multicomp import MultiComparison
def pooled_sd(sample1, sample2, weighted=False):
"""Formula from weighted=True for weighted pooled SD
Formula from https://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/hedgeg.htm"""
sd1 = np.std(sample1)
sd2 = np.std(sample2)
n1 = len(sample1)
n2 = len(sample2)
if weighted:
return np.sqrt( (((n1-1)*(sd1**2)) + ((n2-1)*(sd2**2))) / (n1+n2-2) )
else:
return np.sqrt( ((sd1**2) + (sd2**2)) / 2 )
def g_hedge(sample1, sample2):
"""Hedge's g statistic
Formula from https://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/hedgeg.htm"""
mean1 = np.mean(sample1)
mean2 = np.mean(sample2)
return (mean1-mean2)/pooled_sd(sample1, sample2, True)
def g_durlak_bias(g, n):
"""Application of Durlak's bias correction to the Hedge's g statistic.
Formula from https://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/hedgeg.htm
n = n1+n2"""
Durlak = ((n-3)/(n-2.25))*np.sqrt((n-2)/n)
return g*Durlak
def interpret_d(d_or_g):
"""Interpret Cohen's d or Hedge's g values using Table 1
from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3444174/"""
if d_or_g < 0:
return f"{interpret_d(np.abs(d_or_g)).split(' ')[0]} Adverse Effect"
elif 0.0 <= d_or_g < 0.2:
return "No Effect"
elif 0.2 <= d_or_g < 0.5:
return "Small Effect"
elif 0.5 <= d_or_g < 0.8:
return "Intermediate Effect"
elif 0.8 <= d_or_g:
return "Large Effect"
"""Dibutyl Phthalate ANOVA"""
CBC = [193540.383333333, 156797.01984127, 877545.732539682, 185331.853968254, 198447.588095238]
Magtech = [388224.406349206, 1299156.4515873, 775447.952380952, 797549.789682539, 1469425.29206349]
Magtech_CleanRange = [190675.011904762, 183011.074603174, 138076.892063492, 112654.025396825, 278997.476190476]
a_value = 0.05
f, p = stats.f_oneway(CBC, Magtech, Magtech_CleanRange)
print ('Dibutyl Phthalate One-way ANOVA')
print ('===============================')
print ('F value:', f)
print ('P value:', p)
print("Reject = ",end='')
print("False" if p > a_value else "True")
print('')
"""Tukey's"""
data = []
for area in CBC:
data.append(("CBC", area))
for area in Magtech:
data.append(("Magtech", area))
for area in Magtech_CleanRange:
data.append(("Magtech CleanRange", area))
data = np.rec.array(data, dtype = [('Sample',"|U20"),('Peak Area','float64')])
mc = MultiComparison(data['Peak Area'], data['Sample'])
print(mc.tukeyhsd())
print("")
print("""Hedge's g\n""")
print("CBC v Magtech")
g = g_hedge(CBC, Magtech)
print(g)
print(interpret_d(g_durlak_bias(g, 10)))
print("\nCBC v Magtech CleanRange")
g = g_hedge(CBC, Magtech_CleanRange)
print(g)
print(interpret_d(g_durlak_bias(g, 10)))
print("\nMagtech v Magtech CleanRange")
g = g_hedge(Magtech, Magtech_CleanRange)
print(g)
print(interpret_d(g_durlak_bias(g, 10)))
print("\n")
print("""Student's t-test for DBP: CBC v CleanRange""")
t, p = stats.ttest_ind_from_stats(np.mean(CBC), np.std(CBC), len(CBC),
np.mean(Magtech_CleanRange),
np.std(Magtech_CleanRange),
len(Magtech_CleanRange))
print ('t Statistic:', f)
print ('P value: ', p)
print("Reject = ",end='')
print("False" if p > a_value else "True")
print('')
print("""Welch's t-test for DBP: CBC v CleanRange""")
t, p = stats.ttest_ind_from_stats(np.mean(CBC), np.std(CBC), len(CBC),
np.mean(Magtech_CleanRange),
np.std(Magtech_CleanRange),
len(Magtech_CleanRange), False)
print ('t Statistic:', f)
print ('P value: ', p)
print("Reject = ",end='')
print("False" if p > a_value else "True")
print('')
print("=========================================================================")
print("\n\n")
print("""Ethyl Centralite Hedge's g""")
CBC = [2360313.56904762,9205011.71269841,4368357.48412698,6818562.4468254,12065016.2349207]
Magtech = [112587.645238095,532979.707936508,422138.485714286,436944.629365079,772629.48015873]
g = g_hedge(CBC, Magtech)
print(g)
print(interpret_d(g_durlak_bias(g, 10)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment