domdfcoding/data_analysis.py

## data_analysis.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
#  Additional data analysis to accompany
#  "Improving the Analysis of Organic Gunshot Residue with GunShotMatch"
#
#  Copyright 2019 Dominic Davis-Foster <dominic@davis-foster.co.uk>
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
#  MA 02110-1301, USA.
#
#
# Based on:
#   http://cleverowl.uk/2015/07/01/using-one-way-anova-and-tukeys-test-to-compare-data-sets/
#   https://www.psychometrica.de/effect_size.html#transform
#   https://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/hedgeg.htm
# and Cohen, J. (1988). Statistical power analysis for the behavioral sciences (2nd Edition). Hillsdale, NJ: Lawrence Erlbaum Associates


import numpy as np
import scipy.stats as stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multicomp import MultiComparison

def pooled_sd(sample1, sample2, weighted=False):
	"""Formula from weighted=True for weighted pooled SD
	Formula from https://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/hedgeg.htm"""
	sd1 = np.std(sample1)
	sd2 = np.std(sample2)
	n1 = len(sample1)
	n2 = len(sample2)
	if weighted:
		return np.sqrt( (((n1-1)*(sd1**2)) + ((n2-1)*(sd2**2))) / (n1+n2-2) )
	else:
		return np.sqrt( ((sd1**2) + (sd2**2)) / 2 )


def g_hedge(sample1, sample2):
	"""Hedge's g statistic
	Formula from https://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/hedgeg.htm"""
	mean1 = np.mean(sample1)
	mean2 = np.mean(sample2)
	return (mean1-mean2)/pooled_sd(sample1, sample2, True)


def g_durlak_bias(g, n):
	"""Application of Durlak's bias correction to the Hedge's g statistic.
	Formula from https://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/hedgeg.htm

	n = n1+n2"""
	Durlak = ((n-3)/(n-2.25))*np.sqrt((n-2)/n)
	return g*Durlak


def interpret_d(d_or_g):
	"""Interpret Cohen's d or Hedge's g values using Table 1
	from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3444174/"""

	if d_or_g < 0:
		return f"{interpret_d(np.abs(d_or_g)).split(' ')[0]} Adverse Effect"
	elif 0.0 <= d_or_g < 0.2:
		return "No Effect"
	elif 0.2 <= d_or_g < 0.5:
		return "Small Effect"
	elif 0.5 <= d_or_g < 0.8:
		return "Intermediate Effect"
	elif 0.8 <= d_or_g:
		return "Large Effect"


"""Dibutyl Phthalate ANOVA"""
CBC = [193540.383333333, 156797.01984127, 877545.732539682, 185331.853968254, 198447.588095238]
Magtech = [388224.406349206, 1299156.4515873, 775447.952380952, 797549.789682539, 1469425.29206349]
Magtech_CleanRange = [190675.011904762, 183011.074603174, 138076.892063492, 112654.025396825, 278997.476190476]

a_value = 0.05

f, p = stats.f_oneway(CBC, Magtech, Magtech_CleanRange)
print ('Dibutyl Phthalate One-way ANOVA')
print ('===============================')

print ('F value:', f)
print ('P value:', p)
print("Reject = ",end='')
print("False" if p > a_value else "True")
print('')

"""Tukey's"""
data = []
for area in CBC:
	data.append(("CBC", area))
for area in Magtech:
	data.append(("Magtech", area))
for area in Magtech_CleanRange:
	data.append(("Magtech CleanRange", area))

data = np.rec.array(data, dtype = [('Sample',"|U20"),('Peak Area','float64')])

mc = MultiComparison(data['Peak Area'], data['Sample'])
print(mc.tukeyhsd())

print("")
print("""Hedge's g\n""")

print("CBC v Magtech")
g = g_hedge(CBC, Magtech)
print(g)
print(interpret_d(g_durlak_bias(g, 10)))

print("\nCBC v Magtech CleanRange")
g = g_hedge(CBC, Magtech_CleanRange)
print(g)
print(interpret_d(g_durlak_bias(g, 10)))

print("\nMagtech v Magtech CleanRange")
g = g_hedge(Magtech, Magtech_CleanRange)
print(g)
print(interpret_d(g_durlak_bias(g, 10)))

print("\n")
print("""Student's t-test for DBP: CBC v CleanRange""")
t, p = stats.ttest_ind_from_stats(np.mean(CBC), np.std(CBC), len(CBC),
								  np.mean(Magtech_CleanRange),
								  np.std(Magtech_CleanRange),
								  len(Magtech_CleanRange))
print ('t Statistic:', f)
print ('P value:    ', p)
print("Reject = ",end='')
print("False" if p > a_value else "True")
print('')
print("""Welch's t-test for DBP: CBC v CleanRange""")
t, p = stats.ttest_ind_from_stats(np.mean(CBC), np.std(CBC), len(CBC),
								  np.mean(Magtech_CleanRange),
								  np.std(Magtech_CleanRange),
								  len(Magtech_CleanRange), False)
print ('t Statistic:', f)
print ('P value:    ', p)
print("Reject = ",end='')
print("False" if p > a_value else "True")
print('')

print("=========================================================================")


print("\n\n")
print("""Ethyl Centralite Hedge's g""")
CBC = [2360313.56904762,9205011.71269841,4368357.48412698,6818562.4468254,12065016.2349207]
Magtech = [112587.645238095,532979.707936508,422138.485714286,436944.629365079,772629.48015873]

g = g_hedge(CBC, Magtech)
print(g)
print(interpret_d(g_durlak_bias(g, 10)))
	#!/usr/bin/env python
	# -- coding: utf-8 --
	#
	# Additional data analysis to accompany
	# "Improving the Analysis of Organic Gunshot Residue with GunShotMatch"
	#
	# Copyright 2019 Dominic Davis-Foster <dominic@davis-foster.co.uk>
	#
	# This program is free software; you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation; either version 2 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with this program; if not, write to the Free Software
	# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
	# MA 02110-1301, USA.
	#
	#
	# Based on:
	# http://cleverowl.uk/2015/07/01/using-one-way-anova-and-tukeys-test-to-compare-data-sets/
	# https://www.psychometrica.de/effect_size.html#transform
	# https://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/hedgeg.htm
	# and Cohen, J. (1988). Statistical power analysis for the behavioral sciences (2nd Edition). Hillsdale, NJ: Lawrence Erlbaum Associates


	import numpy as np
	import scipy.stats as stats
	from statsmodels.stats.multicomp import pairwise_tukeyhsd
	from statsmodels.stats.multicomp import MultiComparison

	def pooled_sd(sample1, sample2, weighted=False):
	"""Formula from weighted=True for weighted pooled SD
	Formula from https://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/hedgeg.htm"""
	sd1 = np.std(sample1)
	sd2 = np.std(sample2)
	n1 = len(sample1)
	n2 = len(sample2)
	if weighted:
	return np.sqrt( (((n1-1)(sd12)) + ((n2-1)(sd2**2))) / (n1+n2-2) )
	else:
	return np.sqrt( ((sd12) + (sd22)) / 2 )


	def g_hedge(sample1, sample2):
	"""Hedge's g statistic
	Formula from https://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/hedgeg.htm"""
	mean1 = np.mean(sample1)
	mean2 = np.mean(sample2)
	return (mean1-mean2)/pooled_sd(sample1, sample2, True)


	def g_durlak_bias(g, n):
	"""Application of Durlak's bias correction to the Hedge's g statistic.
	Formula from https://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/hedgeg.htm

	n = n1+n2"""
	Durlak = ((n-3)/(n-2.25))*np.sqrt((n-2)/n)
	return g*Durlak


	def interpret_d(d_or_g):
	"""Interpret Cohen's d or Hedge's g values using Table 1
	from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3444174/"""

	if d_or_g < 0:
	return f"{interpret_d(np.abs(d_or_g)).split(' ')[0]} Adverse Effect"
	elif 0.0 <= d_or_g < 0.2:
	return "No Effect"
	elif 0.2 <= d_or_g < 0.5:
	return "Small Effect"
	elif 0.5 <= d_or_g < 0.8:
	return "Intermediate Effect"
	elif 0.8 <= d_or_g:
	return "Large Effect"


	"""Dibutyl Phthalate ANOVA"""
	CBC = [193540.383333333, 156797.01984127, 877545.732539682, 185331.853968254, 198447.588095238]
	Magtech = [388224.406349206, 1299156.4515873, 775447.952380952, 797549.789682539, 1469425.29206349]
	Magtech_CleanRange = [190675.011904762, 183011.074603174, 138076.892063492, 112654.025396825, 278997.476190476]

	a_value = 0.05

	f, p = stats.f_oneway(CBC, Magtech, Magtech_CleanRange)
	print ('Dibutyl Phthalate One-way ANOVA')
	print ('===============================')

	print ('F value:', f)
	print ('P value:', p)
	print("Reject = ",end='')
	print("False" if p > a_value else "True")
	print('')

	"""Tukey's"""
	data = []
	for area in CBC:
	data.append(("CBC", area))
	for area in Magtech:
	data.append(("Magtech", area))
	for area in Magtech_CleanRange:
	data.append(("Magtech CleanRange", area))

	data = np.rec.array(data, dtype = [('Sample',"\|U20"),('Peak Area','float64')])

	mc = MultiComparison(data['Peak Area'], data['Sample'])
	print(mc.tukeyhsd())

	print("")
	print("""Hedge's g\n""")

	print("CBC v Magtech")
	g = g_hedge(CBC, Magtech)
	print(g)
	print(interpret_d(g_durlak_bias(g, 10)))

	print("\nCBC v Magtech CleanRange")
	g = g_hedge(CBC, Magtech_CleanRange)
	print(g)
	print(interpret_d(g_durlak_bias(g, 10)))

	print("\nMagtech v Magtech CleanRange")
	g = g_hedge(Magtech, Magtech_CleanRange)
	print(g)
	print(interpret_d(g_durlak_bias(g, 10)))

	print("\n")
	print("""Student's t-test for DBP: CBC v CleanRange""")
	t, p = stats.ttest_ind_from_stats(np.mean(CBC), np.std(CBC), len(CBC),
	np.mean(Magtech_CleanRange),
	np.std(Magtech_CleanRange),
	len(Magtech_CleanRange))
	print ('t Statistic:', f)
	print ('P value: ', p)
	print("Reject = ",end='')
	print("False" if p > a_value else "True")
	print('')
	print("""Welch's t-test for DBP: CBC v CleanRange""")
	t, p = stats.ttest_ind_from_stats(np.mean(CBC), np.std(CBC), len(CBC),
	np.mean(Magtech_CleanRange),
	np.std(Magtech_CleanRange),
	len(Magtech_CleanRange), False)
	print ('t Statistic:', f)
	print ('P value: ', p)
	print("Reject = ",end='')
	print("False" if p > a_value else "True")
	print('')

	print("=========================================================================")


	print("\n\n")
	print("""Ethyl Centralite Hedge's g""")
	CBC = [2360313.56904762,9205011.71269841,4368357.48412698,6818562.4468254,12065016.2349207]
	Magtech = [112587.645238095,532979.707936508,422138.485714286,436944.629365079,772629.48015873]

	g = g_hedge(CBC, Magtech)
	print(g)
	print(interpret_d(g_durlak_bias(g, 10)))