Created
August 31, 2019 19:32
-
-
Save ludovikcoba/5c67b1bd671fadb302bbee378c3e5588 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import scipy.stats | |
from math import sqrt | |
def cramers_v(x, y): | |
confusion_matrix = pd.crosstab(x, y) | |
chi2 = scipy.stats.chi2_contingency(confusion_matrix)[0] | |
n = confusion_matrix.sum().sum() | |
phi2 = chi2 / n | |
r, k = confusion_matrix.shape | |
phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1)) | |
rcorr = r - ((r - 1) ** 2) / (n - 1) | |
kcorr = k - ((k - 1) ** 2) / (n - 1) | |
return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1))) | |
def gini(list_of_values): | |
''' | |
Compute the Gini coefficient. | |
:param list_of_values: list/series | |
:return: Gini coefficient | |
''' | |
if isinstance(list_of_values, float): | |
return 1 | |
sorted_list = sorted(list_of_values) | |
height, area = 0, 0 | |
for value in sorted_list: | |
height += value | |
area += height - value / 2. | |
fair_area = height * len(list_of_values) / 2. | |
return (fair_area - area) / fair_area | |
def mean_confidence_interval(data, confidence=0.95): | |
''' | |
Confidence interval of the mean | |
:param data: | |
:param confidence: | |
:return: | |
''' | |
a = 1.0 * np.array(data) | |
n, se = len(a), scipy.stats.sem(a) | |
h = se * scipy.stats.t.ppf((1 + confidence) / 2., n - 1) | |
return h | |
def median_confidence_interval(data, confidence=0.95): | |
''' | |
Get the confidence interval over the median | |
:param data: an array/series | |
:param confidence: | |
:return: | |
''' | |
a = 1.0 * np.array(data) | |
n = len(a) | |
n, se = len(a), sqrt(n * .5 * .5) | |
h = se * scipy.stats.t.ppf((1 + confidence) / 2., n - 1) | |
return h | |
def test_diff(a, b): | |
''' | |
Run the Mann Whitney test | |
:param a: first list/series | |
:param b: second list/series | |
:return: U and p-val | |
''' | |
a, b = np.array(a), np.array(b) | |
if len(a) == 0 or len(b) == 0: | |
return 1 | |
if np.array_equal(a, b): | |
return 1 | |
t, p = scipy.stats.mannwhitneyu(a, b) | |
return p # "U = {}, p = {}".format(t, p) | |
def crombach_alpha(two_columns_df): | |
''' | |
Compute the Crombach alpha | |
:param two_columns_df: df | |
:return: the Crombach Alpha | |
''' | |
nr_items = len(two_columns_df.columns) | |
cov_mtr = two_columns_df.cov() # variance - covariance matrix | |
mean_var_item = sum(np.diagonal(cov_mtr)) / nr_items # mean variance | |
mean_cov_ii = sum(np.sum(cov_mtr)) - sum( | |
np.diagonal(cov_mtr)) # sume all the covariance among item, and remove the variance | |
mean_cov_ii = mean_cov_ii / (nr_items * nr_items - nr_items) # average | |
return nr_items * mean_cov_ii / (mean_var_item + (nr_items - 1) * mean_cov_ii) | |
def median_split(two_column_dataset): | |
""" | |
Split dataset on the median of the second column | |
:param two_column_dataset | |
:return: low, high subset | |
""" | |
two_column_dataset = two_column_dataset.sort_values(by=two_column_dataset.columns[1]) # order by scale | |
median_position = int(two_column_dataset.shape[0] / 2) # find median index | |
# split | |
low = two_column_dataset.iloc[:median_position, ] | |
high = two_column_dataset.iloc[median_position:, ] | |
return low, high | |
def low_high_split(two_column_dataset): | |
""" | |
Split dataset in three of the second column | |
:param two_column_dataset | |
:return: low, high subset | |
""" | |
two_column_dataset = two_column_dataset.sort_values(by=two_column_dataset.columns[1]) # order by scale | |
low_position = int(two_column_dataset.shape[0] / 3) # find median index | |
high_position = int(2 * two_column_dataset.shape[0]/3) | |
# split | |
low = two_column_dataset.iloc[:low_position, ] | |
high = two_column_dataset.iloc[high_position:, ] | |
return low, high | |
if __name__ == "__main__": | |
print("MAIN") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment