Nhan Tran panicpotatoe

## chi_square_example_01_p01.py
import numpy as np
import pandas as pd
import scipy.stats as stats

## chi_square_example_01_p02.py
# STEP 1: GENERATE A RANDOM DATASET
# Generate under a random factor
# https://docs.scipy.org/doc/numpy/reference/generated/numpy.random.seed.html
np.random.seed(10)

# Sample data randomly at fixed probabilities
voter_race = np.random.choice(a=["asian","black","hispanic","other","white"],
                              p=[0.05, 0.15 ,0.25, 0.05, 0.5],
                              size=1000)

## chi_square_example_01_p03.py
# Create a CrossTab from DataFrame, Assign the column names and row names
voter_tab = pd.crosstab(voters.race, voters.party, margins=True)
voter_tab.columns = ["democrat", "independent", "republican", "row_totals"]
voter_tab.index = ["asian", "black", "hispanic", "other", "white", "col_totals"]
# You can check the data of CrossTab by calling it
voter_tab

## chi_square_example_01_p05.py
"""
    Calculate the "expected" table:
    "Expected" table can be calculated using below formula:
        total_rows x total_columns / total_observations
    And these factors can be get by:
        - total_rows = voter_tab["row_totals"]
        - total_columns = voter_tab["col_totals"]
        - total_observations = 1000
    Please note that the "loc" function in below code is used to switch the
        index base on column name to row name

## chi_square_example_01_p04.py
# STEP 2: GET THE "OBSERVED" TABLE AND "EXPECTED" TABLE
"""
    Calculate the "observed" table:
    "Observed" table can be extracted from our CrossTab by exclude the row_totals and col_totals
    You can see row_totals is in the index of 4 (in column)
        and col_totals is in the index of 6 (in row).
        So [0:5, 0:3] means "we will take the rows from 0 index to 5 index
        and columns from 0 index to 3 index and assign to new CrossTab
        that named [observed]"
"""

## chi_square_example_01_p06.py
# Now convert into a DataFrame, Assign the column names and row names
expected = pd.DataFrame(expected)
expected.columns = ["democrat", "independent", "republican"]
expected.index = ["asian", "black", "hispanic", "other", "white"]
# You can check the data of expected table by calling it
expected

## chi_square_example_01_p07.py
# STEP 3: CALCULATE THE CHI SQUARE VALUE and CRITICAL VALUE
"""
    Chi square formula:
        chi square = total of [(observed - expected)^2]/expected
    Note: We call .sum() twice: once to get the column sums
        and a second time to add the column sums together,
        returning the sum of the entire 2D table.
"""
chi_squared_stat = (((observed-expected)**2)/expected).sum().sum()
print(chi_squared_stat)

## chi_square_example_01_p08.py
"""
    Find the critical value for confidence of 95% and degree of freedom (df) of 8
    Why df = 8?
    Degree of freedom formula:
        df = (total rows - 1) x (total columns - 1)
           = (5 - 1) x (3 - 1)
           = 4 x 2
           = 8
"""
crit = stats.chi2.ppf(q=0.95, df=8)

## chi_square_example_01_p09.py
""" METHODOLOGY 02: CALCULATE USING SCIPY.STATS LIBRARY"""
stats = stats.chi2_contingency(observed=observed)
# You can check the returned data by calling it
# The returned data includes: chi_squared_stat, p_value, df, expected_crosstab
stats

## mean_median_mode.py
import statistics as stats
	import numpy as np
	import pandas as pd
	import scipy.stats as stats
	# STEP 1: GENERATE A RANDOM DATASET
	# Generate under a random factor
	# https://docs.scipy.org/doc/numpy/reference/generated/numpy.random.seed.html
	np.random.seed(10)

	# Sample data randomly at fixed probabilities
	voter_race = np.random.choice(a=["asian","black","hispanic","other","white"],
	p=[0.05, 0.15 ,0.25, 0.05, 0.5],
	size=1000)
	# Create a CrossTab from DataFrame, Assign the column names and row names
	voter_tab = pd.crosstab(voters.race, voters.party, margins=True)
	voter_tab.columns = ["democrat", "independent", "republican", "row_totals"]
	voter_tab.index = ["asian", "black", "hispanic", "other", "white", "col_totals"]
	# You can check the data of CrossTab by calling it
	voter_tab
	"""
	Calculate the "expected" table:
	"Expected" table can be calculated using below formula:
	total_rows x total_columns / total_observations
	And these factors can be get by:
	- total_rows = voter_tab["row_totals"]
	- total_columns = voter_tab["col_totals"]
	- total_observations = 1000
	Please note that the "loc" function in below code is used to switch the
	index base on column name to row name
	# STEP 2: GET THE "OBSERVED" TABLE AND "EXPECTED" TABLE
	"""
	Calculate the "observed" table:
	"Observed" table can be extracted from our CrossTab by exclude the row_totals and col_totals
	You can see row_totals is in the index of 4 (in column)
	and col_totals is in the index of 6 (in row).
	So [0:5, 0:3] means "we will take the rows from 0 index to 5 index
	and columns from 0 index to 3 index and assign to new CrossTab
	that named [observed]"
	"""
	# Now convert into a DataFrame, Assign the column names and row names
	expected = pd.DataFrame(expected)
	expected.columns = ["democrat", "independent", "republican"]
	expected.index = ["asian", "black", "hispanic", "other", "white"]
	# You can check the data of expected table by calling it
	expected
	# STEP 3: CALCULATE THE CHI SQUARE VALUE and CRITICAL VALUE
	"""
	Chi square formula:
	chi square = total of [(observed - expected)^2]/expected
	Note: We call .sum() twice: once to get the column sums
	and a second time to add the column sums together,
	returning the sum of the entire 2D table.
	"""
	chi_squared_stat = (((observed-expected)**2)/expected).sum().sum()
	print(chi_squared_stat)
	"""
	Find the critical value for confidence of 95% and degree of freedom (df) of 8
	Why df = 8?
	Degree of freedom formula:
	df = (total rows - 1) x (total columns - 1)
	= (5 - 1) x (3 - 1)
	= 4 x 2
	= 8
	"""
	crit = stats.chi2.ppf(q=0.95, df=8)
	""" METHODOLOGY 02: CALCULATE USING SCIPY.STATS LIBRARY"""
	stats = stats.chi2_contingency(observed=observed)
	# You can check the returned data by calling it
	# The returned data includes: chi_squared_stat, p_value, df, expected_crosstab
	stats