Justin Evans eustin

## leading_commas.sql
select
  some_column
  , another_column
  , yet_another_column
from some_table

## stat_sig_random_19.py
actual_specialness_result = score_test_proportions_2indep(NUM_CONVERTING_VARIANT_USERS, NUM_VARIANT_USERS, NUM_CONVERTING_CONTROL_USERS, NUM_CONTROL_USERS, alternative="larger")
print(f"actual 'specialness' result: {actual_specialness_result.pvalue:.2%}")

#> actual 'specialness' result: 3.83%

## stat_sig_random_18.py
sampling_specialness_result = (sampled_diffs >= observed_diff_in_rates).sum() / sampled_diffs.shape[0]
print(f"sampled 'specialness' result: {sampling_specialness_result:.2%}")

#> sampled 'specialness' result: 3.41%

## stat_sig_random_17.py
plot_hist(sampled_diffs,
          bins=50,
          observed_rate=observed_diff_in_rates,
          title="Our range of pure randomness")

## stat_sig_random_16.py
NUM_SIMULATIONS = 10_000

sampled_diffs = sample_diffs_in_rates(all_users, NUM_CONTROL_USERS, NUM_SIMULATIONS)

## stat_sig_random_15.py
@njit(parallel=True)
def sample_diffs_in_rates(all_users, num_control_users, num_simulations):
    results = np.zeros(num_simulations)
    for i in prange(num_simulations):
        # numpy random shuffling appears to be slower when using numba
        random.shuffle(all_users)
        control_rate = all_users[:num_control_users].mean()
        # we assume the rest of the users are variant users
        variant_rate = all_users[num_control_users:].mean()
        results[i] = variant_rate - control_rate

## stat_sig_random_14.py
NUM_CONTROL_USERS = 1_000_000
NUM_CONVERTING_CONTROL_USERS = 26_000

NUM_VARIANT_USERS = 1_000_000
NUM_CONVERTING_VARIANT_USERS = 26_400

# create our arrays of users
control_users = np.zeros(NUM_CONTROL_USERS)
control_users[:NUM_CONVERTING_CONTROL_USERS] = 1.0
control_conversion_rate = control_users.mean()

## stat_sig_random_13.py
OBSERVED_DIFF_IN_RATES = 0.167 # this is our experiment result

num_diffs_gte_observed = (simulated_diffs_in_rates >= OBSERVED_DIFF_IN_RATES).sum()
num_samples = simulated_diffs_in_rates.shape[0]

print(f"{num_diffs_gte_observed:,} out of {num_samples:,} random samples show differences in rates greater than or equal to {OBSERVED_DIFF_IN_RATES:.1%}")
print(f"percentage of random noise distribution with difference in rates greater than or equal to {OBSERVED_DIFF_IN_RATES:.1%}: {num_diffs_gte_observed / num_samples:.2%}")

#> 1,267 out of 10,000 random samples show differences in rates greater than or equal to 16.7%
#> percentage of random noise distribution with difference in rates greater than or equal to 16.7%: 12.67%

## stat_sig_random_12.py
plot_hist(simulated_diffs_in_rates,
          bins=15,
          observed_rate=0.167,
          title="Our range of pure randomness")

## stat_sig_random_11.py
def plot_hist(experiment_results: np.ndarray,
              bins=100,
              observed_rate: float = None,
              title: str = None) -> None:
    sns.histplot(experiment_results, bins=bins)
    if observed_rate:
        plt.axvline(observed_rate, color='r', label='Diff in rates observed in experiment')
        plt.legend(bbox_to_anchor=(0.5, -0.2), loc="lower center")
    if title:
        plt.title(title)
	select
	some_column
	, another_column
	, yet_another_column
	from some_table
	actual_specialness_result = score_test_proportions_2indep(NUM_CONVERTING_VARIANT_USERS, NUM_VARIANT_USERS, NUM_CONVERTING_CONTROL_USERS, NUM_CONTROL_USERS, alternative="larger")
	print(f"actual 'specialness' result: {actual_specialness_result.pvalue:.2%}")

	#> actual 'specialness' result: 3.83%
	sampling_specialness_result = (sampled_diffs >= observed_diff_in_rates).sum() / sampled_diffs.shape[0]
	print(f"sampled 'specialness' result: {sampling_specialness_result:.2%}")

	#> sampled 'specialness' result: 3.41%
	plot_hist(sampled_diffs,
	bins=50,
	observed_rate=observed_diff_in_rates,
	title="Our range of pure randomness")
	NUM_SIMULATIONS = 10_000

	sampled_diffs = sample_diffs_in_rates(all_users, NUM_CONTROL_USERS, NUM_SIMULATIONS)
	@njit(parallel=True)
	def sample_diffs_in_rates(all_users, num_control_users, num_simulations):
	results = np.zeros(num_simulations)
	for i in prange(num_simulations):
	# numpy random shuffling appears to be slower when using numba
	random.shuffle(all_users)
	control_rate = all_users[:num_control_users].mean()
	# we assume the rest of the users are variant users
	variant_rate = all_users[num_control_users:].mean()
	results[i] = variant_rate - control_rate
	NUM_CONTROL_USERS = 1_000_000
	NUM_CONVERTING_CONTROL_USERS = 26_000

	NUM_VARIANT_USERS = 1_000_000
	NUM_CONVERTING_VARIANT_USERS = 26_400

	# create our arrays of users
	control_users = np.zeros(NUM_CONTROL_USERS)
	control_users[:NUM_CONVERTING_CONTROL_USERS] = 1.0
	control_conversion_rate = control_users.mean()
	OBSERVED_DIFF_IN_RATES = 0.167 # this is our experiment result

	num_diffs_gte_observed = (simulated_diffs_in_rates >= OBSERVED_DIFF_IN_RATES).sum()
	num_samples = simulated_diffs_in_rates.shape[0]

	print(f"{num_diffs_gte_observed:,} out of {num_samples:,} random samples show differences in rates greater than or equal to {OBSERVED_DIFF_IN_RATES:.1%}")
	print(f"percentage of random noise distribution with difference in rates greater than or equal to {OBSERVED_DIFF_IN_RATES:.1%}: {num_diffs_gte_observed / num_samples:.2%}")

	#> 1,267 out of 10,000 random samples show differences in rates greater than or equal to 16.7%
	#> percentage of random noise distribution with difference in rates greater than or equal to 16.7%: 12.67%
	plot_hist(simulated_diffs_in_rates,
	bins=15,
	observed_rate=0.167,
	title="Our range of pure randomness")
	def plot_hist(experiment_results: np.ndarray,
	bins=100,
	observed_rate: float = None,
	title: str = None) -> None:
	sns.histplot(experiment_results, bins=bins)
	if observed_rate:
	plt.axvline(observed_rate, color='r', label='Diff in rates observed in experiment')
	plt.legend(bbox_to_anchor=(0.5, -0.2), loc="lower center")
	if title:
	plt.title(title)