Eduardo Martinez omartinez182

## nested-cv-article-s7.py
# Plot bar chart of the difference.
plt.figure(figsize=(10,5))
plt.tight_layout()
difference_plot = plt.bar(range(rounds), score_difference)
plt.xlabel("Individual Trial #")
plt.legend([difference_plot],
          ["Non-Nested CV - Nested CV Score"],
          bbox_to_anchor=(0, 1, .8, 0))
plt.ylabel("score difference", fontsize="14")
plt.show()

## nested-cv-article-s6.py
# Plot scores on each round for nested and non-nested cross-validation
plt.style.use('seaborn')
plt.tight_layout()
plt.figure(figsize=(10,5))
outer_scores_line, = plt.plot(outer_scores, color='orange')
nested_line, = plt.plot(nested_scores, color='steelblue')
plt.ylabel("Score", fontsize="14")
plt.legend([outer_scores_line, nested_line],
          ["Non-Nested CV", "Nested CV"],
          bbox_to_anchor=(0, .4, .5, 0))

## nested-cv-article-s5.py
#Take the difference from the non-nested and nested scores
score_difference = outer_scores - nested_scores

print("Avg. difference of {:6f} with std. dev. of {:6f}."
      .format(score_difference.mean(), score_difference.std()))

## nested-cv-article-s4.py
# Loop for each round
for i in range(rounds):

   #Define both cross-validation objects (inner & outer)
   inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
   outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)

   # Non-nested parameter search and scoring
   clf = GridSearchCV(estimator=rf, param_grid=rf_param_grid, cv=inner_cv)
   clf.fit(X, y)

## nested-cv-article-s3.py
#Define the hyperparameter grid
rf_param_grid = {'max_depth': [10, 50],
                'n_estimators': [100, 200, 400]}

#Create arrays to store the scores
outer_scores = np.zeros(rounds)
nested_scores = np.zeros(rounds)

## nested-cv-article-s2.py
#Set a seed to ensure reproducibility
seed = 42

#Instantiate the Random Forest classifier
rf = RandomForestClassifier(random_state=seed)

#Number of rounds
rounds = 20

## nested-cv-article-s1.py
#Load libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold

import matplotlib.pyplot as plt
import warnings

## hypothesis-test-for-comparing-two-proportions.py
#Hypothesis Test for Comparing Two Proportions

#Libraries
import numpy as np
from statsmodels.stats.proportion import proportions_ztest

#Null Hypothesis: The difference in proportions is = 0.  HA: the difference in proportions is ≠ 0.
Ho = 0

#Number of users converted in each campaign

## time-series-analysis-s-11.R
dfOut <- MultForecast(4, data, 2)

dfOut %>%
  ggplot(aes(x=t, y=sales))+
  geom_line(col = 'light blue')+
  geom_point()+
  geom_line(aes(x=t, y=predictions), col = 'orange')+
  geom_point(aes(x=t, y=predictions), col = 'orange')+
  xlab('Time')+
  ylab('Sales')+

## time-series-analysis-s-10.R
MultForecast<-function(k, df, Y){
  #Array with all of the raw values of 'Y'
  arr <- df[, Y]
  #Empty array to store the values of the moving averages
  MM <- rep(NA, length(arr))
  SI <- rep(NA, length(arr))
  index <- k-1

  #Calculate moving averages + SI
  for(i in c(1:length(arr))){
	# Plot bar chart of the difference.
	plt.figure(figsize=(10,5))
	plt.tight_layout()
	difference_plot = plt.bar(range(rounds), score_difference)
	plt.xlabel("Individual Trial #")
	plt.legend([difference_plot],
	["Non-Nested CV - Nested CV Score"],
	bbox_to_anchor=(0, 1, .8, 0))
	plt.ylabel("score difference", fontsize="14")
	plt.show()
	# Plot scores on each round for nested and non-nested cross-validation
	plt.style.use('seaborn')
	plt.tight_layout()
	plt.figure(figsize=(10,5))
	outer_scores_line, = plt.plot(outer_scores, color='orange')
	nested_line, = plt.plot(nested_scores, color='steelblue')
	plt.ylabel("Score", fontsize="14")
	plt.legend([outer_scores_line, nested_line],
	["Non-Nested CV", "Nested CV"],
	bbox_to_anchor=(0, .4, .5, 0))
	#Take the difference from the non-nested and nested scores
	score_difference = outer_scores - nested_scores

	print("Avg. difference of {:6f} with std. dev. of {:6f}."
	.format(score_difference.mean(), score_difference.std()))
	# Loop for each round
	for i in range(rounds):

	#Define both cross-validation objects (inner & outer)
	inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
	outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)

	# Non-nested parameter search and scoring
	clf = GridSearchCV(estimator=rf, param_grid=rf_param_grid, cv=inner_cv)
	clf.fit(X, y)
	#Define the hyperparameter grid
	rf_param_grid = {'max_depth': [10, 50],
	'n_estimators': [100, 200, 400]}

	#Create arrays to store the scores
	outer_scores = np.zeros(rounds)
	nested_scores = np.zeros(rounds)
	#Set a seed to ensure reproducibility
	seed = 42

	#Instantiate the Random Forest classifier
	rf = RandomForestClassifier(random_state=seed)

	#Number of rounds
	rounds = 20
	#Load libraries
	import numpy as np
	import pandas as pd
	from sklearn.datasets import load_wine
	from sklearn.model_selection import StratifiedKFold
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.model_selection import cross_val_score, GridSearchCV, KFold

	import matplotlib.pyplot as plt
	import warnings
	#Hypothesis Test for Comparing Two Proportions

	#Libraries
	import numpy as np
	from statsmodels.stats.proportion import proportions_ztest

	#Null Hypothesis: The difference in proportions is = 0. HA: the difference in proportions is ≠ 0.
	Ho = 0

	#Number of users converted in each campaign
	dfOut <- MultForecast(4, data, 2)

	dfOut %>%
	ggplot(aes(x=t, y=sales))+
	geom_line(col = 'light blue')+
	geom_point()+
	geom_line(aes(x=t, y=predictions), col = 'orange')+
	geom_point(aes(x=t, y=predictions), col = 'orange')+
	xlab('Time')+
	ylab('Sales')+
	MultForecast<-function(k, df, Y){
	#Array with all of the raw values of 'Y'
	arr <- df[, Y]
	#Empty array to store the values of the moving averages
	MM <- rep(NA, length(arr))
	SI <- rep(NA, length(arr))
	index <- k-1

	#Calculate moving averages + SI
	for(i in c(1:length(arr))){