naiborhujosua naiborhujosua

## song.py
# Look at the data
msd.show()

# Count the number of distinct userIds
user_count = msd.select("userId").distinct().count()
print("Number of users: ", user_count)

# Count the number of distinct songIds
song_count = msd.select("songId").distinct().count()
print("Number of songs: ", song_count)

## statistics.py
from pyspark.sql.functions import col,avg,min
# Min num implicit ratings for a song
print("Minimum implicit ratings for a song: ")
msd.filter(col("num_plays") > 0).groupBy("songId").count().select(min("count")).show()

# Avg num implicit ratings per songs
print("Average implicit ratings per song: ")
msd.filter(col("num_plays") > 0).groupBy("songId").count().select(avg("count")).show()

# Min num implicit ratings from a user

## als.py
# Complete the lists below
ranks = [10, 20, 30, 40]
maxIters = [10, 20, 30, 40]
regParams =  [.05, .1, .15]
alphas = [20, 40, 60, 80]

## als_functions.py
# For loop will automatically create and store ALS models
for r in ranks:
    for mi in maxIters:
        for rp in regParams:
            for a in alphas:
                model_list.append(ALS(userCol= "userId", itemCol= "songId", ratingCol= "num_plays", rank = r, maxIter = mi, regParam = rp, alpha = a, coldStartStrategy="drop", nonnegative = True, implicitPrefs = True))

# Print the model list, and the length of model_list
print (model_list, "Length of model_list: ", len(model_list))

## cross_validation.py
# Split the data into training and test sets
(training, test) = msd.randomSplit([0.8, 0.2])

#Building 5 folds within the training set.
train1, train2, train3, train4, train5 = training.randomSplit([0.2, 0.2, 0.2, 0.2, 0.2], seed = 1)
fold1 = train2.union(train3).union(train4).union(train5)
fold2 = train3.union(train4).union(train5).union(train1)
fold3 = train4.union(train5).union(train1).union(train2)
fold4 = train5.union(train1).union(train2).union(train3)
fold5 = train1.union(train2).union(train3).union(train4)

## roem.py
# Import numpy
import numpy

# Find the index of the smallest ROEM
i = numpy.argmin(ROEMS)
print("Index of smallest ROEM:", i)

# Find ith element of ROEMS
print("Smallest ROEM: ", ROEMS[i])

## model_hyperparameter.py
# Extract the best_model
best_model = model_list[38]

# Extract the Rank
print ("Rank: ", best_model.getRank())

# Extract the MaxIter value
print ("MaxIter: ", best_model.getMaxIter())

# Extract the RegParam value

## adfuller.py
from statsmodels.tsa.stattools import adfullers

results = adfuller(df['close'])

## difference.py
df_stationary = df.diff().dropna()

## armamodel.py
from statsmodels.tsa.s import arma_generate_sample
ar_coeffs = [1,0.5]
ma_coeffs = [1,0.2]

y = arma_generate_sample(ar_coeffs,ma_coeffs,nsample=100,sigma=0.5)
	# Look at the data
	msd.show()

	# Count the number of distinct userIds
	user_count = msd.select("userId").distinct().count()
	print("Number of users: ", user_count)

	# Count the number of distinct songIds
	song_count = msd.select("songId").distinct().count()
	print("Number of songs: ", song_count)
	from pyspark.sql.functions import col,avg,min
	# Min num implicit ratings for a song
	print("Minimum implicit ratings for a song: ")
	msd.filter(col("num_plays") > 0).groupBy("songId").count().select(min("count")).show()

	# Avg num implicit ratings per songs
	print("Average implicit ratings per song: ")
	msd.filter(col("num_plays") > 0).groupBy("songId").count().select(avg("count")).show()

	# Min num implicit ratings from a user
	# Complete the lists below
	ranks = [10, 20, 30, 40]
	maxIters = [10, 20, 30, 40]
	regParams = [.05, .1, .15]
	alphas = [20, 40, 60, 80]
	# For loop will automatically create and store ALS models
	for r in ranks:
	for mi in maxIters:
	for rp in regParams:
	for a in alphas:
	model_list.append(ALS(userCol= "userId", itemCol= "songId", ratingCol= "num_plays", rank = r, maxIter = mi, regParam = rp, alpha = a, coldStartStrategy="drop", nonnegative = True, implicitPrefs = True))

	# Print the model list, and the length of model_list
	print (model_list, "Length of model_list: ", len(model_list))
	# Split the data into training and test sets
	(training, test) = msd.randomSplit([0.8, 0.2])

	#Building 5 folds within the training set.
	train1, train2, train3, train4, train5 = training.randomSplit([0.2, 0.2, 0.2, 0.2, 0.2], seed = 1)
	fold1 = train2.union(train3).union(train4).union(train5)
	fold2 = train3.union(train4).union(train5).union(train1)
	fold3 = train4.union(train5).union(train1).union(train2)
	fold4 = train5.union(train1).union(train2).union(train3)
	fold5 = train1.union(train2).union(train3).union(train4)
	# Import numpy
	import numpy

	# Find the index of the smallest ROEM
	i = numpy.argmin(ROEMS)
	print("Index of smallest ROEM:", i)

	# Find ith element of ROEMS
	print("Smallest ROEM: ", ROEMS[i])
	# Extract the best_model
	best_model = model_list[38]

	# Extract the Rank
	print ("Rank: ", best_model.getRank())

	# Extract the MaxIter value
	print ("MaxIter: ", best_model.getMaxIter())

	# Extract the RegParam value
	from statsmodels.tsa.stattools import adfullers

	results = adfuller(df['close'])
	from statsmodels.tsa.s import arma_generate_sample
	ar_coeffs = [1,0.5]
	ma_coeffs = [1,0.2]

	y = arma_generate_sample(ar_coeffs,ma_coeffs,nsample=100,sigma=0.5)