Skip to content

Instantly share code, notes, and snippets.

View naiborhujosua's full-sized avatar
👨‍💻

naiborhujosua naiborhujosua

👨‍💻
View GitHub Profile
# Look at the data
msd.show()
# Count the number of distinct userIds
user_count = msd.select("userId").distinct().count()
print("Number of users: ", user_count)
# Count the number of distinct songIds
song_count = msd.select("songId").distinct().count()
print("Number of songs: ", song_count)
from pyspark.sql.functions import col,avg,min
# Min num implicit ratings for a song
print("Minimum implicit ratings for a song: ")
msd.filter(col("num_plays") > 0).groupBy("songId").count().select(min("count")).show()
# Avg num implicit ratings per songs
print("Average implicit ratings per song: ")
msd.filter(col("num_plays") > 0).groupBy("songId").count().select(avg("count")).show()
# Min num implicit ratings from a user
# Complete the lists below
ranks = [10, 20, 30, 40]
maxIters = [10, 20, 30, 40]
regParams = [.05, .1, .15]
alphas = [20, 40, 60, 80]
# For loop will automatically create and store ALS models
for r in ranks:
for mi in maxIters:
for rp in regParams:
for a in alphas:
model_list.append(ALS(userCol= "userId", itemCol= "songId", ratingCol= "num_plays", rank = r, maxIter = mi, regParam = rp, alpha = a, coldStartStrategy="drop", nonnegative = True, implicitPrefs = True))
# Print the model list, and the length of model_list
print (model_list, "Length of model_list: ", len(model_list))
# Split the data into training and test sets
(training, test) = msd.randomSplit([0.8, 0.2])
#Building 5 folds within the training set.
train1, train2, train3, train4, train5 = training.randomSplit([0.2, 0.2, 0.2, 0.2, 0.2], seed = 1)
fold1 = train2.union(train3).union(train4).union(train5)
fold2 = train3.union(train4).union(train5).union(train1)
fold3 = train4.union(train5).union(train1).union(train2)
fold4 = train5.union(train1).union(train2).union(train3)
fold5 = train1.union(train2).union(train3).union(train4)
# Import numpy
import numpy
# Find the index of the smallest ROEM
i = numpy.argmin(ROEMS)
print("Index of smallest ROEM:", i)
# Find ith element of ROEMS
print("Smallest ROEM: ", ROEMS[i])
# Extract the best_model
best_model = model_list[38]
# Extract the Rank
print ("Rank: ", best_model.getRank())
# Extract the MaxIter value
print ("MaxIter: ", best_model.getMaxIter())
# Extract the RegParam value
from statsmodels.tsa.stattools import adfullers
results = adfuller(df['close'])
df_stationary = df.diff().dropna()
from statsmodels.tsa.s import arma_generate_sample
ar_coeffs = [1,0.5]
ma_coeffs = [1,0.2]
y = arma_generate_sample(ar_coeffs,ma_coeffs,nsample=100,sigma=0.5)