Jose Luis Fernández Nuevo JLFDataScience

## collaborative_function.py
def dot_product(vector_1, vector_2):
    return sum([ i*j for i,j in zip(vector_1, vector_2)])

def get_movie_score(movie_features, user_preferences):
    return dot_product(movie_features, user_preferences)

def get_movie_recommendations(user_preferences, n_recommendations):
    #We create a new column in the dataset with the value of each movie for the user
    movies_df['score'] = movies_df[movie_categories].apply(get_movie_score,
                                                           args=([user_preferences.values()]), axis=1)

## user_preferences.py
from collections import OrderedDict

user_preferences = OrderedDict(zip(movie_categories, []))

user_preferences['Action'] = 5
user_preferences['Adventure'] = 5
user_preferences['Animation'] = 1
user_preferences["Children's"] = 1
user_preferences["Comedy"] = 3
user_preferences['Crime'] = 2

## Movies_dummies.py
# We convert the genero variable into a dummy variable for your treatment
# The get_dummies function converts a categorical variable into multiple columns
# For each movie, these dummy columns will have a value of 0 except for those genres that have the movie
movies_df = pd.concat([movies_df, movies_df.movie_genre.str.get_dummies(sep='|')], axis=1)

# A variable is created with the categories
movie_categories = movies_df.columns[3:]

movies_df.head()
print(movie_categories)

## load_data_movie.py
import pandas as pd
import numpy as np
from zipfile import ZipFile
from io import StringIO
import io
from urllib.request import urlopen
import requests

#We check what content the zip of the data has, before downloading and unzip
zip_url ='http://files.grouplens.org/datasets/movielens/ml-1m.zip'

## Pipeline_optimization.py
from sklearn.preprocessing import StandardScaler, Normalizer, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('norm', Normalizer()),
    ('poly', PolynomialFeatures(degree=2)),
    ('norm2',Normalizer()),
    ('pca', PCA(n_components=3)),

## GridSearchCV3.py
parameters = dict(kBest__k = range(1,  len(features)),
                  svm__kernel = ['rbf', 'sigmoid'],
                  svm__C = [0.1, 1, 10, 100, 1000],
                  svm__gamma = [0.1, 0.01, 0.001, 0.0001, 0.00001],
                  svm__random_state = [0])

grid = GridSearchCV(pipe, param_grid = parameters, cv = 5)

grid.fit(x_train, y_train)

## SVC_pipeline.py
from sklearn.svm import SVC

pipe = Pipeline([('kBest', SelectKBest(f_classif, k = 5)),
                 ('svm', SVC(kernel = 'rbf'))])

pipe.fit(x_train, y_train)

print(u'The performance of the model is: %0.5f' % pipe.score(x_test, y_test))

## GridSearchCV2.py
parameters = dict(kBest__k = range(1,  len(features)),
                  rf__n_estimators = [25, 50, 75],
                  rf__max_depth = [1, 2, 3, 4, 5],
                  rf__random_state = [0])

grid = GridSearchCV(pipe, param_grid = parameters, cv = 5)

grid.fit(x_train, y_train)

print(u'The performance of the model is: %0.5f' % grid.score(x_test, y_test))

## Random_forest_pipeline.py
from sklearn.ensemble import RandomForestClassifier

pipe = Pipeline([('kBest', SelectKBest(f_classif, k = 5)),
                 ('rf', RandomForestClassifier(max_depth = 2))])

pipe.fit(x_train, y_train)

print(u'The performance of the model is: %0.5f' % pipe.score(x_test, y_test))

## GridSearchCV.py
from sklearn.model_selection import GridSearchCV

parameters = dict(kBest__k = range(1,  len(features)))

grid = GridSearchCV(pipe, param_grid = parameters, cv = 5)

grid.fit(x_train, y_train)

print(u'The performance of the model is: %0.5f' % grid.score(x_test, y_test))
	def dot_product(vector_1, vector_2):
	return sum([ i*j for i,j in zip(vector_1, vector_2)])

	def get_movie_score(movie_features, user_preferences):
	return dot_product(movie_features, user_preferences)

	def get_movie_recommendations(user_preferences, n_recommendations):
	#We create a new column in the dataset with the value of each movie for the user
	movies_df['score'] = movies_df[movie_categories].apply(get_movie_score,
	args=([user_preferences.values()]), axis=1)
	from collections import OrderedDict

	user_preferences = OrderedDict(zip(movie_categories, []))

	user_preferences['Action'] = 5
	user_preferences['Adventure'] = 5
	user_preferences['Animation'] = 1
	user_preferences["Children's"] = 1
	user_preferences["Comedy"] = 3
	user_preferences['Crime'] = 2
	# We convert the genero variable into a dummy variable for your treatment
	# The get_dummies function converts a categorical variable into multiple columns
	# For each movie, these dummy columns will have a value of 0 except for those genres that have the movie
	movies_df = pd.concat([movies_df, movies_df.movie_genre.str.get_dummies(sep='\|')], axis=1)

	# A variable is created with the categories
	movie_categories = movies_df.columns[3:]

	movies_df.head()
	print(movie_categories)
	import pandas as pd
	import numpy as np
	from zipfile import ZipFile
	from io import StringIO
	import io
	from urllib.request import urlopen
	import requests

	#We check what content the zip of the data has, before downloading and unzip
	zip_url ='http://files.grouplens.org/datasets/movielens/ml-1m.zip'
	from sklearn.preprocessing import StandardScaler, Normalizer, PolynomialFeatures
	from sklearn.decomposition import PCA
	from sklearn.ensemble import RandomForestClassifier

	pipe = Pipeline([
	('scaler', StandardScaler()),
	('norm', Normalizer()),
	('poly', PolynomialFeatures(degree=2)),
	('norm2',Normalizer()),
	('pca', PCA(n_components=3)),
	parameters = dict(kBest__k = range(1, len(features)),
	svm__kernel = ['rbf', 'sigmoid'],
	svm__C = [0.1, 1, 10, 100, 1000],
	svm__gamma = [0.1, 0.01, 0.001, 0.0001, 0.00001],
	svm__random_state = [0])

	grid = GridSearchCV(pipe, param_grid = parameters, cv = 5)

	grid.fit(x_train, y_train)
	from sklearn.svm import SVC

	pipe = Pipeline([('kBest', SelectKBest(f_classif, k = 5)),
	('svm', SVC(kernel = 'rbf'))])

	pipe.fit(x_train, y_train)

	print(u'The performance of the model is: %0.5f' % pipe.score(x_test, y_test))
	parameters = dict(kBest__k = range(1, len(features)),
	rf__n_estimators = [25, 50, 75],
	rf__max_depth = [1, 2, 3, 4, 5],
	rf__random_state = [0])

	grid = GridSearchCV(pipe, param_grid = parameters, cv = 5)

	grid.fit(x_train, y_train)

	print(u'The performance of the model is: %0.5f' % grid.score(x_test, y_test))
	from sklearn.ensemble import RandomForestClassifier

	pipe = Pipeline([('kBest', SelectKBest(f_classif, k = 5)),
	('rf', RandomForestClassifier(max_depth = 2))])

	pipe.fit(x_train, y_train)

	print(u'The performance of the model is: %0.5f' % pipe.score(x_test, y_test))
	from sklearn.model_selection import GridSearchCV

	parameters = dict(kBest__k = range(1, len(features)))

	grid = GridSearchCV(pipe, param_grid = parameters, cv = 5)

	grid.fit(x_train, y_train)

	print(u'The performance of the model is: %0.5f' % grid.score(x_test, y_test))