Skip to content

Instantly share code, notes, and snippets.

View JLFDataScience's full-sized avatar

Jose Luis Fernández Nuevo JLFDataScience

  • FGCSIC
View GitHub Profile
@JLFDataScience
JLFDataScience / collaborative_function.py
Created February 12, 2020 13:07
Collaborative functions recommendations
def dot_product(vector_1, vector_2):
return sum([ i*j for i,j in zip(vector_1, vector_2)])
def get_movie_score(movie_features, user_preferences):
return dot_product(movie_features, user_preferences)
def get_movie_recommendations(user_preferences, n_recommendations):
#We create a new column in the dataset with the value of each movie for the user
movies_df['score'] = movies_df[movie_categories].apply(get_movie_score,
args=([user_preferences.values()]), axis=1)
@JLFDataScience
JLFDataScience / user_preferences.py
Last active February 12, 2020 12:27
Configure preferences model user
from collections import OrderedDict
user_preferences = OrderedDict(zip(movie_categories, []))
user_preferences['Action'] = 5
user_preferences['Adventure'] = 5
user_preferences['Animation'] = 1
user_preferences["Children's"] = 1
user_preferences["Comedy"] = 3
user_preferences['Crime'] = 2
@JLFDataScience
JLFDataScience / Movies_dummies.py
Last active February 12, 2020 12:25
Apply get_dummies function to the dataframe
# We convert the genero variable into a dummy variable for your treatment
# The get_dummies function converts a categorical variable into multiple columns
# For each movie, these dummy columns will have a value of 0 except for those genres that have the movie
movies_df = pd.concat([movies_df, movies_df.movie_genre.str.get_dummies(sep='|')], axis=1)
# A variable is created with the categories
movie_categories = movies_df.columns[3:]
movies_df.head()
print(movie_categories)
@JLFDataScience
JLFDataScience / load_data_movie.py
Created February 12, 2020 12:13
Explore zip file and load de data
import pandas as pd
import numpy as np
from zipfile import ZipFile
from io import StringIO
import io
from urllib.request import urlopen
import requests
#We check what content the zip of the data has, before downloading and unzip
zip_url ='http://files.grouplens.org/datasets/movielens/ml-1m.zip'
@JLFDataScience
JLFDataScience / Pipeline_optimization.py
Created January 30, 2020 17:00
Pipeline optimization
from sklearn.preprocessing import StandardScaler, Normalizer, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
pipe = Pipeline([
('scaler', StandardScaler()),
('norm', Normalizer()),
('poly', PolynomialFeatures(degree=2)),
('norm2',Normalizer()),
('pca', PCA(n_components=3)),
parameters = dict(kBest__k = range(1, len(features)),
svm__kernel = ['rbf', 'sigmoid'],
svm__C = [0.1, 1, 10, 100, 1000],
svm__gamma = [0.1, 0.01, 0.001, 0.0001, 0.00001],
svm__random_state = [0])
grid = GridSearchCV(pipe, param_grid = parameters, cv = 5)
grid.fit(x_train, y_train)
@JLFDataScience
JLFDataScience / SVC_pipeline.py
Created January 30, 2020 16:55
SVC pipeline
from sklearn.svm import SVC
pipe = Pipeline([('kBest', SelectKBest(f_classif, k = 5)),
('svm', SVC(kernel = 'rbf'))])
pipe.fit(x_train, y_train)
print(u'The performance of the model is: %0.5f' % pipe.score(x_test, y_test))
parameters = dict(kBest__k = range(1, len(features)),
rf__n_estimators = [25, 50, 75],
rf__max_depth = [1, 2, 3, 4, 5],
rf__random_state = [0])
grid = GridSearchCV(pipe, param_grid = parameters, cv = 5)
grid.fit(x_train, y_train)
print(u'The performance of the model is: %0.5f' % grid.score(x_test, y_test))
@JLFDataScience
JLFDataScience / Random_forest_pipeline.py
Created January 30, 2020 16:49
Random forest pipeline
from sklearn.ensemble import RandomForestClassifier
pipe = Pipeline([('kBest', SelectKBest(f_classif, k = 5)),
('rf', RandomForestClassifier(max_depth = 2))])
pipe.fit(x_train, y_train)
print(u'The performance of the model is: %0.5f' % pipe.score(x_test, y_test))
@JLFDataScience
JLFDataScience / GridSearchCV.py
Created January 30, 2020 16:46
GridSearchCV
from sklearn.model_selection import GridSearchCV
parameters = dict(kBest__k = range(1, len(features)))
grid = GridSearchCV(pipe, param_grid = parameters, cv = 5)
grid.fit(x_train, y_train)
print(u'The performance of the model is: %0.5f' % grid.score(x_test, y_test))