Skip to content

Instantly share code, notes, and snippets.

@meow-d
Last active January 16, 2024 00:56
Show Gist options
  • Save meow-d/8e32ef0a843e284ff45d043f9844eeef to your computer and use it in GitHub Desktop.
Save meow-d/8e32ef0a843e284ff45d043f9844eeef to your computer and use it in GitHub Desktop.
easiest way to implement the recipe recommendation algorithm
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
# load dataset
dataset = pd.read_csv("recipe_nlg_lite/train.csv", sep=" ")
dataset["combined"] = dataset.apply(
lambda row: " ".join(row[["name", "description", "ner", "steps"]]), axis=1
)
# alternative stopwords and preprocessing
# with open("stopwords.txt", "r") as file:
# stop_words = file.read().splitlines()
# def no_number_preprocessor(tokens):
# r = re.sub("(\d)+", "NUM", tokens.lower())
# # This alternative just removes numbers:
# # r = re.sub('(\d)+', '', tokens.lower())
# return r
# tfidf_desc = TfidfVectorizer(stop_words=stop_words, preprocessor=no_number_preprocessor)
# vectorize
tfidf_desc = TfidfVectorizer(stop_words="english")
tfidf_desc_matrix = tfidf_desc.fit_transform(dataset["combined"])
# calculate similarity, using linear kernel or cosine similarity
sim_cosine_desc = linear_kernel(tfidf_desc_matrix, tfidf_desc_matrix)
# functions
indices = pd.Series(dataset.index, index=dataset["uid"]).drop_duplicates()
def get_recommendations(uid, similarity, num_recommend=10):
idx = indices[
uid
] # Get the pairwsie similarity scores of all movies with that movie
sim_scores = list(
enumerate(similarity[idx])
) # Sort the movies based on the similarity scores
sim_scores = sorted(
sim_scores, key=lambda x: x[1], reverse=True
) # Get the scores of the 10 most similar movies
top_similar = sim_scores[1 : num_recommend + 1] # Get the movie indices
recipe_indices = [i[0] for i in top_similar]
return dataset.loc[recipe_indices]
# actually use the system
a = get_recommendations(
"dab8b7d0-e0f6-4bb0-aed9-346e80dace1f",
similarity=sim_cosine_desc,
num_recommend=20,
)
# recommendation_based_on_ingr = get_recommendations(
# "dab8b7d0-e0f6-4bb0-aed9-346e80dace1f",
# cosine_sim=cosine_sim_ingr,
# num_recommend=20,
# )
a.to_csv("output/description_cosine.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment