meow-d/test.py

## test.py
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


# load dataset
dataset = pd.read_csv("recipe_nlg_lite/train.csv", sep="	")
dataset["combined"] = dataset.apply(
    lambda row: " ".join(row[["name", "description", "ner", "steps"]]), axis=1
)


# alternative stopwords and preprocessing
# with open("stopwords.txt", "r") as file:
#     stop_words = file.read().splitlines()


# def no_number_preprocessor(tokens):
#     r = re.sub("(\d)+", "NUM", tokens.lower())
#     # This alternative just removes numbers:
#     # r = re.sub('(\d)+', '', tokens.lower())
#     return r


# tfidf_desc = TfidfVectorizer(stop_words=stop_words, preprocessor=no_number_preprocessor)


# vectorize
tfidf_desc = TfidfVectorizer(stop_words="english")
tfidf_desc_matrix = tfidf_desc.fit_transform(dataset["combined"])


# calculate similarity, using linear kernel or cosine similarity
sim_cosine_desc = linear_kernel(tfidf_desc_matrix, tfidf_desc_matrix)


# functions
indices = pd.Series(dataset.index, index=dataset["uid"]).drop_duplicates()


def get_recommendations(uid, similarity, num_recommend=10):
    idx = indices[
        uid
    ]  # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(
        enumerate(similarity[idx])
    )  # Sort the movies based on the similarity scores
    sim_scores = sorted(
        sim_scores, key=lambda x: x[1], reverse=True
    )  # Get the scores of the 10 most similar movies
    top_similar = sim_scores[1 : num_recommend + 1]  # Get the movie indices
    recipe_indices = [i[0] for i in top_similar]
    return dataset.loc[recipe_indices]


# actually use the system
a = get_recommendations(
    "dab8b7d0-e0f6-4bb0-aed9-346e80dace1f",
    similarity=sim_cosine_desc,
    num_recommend=20,
)

# recommendation_based_on_ingr = get_recommendations(
#     "dab8b7d0-e0f6-4bb0-aed9-346e80dace1f",
#     cosine_sim=cosine_sim_ingr,
#     num_recommend=20,
# )

a.to_csv("output/description_cosine.csv")
	import re
	import pandas as pd
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import linear_kernel


	# load dataset
	dataset = pd.read_csv("recipe_nlg_lite/train.csv", sep=" ")
	dataset["combined"] = dataset.apply(
	lambda row: " ".join(row[["name", "description", "ner", "steps"]]), axis=1
	)


	# alternative stopwords and preprocessing
	# with open("stopwords.txt", "r") as file:
	# stop_words = file.read().splitlines()


	# def no_number_preprocessor(tokens):
	# r = re.sub("(\d)+", "NUM", tokens.lower())
	# # This alternative just removes numbers:
	# # r = re.sub('(\d)+', '', tokens.lower())
	# return r


	# tfidf_desc = TfidfVectorizer(stop_words=stop_words, preprocessor=no_number_preprocessor)


	# vectorize
	tfidf_desc = TfidfVectorizer(stop_words="english")
	tfidf_desc_matrix = tfidf_desc.fit_transform(dataset["combined"])


	# calculate similarity, using linear kernel or cosine similarity
	sim_cosine_desc = linear_kernel(tfidf_desc_matrix, tfidf_desc_matrix)


	# functions
	indices = pd.Series(dataset.index, index=dataset["uid"]).drop_duplicates()


	def get_recommendations(uid, similarity, num_recommend=10):
	idx = indices[
	uid
	] # Get the pairwsie similarity scores of all movies with that movie
	sim_scores = list(
	enumerate(similarity[idx])
	) # Sort the movies based on the similarity scores
	sim_scores = sorted(
	sim_scores, key=lambda x: x[1], reverse=True
	) # Get the scores of the 10 most similar movies
	top_similar = sim_scores[1 : num_recommend + 1] # Get the movie indices
	recipe_indices = [i[0] for i in top_similar]
	return dataset.loc[recipe_indices]


	# actually use the system
	a = get_recommendations(
	"dab8b7d0-e0f6-4bb0-aed9-346e80dace1f",
	similarity=sim_cosine_desc,
	num_recommend=20,
	)

	# recommendation_based_on_ingr = get_recommendations(
	# "dab8b7d0-e0f6-4bb0-aed9-346e80dace1f",
	# cosine_sim=cosine_sim_ingr,
	# num_recommend=20,
	# )

	a.to_csv("output/description_cosine.csv")