Skip to content

Instantly share code, notes, and snippets.

@4OH4
Last active April 13, 2023 21:56
Show Gist options
  • Save 4OH4/f727af7dfc0e6bb0f26d2ea41d89ee55 to your computer and use it in GitHub Desktop.
Save 4OH4/f727af7dfc0e6bb0f26d2ea41d89ee55 to your computer and use it in GitHub Desktop.
TF-idf model with stopwords and lemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
# Download stopwords list
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
# Interface lemma tokenizer from nltk with sklearn
class LemmaTokenizer:
ignore_tokens = [',', '.', ';', ':', '"', '``', "''", '`']
def __init__(self):
self.wnl = WordNetLemmatizer()
def __call__(self, doc):
return [self.wnl.lemmatize(t) for t in word_tokenize(doc) if t not in self.ignore_tokens]
# Lemmatize the stop words
tokenizer=LemmaTokenizer()
token_stop = tokenizer(' '.join(stop_words))
search_terms = 'red tomato'
documents = ['cars drive on the road', 'tomatoes are actually fruit']
# Create TF-idf model
vectorizer = TfidfVectorizer(stop_words=token_stop,
tokenizer=tokenizer)
doc_vectors = vectorizer.fit_transform([search_terms] + documents)
# Calculate similarity
cosine_similarities = linear_kernel(doc_vectors[0:1], doc_vectors).flatten()
document_scores = [item.item() for item in cosine_similarities[1:]]
# [0.0, 0.287]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment