srang992/tfidf.py

## tfidf.py

# making an object of TfidfVectorizer in which words contains only in 1 document and word repeated in 70% of documents are ignored.
tfidf = TfidfVectorizer(min_df = 2, max_df = 0.7)


# fitting the cleaned text in TfidfVectorizer
X = tfidf.fit_transform(netflix_data_copy['clean_desc'])


# making a suitable dataframe for calculating the cosine similarity and save it
tfidf_df = pd.DataFrame(X.toarray(), columns = tfidf.get_feature_names())
tfidf_df.index = netflix_data_copy['title']
tfidf_df.to_csv("data/tfidf_data.csv")

	# making an object of TfidfVectorizer in which words contains only in 1 document and word repeated in 70% of documents are ignored.
	tfidf = TfidfVectorizer(min_df = 2, max_df = 0.7)


	# fitting the cleaned text in TfidfVectorizer
	X = tfidf.fit_transform(netflix_data_copy['clean_desc'])


	# making a suitable dataframe for calculating the cosine similarity and save it
	tfidf_df = pd.DataFrame(X.toarray(), columns = tfidf.get_feature_names())
	tfidf_df.index = netflix_data_copy['title']
	tfidf_df.to_csv("data/tfidf_data.csv")