Last active
June 27, 2019 05:49
-
-
Save NaelsonDouglas/fbe3e6a211408c3b096e57f44e012afb to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.feature_extraction.text import TfidfVectorizer | |
import glob | |
import pandas as pd | |
files = [x for x in glob.glob("inputs/*.txt")] | |
#Read data from a folder | |
data = [open(f).read() for f in files] | |
#Random data | |
data =['all your base are belong to us', | |
'a dead horse is a lifeless animal', | |
'all the horses are inside the base even the dead'] | |
tfidf_vectorizer = TfidfVectorizer(use_idf=True) | |
vectors = tfidf_vectorizer.fit_transform(data) | |
#It will print only the table relative to the first 'file', i.e. vectors[0] | |
df = pd.DataFrame(vectors[0].T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"]) | |
df = df.sort_values(by=["tfidf"],ascending=False) | |
print(df) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
__________tfidf
belong____0.417567
to________0.417567
us________0.417567
your______0.417567
all_______0.317570
are_______0.317570
base______0.317570
animal____0.000000
dead______0.000000
even______0.000000
horse_____0.000000
horses____0.000000
inside____0.000000
is________0.000000
lifeless__0.000000
the_______0.000000