Skip to content

Instantly share code, notes, and snippets.

@aurora1625
Created January 12, 2021 02:18
Show Gist options
  • Save aurora1625/77d0c4a1e0ac32dd10e23f77d53eecc0 to your computer and use it in GitHub Desktop.
Save aurora1625/77d0c4a1e0ac32dd10e23f77d53eecc0 to your computer and use it in GitHub Desktop.
extract tfidf keywords from a corpus
## THE CODE IS SELF EXPLANATORY AND COMMENTED
## loading some dependencies
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk
nltk.download('wordnet')
from sklearn.feature_extraction.text import TfidfVectorizer
## our dataset
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train' , shuffle = True , categories = [ "alt.atheism" ])
## defining a stemmer to use
stemmer = SnowballStemmer("english")
## this dictiaoniary will come in handy later on ..
stemmed_to_original = {}
## Basic Preprocessings Functions ##
def lemmatize_stemming(text):
return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
result=[]
for token in gensim.utils.simple_preprocess(text) :
if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
stemmed_token = lemmatize_stemming(token)
stemmed_to_original[stemmed_token] = token
result.append(stemmed_token)
return result
news_data = [ preprocess(i) for i in newsgroups_train.data ]
## notice, min_df and max_df parameters are really important in getting the most important keywords out of your corpus
vectorizer = TfidfVectorizer( stop_words= gensim.parsing.preprocessing.STOPWORDS , min_df = 20 , max_df = 0.72, tokenizer= lambda x : x , lowercase= False )
vectorizer.fit_transform( news_data )
## get idf values of all the corresponding tokens used by vectorizer and sort them in ascending order
## Depends on how you define it, but for most of cases while working in text corpus, after unnecessary stopwords and ( really high / really rare ) frequent words have been filtered out
## by parameters we used in our vectorizer above, this type of sorting gets you important keywords
## make a dictionairy of words and corresponding idf weight
word_to_idf = { i:j for i,j in zip(vectorizer.get_feature_names() , vectorizer.idf_ ) }
## sort the dictionairy in ascending order of idf weights
word_to_idf = sorted( word_to_idf.items() ,key = lambda x : x[1] , reverse = False )
print(word_to_idf)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment