Created
January 12, 2021 02:18
-
-
Save aurora1625/77d0c4a1e0ac32dd10e23f77d53eecc0 to your computer and use it in GitHub Desktop.
extract tfidf keywords from a corpus
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## THE CODE IS SELF EXPLANATORY AND COMMENTED | |
## loading some dependencies | |
import gensim | |
from gensim.utils import simple_preprocess | |
from gensim.parsing.preprocessing import STOPWORDS | |
from nltk.stem import WordNetLemmatizer, SnowballStemmer | |
from nltk.stem.porter import * | |
import nltk | |
nltk.download('wordnet') | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
## our dataset | |
from sklearn.datasets import fetch_20newsgroups | |
newsgroups_train = fetch_20newsgroups(subset='train' , shuffle = True , categories = [ "alt.atheism" ]) | |
## defining a stemmer to use | |
stemmer = SnowballStemmer("english") | |
## this dictiaoniary will come in handy later on .. | |
stemmed_to_original = {} | |
## Basic Preprocessings Functions ## | |
def lemmatize_stemming(text): | |
return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v')) | |
def preprocess(text): | |
result=[] | |
for token in gensim.utils.simple_preprocess(text) : | |
if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3: | |
stemmed_token = lemmatize_stemming(token) | |
stemmed_to_original[stemmed_token] = token | |
result.append(stemmed_token) | |
return result | |
news_data = [ preprocess(i) for i in newsgroups_train.data ] | |
## notice, min_df and max_df parameters are really important in getting the most important keywords out of your corpus | |
vectorizer = TfidfVectorizer( stop_words= gensim.parsing.preprocessing.STOPWORDS , min_df = 20 , max_df = 0.72, tokenizer= lambda x : x , lowercase= False ) | |
vectorizer.fit_transform( news_data ) | |
## get idf values of all the corresponding tokens used by vectorizer and sort them in ascending order | |
## Depends on how you define it, but for most of cases while working in text corpus, after unnecessary stopwords and ( really high / really rare ) frequent words have been filtered out | |
## by parameters we used in our vectorizer above, this type of sorting gets you important keywords | |
## make a dictionairy of words and corresponding idf weight | |
word_to_idf = { i:j for i,j in zip(vectorizer.get_feature_names() , vectorizer.idf_ ) } | |
## sort the dictionairy in ascending order of idf weights | |
word_to_idf = sorted( word_to_idf.items() ,key = lambda x : x[1] , reverse = False ) | |
print(word_to_idf) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment