aurora1625/tfidf_corpus.py

## tfidf_corpus.py
## THE CODE IS SELF EXPLANATORY AND COMMENTED

## loading some dependencies
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk
nltk.download('wordnet')
from sklearn.feature_extraction.text import TfidfVectorizer

## our dataset
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train' , shuffle = True , categories =  [ "alt.atheism" ])
## defining a stemmer to use
stemmer = SnowballStemmer("english")

## this dictiaoniary will come in handy later on ..
stemmed_to_original = {}

## Basic Preprocessings Functions ##
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :

        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            stemmed_token = lemmatize_stemming(token)
            stemmed_to_original[stemmed_token] = token
            result.append(stemmed_token)

    return result


news_data = [ preprocess(i) for i in newsgroups_train.data  ]
## notice, min_df and max_df parameters are really important in getting the most important keywords out of your corpus
vectorizer = TfidfVectorizer(   stop_words= gensim.parsing.preprocessing.STOPWORDS , min_df = 20 , max_df = 0.72, tokenizer= lambda x : x , lowercase= False   )
vectorizer.fit_transform( news_data  )

## get idf values of all the corresponding tokens used by vectorizer and sort them in ascending order
## Depends on how you define it, but for most of cases while working in text corpus,  after unnecessary stopwords and  ( really high / really rare ) frequent words have been filtered out
## by parameters we used in our vectorizer above,  this type of sorting gets you important keywords

## make a dictionairy of words and corresponding idf weight
word_to_idf = {  i:j for i,j in zip(vectorizer.get_feature_names() , vectorizer.idf_ ) }
## sort the dictionairy in ascending order of idf weights
word_to_idf = sorted(   word_to_idf.items() ,key = lambda x : x[1]  ,  reverse = False )
print(word_to_idf)
	## THE CODE IS SELF EXPLANATORY AND COMMENTED

	## loading some dependencies
	import gensim
	from gensim.utils import simple_preprocess
	from gensim.parsing.preprocessing import STOPWORDS
	from nltk.stem import WordNetLemmatizer, SnowballStemmer
	from nltk.stem.porter import *
	import nltk
	nltk.download('wordnet')
	from sklearn.feature_extraction.text import TfidfVectorizer

	## our dataset
	from sklearn.datasets import fetch_20newsgroups
	newsgroups_train = fetch_20newsgroups(subset='train' , shuffle = True , categories = [ "alt.atheism" ])
	## defining a stemmer to use
	stemmer = SnowballStemmer("english")

	## this dictiaoniary will come in handy later on ..
	stemmed_to_original = {}

	## Basic Preprocessings Functions ##
	def lemmatize_stemming(text):
	return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

	def preprocess(text):
	result=[]
	for token in gensim.utils.simple_preprocess(text) :

	if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
	stemmed_token = lemmatize_stemming(token)
	stemmed_to_original[stemmed_token] = token
	result.append(stemmed_token)

	return result


	news_data = [ preprocess(i) for i in newsgroups_train.data ]
	## notice, min_df and max_df parameters are really important in getting the most important keywords out of your corpus
	vectorizer = TfidfVectorizer( stop_words= gensim.parsing.preprocessing.STOPWORDS , min_df = 20 , max_df = 0.72, tokenizer= lambda x : x , lowercase= False )
	vectorizer.fit_transform( news_data )

	## get idf values of all the corresponding tokens used by vectorizer and sort them in ascending order
	## Depends on how you define it, but for most of cases while working in text corpus, after unnecessary stopwords and ( really high / really rare ) frequent words have been filtered out
	## by parameters we used in our vectorizer above, this type of sorting gets you important keywords

	## make a dictionairy of words and corresponding idf weight
	word_to_idf = { i:j for i,j in zip(vectorizer.get_feature_names() , vectorizer.idf_ ) }
	## sort the dictionairy in ascending order of idf weights
	word_to_idf = sorted( word_to_idf.items() ,key = lambda x : x[1] , reverse = False )
	print(word_to_idf)