Dref360/topic_modeling.py

## topic_modeling.py
import gensim
import nltk
from gensim import corpora, models
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import pandas as pd

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

"""
Taken from https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24
"""

FILTERED_TOKENS = {'okay', 'like', 'know', 'yeah', 'think', 'thing'}


def lemmatize_stemming(text):
    stemmer = SnowballStemmer('english', ignore_stopwords=True)
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))


def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3 and token not in FILTERED_TOKENS:
            result.append(lemmatize_stemming(token))
    return result


def find_topics(df: pd.DataFrame, text_column: str):
    """Find topics for a dataframe.

    :note Just print for now.

    :param df: Dataframe with data
    :param text_column: Column name to get segment.
    :return: Nothing
    """
    processed_docs = df[text_column].map(preprocess)
    dictionary = gensim.corpora.Dictionary(processed_docs)
    dictionary.filter_extremes(no_below=15, no_above=0.2, keep_n=100)
    bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
    tfidf = models.TfidfModel(bow_corpus)
    corpus_tfidf = tfidf[bow_corpus]

    lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
    for idx, topic in lda_model_tfidf.print_topics(-1):
        print('Topic: {} Word: {}'.format(idx, topic))
	import gensim
	import nltk
	from gensim import corpora, models
	from nltk.stem import WordNetLemmatizer, SnowballStemmer
	import pandas as pd

	nltk.download('wordnet')
	nltk.download('omw-1.4')
	nltk.download('stopwords')

	"""
	Taken from https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24
	"""

	FILTERED_TOKENS = {'okay', 'like', 'know', 'yeah', 'think', 'thing'}


	def lemmatize_stemming(text):
	stemmer = SnowballStemmer('english', ignore_stopwords=True)
	return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))


	def preprocess(text):
	result = []
	for token in gensim.utils.simple_preprocess(text):
	if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3 and token not in FILTERED_TOKENS:
	result.append(lemmatize_stemming(token))
	return result


	def find_topics(df: pd.DataFrame, text_column: str):
	"""Find topics for a dataframe.

	:note Just print for now.

	:param df: Dataframe with data
	:param text_column: Column name to get segment.
	:return: Nothing
	"""
	processed_docs = df[text_column].map(preprocess)
	dictionary = gensim.corpora.Dictionary(processed_docs)
	dictionary.filter_extremes(no_below=15, no_above=0.2, keep_n=100)
	bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
	tfidf = models.TfidfModel(bow_corpus)
	corpus_tfidf = tfidf[bow_corpus]

	lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
	for idx, topic in lda_model_tfidf.print_topics(-1):
	print('Topic: {} Word: {}'.format(idx, topic))