Skip to content

Instantly share code, notes, and snippets.

@light44
Created December 21, 2016 02:10
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save light44/b6e374d172bbe3e6130f6849e7bd6c13 to your computer and use it in GitHub Desktop.
Save light44/b6e374d172bbe3e6130f6849e7bd6c13 to your computer and use it in GitHub Desktop.
Latent Dirichlet Allocation using gensim
import nltk
import pandas as pd
import re
import pprint
import operator
import csv
import logging
from stop_words import get_stop_words
from collections import defaultdict
from gensim import corpora
from gensim.models import ldamodel
from nltk.stem import WordNetLemmatizer
# constants
STOPWORDS = set(get_stop_words('en'))
CUSTOM_STOPWORDS = {'light', 'lights', 'lights', 'sky', 'object', 'bright', 'ufo', 'quot'}
pp = pprint.PrettyPrinter(indent=4)
regex_filter = re.compile('[a-z]{2,}')
# put your custom path here if you so choose
nltk.data.path.append('')
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
def tokenize_and_clean(document, stopwords=(), regex=[], lemmatizer=WordNetLemmatizer()):
"""
:param document: a string representing a single document
:param stopwords: a set of stopwords
:param regex: additional regular expressions to use as a filter. Assuming these are compiled prior
:param lemmatizer: an instance of an nltk lemmatizer
:return: a tokenized and filtered document
"""
raw_tokenized = nltk.tokenize.wordpunct_tokenize(document)
tokenized = []
for word in raw_tokenized:
w = word.lower()
if w not in stopwords:
for exp in regex:
if re.match(exp,w):
if lemmatizer:
tokenized.append(lemmatizer.lemmatize(w))
else:
tokenized.append(w)
return tokenized
def word_frequency(corpus=[[]]):
"""
:param corpus: a list of lists representing tokenized documents
:return: a dict containing the frequency of each word in the corpus
"""
frequency = defaultdict(int)
for doc in corpus:
for w in doc:
frequency[w] += 1
return dict(sorted(frequency.items(), key=operator.itemgetter(1), reverse=True))
def write_dict_to_csv(data, filepath):
"""
Encapsulating this in a function - writes an object to a csv
:param data: a dict containing your data
:param filepath: the filepath for your csv file
"""
with open(filepath, 'wb') as csv_file:
writer = csv.writer(csv_file)
for key, value in data:
writer.writerow([key, value])
# reading in the raw file - there are other interesting data that we won't analyze at this time
raw = pd.read_csv('./data/raw.csv', usecols=[7], names=['description'])
# a dict for our document corpus
corpus = []
for i, row in raw.iterrows():
corpus.append(row[0])
tokenized_corpus = []
for doc in corpus:
try:
tokenized_corpus.append(tokenize_and_clean(document=doc, stopwords=STOPWORDS.union(CUSTOM_STOPWORDS), regex=[regex_filter]))
except:
pass
freq = word_frequency(tokenized_corpus)
# filtering words based off of low frequency < 10 instances (mispellings, rare words) and removing
# high frequency words that don't provide a lot of discrimination between documents
tokenized_final = [[token for token in doc if freq[token] > 10] for doc in tokenized_corpus]
# creating a vocabulary of words from this corpus for streaming use
vocabulary = corpora.Dictionary(tokenized_final)
# save to disk
vocabulary.save('data/vocabulary.dict')
print(vocabulary)
# creating an mm corpus
corpus = [vocabulary.doc2bow(text) for text in tokenized_final]
corpora.MmCorpus.serialize('data/ufo.mm', corpus)
ufo_corpus = corpora.MmCorpus('data/ufo.mm')
lda = ldamodel.LdaModel(corpus=ufo_corpus,alpha='auto', id2word=vocabulary, num_topics=20, update_every=0, passes=20)
with open('data/lda_topics', 'w') as file:
file.write(str(lda.print_topics(-1)))
lda.print_topics(-1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment