Skip to content

Instantly share code, notes, and snippets.

@metasyn
Created September 30, 2016 18:20
Show Gist options
  • Save metasyn/62d42bea353b25467b72248b9d8c15e1 to your computer and use it in GitHub Desktop.
Save metasyn/62d42bea353b25467b72248b9d8c15e1 to your computer and use it in GitHub Desktop.
# splunk search -> data -> topic Modeling
# xander johnson
from __future__ import division # python 2, so old school
# Splunk SDK
import splunklib.client as client
import splunklib.results as results
import numpy as np
import pandas as pd
# Topic Modeling
import gensim
import pyLDAvis
import pyLDAvis.gensim as pg
# Just for the stop words
import nltk
# Standard Libraries
import collections
import re
import splunklib.client as client
import splunklib.results as results
HOST = "conf-dcr4.splunkoxygen.com"
PORT = 8089
USERNAME = "Johnson"
PASSWORD = "clear text passwords are the best"
# Create a Service instance and log in
service = client.connect(
host=HOST,
port=PORT,
username=USERNAME,
password=PASSWORD)
# In[7]:
kwargs_export = {"earliest_time": "-1mon", #
"latest_time": "now",
"search_mode": "normal"}
# Remember - Python uses """ to escape double quotes """
searchquery_export = """search sourcetype=*session*
| xmlkv
| search detail=*
| rex mode=sed field=detail "s/&.+?;//g"
| stats count by detail"""
exportsearch_results = service.jobs.export(searchquery_export, **kwargs_export)
# Get the results and display them using the ResultsReader
reader = results.ResultsReader(exportsearch_results)
# Print whether results are a preview from a running search
print "is_preview = %s " % reader.is_preview
# change detail to the name of the field with the text
data = [r['detail'] for r in reader
if isinstance(r, dict)]
# # scrub a dub dub
#
# Stopwords, tokenization, lemmatization, oh my!
# Stop words are words like of, the, an , a, etc.
# We're going to want to remove them.
# Stop list file from http://www.ranks.nl/stopwords + nltk
stopword_file = open('./stopword.txt', 'r')
stopwords_raw = stopword_file.read()
stopword_file.close()
stopwords_list = [w for w in stopwords_raw.split()]
stopwords_list = stopwords_list + nltk.corpus.stopwords.words('english')
stopwords = list(set(stopwords_list))
# these are things that somehow made it though in the end, people misspell things
stopwords.append('wa')
stopwords.append('ha')
stopwords.append('le')
stopwords.append('u')
stopwords.append('splunk') # every topic would have this
stopwords.append('customer') #
stopwords.append('data')
# len(stopwords) # yum
def scrub(text):
lines = text.splitlines()
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
tokens = [t for t in tokenizer.tokenize(' '.join(lines))]
# Lemmatization turns things like "running" into "run"
lemmatizer = nltk.stem.WordNetLemmatizer()
clean_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if re.search(ur'^[a-zA-Z]+', token)]
# remove those stop words
clean = [w for w in clean_tokens if w not in stopwords]
return clean
clean_tokens = scrub(' '.join(data))
df = pd.DataFrame(data, columns=['abstract'])
df['clean'] = df.abstract.map(lambda x: ' '.join(scrub(x)))
df.clean = df.clean.map(lambda x: x.decode('utf8', errors='ignore'))
# the actual topic modeling part
all_text = [d.split() for d in df.clean] # aka "texts"
gensim_d = gensim.corpora.Dictionary(all_text) # aka "dictionary"
corpus = [gensim_d.doc2bow(text) for text in all_text]
# try changing the num_topics around a bit
lda = gensim.models.ldamodel.LdaModel(
corpus=corpus,
id2word=gensim_d,
num_topics=8,
update_every=1,
chunksize=100,
passes=1)
vis = pyLDAvis.gensim.prepare(lda, corpus, gensim_d)
# jupyter notebook display
# import matplotlib.pyplot as plt
# %matplotlib inline
# pyLDAvis.display(vis)
# or just ave it as a file
with open('./conftopics_detail.html', 'w+') as f:
pyLDAvis.save_html(vis, f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment