Skip to content

Instantly share code, notes, and snippets.

@zackmdavis
Last active January 30, 2016 07:30
Show Gist options
  • Save zackmdavis/35010d361a4884534b03 to your computer and use it in GitHub Desktop.
Save zackmdavis/35010d361a4884534b03 to your computer and use it in GitHub Desktop.
import csv
import logging
import bleach
import textblob
import gensim
from sklearn.decomposition import PCA
from numpy import dot
from numpy.linalg import norm
logging.basicConfig(level=logging.INFO)
def sentencize(comment):
content = textblob.TextBlob(bleach.clean(comment, strip=True, tags=[]))
for sentence in content.sentences:
yield list(sentence.words)
class SentenceIterable:
def __init__(self, limit=None):
self.limit = limit
def __iter__(self):
comment_count = 0
with open('LWComments.csv') as lw_comments_csv:
comment_reader = csv.reader(lw_comments_csv)
# skip CSV header—
next(comment_reader)
# ['author', 'body', 'id', 'net_votes', 'time',
# 'url', 'vote_ratio', 'dum', 'avgKarma']
for comment in comment_reader:
_author, body, _identifier, *_other_fields = comment
yield from sentencize(body)
comment_count += 1
if comment_count % 1000 == 0:
logging.info("extracted sentences from %s comments so far",
comment_count)
if self.limit is not None:
if comment_count >= self.limit:
break
def build_model(limit=None):
return gensim.models.Word2Vec(SentenceIterable(limit=limit))
def principal_components_analysis(model):
our_analysis = PCA()
our_analysis.fit(model.syn0)
return our_analysis
def principal_components(model):
our_analysis = principal_components_analysis(model)
return our_analysis.transform(model.syn0)
def word_spectra(model, how_many, real_word_threshold):
components = principal_components(model)
vocab = {word: info for word, info in model.vocab.items()
if info.count >= real_word_threshold}
return [sorted([(word, components[model.vocab[word].index][i])
for word in vocab], key=lambda item: item[1])
for i in range(how_many)]
def vector_project(v, onto):
return (dot(v, onto) / dot(onto, onto)) * onto
def scalar_project(v, onto):
return dot(v, onto) / norm(onto)
def bipolar_spectrum(model, one_word, another_word):
axis = model[another_word] - model[one_word]
projected_vocab = [
(word, vector_project(model[word], onto=axis))
for word in model.vocab]
return sorted(projected_vocab,
key=lambda v: scalar_project(v[1], axis))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment