zackmdavis/comment_modeling.py

## comment_modeling.py
import csv
import logging

import bleach
import textblob
import gensim
from sklearn.decomposition import PCA
from numpy import dot
from numpy.linalg import norm


logging.basicConfig(level=logging.INFO)


def sentencize(comment):
    content = textblob.TextBlob(bleach.clean(comment, strip=True, tags=[]))
    for sentence in content.sentences:
        yield list(sentence.words)


class SentenceIterable:
    def __init__(self, limit=None):
        self.limit = limit

    def __iter__(self):
        comment_count = 0
        with open('LWComments.csv') as lw_comments_csv:
            comment_reader = csv.reader(lw_comments_csv)
            # skip CSV header—
            next(comment_reader)
            # ['author', 'body', 'id', 'net_votes', 'time',
            #  'url', 'vote_ratio', 'dum', 'avgKarma']

            for comment in comment_reader:
                _author, body, _identifier, *_other_fields = comment
                yield from sentencize(body)
                comment_count += 1

                if comment_count % 1000 == 0:
                    logging.info("extracted sentences from %s comments so far",
                                 comment_count)

                if self.limit is not None:
                    if comment_count >= self.limit:
                        break

def build_model(limit=None):
    return gensim.models.Word2Vec(SentenceIterable(limit=limit))


def principal_components_analysis(model):
    our_analysis = PCA()
    our_analysis.fit(model.syn0)
    return our_analysis


def principal_components(model):
    our_analysis = principal_components_analysis(model)
    return our_analysis.transform(model.syn0)


def word_spectra(model, how_many, real_word_threshold):
    components = principal_components(model)
    vocab = {word: info for word, info in model.vocab.items()
             if info.count >= real_word_threshold}
    return [sorted([(word, components[model.vocab[word].index][i])
                    for word in vocab], key=lambda item: item[1])
            for i in range(how_many)]


def vector_project(v, onto):
    return (dot(v, onto) / dot(onto, onto)) * onto


def scalar_project(v, onto):
    return dot(v, onto) / norm(onto)


def bipolar_spectrum(model, one_word, another_word):
    axis = model[another_word] - model[one_word]
    projected_vocab = [
        (word, vector_project(model[word], onto=axis))
        for word in model.vocab]
    return sorted(projected_vocab,
                  key=lambda v: scalar_project(v[1], axis))
	import csv
	import logging

	import bleach
	import textblob
	import gensim
	from sklearn.decomposition import PCA
	from numpy import dot
	from numpy.linalg import norm


	logging.basicConfig(level=logging.INFO)


	def sentencize(comment):
	content = textblob.TextBlob(bleach.clean(comment, strip=True, tags=[]))
	for sentence in content.sentences:
	yield list(sentence.words)


	class SentenceIterable:
	def __init__(self, limit=None):
	self.limit = limit

	def __iter__(self):
	comment_count = 0
	with open('LWComments.csv') as lw_comments_csv:
	comment_reader = csv.reader(lw_comments_csv)
	# skip CSV header—
	next(comment_reader)
	# ['author', 'body', 'id', 'net_votes', 'time',
	# 'url', 'vote_ratio', 'dum', 'avgKarma']

	for comment in comment_reader:
	_author, body, _identifier, *_other_fields = comment
	yield from sentencize(body)
	comment_count += 1

	if comment_count % 1000 == 0:
	logging.info("extracted sentences from %s comments so far",
	comment_count)

	if self.limit is not None:
	if comment_count >= self.limit:
	break

	def build_model(limit=None):
	return gensim.models.Word2Vec(SentenceIterable(limit=limit))


	def principal_components_analysis(model):
	our_analysis = PCA()
	our_analysis.fit(model.syn0)
	return our_analysis


	def principal_components(model):
	our_analysis = principal_components_analysis(model)
	return our_analysis.transform(model.syn0)


	def word_spectra(model, how_many, real_word_threshold):
	components = principal_components(model)
	vocab = {word: info for word, info in model.vocab.items()
	if info.count >= real_word_threshold}
	return [sorted([(word, components[model.vocab[word].index][i])
	for word in vocab], key=lambda item: item[1])
	for i in range(how_many)]


	def vector_project(v, onto):
	return (dot(v, onto) / dot(onto, onto)) * onto


	def scalar_project(v, onto):
	return dot(v, onto) / norm(onto)


	def bipolar_spectrum(model, one_word, another_word):
	axis = model[another_word] - model[one_word]
	projected_vocab = [
	(word, vector_project(model[word], onto=axis))
	for word in model.vocab]
	return sorted(projected_vocab,
	key=lambda v: scalar_project(v[1], axis))