xandaschofield/fightinwords.py

## fightinwords.py
#!/usr/bin/env python
# Adapted from https://github.com/jmhessel/FightingWords
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer as CV
import string

exclude = set(string.punctuation)


def basic_sanitize(in_string):
    '''Returns a very roughly sanitized version of the input string.'''
    return_string = ''.join([ch for ch in in_string if ord(ch) < 128 and ch not in exclude]).lower()
    return_string = ' '.join(return_string.split())
    return return_string

def bayes_compare_language(l1, l2, ngram=1, prior=.01, cv=None, sig_val=2.573):
    '''
    Arguments:
    - l1, l2; a list of strings from each language sample
    - ngram; an int describing up to what n gram you want to consider (1 is unigrams,
    2 is bigrams + unigrams, etc). Ignored if a custom CountVectorizer is passed.
    - prior; either a float describing a uniform prior, or a vector describing a prior
    over vocabulary items. If you're using a predefined vocabulary, make sure to specify that
    when you make your CountVectorizer object.
    - cv; a sklearn.feature_extraction.text.CountVectorizer object, if desired.

    Returns:
    - A list of length |Vocab| where each entry is a (n-gram, zscore) tuple.'''
    if cv is None and type(prior) is not float:
        print("If using a non-uniform prior:")
        print("Please also pass a count vectorizer with the vocabulary parameter set.")
        quit()
    l1 = [basic_sanitize(l) for l in l1]
    l2 = [basic_sanitize(l) for l in l2]
    if cv is None:
        cv = CV(decode_error = 'ignore', min_df = 10, max_df = .5, ngram_range=(1,ngram),
                binary = False,
                max_features = 15000)
    counts_mat = cv.fit_transform(l1+l2).toarray()
    # Now sum over languages...
    vocab_size = len(cv.vocabulary_)
    print("Vocab size is {}".format(vocab_size))
    if type(prior) is float:
        priors = np.array([prior for i in range(vocab_size)])
    else:
        priors = prior
    z_scores = np.empty(priors.shape[0])
    count_matrix = np.empty([2, vocab_size], dtype=np.float32)
    count_matrix[0, :] = np.sum(counts_mat[:len(l1), :], axis = 0)
    count_matrix[1, :] = np.sum(counts_mat[len(l1):, :], axis = 0)
    a0 = np.sum(priors)
    n1 = 1.*np.sum(count_matrix[0,:])
    n2 = 1.*np.sum(count_matrix[1,:])
    print("Comparing language...")
    for i in range(vocab_size):
        #compute delta
        term1 = np.log((count_matrix[0,i] + priors[i])/(n1 + a0 - count_matrix[0,i] - priors[i]))
        term2 = np.log((count_matrix[1,i] + priors[i])/(n2 + a0 - count_matrix[1,i] - priors[i]))
        delta = term1 - term2
        #compute variance on delta
        var = 1./(count_matrix[0,i] + priors[i]) + 1./(count_matrix[1,i] + priors[i])
        #store final score
        z_scores[i] = delta/np.sqrt(var)
    index_to_term = {v: k for k, v in cv.vocabulary_.items()}
    sorted_indices = np.argsort(z_scores)
    return_list = [(index_to_term[i], z_scores[i]) for i in sorted_indices]

    x_vals = count_matrix.sum(axis=0)
    y_vals = z_scores
    sizes = abs(z_scores) * 2
    neg_color, pos_color, insig_color = ('orange', 'purple', 'grey')
    colors = []
    annots = []
    for i, y in enumerate(y_vals):
        if y > sig_val:
            colors.append(pos_color)
            annots.append(index_to_term[i])
        elif y < -sig_val:
            colors.append(neg_color)
            annots.append(index_to_term[i])
        else:
            colors.append(insig_color)
            annots.append(None)

    fig, ax = plt.subplots()
    ax.scatter(x_vals, y_vals, c=colors, s=sizes, linewidth=0)
    for i, annot in enumerate(annots):
        if annot is not None:
            ax.annotate(annot, (x_vals[i], y_vals[i]), color=colors[i], size=sizes[i])
    ax.set_xscale('log')

    plt.savefig('test.pdf')
	#!/usr/bin/env python
	# Adapted from https://github.com/jmhessel/FightingWords
	import matplotlib
	matplotlib.use('Agg')
	from matplotlib import pyplot as plt
	import numpy as np
	import seaborn as sns
	from sklearn.feature_extraction.text import CountVectorizer as CV
	import string

	exclude = set(string.punctuation)


	def basic_sanitize(in_string):
	'''Returns a very roughly sanitized version of the input string.'''
	return_string = ''.join([ch for ch in in_string if ord(ch) < 128 and ch not in exclude]).lower()
	return_string = ' '.join(return_string.split())
	return return_string

	def bayes_compare_language(l1, l2, ngram=1, prior=.01, cv=None, sig_val=2.573):
	'''
	Arguments:
	- l1, l2; a list of strings from each language sample
	- ngram; an int describing up to what n gram you want to consider (1 is unigrams,
	2 is bigrams + unigrams, etc). Ignored if a custom CountVectorizer is passed.
	- prior; either a float describing a uniform prior, or a vector describing a prior
	over vocabulary items. If you're using a predefined vocabulary, make sure to specify that
	when you make your CountVectorizer object.
	- cv; a sklearn.feature_extraction.text.CountVectorizer object, if desired.

	Returns:
	- A list of length \|Vocab\| where each entry is a (n-gram, zscore) tuple.'''
	if cv is None and type(prior) is not float:
	print("If using a non-uniform prior:")
	print("Please also pass a count vectorizer with the vocabulary parameter set.")
	quit()
	l1 = [basic_sanitize(l) for l in l1]
	l2 = [basic_sanitize(l) for l in l2]
	if cv is None:
	cv = CV(decode_error = 'ignore', min_df = 10, max_df = .5, ngram_range=(1,ngram),
	binary = False,
	max_features = 15000)
	counts_mat = cv.fit_transform(l1+l2).toarray()
	# Now sum over languages...
	vocab_size = len(cv.vocabulary_)
	print("Vocab size is {}".format(vocab_size))
	if type(prior) is float:
	priors = np.array([prior for i in range(vocab_size)])
	else:
	priors = prior
	z_scores = np.empty(priors.shape[0])
	count_matrix = np.empty([2, vocab_size], dtype=np.float32)
	count_matrix[0, :] = np.sum(counts_mat[:len(l1), :], axis = 0)
	count_matrix[1, :] = np.sum(counts_mat[len(l1):, :], axis = 0)
	a0 = np.sum(priors)
	n1 = 1.*np.sum(count_matrix[0,:])
	n2 = 1.*np.sum(count_matrix[1,:])
	print("Comparing language...")
	for i in range(vocab_size):
	#compute delta
	term1 = np.log((count_matrix[0,i] + priors[i])/(n1 + a0 - count_matrix[0,i] - priors[i]))
	term2 = np.log((count_matrix[1,i] + priors[i])/(n2 + a0 - count_matrix[1,i] - priors[i]))
	delta = term1 - term2
	#compute variance on delta
	var = 1./(count_matrix[0,i] + priors[i]) + 1./(count_matrix[1,i] + priors[i])
	#store final score
	z_scores[i] = delta/np.sqrt(var)
	index_to_term = {v: k for k, v in cv.vocabulary_.items()}
	sorted_indices = np.argsort(z_scores)
	return_list = [(index_to_term[i], z_scores[i]) for i in sorted_indices]

	x_vals = count_matrix.sum(axis=0)
	y_vals = z_scores
	sizes = abs(z_scores) * 2
	neg_color, pos_color, insig_color = ('orange', 'purple', 'grey')
	colors = []
	annots = []
	for i, y in enumerate(y_vals):
	if y > sig_val:
	colors.append(pos_color)
	annots.append(index_to_term[i])
	elif y < -sig_val:
	colors.append(neg_color)
	annots.append(index_to_term[i])
	else:
	colors.append(insig_color)
	annots.append(None)

	fig, ax = plt.subplots()
	ax.scatter(x_vals, y_vals, c=colors, s=sizes, linewidth=0)
	for i, annot in enumerate(annots):
	if annot is not None:
	ax.annotate(annot, (x_vals[i], y_vals[i]), color=colors[i], size=sizes[i])
	ax.set_xscale('log')

	plt.savefig('test.pdf')