Skip to content

Instantly share code, notes, and snippets.

@xandaschofield
Last active December 11, 2017 06:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save xandaschofield/3c4070b2f232b185ce6a09e47b4e7473 to your computer and use it in GitHub Desktop.
Save xandaschofield/3c4070b2f232b185ce6a09e47b4e7473 to your computer and use it in GitHub Desktop.
Quick code to plots significant values with words (based on FightinWords from jmhessel)
#!/usr/bin/env python
# Adapted from https://github.com/jmhessel/FightingWords
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer as CV
import string
exclude = set(string.punctuation)
def basic_sanitize(in_string):
'''Returns a very roughly sanitized version of the input string.'''
return_string = ''.join([ch for ch in in_string if ord(ch) < 128 and ch not in exclude]).lower()
return_string = ' '.join(return_string.split())
return return_string
def bayes_compare_language(l1, l2, ngram=1, prior=.01, cv=None, sig_val=2.573):
'''
Arguments:
- l1, l2; a list of strings from each language sample
- ngram; an int describing up to what n gram you want to consider (1 is unigrams,
2 is bigrams + unigrams, etc). Ignored if a custom CountVectorizer is passed.
- prior; either a float describing a uniform prior, or a vector describing a prior
over vocabulary items. If you're using a predefined vocabulary, make sure to specify that
when you make your CountVectorizer object.
- cv; a sklearn.feature_extraction.text.CountVectorizer object, if desired.
Returns:
- A list of length |Vocab| where each entry is a (n-gram, zscore) tuple.'''
if cv is None and type(prior) is not float:
print("If using a non-uniform prior:")
print("Please also pass a count vectorizer with the vocabulary parameter set.")
quit()
l1 = [basic_sanitize(l) for l in l1]
l2 = [basic_sanitize(l) for l in l2]
if cv is None:
cv = CV(decode_error = 'ignore', min_df = 10, max_df = .5, ngram_range=(1,ngram),
binary = False,
max_features = 15000)
counts_mat = cv.fit_transform(l1+l2).toarray()
# Now sum over languages...
vocab_size = len(cv.vocabulary_)
print("Vocab size is {}".format(vocab_size))
if type(prior) is float:
priors = np.array([prior for i in range(vocab_size)])
else:
priors = prior
z_scores = np.empty(priors.shape[0])
count_matrix = np.empty([2, vocab_size], dtype=np.float32)
count_matrix[0, :] = np.sum(counts_mat[:len(l1), :], axis = 0)
count_matrix[1, :] = np.sum(counts_mat[len(l1):, :], axis = 0)
a0 = np.sum(priors)
n1 = 1.*np.sum(count_matrix[0,:])
n2 = 1.*np.sum(count_matrix[1,:])
print("Comparing language...")
for i in range(vocab_size):
#compute delta
term1 = np.log((count_matrix[0,i] + priors[i])/(n1 + a0 - count_matrix[0,i] - priors[i]))
term2 = np.log((count_matrix[1,i] + priors[i])/(n2 + a0 - count_matrix[1,i] - priors[i]))
delta = term1 - term2
#compute variance on delta
var = 1./(count_matrix[0,i] + priors[i]) + 1./(count_matrix[1,i] + priors[i])
#store final score
z_scores[i] = delta/np.sqrt(var)
index_to_term = {v: k for k, v in cv.vocabulary_.items()}
sorted_indices = np.argsort(z_scores)
return_list = [(index_to_term[i], z_scores[i]) for i in sorted_indices]
x_vals = count_matrix.sum(axis=0)
y_vals = z_scores
sizes = abs(z_scores) * 2
neg_color, pos_color, insig_color = ('orange', 'purple', 'grey')
colors = []
annots = []
for i, y in enumerate(y_vals):
if y > sig_val:
colors.append(pos_color)
annots.append(index_to_term[i])
elif y < -sig_val:
colors.append(neg_color)
annots.append(index_to_term[i])
else:
colors.append(insig_color)
annots.append(None)
fig, ax = plt.subplots()
ax.scatter(x_vals, y_vals, c=colors, s=sizes, linewidth=0)
for i, annot in enumerate(annots):
if annot is not None:
ax.annotate(annot, (x_vals[i], y_vals[i]), color=colors[i], size=sizes[i])
ax.set_xscale('log')
plt.savefig('test.pdf')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment