Skip to content

Instantly share code, notes, and snippets.

@brendano
Last active April 30, 2019 01:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save brendano/320d9bde96f86ec0345b969d64d1418a to your computer and use it in GitHub Desktop.
Save brendano/320d9bde96f86ec0345b969d64d1418a to your computer and use it in GitHub Desktop.
l1 generative implementation with liblbfgs (owl-qn)
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
"""
"L1 SAGE": a BOW text model that assumes words are generated as log-linear
sparse additive combinations of their documents' metadata features.
x = word counts in a doc
y = doc's (non-textual, metadata) feature a.k.a. covariate vector
Both x and y are observed.
p(x | y) = (1/Z) exp(beta + y*gamma)
p(x_w | y) = (1/Z) exp(beta_w + y'gamma_{.,w})
where every gamma_{k,w} ~ Laplace(1/lambda)
beta (V): background log-probs (set to empirical corpus MLE)
gamma (KxV): per-feature weights across word vocabulary
If y is one-hot, it's a single-membership multiclass model, similar to
Multinomial NB except with a shared background component. If y is not one-hot,
it's more interesting: for example, multiple covariate-lexical effects can be
present at once.
Similar to Eisenstein's "SAGE" model, except uses L1 regularization or MAP
estimation, instead of VB under a different hierarchical model. See also
Taddy's "inverse regression" model. This implmentation uses OWL-QN, via
PyLibLBFGS, for the convex optimization training.
"""
# initialized from katie's codebase: https://github.com/slanglab/docprop/blob/master/code/loglin/model.py
from __future__ import division, print_function
import time, json, glob, sys
import numpy as np
from scipy.misc import logsumexp
from lbfgs import LBFGS, LBFGSError, fmin_lbfgs
#for the pylbfgs package
def nll_and_grad(gamma, grad, beta, trainX, trainY):
'''
gamma:: the "corruption" vectors (class-specific vector), size KxV
grad:: gradient of gamma, also KxV. This will be zerod and mutated.
beta:: background logprobs (weights)
'''
assert grad.shape == gamma.shape
K, V = gamma.shape
D = trainY.shape[0]
assert D != V, "stupid limitation sorry"
assert beta.shape == (V,)
assert np.all(np.isfinite(beta)), "bad beta"
assert np.all(np.isfinite(gamma)), "bad gamma"
# pyliblbfgs wants 'grad' updated in-place.
# need to zero it out before incrementally adding in-place.
grad.fill(0.0)
# precompute unnormalized LMs for every doc as one big dense matrix
# we could do minibatches instead to save memory.
doc_lprobs = trainY.dot( gamma )
assert doc_lprobs.shape == (D,V)
doc_lprobs += beta ## should broadcast to every row.
# could use logsumexp(axis) to batch this?
ll = 0.0
for i in xrange(D):
w_lprobs = doc_lprobs[i] - logsumexp(doc_lprobs[i])
w_probs = np.exp(w_lprobs)
doclen = np.sum(trainX[i])
assert np.all(np.isfinite(w_lprobs))
ll += trainX[i].dot(w_lprobs)
# dl/gamma_{j,w} = y_j (x_w - doclen*p(w | y))
wordpred = doclen * w_probs
residvec = trainX[i] - wordpred
assert residvec.shape == (V,)
grad += np.outer(trainY[i], residvec)
# same as:
# for k in xrange(K): grad[k] += trainY[i,k] * residvec
#switch to negative for minimization. note this is INPLACE mutation
grad *= -1.0
return -ll
def progress(weights, grad, fx, xnorm, gnorm, step, k, num_eval, *args):
"""https://github.com/larsmans/pylbfgs/blob/master/lbfgs/_lowlevel.pyx#L291"""
print("iter {k} linestep {step} nnz {nnz} nll {fx} weight,grad norms {xnorm} {gnorm}".format(nnz=np.sum(weights!=0), **locals()))
# return non-zero to stop the optimizer? or is this buggy?
# if k>5: return -1
return 0
#########################
def train(trainX, trainY, l1_penalty=1e-5, epsilon=1e-3, delta=1e-3, verbose=True):
"""
Inputs:
trainX (D,V): for each doc, vector of word counts
trainY (D,K): for each doc, vector of doc covariates, e.g. binary vector
where
D = number of docs
V = number of wordtypes in vocabulary
K = number of doc features
Returns:
The 'gamma' (K x V) weights matrix of sparse differences of words'
log-probabilities relative to the background distribution. See
module's docs for details on the model.
Model hyperparams:
l1_penalty: the 'lambda' in the penalty term lambda*||gamma||_1.
Make this higher, like 1 or 10, for more sparsity.
Training hyperparams:
epsilon: convergence threshold, relative grad norm
delta: convergence threshold, relative change in objective
"""
D1,V = trainX.shape
D2,K = trainY.shape
assert D1==D2, "matrices should both have 1 row per doc"
D=D1
if verbose:
print("{} docs, {} vocab, {} doc features".format(D,V,K))
bb=LBFGS()
bb.orthantwise_c = l1_penalty
bb.linesearch = 'wolfe'
# looks like it's an OR between the two convergence criteria.
# https://github.com/larsmans/pylbfgs/blob/master/liblbfgs/lbfgs.c#L498
bb.epsilon = epsilon
bb.delta = delta
corpus_wordcounts = trainX.sum(0)
assert np.all(corpus_wordcounts > 0), "can't handle zero counts in input"
corpus_wordprob = corpus_wordcounts / corpus_wordcounts.sum()
beta = np.log(corpus_wordprob)
weight_init = np.zeros((K,V))
cb = progress if verbose else lambda *a: 0
opt = bb.minimize(nll_and_grad, weight_init, cb, [beta, trainX, trainY])
return opt
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment