nschneid/iologreg.py

## iologreg.py
import numpy as np
import scipy
import random
import math
import sys

INFINITY = float('inf')

def logadd(a,b):
    """
    compute log(exp(a) + exp(b))
    """
    if a == -INFINITY:
        return b
    if b == -INFINITY:
        return a
    if b < a: # b - a < 0
        return a + math.log1p(math.exp(b - a))
    else: # a - b < 0
        return b + math.log1p(math.exp(a - b))

class IOLogisticRegression:
    """
    Logistic regression.
    Minimize regularized log-loss:
        L(x, y|w) = - sum_i log p(y_i|x_i, w) + l2 ||w||^2
        p(y|x, w) = exp(w[y].x) / (sum_y' exp(w[y'].x))

    Parameters
    ----------
    l2: float, default=0
        L2 regularization strength
    """
    def __init__(self, l1=0.0, l2=0.0):
        self.l1 = l1
        self.l2 = l2

    def gradient(self, x, n, y, y_feats, W, G):

        z = -INFINITY
        log_probs = np.zeros(self.num_labels)
        xw = x.dot(W)

        found = False
        for yi in n:
            if yi == y: found = True
            u = xw.dot(y_feats[yi])
            log_probs[yi] = u
            z = logadd(z, u)
        if not found:
            print '[ERROR] for training instance', x, 'gold label', y, 'not found in neighborhood', n
            raise Exception
        loss = -(log_probs[y] - z)
        for yi in n:
            delta = math.exp(log_probs[yi] - z) - (yi == y)

            q = np.outer(x, y_feats[yi]) * delta

            #print(G)
            #print('rG')
            #print(repr(G))
            #print(G+q)
            G += q
        return loss

    def fit(self, infeats, outfeats, X, N, Y, y_feats, num_labels, iterations=300, minibatch_size=1000, eta=1.0):
        minibatch_size = min(minibatch_size, X.shape[0])
        self.num_labels = num_labels
        self.y_feats = y_feats
        self.W = scipy.zeros(shape=(infeats, outfeats))
        G = scipy.zeros(shape=(infeats, outfeats))
        H = scipy.ones(shape=(infeats, outfeats)) * 1e-300
        H.fill(1e-300)
        for i in range(iterations):
            sys.stderr.write('Iteration: %d\n' % i)
            G.fill(0)
            loss = 0
            for s in range(10): # random.sample(range(X.shape[0]), minibatch_size):
                thisloss = self.gradient(X[s], N[s], Y[s], y_feats, self.W, G)
                #print(thisloss)
                loss += thisloss


            #for k in range(self.n_classes - 1):
            #    offset = (self.n_features + 1) * k
            #    for j in range(self.n_features):
            #        loss += self.l2 * self.coef_[offset + j]**2
            #        g[offset + j] += 2 * self.l2 * self.coef_[offset + j]

            sys.stderr.write('  Loss = %f\n' % loss)
            G /= minibatch_size
            #G_ = scipy.sparse.dok_matrix(G)
            #print(G_)
            #print(repr(G_))
            #print('rH')
            #print(repr(H))
            #print('rG')
            #print(repr(G))
            H += np.square(G)
            self.W -= np.divide(G, np.sqrt(H)) * eta
        return self

    def predict_(self, x, n, probs):
        probs.fill(0.0)
        z = -INFINITY
        xw = x.dot(self.W)
        for y in n:
            u = xw.dot(self.y_feats[y])
            probs[y] = u
            z = logadd(z, u)
        for y in n:
            probs[y] = math.exp(probs[y] - z)

    def predict(self, X, N):
        post = np.zeros(shape=(X.shape[0],self.num_labels))
        return post

    def predict_proba(self, X, N):
        post = np.zeros(shape=(X.shape[0],self.num_labels))
        for (x, n, p) in zip(X, N, post):
          self.predict_(x, n, p)
        return post

## iologreg_sparse.py
import numpy as np
import scipy
import random
import math
import sys

INFINITY = float('inf')

def logadd(a,b):
    """
    compute log(exp(a) + exp(b))
    """
    if a == -INFINITY:
        return b
    if b == -INFINITY:
        return a
    if b < a: # b - a < 0
        return a + math.log1p(math.exp(b - a))
    else: # a - b < 0
        return b + math.log1p(math.exp(a - b))

class IOLogisticRegression:
    """
    Logistic regression.
    Minimize regularized log-loss:
        L(x, y|w) = - sum_i log p(y_i|x_i, w) + l2 ||w||^2
        p(y|x, w) = exp(w[y].x) / (sum_y' exp(w[y'].x))

    Parameters
    ----------
    l2: float, default=0
        L2 regularization strength
    """
    def __init__(self, l1=0.0, l2=0.0):
        self.l1 = l1
        self.l2 = l2

    def gradient(self, x_, n, y, y_feats, W, infeats, outfeats):

        z = -INFINITY
        log_probs = np.zeros(self.num_labels)
        xw = x_.dot(W)
        #xw_ = x_.dot(W_)

        found = False
        for yi in n:
            if yi == y: found = True
            u = (xw * y_feats[yi].T)[0,0]
            #u = xw_.dot(y_feats[yi].T)[0,0]
            log_probs[yi] = u
            z = logadd(z, u)
        if not found:
            print '[ERROR] for training instance', x, 'gold label', y, 'not found in neighborhood', n
            raise Exception
        loss = -(log_probs[y] - z)

        G = scipy.sparse.dok_matrix((infeats, outfeats))
        for yi in n:
            delta = math.exp(log_probs[yi] - z) - (yi == y)

            #q = np.outer(x, y_feats[yi].toarray()) * delta
            q_ = (x_.T * y_feats[yi]) * delta

            #print(G)
            #print('rG')
            #print(repr(G))
            #print(G+q)
            G = G + q_
        return loss, G

    def fit(self, infeats, outfeats, X_, N, Y, y_feats, num_labels, iterations=300, minibatch_size=1000, eta=1.0):
        minibatch_size = min(minibatch_size, X_.shape[0])
        self.num_labels = num_labels
        self.y_feats = y_feats
        self.W = scipy.zeros(shape=(infeats, outfeats))
        #self.W_ = scipy.sparse.dok_matrix((infeats, outfeats))
        #G = scipy.zeros(shape=(infeats, outfeats))
        G = scipy.sparse.dok_matrix((infeats, outfeats))
        #H = scipy.ones(shape=(infeats, outfeats)) * 1e-300
        H = scipy.sparse.dok_matrix((infeats, outfeats)).todense()
        H.fill(1e-300)
        for i in range(iterations):
            sys.stderr.write('Iteration: %d\n' % i)

            loss = 0
            for s in range(10): #random.sample(range(X.shape[0]), minibatch_size):
                thisloss, thisG = self.gradient(X_[s], N[s], Y[s], y_feats, self.W, infeats, outfeats)
                #print(thisloss)
                loss += thisloss
                G = G + thisG
                #ss.append(s)

            #for k in range(self.n_classes - 1):
            #    offset = (self.n_features + 1) * k
            #    for j in range(self.n_features):
            #        loss += self.l2 * self.coef_[offset + j]**2
            #        g[offset + j] += 2 * self.l2 * self.coef_[offset + j]

            sys.stderr.write('  Loss = %f\n' % loss)
            G /= minibatch_size
            #print(G)
            #print(repr(G))
            #print('rH')
            #print(repr(H))
            #print('rG')
            #print(repr(G))
            #H += np.square(G)
            Gsq = scipy.sparse.csr_matrix(G.copy())
            Gsq.data **= 2  # square each element
            H += Gsq
            Hsqrt = scipy.sparse.csr_matrix(H.copy())
            Hsqrt.data **= 0.5
            #Hsqrt = np.sqrt(H)
            self.W -= (G / Hsqrt) * eta
            #self.W_ = self.W_ - np.divide(G, Hsqrt) * eta
        return self

    def predict_(self, x, n, probs):
        probs.fill(0.0)
        z = -INFINITY
        xw = x.dot(self.W)
        for y in n:
            #u = xw.dot(self.y_feats[y])
            u = (xw * self.y_feats[y].T)[0,0]
            probs[y] = u
            z = logadd(z, u)
        for y in n:
            probs[y] = math.exp(probs[y] - z)

    def predict(self, X, N):
        post = np.zeros(shape=(X.shape[0],self.num_labels))
        return post

    def predict_proba(self, X, N):
        post = np.zeros(shape=(X.shape[0],self.num_labels))
        for (x, n, p) in zip(X, N, post):
          self.predict_(x, n, p)
        return post

## newlearn.py
import sys
import json
from sklearn import preprocessing
from sklearn import feature_extraction
from iologreg import IOLogisticRegression

features = []
labels = {}
invlabels = {}
# read labels and associated features
for line in open(sys.argv[1]):
  (label, f) = line.strip().split('\t')
  invlabels[len(labels)] = label
  labels[label] = len(labels)
  features.append(json.loads(f))
label_dict = feature_extraction.DictVectorizer()
label_features = label_dict.fit_transform(features).toarray()

sys.stderr.write('        LABELS: %s\n' % ' '.join(labels.keys()))
sys.stderr.write('LABEL-FEATURES: %s\n' % ' '.join(label_dict.get_feature_names()))
out_dim = len(label_dict.get_feature_names())

ids = {}
X = []
N = []
# read training instances and neighborhoods
for line in open(sys.argv[2]):
  (id, xfeats, n) = line.strip().split('\t')
  ids[id] = len(ids)
  X.append(json.loads(xfeats))
  neighborhood = json.loads(n)['N']
  if len(neighborhood) == 0:
    sys.stderr.write('[ERROR] empty neighborhood in line:\n%s' % line)
    sys.exit(1)
  if len(neighborhood) == 1:
    sys.stderr.write('[WARNING] neighborhood for id="%s" is singleton: %s\n' % (id, str(neighborhood)))
  n = [labels[x] for x in neighborhood]
  N.append(n)
X_dict = feature_extraction.DictVectorizer()
X0 = X_dict.fit_transform(X)
X = X0.toarray()

sys.stderr.write('       rows(X): %d\n' % X.shape[0])
sys.stderr.write('INPUT-FEATURES: %s\n' % ' '.join(X_dict.get_feature_names()))
in_dim = len(X_dict.get_feature_names())

# read gold labels
Y = [0 for x in xrange(X.shape[0])]
for line in open(sys.argv[3]):
  (id, y) = line.strip().split('\t')
  Y[ids[id]] = labels[y]

assert X.shape[0] == len(N)
assert len(Y) == X.shape[0]

model = IOLogisticRegression()
model.fit(in_dim, out_dim, X, N, Y, label_features, len(labels), iterations = 1000, minibatch_size=10)

D = model.predict_proba(X, N)
for row in D:
  dist = {}
  for i in range(len(row)):
    if row[i] > 0.0: dist[invlabels[i]] = row[i]
  print dist


## newlearn_sparse.py
import sys
import json
from sklearn import preprocessing
from sklearn import feature_extraction
from iologreg_sparse import IOLogisticRegression

features = []
labels = {}
invlabels = {}
# read labels and associated features
for line in open(sys.argv[1]):
  (label, f) = line.strip().split('\t')
  invlabels[len(labels)] = label
  labels[label] = len(labels)
  features.append(json.loads(f))
label_dict = feature_extraction.DictVectorizer()
#label_features = label_dict.fit_transform(features).toarray()
label_features = label_dict.fit_transform(features).tocsr()

sys.stderr.write('        LABELS: %s\n' % ' '.join(labels.keys()))
sys.stderr.write('LABEL-FEATURES: %s\n' % ' '.join(label_dict.get_feature_names()))
out_dim = len(label_dict.get_feature_names())

ids = {}
X = []
N = []
# read training instances and neighborhoods
for line in open(sys.argv[2]):
  (id, xfeats, n) = line.strip().split('\t')
  ids[id] = len(ids)
  X.append(json.loads(xfeats))
  neighborhood = json.loads(n)['N']
  if len(neighborhood) == 0:
    sys.stderr.write('[ERROR] empty neighborhood in line:\n%s' % line)
    sys.exit(1)
  if len(neighborhood) == 1:
    sys.stderr.write('[WARNING] neighborhood for id="%s" is singleton: %s\n' % (id, str(neighborhood)))
  n = [labels[x] for x in neighborhood]
  N.append(n)
X_dict = feature_extraction.DictVectorizer()
X0 = X_dict.fit_transform(X)
X_ = X0.tocsr()

sys.stderr.write('       rows(X): %d\n' % X_.shape[0])
sys.stderr.write('INPUT-FEATURES: %s\n' % ' '.join(X_dict.get_feature_names()))
in_dim = len(X_dict.get_feature_names())

# read gold labels
Y = [0 for x in xrange(X_.shape[0])]
for line in open(sys.argv[3]):
  (id, y) = line.strip().split('\t')
  Y[ids[id]] = labels[y]

assert X_.shape[0] == len(N)
assert len(Y) == X_.shape[0]

model = IOLogisticRegression()
model.fit(in_dim, out_dim, X_, N, Y, label_features, len(labels), iterations = 1000, minibatch_size=10)

D = model.predict_proba(X_, N)
for row in D:
  dist = {}
  for i in range(len(row)):
    if row[i] > 0.0: dist[invlabels[i]] = row[i]
  print dist
	import numpy as np
	import scipy
	import random
	import math
	import sys

	INFINITY = float('inf')

	def logadd(a,b):
	"""
	compute log(exp(a) + exp(b))
	"""
	if a == -INFINITY:
	return b
	if b == -INFINITY:
	return a
	if b < a: # b - a < 0
	return a + math.log1p(math.exp(b - a))
	else: # a - b < 0
	return b + math.log1p(math.exp(a - b))

	class IOLogisticRegression:
	"""
	Logistic regression.
	Minimize regularized log-loss:
	L(x, y\|w) = - sum_i log p(y_i\|x_i, w) + l2 \|\|w\|\|^2
	p(y\|x, w) = exp(w[y].x) / (sum_y' exp(w[y'].x))

	Parameters
	----------
	l2: float, default=0
	L2 regularization strength
	"""
	def __init__(self, l1=0.0, l2=0.0):
	self.l1 = l1
	self.l2 = l2

	def gradient(self, x, n, y, y_feats, W, G):

	z = -INFINITY
	log_probs = np.zeros(self.num_labels)
	xw = x.dot(W)

	found = False
	for yi in n:
	if yi == y: found = True
	u = xw.dot(y_feats[yi])
	log_probs[yi] = u
	z = logadd(z, u)
	if not found:
	print '[ERROR] for training instance', x, 'gold label', y, 'not found in neighborhood', n
	raise Exception
	loss = -(log_probs[y] - z)
	for yi in n:
	delta = math.exp(log_probs[yi] - z) - (yi == y)

	q = np.outer(x, y_feats[yi]) * delta

	#print(G)
	#print('rG')
	#print(repr(G))
	#print(G+q)
	G += q
	return loss

	def fit(self, infeats, outfeats, X, N, Y, y_feats, num_labels, iterations=300, minibatch_size=1000, eta=1.0):
	minibatch_size = min(minibatch_size, X.shape[0])
	self.num_labels = num_labels
	self.y_feats = y_feats
	self.W = scipy.zeros(shape=(infeats, outfeats))
	G = scipy.zeros(shape=(infeats, outfeats))
	H = scipy.ones(shape=(infeats, outfeats)) * 1e-300
	H.fill(1e-300)
	for i in range(iterations):
	sys.stderr.write('Iteration: %d\n' % i)
	G.fill(0)
	loss = 0
	for s in range(10): # random.sample(range(X.shape[0]), minibatch_size):
	thisloss = self.gradient(X[s], N[s], Y[s], y_feats, self.W, G)
	#print(thisloss)
	loss += thisloss


	#for k in range(self.n_classes - 1):
	# offset = (self.n_features + 1) * k
	# for j in range(self.n_features):
	# loss += self.l2 * self.coef_[offset + j]**2
	# g[offset + j] += 2 * self.l2 * self.coef_[offset + j]

	sys.stderr.write(' Loss = %f\n' % loss)
	G /= minibatch_size
	#G_ = scipy.sparse.dok_matrix(G)
	#print(G_)
	#print(repr(G_))
	#print('rH')
	#print(repr(H))
	#print('rG')
	#print(repr(G))
	H += np.square(G)
	self.W -= np.divide(G, np.sqrt(H)) * eta
	return self

	def predict_(self, x, n, probs):
	probs.fill(0.0)
	z = -INFINITY
	xw = x.dot(self.W)
	for y in n:
	u = xw.dot(self.y_feats[y])
	probs[y] = u
	z = logadd(z, u)
	for y in n:
	probs[y] = math.exp(probs[y] - z)

	def predict(self, X, N):
	post = np.zeros(shape=(X.shape[0],self.num_labels))
	return post

	def predict_proba(self, X, N):
	post = np.zeros(shape=(X.shape[0],self.num_labels))
	for (x, n, p) in zip(X, N, post):
	self.predict_(x, n, p)
	return post
	import sys
	import json
	from sklearn import preprocessing
	from sklearn import feature_extraction
	from iologreg import IOLogisticRegression

	features = []
	labels = {}
	invlabels = {}
	# read labels and associated features
	for line in open(sys.argv[1]):
	(label, f) = line.strip().split('\t')
	invlabels[len(labels)] = label
	labels[label] = len(labels)
	features.append(json.loads(f))
	label_dict = feature_extraction.DictVectorizer()
	label_features = label_dict.fit_transform(features).toarray()

	sys.stderr.write(' LABELS: %s\n' % ' '.join(labels.keys()))
	sys.stderr.write('LABEL-FEATURES: %s\n' % ' '.join(label_dict.get_feature_names()))
	out_dim = len(label_dict.get_feature_names())

	ids = {}
	X = []
	N = []
	# read training instances and neighborhoods
	for line in open(sys.argv[2]):
	(id, xfeats, n) = line.strip().split('\t')
	ids[id] = len(ids)
	X.append(json.loads(xfeats))
	neighborhood = json.loads(n)['N']
	if len(neighborhood) == 0:
	sys.stderr.write('[ERROR] empty neighborhood in line:\n%s' % line)
	sys.exit(1)
	if len(neighborhood) == 1:
	sys.stderr.write('[WARNING] neighborhood for id="%s" is singleton: %s\n' % (id, str(neighborhood)))
	n = [labels[x] for x in neighborhood]
	N.append(n)
	X_dict = feature_extraction.DictVectorizer()
	X0 = X_dict.fit_transform(X)
	X = X0.toarray()

	sys.stderr.write(' rows(X): %d\n' % X.shape[0])
	sys.stderr.write('INPUT-FEATURES: %s\n' % ' '.join(X_dict.get_feature_names()))
	in_dim = len(X_dict.get_feature_names())

	# read gold labels
	Y = [0 for x in xrange(X.shape[0])]
	for line in open(sys.argv[3]):
	(id, y) = line.strip().split('\t')
	Y[ids[id]] = labels[y]

	assert X.shape[0] == len(N)
	assert len(Y) == X.shape[0]

	model = IOLogisticRegression()
	model.fit(in_dim, out_dim, X, N, Y, label_features, len(labels), iterations = 1000, minibatch_size=10)

	D = model.predict_proba(X, N)
	for row in D:
	dist = {}
	for i in range(len(row)):
	if row[i] > 0.0: dist[invlabels[i]] = row[i]
	print dist