Skip to content

Instantly share code, notes, and snippets.

@nschneid
Created March 24, 2014 20:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nschneid/9748235 to your computer and use it in GitHub Desktop.
Save nschneid/9748235 to your computer and use it in GitHub Desktop.
Preliminary attempt at sparse learning in creg2. Non-sparse counterpart code is included for comparison.
import numpy as np
import scipy
import random
import math
import sys
INFINITY = float('inf')
def logadd(a,b):
"""
compute log(exp(a) + exp(b))
"""
if a == -INFINITY:
return b
if b == -INFINITY:
return a
if b < a: # b - a < 0
return a + math.log1p(math.exp(b - a))
else: # a - b < 0
return b + math.log1p(math.exp(a - b))
class IOLogisticRegression:
"""
Logistic regression.
Minimize regularized log-loss:
L(x, y|w) = - sum_i log p(y_i|x_i, w) + l2 ||w||^2
p(y|x, w) = exp(w[y].x) / (sum_y' exp(w[y'].x))
Parameters
----------
l2: float, default=0
L2 regularization strength
"""
def __init__(self, l1=0.0, l2=0.0):
self.l1 = l1
self.l2 = l2
def gradient(self, x, n, y, y_feats, W, G):
z = -INFINITY
log_probs = np.zeros(self.num_labels)
xw = x.dot(W)
found = False
for yi in n:
if yi == y: found = True
u = xw.dot(y_feats[yi])
log_probs[yi] = u
z = logadd(z, u)
if not found:
print '[ERROR] for training instance', x, 'gold label', y, 'not found in neighborhood', n
raise Exception
loss = -(log_probs[y] - z)
for yi in n:
delta = math.exp(log_probs[yi] - z) - (yi == y)
q = np.outer(x, y_feats[yi]) * delta
#print(G)
#print('rG')
#print(repr(G))
#print(G+q)
G += q
return loss
def fit(self, infeats, outfeats, X, N, Y, y_feats, num_labels, iterations=300, minibatch_size=1000, eta=1.0):
minibatch_size = min(minibatch_size, X.shape[0])
self.num_labels = num_labels
self.y_feats = y_feats
self.W = scipy.zeros(shape=(infeats, outfeats))
G = scipy.zeros(shape=(infeats, outfeats))
H = scipy.ones(shape=(infeats, outfeats)) * 1e-300
H.fill(1e-300)
for i in range(iterations):
sys.stderr.write('Iteration: %d\n' % i)
G.fill(0)
loss = 0
for s in range(10): # random.sample(range(X.shape[0]), minibatch_size):
thisloss = self.gradient(X[s], N[s], Y[s], y_feats, self.W, G)
#print(thisloss)
loss += thisloss
#for k in range(self.n_classes - 1):
# offset = (self.n_features + 1) * k
# for j in range(self.n_features):
# loss += self.l2 * self.coef_[offset + j]**2
# g[offset + j] += 2 * self.l2 * self.coef_[offset + j]
sys.stderr.write(' Loss = %f\n' % loss)
G /= minibatch_size
#G_ = scipy.sparse.dok_matrix(G)
#print(G_)
#print(repr(G_))
#print('rH')
#print(repr(H))
#print('rG')
#print(repr(G))
H += np.square(G)
self.W -= np.divide(G, np.sqrt(H)) * eta
return self
def predict_(self, x, n, probs):
probs.fill(0.0)
z = -INFINITY
xw = x.dot(self.W)
for y in n:
u = xw.dot(self.y_feats[y])
probs[y] = u
z = logadd(z, u)
for y in n:
probs[y] = math.exp(probs[y] - z)
def predict(self, X, N):
post = np.zeros(shape=(X.shape[0],self.num_labels))
return post
def predict_proba(self, X, N):
post = np.zeros(shape=(X.shape[0],self.num_labels))
for (x, n, p) in zip(X, N, post):
self.predict_(x, n, p)
return post
import numpy as np
import scipy
import random
import math
import sys
INFINITY = float('inf')
def logadd(a,b):
"""
compute log(exp(a) + exp(b))
"""
if a == -INFINITY:
return b
if b == -INFINITY:
return a
if b < a: # b - a < 0
return a + math.log1p(math.exp(b - a))
else: # a - b < 0
return b + math.log1p(math.exp(a - b))
class IOLogisticRegression:
"""
Logistic regression.
Minimize regularized log-loss:
L(x, y|w) = - sum_i log p(y_i|x_i, w) + l2 ||w||^2
p(y|x, w) = exp(w[y].x) / (sum_y' exp(w[y'].x))
Parameters
----------
l2: float, default=0
L2 regularization strength
"""
def __init__(self, l1=0.0, l2=0.0):
self.l1 = l1
self.l2 = l2
def gradient(self, x_, n, y, y_feats, W, infeats, outfeats):
z = -INFINITY
log_probs = np.zeros(self.num_labels)
xw = x_.dot(W)
#xw_ = x_.dot(W_)
found = False
for yi in n:
if yi == y: found = True
u = (xw * y_feats[yi].T)[0,0]
#u = xw_.dot(y_feats[yi].T)[0,0]
log_probs[yi] = u
z = logadd(z, u)
if not found:
print '[ERROR] for training instance', x, 'gold label', y, 'not found in neighborhood', n
raise Exception
loss = -(log_probs[y] - z)
G = scipy.sparse.dok_matrix((infeats, outfeats))
for yi in n:
delta = math.exp(log_probs[yi] - z) - (yi == y)
#q = np.outer(x, y_feats[yi].toarray()) * delta
q_ = (x_.T * y_feats[yi]) * delta
#print(G)
#print('rG')
#print(repr(G))
#print(G+q)
G = G + q_
return loss, G
def fit(self, infeats, outfeats, X_, N, Y, y_feats, num_labels, iterations=300, minibatch_size=1000, eta=1.0):
minibatch_size = min(minibatch_size, X_.shape[0])
self.num_labels = num_labels
self.y_feats = y_feats
self.W = scipy.zeros(shape=(infeats, outfeats))
#self.W_ = scipy.sparse.dok_matrix((infeats, outfeats))
#G = scipy.zeros(shape=(infeats, outfeats))
G = scipy.sparse.dok_matrix((infeats, outfeats))
#H = scipy.ones(shape=(infeats, outfeats)) * 1e-300
H = scipy.sparse.dok_matrix((infeats, outfeats)).todense()
H.fill(1e-300)
for i in range(iterations):
sys.stderr.write('Iteration: %d\n' % i)
loss = 0
for s in range(10): #random.sample(range(X.shape[0]), minibatch_size):
thisloss, thisG = self.gradient(X_[s], N[s], Y[s], y_feats, self.W, infeats, outfeats)
#print(thisloss)
loss += thisloss
G = G + thisG
#ss.append(s)
#for k in range(self.n_classes - 1):
# offset = (self.n_features + 1) * k
# for j in range(self.n_features):
# loss += self.l2 * self.coef_[offset + j]**2
# g[offset + j] += 2 * self.l2 * self.coef_[offset + j]
sys.stderr.write(' Loss = %f\n' % loss)
G /= minibatch_size
#print(G)
#print(repr(G))
#print('rH')
#print(repr(H))
#print('rG')
#print(repr(G))
#H += np.square(G)
Gsq = scipy.sparse.csr_matrix(G.copy())
Gsq.data **= 2 # square each element
H += Gsq
Hsqrt = scipy.sparse.csr_matrix(H.copy())
Hsqrt.data **= 0.5
#Hsqrt = np.sqrt(H)
self.W -= (G / Hsqrt) * eta
#self.W_ = self.W_ - np.divide(G, Hsqrt) * eta
return self
def predict_(self, x, n, probs):
probs.fill(0.0)
z = -INFINITY
xw = x.dot(self.W)
for y in n:
#u = xw.dot(self.y_feats[y])
u = (xw * self.y_feats[y].T)[0,0]
probs[y] = u
z = logadd(z, u)
for y in n:
probs[y] = math.exp(probs[y] - z)
def predict(self, X, N):
post = np.zeros(shape=(X.shape[0],self.num_labels))
return post
def predict_proba(self, X, N):
post = np.zeros(shape=(X.shape[0],self.num_labels))
for (x, n, p) in zip(X, N, post):
self.predict_(x, n, p)
return post
import sys
import json
from sklearn import preprocessing
from sklearn import feature_extraction
from iologreg import IOLogisticRegression
features = []
labels = {}
invlabels = {}
# read labels and associated features
for line in open(sys.argv[1]):
(label, f) = line.strip().split('\t')
invlabels[len(labels)] = label
labels[label] = len(labels)
features.append(json.loads(f))
label_dict = feature_extraction.DictVectorizer()
label_features = label_dict.fit_transform(features).toarray()
sys.stderr.write(' LABELS: %s\n' % ' '.join(labels.keys()))
sys.stderr.write('LABEL-FEATURES: %s\n' % ' '.join(label_dict.get_feature_names()))
out_dim = len(label_dict.get_feature_names())
ids = {}
X = []
N = []
# read training instances and neighborhoods
for line in open(sys.argv[2]):
(id, xfeats, n) = line.strip().split('\t')
ids[id] = len(ids)
X.append(json.loads(xfeats))
neighborhood = json.loads(n)['N']
if len(neighborhood) == 0:
sys.stderr.write('[ERROR] empty neighborhood in line:\n%s' % line)
sys.exit(1)
if len(neighborhood) == 1:
sys.stderr.write('[WARNING] neighborhood for id="%s" is singleton: %s\n' % (id, str(neighborhood)))
n = [labels[x] for x in neighborhood]
N.append(n)
X_dict = feature_extraction.DictVectorizer()
X0 = X_dict.fit_transform(X)
X = X0.toarray()
sys.stderr.write(' rows(X): %d\n' % X.shape[0])
sys.stderr.write('INPUT-FEATURES: %s\n' % ' '.join(X_dict.get_feature_names()))
in_dim = len(X_dict.get_feature_names())
# read gold labels
Y = [0 for x in xrange(X.shape[0])]
for line in open(sys.argv[3]):
(id, y) = line.strip().split('\t')
Y[ids[id]] = labels[y]
assert X.shape[0] == len(N)
assert len(Y) == X.shape[0]
model = IOLogisticRegression()
model.fit(in_dim, out_dim, X, N, Y, label_features, len(labels), iterations = 1000, minibatch_size=10)
D = model.predict_proba(X, N)
for row in D:
dist = {}
for i in range(len(row)):
if row[i] > 0.0: dist[invlabels[i]] = row[i]
print dist
import sys
import json
from sklearn import preprocessing
from sklearn import feature_extraction
from iologreg_sparse import IOLogisticRegression
features = []
labels = {}
invlabels = {}
# read labels and associated features
for line in open(sys.argv[1]):
(label, f) = line.strip().split('\t')
invlabels[len(labels)] = label
labels[label] = len(labels)
features.append(json.loads(f))
label_dict = feature_extraction.DictVectorizer()
#label_features = label_dict.fit_transform(features).toarray()
label_features = label_dict.fit_transform(features).tocsr()
sys.stderr.write(' LABELS: %s\n' % ' '.join(labels.keys()))
sys.stderr.write('LABEL-FEATURES: %s\n' % ' '.join(label_dict.get_feature_names()))
out_dim = len(label_dict.get_feature_names())
ids = {}
X = []
N = []
# read training instances and neighborhoods
for line in open(sys.argv[2]):
(id, xfeats, n) = line.strip().split('\t')
ids[id] = len(ids)
X.append(json.loads(xfeats))
neighborhood = json.loads(n)['N']
if len(neighborhood) == 0:
sys.stderr.write('[ERROR] empty neighborhood in line:\n%s' % line)
sys.exit(1)
if len(neighborhood) == 1:
sys.stderr.write('[WARNING] neighborhood for id="%s" is singleton: %s\n' % (id, str(neighborhood)))
n = [labels[x] for x in neighborhood]
N.append(n)
X_dict = feature_extraction.DictVectorizer()
X0 = X_dict.fit_transform(X)
X_ = X0.tocsr()
sys.stderr.write(' rows(X): %d\n' % X_.shape[0])
sys.stderr.write('INPUT-FEATURES: %s\n' % ' '.join(X_dict.get_feature_names()))
in_dim = len(X_dict.get_feature_names())
# read gold labels
Y = [0 for x in xrange(X_.shape[0])]
for line in open(sys.argv[3]):
(id, y) = line.strip().split('\t')
Y[ids[id]] = labels[y]
assert X_.shape[0] == len(N)
assert len(Y) == X_.shape[0]
model = IOLogisticRegression()
model.fit(in_dim, out_dim, X_, N, Y, label_features, len(labels), iterations = 1000, minibatch_size=10)
D = model.predict_proba(X_, N)
for row in D:
dist = {}
for i in range(len(row)):
if row[i] > 0.0: dist[invlabels[i]] = row[i]
print dist
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment