Skip to content

Instantly share code, notes, and snippets.

@yuanmai
Created July 21, 2021 05:48
Show Gist options
  • Save yuanmai/9a61fadbace8d1a46645fc157062e99b to your computer and use it in GitHub Desktop.
Save yuanmai/9a61fadbace8d1a46645fc157062e99b to your computer and use it in GitHub Desktop.
Maximum Entropy
import numpy as np
def encode(featureset, label, mapping):
encoding = []
for (fname, fval) in featureset.items():
if(fname,fval,label) in mapping:
encoding.append((mapping[(fname,fval,label)],1))
return encoding
def calculate_empirical_fcount(train_toks, mapping):
fcount = np.zeros(len(mapping))
for tok, label in train_toks:
for(index, val) in encode(tok,label,mapping):
fcount[index] += val
return fcount
def prob(tok, labels, mapping, weights):
prob_dict = {}
for label in labels:
total = 0.0
for(index,val) in encode(tok,label,mapping):
total += weights[index]*val
prob_dict[label] = np.exp(total)
value_sum = sum(list(prob_dict.values()))
for(label, value) in prob_dict.items():
prob_dict[label] = prob_dict[label]/value_sum
return prob_dict
def calculate_estimated_fcount(train_toks, mapping, labels, weights):
fcount = np.zeros(len(mapping))
for tok, label in train_toks:
prob_dict = prob(tok,labels,mapping,weights)
for label, p in prob_dict.items():
for (index, val) in encode(tok, label, mapping):
fcount[index] += p*val
return fcount
def maxent_train(train_toks):
mapping = {} # maps (fname, fval, label) -> fid
labels = set()
feature_name = set()
for(tok, label) in train_toks:
for(fname, fval) in tok.items():
if (fname,fval,label) not in mapping:
mapping[(fname,fval,label)] = len(mapping)
feature_name.add(fname)
labels.add(label)
C = len(feature_name)+1
Cinv = 1/C
empirical_fcount = calculate_empirical_fcount(train_toks,mapping)
weights = np.zeros(len(empirical_fcount))
iter = 1
while True:
if iter == 100:
break
estimated_fcount = calculate_estimated_fcount(train_toks, mapping, labels, weights)
weights += (empirical_fcount / estimated_fcount) * Cinv
iter+=1
return weights, labels, mapping
if __name__ == '__main__':
train_data = [
(dict(a=1, b=1, c=1), '1'),
(dict(a=1, b=1, c=0), '0'),
(dict(a=0, b=1, c=1), '1')]
maxent_train(train_data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment