Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
# Usage: python3 p5-bayes.py z4-mark z4-noma
import sys
import numpy as np
import scipy.stats
def load_data(path):
nr_fields = None
ids = list()
vecs = list()
with open(path) as fp:
for line in fp:
fields = line.strip().split()
fields = [fields[0]] + [int(x) for x in fields[1:]]
if nr_fields:
assert nr_fields == len(fields)
else:
nr_fields = len(fields)
ids.append(fields[0])
vec = fields[1:3] + fields[4:]
for (a,b) in [(2,3),(3,6),(6,5),(7,6),(8,6),(9,6),(10,9),(11,9),(12,6),(13,12),(14,12),(15,12),(16,12),(17,16),(18,16),(19,16),(10,16)]:
vec.append(fields[a-1]/(fields[b-1]+0.001))
vecs.append(tuple(vec))
return ids, vecs
def normalize(vecs):
# ret[sample_idx, feature_idx]
ret = np.array(vecs)
idx = np.argsort(ret, axis=0)
ret = np.zeros(ret.shape)
for (x,y), val in np.ndenumerate(idx):
ret[val, y] = x/float(ret.shape[0])
return ret
def main():
id_mark, vec_mark = load_data(sys.argv[1])
id_noma, vec_noma = load_data(sys.argv[2])
vec_norm = normalize(vec_mark + vec_noma)
print("mark=%d, nomark=%d, features=%d, shape=%s" % (len(id_mark), len(id_noma), len(vec_mark[0]), str(vec_norm.shape)))
stat_mark = list()
for i in range(len(vec_mark[0])):
sum0 = 0
sum1 = 0.0
sum2 = 0.0
for j in range(len(vec_mark)):
sum0 += 1
sum1 += vec_norm[j, i]
sum2 += vec_norm[j, i] * vec_norm[j, i]
stat_mark.append((sum1/sum0, np.sqrt(sum2/sum0 - sum1*sum1/sum0/sum0)))
stat_all = list()
for i in range(len(vec_mark[0])):
sum0 = 0
sum1 = 0.0
sum2 = 0.0
for j in range(vec_norm.shape[0]):
sum0 += 1
sum1 += vec_norm[j, i]
sum2 += vec_norm[j, i] * vec_norm[j, i]
stat_all.append((sum1/sum0, np.sqrt(sum2/sum0 - sum1*sum1/sum0/sum0)))
for i in range(len(vec_mark[0])):
print("%2d %f %f %f %f" % (i, stat_mark[i][0], stat_mark[i][1], stat_all[i][0], stat_all[i][1]))
for i in range(vec_norm.shape[0]):
bid = id_mark[i] if i < len(id_mark) else id_noma[i-len(id_mark)]
p = 0.0
for j in range(len(vec_mark[0])):
p1 = abs(vec_norm[i, j] - stat_mark[j][0]) / stat_mark[j][1]
p1 = scipy.stats.norm.cdf(-p1) * 2
if p1 < 0.000001: p1 = 0.000001 # prevent log error
p += np.log(p1)
print("%s\t%f" % (bid, p))
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment