Skip to content

Instantly share code, notes, and snippets.

@eggplants
Last active April 23, 2019 23:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save eggplants/b208fb974401f2acd6710a50d3ac8db2 to your computer and use it in GitHub Desktop.
Save eggplants/b208fb974401f2acd6710a50d3ac8db2 to your computer and use it in GitHub Desktop.
#00
import numpy as np
data,ans=[],[]
with open("00-test-input.txt","r") as inp:
for line in [s.strip() for s in inp.readlines()]:
data.append(line.split())
data=sum(data,[])
for word in data:
ans.append([word,data.count(word)])
for a in np.unique(ans, axis=0):
print(" ".join(a))
#01-train
import numpy as np
data=[]
ans=[]
with open("01-train-input.txt","r") as inp:
for line in inp:
data.append(line.split())
data.append(['</s>'])
data=sum(data,[])
def one_gram(word,data):
return [word,'{:.06f}'.format(data.count(word)/len(data))]
for s in np.unique(data):
ans.append(one_gram(s,data))
for a in np.unique(ans,axis=0):
print("\t".join(a))
#01-test
import numpy as np
import math
import re
model=[]
data=[]
entropy_data=[]
with open("01-train-answer.txt","r") as m:
for s in m:
tmp=re.split(r"\t",s)
tmp[1]=float(tmp[1])
model.append(tmp)
with open("01-test-input.txt","r") as d:
for li in d:
data.append(re.findall(r"\S",li.strip()))
data.append(['</s>'])
data=sum(data,[])
def entropy(model):
return -math.log(0.95*model+0.05*1/1000000,2)
def coverage(model,data):
word=[x[0] for x in model]
red=0
for st in set(data)-set(word):
red+=data.count(st)
return (len(data)-red)/len(data)
dic={model[i][0]:model[i][1] for i in range(0,len(model))}
for d in data:
try:
entropy_data.append(entropy(dic[d]))
except KeyError:
entropy_data.append(entropy(0))
print('entropy = %f'%(sum(entropy_data)/5))import numpy as np
import re
model=[]
data=[]
dict={}
with open("02-train-input.txt","r") as d:
for line in d.readlines():
data.append(line.split())
data.append(['</s>','</s>'])
data=sum(data,[])
data.insert(0,'</s>')
data.pop(-1)
#2-gram
print(data)
for i in range(1,len(data)):
num=0
s_word=[data[i-1],data[i]]
for ii in range(0,len(data)):
if s_word[0]==data[ii-1] and s_word[1]==data[ii]:
num+=1
dict[" ".join(s_word)]=num/data.count(s_word[0])
# print(num,data.count(s_word[0]))
#1-gram
tmp=data
for i in range(0,data.count('</s>')):
tmp.remove('</s>')
for w in np.unique(data, axis=0):
dict[w]=float(data.count(w)/len(tmp))
[print("%s\t%f"%(li[0],li[1])) for li in sorted(dict.items())]
print('coverage = %f'%coverage(model,data))
#02-train(unsolveddddddddd)
import numpy as np
import re
model=[]
data=[]
dict={}
with open("02-train-input.txt","r") as d:
for line in d.readlines():
data.append(line.split())
data.append(['</s>','</s>'])
data=sum(data,[])
data.insert(0,'</s>')
data.pop(-1)
#2-gram
print(data)
for i in range(1,len(data)):
num=0
s_word=[data[i-1],data[i]]
for ii in range(0,len(data)):
if s_word[0]==data[ii-1] and s_word[1]==data[ii]:
num+=1
dict[" ".join(s_word)]=num/data.count(s_word[0])
# print(num,data.count(s_word[0]))
#1-gram
tmp=data
for i in range(0,data.count('</s>')):
tmp.remove('</s>')
for w in np.unique(data, axis=0):
dict[w]=float(data.count(w)/len(tmp))
[print("%s\t%f"%(li[0],li[1])) for li in sorted(dict.items())]
#03
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment