wannaphong/ner.py

## ner.py
# Apache License 2.0
file_name="data" # ชื่อไฟล์คลังข้อมูล
import codecs
from pythainlp.tokenize import word_tokenize
from pythainlp.tag import pos_tag
from nltk.tokenize import RegexpTokenizer
import glob
import nltk
import re
#จัดการประโยคซ้ำ
data_not=[]
def Unique(p):
 text=re.sub("<[^>]*>","",p)
 text=re.sub("\[(.*?)\]","",text)
 text=re.sub("\[\/(.*?)\]","",text)
 if text not in data_not:
  data_not.append(text)
  return True
 else:
  return False
# เตรียมตัวตัด tag ด้วย re
pattern = r'\[(.*?)\](.*?)\[\/(.*?)\]'
tokenizer = RegexpTokenizer(pattern) # ใช้ nltk.tokenize.RegexpTokenizer เพื่อตัด [TIME]8.00[/TIME] ให้เป็น ('TIME','ไง','TIME')
# จัดการกับ tag ที่ไม่ได้ tag
def toolner_to_tag(text):
 text=text.strip()
 text=re.sub("<[^>]*>","",text)
 text=re.sub("(\[\/(.*?)\])","\\1***",text)#.replace('(\[(.*?)\])','***\\1')#  ตัดการกับพวกไม่มี tag word
 text=re.sub("(\[\w+\])","***\\1",text)
 text2=[]
 for i in text.split('***'):
  if "[" in i:
   text2.append(i)
  else:
   text2.append("[word]"+i+"[/word]")
 text="".join(text2)#re.sub("[word][/word]","","".join(text2))
 return text.replace("[word][/word]","")
# แปลง text ให้เป็น conll2002
def text2conll2002(text,pos=True):
    """
    ใช้แปลงข้อความให้กลายเป็น conll2002
    """
    text=toolner_to_tag(text) # นำไปใส่ tag [word]
    text=text.replace("''",'"')
    text=text.replace("’",'"').replace("‘",'"')#.replace('"',"")
    tag=tokenizer.tokenize(text) # แยก tag ออกมาจากข้อความ
    j=0
    conll2002="" # ประกาศตัวแปรเก็บ conll2002
    for tagopen,text,tagclose in tag: # ลูปใน tag โดยเป็น (tagopen,text,tagclose)
        word_cut=word_tokenize(text,engine="newmm") # ใช้ตัวตัดคำ newmm ของ PyThaiNLP
        i=0
        txt5=""
        while i<len(word_cut): #ลูปตามจำนวน token ที่ตัดในtag
            if word_cut[i]=="''" or word_cut[i]=='"':pass
            elif i==0 and tagopen!='word': # ไม่เป็น tag [word] และเป็น i หรือตัวเริ่มต้น tag
                txt5+=word_cut[i]
                txt5+='\t'+'B-'+tagopen
            elif tagopen!='word':
                txt5+=word_cut[i]
                txt5+='\t'+'I-'+tagopen
            else: # เป็น [word]
                txt5+=word_cut[i]
                txt5+='\t'+'O'
            txt5+='\n'
            #j+=1
            i+=1
        conll2002+=txt5
    if pos==False:
        return conll2002
    return postag(conll2002) # เพิ่ม postag ใส่
# ใช้สำหรับกำกับ pos tag เพื่อใช้กับ NER
# print(text2conll2002(t,pos=False))
def postag(text):
    listtxt=[i for i in text.split('\n') if i!='']
    list_word=[]
    for data in listtxt:
        list_word.append(data.split('\t')[0])
    #print(text)
    list_word=pos_tag(list_word,engine="perceptron")
    text=""
    i=0
    for data in listtxt:
        text+=data.split('\t')[0]+'\t'+list_word[i][1]+'\t'+data.split('\t')[1]+'\n'
        i+=1
    return text
# อ่านข้อมูลจากไฟล์
def get_data(fileopen):
	"""
    สำหรับใช้อ่านทั้งหมดทั้งในไฟล์ทีละรรทัดออกมาเป็น list
    """
	with codecs.open(fileopen, 'r',encoding='utf-8-sig') as f:
		lines = f.read().splitlines()
	return [a for a in lines if Unique(a)] # เอาไม่ซ้ำกัน

def alldata(lists):
    text=""
    for data in lists:
        text+=text2conll2002(data)
        text+='\n'
    return text

def alldata_list(lists):
    data_all=[]
    for data in lists:
        data_num=[]
        try:
            txt=text2conll2002(data,pos=True).split('\n') # นำไปแปลงเป็น conll2002
            for d in txt:
                tt=d.split('\t')
                if d!="":
                    if len(tt)==3:
                        data_num.append((tt[0],tt[1],tt[2]))
                    else:
                        data_num.append((tt[0],tt[1]))
            #print(data_num)
            data_all.append(data_num)
        except:
            print(data)
    #print(data_all)
    return data_all

def alldata_list_str(lists):
	string=""
	for data in lists:
		string1=""
		for j in data:
			string1+=j[0]+"	"+j[1]+"	"+j[2]+"\n"
		string1+="\n"
		string+=string1
	return string

def get_data_tag(listd):
	list_all=[]
	c=[]
	for i in listd:
		if i !='':
			c.append((i.split("\t")[0],i.split("\t")[1],i.split("\t")[2]))
		else:
			list_all.append(c)
			c=[]
	return list_all
def getall(lista):
    ll=[]
    for i in lista:
        o=True
        for j in ll:
            if re.sub("\[(.*?)\]","",i)==re.sub("\[(.*?)\]","",j):
                o=False
                break
        if o==True:
            ll.append(i)
    return ll

data1=getall(get_data(file_name+".txt")) # นำคลังเข้าไป แยกออกเป็น list ละบรรทัด
datatofile=alldata_list(data1) # นำไปผ่านขั้นตอน 1 2 3 4
tt=[]
with open(file_name+"-pos.conll","w") as f:
    i=0
    while i<len(datatofile):
        for j in datatofile[i]:
            f.write(j[0]+"\t"+j[1]+"\t"+j[2]+"\n")
        if i+1<len(datatofile):
            f.write("\n")
        i+=1

with open(file_name+".conll","w") as f:
    i=0
    while i<len(datatofile):
        for j in datatofile[i]:
            f.write(j[0]+"\t"+j[2]+"\n")
        if i+1<len(datatofile):
            f.write("\n")
        i+=1
from sklearn_crfsuite import scorers,metrics
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate,train_test_split
import sklearn_crfsuite
def doc2features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]
    # Features from current word
    features={
        'word.word': word,
        'word.isspace':word.isspace(),
        'postag':postag,
        'word.isdigit()': word.isdigit()
    }
    if i > 0:
        prevword = doc[i-1][0]
        postag1 = doc[i-1][1]
        features['word.prevword'] = prevword
        features['word.previsspace']=prevword.isspace()
        features['word.prepostag'] = postag1
        features['word.prevwordisdigit'] = prevword.isdigit()
    else:
        features['BOS'] = True # Special "Beginning of Sequence" tag
    # Features from next word
    if i < len(doc)-1:
        nextword = doc[i+1][0]
        postag1 = doc[i+1][1]
        features['word.nextword'] = nextword
        features['word.nextisspace']=nextword.isspace()
        features['word.nextpostag'] = postag1
        features['word.nextwordisdigit'] = nextword.isdigit()
    else:
        features['EOS'] = True # Special "End of Sequence" tag
    return features

def extract_features(doc):
    return [doc2features(doc, i) for i in range(len(doc))]

def get_labels(doc):
    return [tag for (token,postag,tag) in doc]


X_data = [extract_features(doc) for doc in datatofile] # เอา คำ แยกออกมา
y_data = [get_labels(doc) for doc in datatofile] # เอา tag แยกออกมา

X, X_test, y, y_test = train_test_split(X_data, y_data, test_size=0.1) # แบ่ง 0.1 หรือ 10%
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=500,
    all_possible_transitions=True,
    model_filename=file_name+"-pos.model0" # ตั้งชื่อโมเดล
)
crf.fit(X, y); # train

labels = list(crf.classes_)
labels.remove('O')
y_pred = crf.predict(X_test)
e=metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)
print(e) # โชว์ประสิทธิภาพ
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

def get_ner(text):
    word_cut=word_tokenize(text,engine="newmm")
    list_word=pos_tag(word_cut,engine='perceptron')
    X_test = extract_features([(data,list_word[i][1]) for i,data in enumerate(word_cut)])
    y_=crf.predict_single(X_test)
    return [(word_cut[i],list_word[i][1],data) for i,data in enumerate(y_)]


while True:
    t=input("Text : ")
    print(get_ner(t))
	# Apache License 2.0
	file_name="data" # ชื่อไฟล์คลังข้อมูล
	import codecs
	from pythainlp.tokenize import word_tokenize
	from pythainlp.tag import pos_tag
	from nltk.tokenize import RegexpTokenizer
	import glob
	import nltk
	import re
	#จัดการประโยคซ้ำ
	data_not=[]
	def Unique(p):
	text=re.sub("<[^>]*>","",p)
	text=re.sub("\[(.*?)\]","",text)
	text=re.sub("\[\/(.*?)\]","",text)
	if text not in data_not:
	data_not.append(text)
	return True
	else:
	return False
	# เตรียมตัวตัด tag ด้วย re
	pattern = r'\[(.?)\](.?)\[\/(.*?)\]'
	tokenizer = RegexpTokenizer(pattern) # ใช้ nltk.tokenize.RegexpTokenizer เพื่อตัด [TIME]8.00[/TIME] ให้เป็น ('TIME','ไง','TIME')
	# จัดการกับ tag ที่ไม่ได้ tag
	def toolner_to_tag(text):
	text=text.strip()
	text=re.sub("<[^>]*>","",text)
	text=re.sub("(\[\/(.?)\])","\\1*",text)#.replace('(\[(.?)\])','***\\1')# ตัดการกับพวกไม่มี tag word
	text=re.sub("(\[\w+\])","***\\1",text)
	text2=[]
	for i in text.split('***'):
	if "[" in i:
	text2.append(i)
	else:
	text2.append("[word]"+i+"[/word]")
	text="".join(text2)#re.sub("[word][/word]","","".join(text2))
	return text.replace("[word][/word]","")
	# แปลง text ให้เป็น conll2002
	def text2conll2002(text,pos=True):
	"""
	ใช้แปลงข้อความให้กลายเป็น conll2002
	"""
	text=toolner_to_tag(text) # นำไปใส่ tag [word]
	text=text.replace("''",'"')
	text=text.replace("’",'"').replace("‘",'"')#.replace('"',"")
	tag=tokenizer.tokenize(text) # แยก tag ออกมาจากข้อความ
	j=0
	conll2002="" # ประกาศตัวแปรเก็บ conll2002
	for tagopen,text,tagclose in tag: # ลูปใน tag โดยเป็น (tagopen,text,tagclose)
	word_cut=word_tokenize(text,engine="newmm") # ใช้ตัวตัดคำ newmm ของ PyThaiNLP
	i=0
	txt5=""
	while i<len(word_cut): #ลูปตามจำนวน token ที่ตัดในtag
	if word_cut[i]=="''" or word_cut[i]=='"':pass
	elif i==0 and tagopen!='word': # ไม่เป็น tag [word] และเป็น i หรือตัวเริ่มต้น tag
	txt5+=word_cut[i]
	txt5+='\t'+'B-'+tagopen
	elif tagopen!='word':
	txt5+=word_cut[i]
	txt5+='\t'+'I-'+tagopen
	else: # เป็น [word]
	txt5+=word_cut[i]
	txt5+='\t'+'O'
	txt5+='\n'
	#j+=1
	i+=1
	conll2002+=txt5
	if pos==False:
	return conll2002
	return postag(conll2002) # เพิ่ม postag ใส่
	# ใช้สำหรับกำกับ pos tag เพื่อใช้กับ NER
	# print(text2conll2002(t,pos=False))
	def postag(text):
	listtxt=[i for i in text.split('\n') if i!='']
	list_word=[]
	for data in listtxt:
	list_word.append(data.split('\t')[0])
	#print(text)
	list_word=pos_tag(list_word,engine="perceptron")
	text=""
	i=0
	for data in listtxt:
	text+=data.split('\t')[0]+'\t'+list_word[i][1]+'\t'+data.split('\t')[1]+'\n'
	i+=1
	return text
	# อ่านข้อมูลจากไฟล์
	def get_data(fileopen):
	"""
	สำหรับใช้อ่านทั้งหมดทั้งในไฟล์ทีละรรทัดออกมาเป็น list
	"""
	with codecs.open(fileopen, 'r',encoding='utf-8-sig') as f:
	lines = f.read().splitlines()
	return [a for a in lines if Unique(a)] # เอาไม่ซ้ำกัน

	def alldata(lists):
	text=""
	for data in lists:
	text+=text2conll2002(data)
	text+='\n'
	return text

	def alldata_list(lists):
	data_all=[]
	for data in lists:
	data_num=[]
	try:
	txt=text2conll2002(data,pos=True).split('\n') # นำไปแปลงเป็น conll2002
	for d in txt:
	tt=d.split('\t')
	if d!="":
	if len(tt)==3:
	data_num.append((tt[0],tt[1],tt[2]))
	else:
	data_num.append((tt[0],tt[1]))
	#print(data_num)
	data_all.append(data_num)
	except:
	print(data)
	#print(data_all)
	return data_all

	def alldata_list_str(lists):
	string=""
	for data in lists:
	string1=""
	for j in data:
	string1+=j[0]+" "+j[1]+" "+j[2]+"\n"
	string1+="\n"
	string+=string1
	return string

	def get_data_tag(listd):
	list_all=[]
	c=[]
	for i in listd:
	if i !='':
	c.append((i.split("\t")[0],i.split("\t")[1],i.split("\t")[2]))
	else:
	list_all.append(c)
	c=[]
	return list_all
	def getall(lista):
	ll=[]
	for i in lista:
	o=True
	for j in ll:
	if re.sub("\[(.?)\]","",i)==re.sub("\[(.?)\]","",j):
	o=False
	break
	if o==True:
	ll.append(i)
	return ll

	data1=getall(get_data(file_name+".txt")) # นำคลังเข้าไป แยกออกเป็น list ละบรรทัด
	datatofile=alldata_list(data1) # นำไปผ่านขั้นตอน 1 2 3 4
	tt=[]
	with open(file_name+"-pos.conll","w") as f:
	i=0
	while i<len(datatofile):
	for j in datatofile[i]:
	f.write(j[0]+"\t"+j[1]+"\t"+j[2]+"\n")
	if i+1<len(datatofile):
	f.write("\n")
	i+=1

	with open(file_name+".conll","w") as f:
	i=0
	while i<len(datatofile):
	for j in datatofile[i]:
	f.write(j[0]+"\t"+j[2]+"\n")
	if i+1<len(datatofile):
	f.write("\n")
	i+=1
	from sklearn_crfsuite import scorers,metrics
	from sklearn.metrics import make_scorer
	from sklearn.model_selection import cross_validate,train_test_split
	import sklearn_crfsuite
	def doc2features(doc, i):
	word = doc[i][0]
	postag = doc[i][1]
	# Features from current word
	features={
	'word.word': word,
	'word.isspace':word.isspace(),
	'postag':postag,
	'word.isdigit()': word.isdigit()
	}
	if i > 0:
	prevword = doc[i-1][0]
	postag1 = doc[i-1][1]
	features['word.prevword'] = prevword
	features['word.previsspace']=prevword.isspace()
	features['word.prepostag'] = postag1
	features['word.prevwordisdigit'] = prevword.isdigit()
	else:
	features['BOS'] = True # Special "Beginning of Sequence" tag
	# Features from next word
	if i < len(doc)-1:
	nextword = doc[i+1][0]
	postag1 = doc[i+1][1]
	features['word.nextword'] = nextword
	features['word.nextisspace']=nextword.isspace()
	features['word.nextpostag'] = postag1
	features['word.nextwordisdigit'] = nextword.isdigit()
	else:
	features['EOS'] = True # Special "End of Sequence" tag
	return features

	def extract_features(doc):
	return [doc2features(doc, i) for i in range(len(doc))]

	def get_labels(doc):
	return [tag for (token,postag,tag) in doc]


	X_data = [extract_features(doc) for doc in datatofile] # เอา คำ แยกออกมา
	y_data = [get_labels(doc) for doc in datatofile] # เอา tag แยกออกมา

	X, X_test, y, y_test = train_test_split(X_data, y_data, test_size=0.1) # แบ่ง 0.1 หรือ 10%
	crf = sklearn_crfsuite.CRF(
	algorithm='lbfgs',
	c1=0.1,
	c2=0.1,
	max_iterations=500,
	all_possible_transitions=True,
	model_filename=file_name+"-pos.model0" # ตั้งชื่อโมเดล
	)
	crf.fit(X, y); # train

	labels = list(crf.classes_)
	labels.remove('O')
	y_pred = crf.predict(X_test)
	e=metrics.flat_f1_score(y_test, y_pred,
	average='weighted', labels=labels)
	print(e) # โชว์ประสิทธิภาพ
	sorted_labels = sorted(
	labels,
	key=lambda name: (name[1:], name[0])
	)
	print(metrics.flat_classification_report(
	y_test, y_pred, labels=sorted_labels, digits=3
	))

	def get_ner(text):
	word_cut=word_tokenize(text,engine="newmm")
	list_word=pos_tag(word_cut,engine='perceptron')
	X_test = extract_features([(data,list_word[i][1]) for i,data in enumerate(word_cut)])
	y_=crf.predict_single(X_test)
	return [(word_cut[i],list_word[i][1],data) for i,data in enumerate(y_)]


	while True:
	t=input("Text : ")
	print(get_ner(t))