wannaphong/toconll.py

## toconll.py
#Apache License 2.0
file_name="data" # ชื่อไฟล์คลังข้อมูล
import codecs
from pythainlp.tokenize import word_tokenize
from pythainlp.tag import pos_tag
from nltk.tokenize import RegexpTokenizer
import glob
import nltk
import re
#จัดการประโยคซ้ำ
data_not=[]
def Unique(p):
 text=re.sub("<[^>]*>","",p)
 text=re.sub("\[(.*?)\]","",text)
 text=re.sub("\[\/(.*?)\]","",text)
 if text not in data_not:
  data_not.append(text)
  return True
 else:
  return False
# เตรียมตัวตัด tag ด้วย re
pattern = r'\[(.*?)\](.*?)\[\/(.*?)\]'
tokenizer = RegexpTokenizer(pattern) # ใช้ nltk.tokenize.RegexpTokenizer เพื่อตัด [TIME]8.00[/TIME] ให้เป็น ('TIME','ไง','TIME')
# จัดการกับ tag ที่ไม่ได้ tag
def toolner_to_tag(text):
 text=text.strip()
 text=re.sub("<[^>]*>","",text)
 text=re.sub("(\[\/(.*?)\])","\\1***",text)#.replace('(\[(.*?)\])','***\\1')#  ตัดการกับพวกไม่มี tag word
 text=re.sub("(\[\w+\])","***\\1",text)
 text2=[]
 for i in text.split('***'):
  if "[" in i:
   text2.append(i)
  else:
   text2.append("[word]"+i+"[/word]")
 text="".join(text2)#re.sub("[word][/word]","","".join(text2))
 return text.replace("[word][/word]","")
# แปลง text ให้เป็น conll2002
def text2conll2002(text,pos=True):
    """
    ใช้แปลงข้อความให้กลายเป็น conll2002
    """
    text=toolner_to_tag(text) # นำไปใส่ tag [word]
    text=text.replace("''",'"')
    text=text.replace("’",'"').replace("‘",'"')#.replace('"',"")
    tag=tokenizer.tokenize(text) # แยก tag ออกมาจากข้อความ
    j=0
    conll2002="" # ประกาศตัวแปรเก็บ conll2002
    for tagopen,text,tagclose in tag: # ลูปใน tag โดยเป็น (tagopen,text,tagclose)
        word_cut=word_tokenize(text,engine="newmm") # ใช้ตัวตัดคำ newmm ของ PyThaiNLP
        i=0
        txt5=""
        while i<len(word_cut): #ลูปตามจำนวน token ที่ตัดในtag
            if word_cut[i]=="''" or word_cut[i]=='"':pass
            elif i==0 and tagopen!='word': # ไม่เป็น tag [word] และเป็น i หรือตัวเริ่มต้น tag
                txt5+=word_cut[i]
                txt5+='\t'+'B-'+tagopen
            elif tagopen!='word':
                txt5+=word_cut[i]
                txt5+='\t'+'I-'+tagopen
            else: # เป็น [word]
                txt5+=word_cut[i]
                txt5+='\t'+'O'
            txt5+='\n'
            #j+=1
            i+=1
        conll2002+=txt5
    if pos==False:
        return conll2002
    return postag(conll2002) # เพิ่ม postag ใส่
# ใช้สำหรับกำกับ pos tag เพื่อใช้กับ NER
# print(text2conll2002(t,pos=False))
def postag(text):
    listtxt=[i for i in text.split('\n') if i!='']
    list_word=[]
    for data in listtxt:
        list_word.append(data.split('\t')[0])
    #print(text)
    list_word=pos_tag(list_word,engine="perceptron")
    text=""
    i=0
    for data in listtxt:
        text+=data.split('\t')[0]+'\t'+list_word[i][1]+'\t'+data.split('\t')[1]+'\n'
        i+=1
    return text
# อ่านข้อมูลจากไฟล์
def get_data(fileopen):
	"""
    สำหรับใช้อ่านทั้งหมดทั้งในไฟล์ทีละรรทัดออกมาเป็น list
    """
	with codecs.open(fileopen, 'r',encoding='utf-8-sig') as f:
		lines = f.read().splitlines()
	return [a for a in lines if Unique(a)] # เอาไม่ซ้ำกัน

def alldata(lists):
    text=""
    for data in lists:
        text+=text2conll2002(data)
        text+='\n'
    return text

def alldata_list(lists):
    data_all=[]
    for data in lists:
        data_num=[]
        try:
            txt=text2conll2002(data,pos=True).split('\n') # นำไปแปลงเป็น conll2002
            for d in txt:
                tt=d.split('\t')
                if d!="":
                    if len(tt)==3:
                        data_num.append((tt[0],tt[1],tt[2]))
                    else:
                        data_num.append((tt[0],tt[1]))
            #print(data_num)
            data_all.append(data_num)
        except:
            print(data)
    #print(data_all)
    return data_all

def alldata_list_str(lists):
	string=""
	for data in lists:
		string1=""
		for j in data:
			string1+=j[0]+"	"+j[1]+"	"+j[2]+"\n"
		string1+="\n"
		string+=string1
	return string

def get_data_tag(listd):
	list_all=[]
	c=[]
	for i in listd:
		if i !='':
			c.append((i.split("\t")[0],i.split("\t")[1],i.split("\t")[2]))
		else:
			list_all.append(c)
			c=[]
	return list_all
def getall(lista):
    ll=[]
    for i in lista:
        o=True
        for j in ll:
            if re.sub("\[(.*?)\]","",i)==re.sub("\[(.*?)\]","",j):
                o=False
                break
        if o==True:
            ll.append(i)
    return ll

data1=getall(get_data(file_name+".txt")) # นำคลังเข้าไป แยกออกเป็น list ละบรรทัด
datatofile=alldata_list(data1) # นำไปผ่านขั้นตอน 1 2 3 4
tt=[]
with open(file_name+"-pos.conll","w") as f:
    i=0
    while i<len(datatofile):
        for j in datatofile[i]:
            f.write(j[0]+"\t"+j[1]+"\t"+j[2]+"\n")
        if i+1<len(datatofile):
            f.write("\n")
        i+=1

with open(file_name+".conll","w") as f:
    i=0
    while i<len(datatofile):
        for j in datatofile[i]:
            f.write(j[0]+"\t"+j[2]+"\n")
        if i+1<len(datatofile):
            f.write("\n")
        i+=1
	#Apache License 2.0
	file_name="data" # ชื่อไฟล์คลังข้อมูล
	import codecs
	from pythainlp.tokenize import word_tokenize
	from pythainlp.tag import pos_tag
	from nltk.tokenize import RegexpTokenizer
	import glob
	import nltk
	import re
	#จัดการประโยคซ้ำ
	data_not=[]
	def Unique(p):
	text=re.sub("<[^>]*>","",p)
	text=re.sub("\[(.*?)\]","",text)
	text=re.sub("\[\/(.*?)\]","",text)
	if text not in data_not:
	data_not.append(text)
	return True
	else:
	return False
	# เตรียมตัวตัด tag ด้วย re
	pattern = r'\[(.?)\](.?)\[\/(.*?)\]'
	tokenizer = RegexpTokenizer(pattern) # ใช้ nltk.tokenize.RegexpTokenizer เพื่อตัด [TIME]8.00[/TIME] ให้เป็น ('TIME','ไง','TIME')
	# จัดการกับ tag ที่ไม่ได้ tag
	def toolner_to_tag(text):
	text=text.strip()
	text=re.sub("<[^>]*>","",text)
	text=re.sub("(\[\/(.?)\])","\\1*",text)#.replace('(\[(.?)\])','***\\1')# ตัดการกับพวกไม่มี tag word
	text=re.sub("(\[\w+\])","***\\1",text)
	text2=[]
	for i in text.split('***'):
	if "[" in i:
	text2.append(i)
	else:
	text2.append("[word]"+i+"[/word]")
	text="".join(text2)#re.sub("[word][/word]","","".join(text2))
	return text.replace("[word][/word]","")
	# แปลง text ให้เป็น conll2002
	def text2conll2002(text,pos=True):
	"""
	ใช้แปลงข้อความให้กลายเป็น conll2002
	"""
	text=toolner_to_tag(text) # นำไปใส่ tag [word]
	text=text.replace("''",'"')
	text=text.replace("’",'"').replace("‘",'"')#.replace('"',"")
	tag=tokenizer.tokenize(text) # แยก tag ออกมาจากข้อความ
	j=0
	conll2002="" # ประกาศตัวแปรเก็บ conll2002
	for tagopen,text,tagclose in tag: # ลูปใน tag โดยเป็น (tagopen,text,tagclose)
	word_cut=word_tokenize(text,engine="newmm") # ใช้ตัวตัดคำ newmm ของ PyThaiNLP
	i=0
	txt5=""
	while i<len(word_cut): #ลูปตามจำนวน token ที่ตัดในtag
	if word_cut[i]=="''" or word_cut[i]=='"':pass
	elif i==0 and tagopen!='word': # ไม่เป็น tag [word] และเป็น i หรือตัวเริ่มต้น tag
	txt5+=word_cut[i]
	txt5+='\t'+'B-'+tagopen
	elif tagopen!='word':
	txt5+=word_cut[i]
	txt5+='\t'+'I-'+tagopen
	else: # เป็น [word]
	txt5+=word_cut[i]
	txt5+='\t'+'O'
	txt5+='\n'
	#j+=1
	i+=1
	conll2002+=txt5
	if pos==False:
	return conll2002
	return postag(conll2002) # เพิ่ม postag ใส่
	# ใช้สำหรับกำกับ pos tag เพื่อใช้กับ NER
	# print(text2conll2002(t,pos=False))
	def postag(text):
	listtxt=[i for i in text.split('\n') if i!='']
	list_word=[]
	for data in listtxt:
	list_word.append(data.split('\t')[0])
	#print(text)
	list_word=pos_tag(list_word,engine="perceptron")
	text=""
	i=0
	for data in listtxt:
	text+=data.split('\t')[0]+'\t'+list_word[i][1]+'\t'+data.split('\t')[1]+'\n'
	i+=1
	return text
	# อ่านข้อมูลจากไฟล์
	def get_data(fileopen):
	"""
	สำหรับใช้อ่านทั้งหมดทั้งในไฟล์ทีละรรทัดออกมาเป็น list
	"""
	with codecs.open(fileopen, 'r',encoding='utf-8-sig') as f:
	lines = f.read().splitlines()
	return [a for a in lines if Unique(a)] # เอาไม่ซ้ำกัน

	def alldata(lists):
	text=""
	for data in lists:
	text+=text2conll2002(data)
	text+='\n'
	return text

	def alldata_list(lists):
	data_all=[]
	for data in lists:
	data_num=[]
	try:
	txt=text2conll2002(data,pos=True).split('\n') # นำไปแปลงเป็น conll2002
	for d in txt:
	tt=d.split('\t')
	if d!="":
	if len(tt)==3:
	data_num.append((tt[0],tt[1],tt[2]))
	else:
	data_num.append((tt[0],tt[1]))
	#print(data_num)
	data_all.append(data_num)
	except:
	print(data)
	#print(data_all)
	return data_all

	def alldata_list_str(lists):
	string=""
	for data in lists:
	string1=""
	for j in data:
	string1+=j[0]+" "+j[1]+" "+j[2]+"\n"
	string1+="\n"
	string+=string1
	return string

	def get_data_tag(listd):
	list_all=[]
	c=[]
	for i in listd:
	if i !='':
	c.append((i.split("\t")[0],i.split("\t")[1],i.split("\t")[2]))
	else:
	list_all.append(c)
	c=[]
	return list_all
	def getall(lista):
	ll=[]
	for i in lista:
	o=True
	for j in ll:
	if re.sub("\[(.?)\]","",i)==re.sub("\[(.?)\]","",j):
	o=False
	break
	if o==True:
	ll.append(i)
	return ll

	data1=getall(get_data(file_name+".txt")) # นำคลังเข้าไป แยกออกเป็น list ละบรรทัด
	datatofile=alldata_list(data1) # นำไปผ่านขั้นตอน 1 2 3 4
	tt=[]
	with open(file_name+"-pos.conll","w") as f:
	i=0
	while i<len(datatofile):
	for j in datatofile[i]:
	f.write(j[0]+"\t"+j[1]+"\t"+j[2]+"\n")
	if i+1<len(datatofile):
	f.write("\n")
	i+=1

	with open(file_name+".conll","w") as f:
	i=0
	while i<len(datatofile):
	for j in datatofile[i]:
	f.write(j[0]+"\t"+j[2]+"\n")
	if i+1<len(datatofile):
	f.write("\n")
	i+=1