import codecs
#處理編碼的套件
import operator
##處理字典檔排序的套件
 

cutlist = "<>/:：;；,、＂’，.。！？｢\"\'\\\n\r《》“”!@#$%^&*()".decode("utf-8")  ##列出標點符號，並轉換成utf-8的格式


def cutSentence(text_path, keywords): ##放入原始文章路徑, 增加斷詞的list
    text = codecs.open(text_path,"r","utf-8")   #開檔
    sentence = ""
    textList = []
       
    for line in text.readlines():
        line = line.strip() ##清除空白
        
        for keyword in keywords:  #清除關鍵字
            line = "".join(line.split(keyword))
            
        for word in line:
            if word not in cutlist: #如果文字不是標點符號，就把字加到句子中
                sentence += word
                #print sentence
            else:
                textList.append(sentence) #如果遇到標點符號，把句子加到 text list中
                sentence = ""
                #print textList
    return textList#傳回一個文字陣列