import codecs #處理編碼的套件 import operator ##處理字典檔排序的套件 cutlist = "<>/::;;,、"’,.。!?「\"\'\\\n\r《》“”!@#$%^&*()".decode("utf-8") ##列出標點符號,並轉換成utf-8的格式 def cutSentence(text_path, keywords): ##放入原始文章路徑, 增加斷詞的list text = codecs.open(text_path,"r","utf-8") #開檔 sentence = "" textList = [] for line in text.readlines(): line = line.strip() ##清除空白 for keyword in keywords: #清除關鍵字 line = "".join(line.split(keyword)) for word in line: if word not in cutlist: #如果文字不是標點符號,就把字加到句子中 sentence += word #print sentence else: textList.append(sentence) #如果遇到標點符號,把句子加到 text list中 sentence = "" #print textList return textList#傳回一個文字陣列