Skip to content

Instantly share code, notes, and snippets.

@Elfsong
Created March 26, 2019 07:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Elfsong/13b92d83010e08d08ee457ca1464f2ea to your computer and use it in GitHub Desktop.
Save Elfsong/13b92d83010e08d08ee457ca1464f2ea to your computer and use it in GitHub Desktop.
故事对话分割
import json
import os
import re
import jieba
ROOT_PATH = "/Users/elfsong/PycharmProjects/BERT_demo"
RESOURCE_PATH = os.path.join(ROOT_PATH, "Test text")
def avsplit1(s, n):
fn = len(s) // n
rn = len(s) % n
ar = [fn + 1] * rn + [fn] * (n - rn)
si = [i * (fn + 1) if i < rn else (rn * (fn + 1) + (i - rn) * fn) for i in range(n)]
sr = [s[si[i]:si[i] + ar[i]] for i in range(n)]
return sr
def split_jieba(s, n):
result = list()
temp_sentence = ""
seg_list = jieba.cut(s, cut_all=False)
for segment in seg_list:
if len(temp_sentence+segment) <= n:
temp_sentence += segment
else:
result += [temp_sentence]
temp_sentence = segment
result += [temp_sentence]
return [sentence for sentence in result if sentence]
def cut_sent(para):
para = para.replace(u'\u3000', '')
para = re.sub('([,,、:。\-—…!~;;?\?])([^”’])', r"\1\n\2", para) # 单字符断句符
para = re.sub('(\.{6})([^”’])', r"\1\n\2", para) # 英文省略号
para = re.sub('(\…{2})([^”’])', r"\1\n\2", para) # 中文省略号
para = re.sub('([。!?\?][”’])([^,。!?\?])', r'\1\n\2', para)
para = para.rstrip()
return [sentence for sentence in para.split("\n") if sentence]
def finetune(sentence_list, perfered_length):
result = list()
current_sentence = sentence_list[0]
for sentence in sentence_list[1:]:
if len(current_sentence) > perfered_length:
result += avsplit1(current_sentence, (len(current_sentence) // perfered_length) + 1)
current_sentence = sentence
elif len(current_sentence) + len(sentence) <= perfered_length:
current_sentence = current_sentence + sentence
else:
result += [current_sentence]
current_sentence = sentence
result += avsplit1(current_sentence, (len(current_sentence) // perfered_length) + 1)
for index, sentence in enumerate(result[1:]):
if sentence[0] in [",", "。", "、", "?", "!", "…", "]", "》"]:
result[index] += sentence[0]
result[index+1] = result[index+1][1:]
if result[index][-1] in ["[", "《"]:
result[index + 1] = result[index][-1]+ result[index + 1]
result[index] = result[index][:-1]
if len(result[index+1]) <= perfered_length * 0.3:
result[index] += result[index+1]
result[index + 1] = ""
final_result = [sentence for sentence in result if sentence]
return final_result
sentence_list = cut_sent("蚂蚁家族里,有一只[小蚂蚁],每天不干活,不觅食,就知道吃、睡和玩儿。")
result = finetune(sentence_list, 15)
for sentence in result:
print(sentence)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment