Skip to content

Instantly share code, notes, and snippets.

@hyoiutu
Created December 11, 2017 15:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hyoiutu/35a0c6bcc639139a8640a6a2c3e0736d to your computer and use it in GitHub Desktop.
Save hyoiutu/35a0c6bcc639139a8640a6a2c3e0736d to your computer and use it in GitHub Desktop.
SS本文のデータ整形
import re
import glob
from itertools import chain
import MeCab
from joblib import Parallel, delayed
target_name = "みりあ"
tagger = MeCab.Tagger("-Owakati")
def extract_conversation(line):
return re.findall(r"([^「]*\s*「.*?」)", line)
def split_speaker(conv):
speaker, sentence = re.search(r"(.*)\s*「(.*)」", conv).groups()
return {"speaker": speaker, "sentence": sentence}
def shape_conversation(lines):
convs = ([split_speaker(conv) for conv in list(chain.from_iterable(Parallel(n_jobs=-1, verbose=7)([delayed(extract_conversation)(line) for line in lines])))])
return [generate_conv_pair(convs, target_cursor) for target_cursor in [cursor for cursor, conv in enumerate(convs) if target_name in conv['speaker']]]
def generate_conv_pair(convs, cursor):
if cursor is not 0:
conv_pair = {"q": convs[cursor-1], "a": convs[cursor]}
else:
conv_pair = {"q": "(BOS)", "a": convs[cursor]}
print_interaction(conv_pair)
return conv_pair
def print_interaction(conv):
print()
print(f"{conv['q']['speaker']} ... {conv['q']['sentence']}")
print("----------------------------------")
print(f"{conv['a']['speaker']} ... {conv['a']['sentence']}")
print()
def dump_interaction(convs, number):
questions = [tagger.parse(conv['q']['sentence']) for conv in convs]
answers = [tagger.parse(conv['a']['sentence']) for conv in convs]
with open(f"input_style_{number}.txt", "w") as f:
for q in questions:
f.write(q + "\n")
with open(f"output_style_{number}.txt", "w") as f:
for a in answers:
f.write(a + "\n")
files = glob.iglob("./articles/**/*.dat", recursive=True)
convs = []
for number, file_name in enumerate(files):
print(file_name)
with open(file_name, "r") as f:
lines = f.readlines()
convs = shape_conversation(lines)
dump_interaction(convs, number)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment