Skip to content

Instantly share code, notes, and snippets.

@yohokuno
Created August 26, 2016 11:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yohokuno/c65e62c7cfcf2b871e7a3fca6e2d39f2 to your computer and use it in GitHub Desktop.
Save yohokuno/c65e62c7cfcf2b871e7a3fca6e2d39f2 to your computer and use it in GitHub Desktop.
Open Subtitle parser for neural conversation model
#!/usr/bin/env python3
import argparse
import xml.etree.ElementTree as ET
import tarfile
import gzip
# Download open subtitles from
# http://opus.lingfil.uu.se/download.php?f=OpenSubtitles2016/ja.tar.gz
def create_pairs(sentences):
previous = None
for sentence in sentences:
if previous is not None:
yield (previous, sentence)
previous = sentence
def iterate_document(document):
for s in document.findall("s"):
sentence = "".join(text.strip() for text in s.itertext())
if sentence:
yield sentence
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("tarfile")
parser.add_argument("source", type=argparse.FileType("a"))
parser.add_argument("target", type=argparse.FileType("a"))
args = parser.parse_args()
files = tarfile.open(args.tarfile, "r:gz")
for file in files.getmembers():
print('Extracting and parsing', file.name)
xml_file = gzip.GzipFile(fileobj=files.extractfile(file))
try:
tree = ET.parse(xml_file)
for document in tree.iter("document"):
sentences = iterate_document(document)
pairs = create_pairs(sentences)
for source, target in pairs:
print(source, file=args.source)
print(target, file=args.target)
except ET.ParseError as e:
print('ParseError:', e)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment