Skip to content

Instantly share code, notes, and snippets.

@jonnyli1125
Last active October 20, 2021 02:04
Show Gist options
  • Save jonnyli1125/d8e4cbc50114443c29fcee96c6868f47 to your computer and use it in GitHub Desktop.
Save jonnyli1125/d8e4cbc50114443c29fcee96c6868f47 to your computer and use it in GitHub Desktop.
Preprocessing script for Japanese entries in NAIST Lang8 Corpus
import os
import argparse
import json
import re
import unicodedata
invalid_bytes_re = re.compile(r'[\x00-\x1F]+')
sline_re = re.compile(r'\[sline\].*?\[/sline\]')
color_tags = ['[f-blue]','[/f-blue]',
'[f-red]','[/f-red]',
'[f-bold]','[/f-bold]']
ja_re = re.compile(r'([ぁ-んァ-ン])')
html_re = re.compile(r'<(\/?[a-z]+)>')
subsent_delim_re = re.compile(r'\.|。|\?|!|,|、|\(|\)')
def clean_sent(sent):
sent = unicodedata.normalize('NFKC', sent.strip())
for tag in color_tags:
sent = sent.replace(tag, '')
sent = sline_re.sub('', sent).replace('[/sline]', '')
return sent
def check_sents(src_sent, tgt_sent):
if src_sent == tgt_sent:
return False
if tgt_sent.endswith('OK') or tgt_sent.endswith('GOOD'):
return False
if not ja_re.search(tgt_sent) or html_re.search(tgt_sent):
return False
src_subsents = [x for x in subsent_delim_re.split(src_sent) if x]
tgt_subsents = [x for x in subsent_delim_re.split(tgt_sent) if x]
if len(src_subsents) != len(tgt_subsents):
return False
return True
def parse_lang8_line(line):
row = json.loads(invalid_bytes_re.sub('', line))
if row[2] != 'Japanese':
return []
pairs = set()
for src_sent, tgt_sents in zip(row[4], row[5]):
if not ja_re.search(src_sent) or html_re.search(src_sent):
continue
src_sent = clean_sent(src_sent)
for tgt_sent in tgt_sents:
if not tgt_sent:
continue
tgt_sent = clean_sent(tgt_sent)
if not check_sents(src_sent, tgt_sent):
continue
pairs.add((src_sent, tgt_sent))
return list(pairs)
def main(args):
with open(args.corpus_path, encoding='utf-8') as f:
lines = f.readlines()
n = 0
os.makedirs(os.path.dirname(args.src_lines_path), exist_ok=True)
os.makedirs(os.path.dirname(args.tgt_lines_path), exist_ok=True)
with open(args.src_lines_path, 'w', encoding='utf-8') as sf:
with open(args.tgt_lines_path, 'w', encoding='utf-8') as tf:
for line in lines:
for src_sent, tgt_sent in parse_lang8_line(line):
sf.write(src_sent + '\n')
tf.write(tgt_sent + '\n')
n += 1
print(f'Wrote {n} lines to parallel corpus output')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-c', '--corpus_path', required=True,
help='Path to Lang8 corpus file')
parser.add_argument('-s', '--src_lines_path', required=True,
help='Path to source lines in parallel corpus output')
parser.add_argument('-t', '--tgt_lines_path', required=True,
help='Path to target lines in parallel corpus output')
args = parser.parse_args()
main(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment