Skip to content

Instantly share code, notes, and snippets.

@tokestermw
Created December 19, 2017 20:40
Show Gist options
  • Save tokestermw/75ab7520187750c01bfd5b478a081803 to your computer and use it in GitHub Desktop.
Save tokestermw/75ab7520187750c01bfd5b478a081803 to your computer and use it in GitHub Desktop.
import random
def process_line(line):
columns = line.split('\t')
if len(columns) < 6:
return None
n_corrections = columns[0]
serial_number = columns[1]
url = columns[2]
sentence_number = columns[3]
ell_text = columns[4]
corrections = columns[5:]
return ell_text, corrections
def split_into_correct_incorrect(path):
x = []
y = []
with open(path, 'r') as f:
for line in f:
result = process_line(line.strip())
if not result:
continue
ell_text, corrections = result
for correction in corrections:
x.append(ell_text)
y.append(correction)
with open('x.txt', 'w') as f:
for i in x:
f.write(i)
f.write('\n')
with open('y.txt', 'w') as f:
for i in y:
f.write(i)
f.write('\n')
if __name__ == '__main__':
"""
python preprocess.py lang-8-en-1.0/entries.train
python preprocess.py lang-8-en-1.0/entries.test
"""
import sys
args = sys.argv
path = args[1]
split_into_correct_incorrect(path)
print('donezo')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment