Skip to content

Instantly share code, notes, and snippets.

@malithjkmt
Created August 4, 2017 00:40
Show Gist options
  • Save malithjkmt/1ac7c3aa1e0c2b13b41bd8c1a6794a21 to your computer and use it in GitHub Desktop.
Save malithjkmt/1ac7c3aa1e0c2b13b41bd8c1a6794a21 to your computer and use it in GitHub Desktop.
Repeat a parallel corpus without loosing alignment. Used in bootstrapping.
import sys, math
MAX_LENGTH = 200
if(len(sys.argv)<3):
sys.exit('input 2 files!')
f1 = open(sys.argv[1],'r').readlines()
f2 = open(sys.argv[2], 'r').readlines()
f_a = [[] for y in range(MAX_LENGTH)]
corpus = [ '' for y in range(len(f1))]
for i in range(0, len(f1)):
corpus[i] = f1[i] + ' | '+ f2[i]
for line in corpus:
length = len(line.split(' ')) -1
f_a[length].append(line)
# for i in f1:
# print len(i)
f1_repeated = open('out1', 'w')
f2_repeated = open('out2', 'w')
for i in range(0,MAX_LENGTH):
count = int(math.floor(len(f_a[i])*.4))
for j in range(0, count):
splitted = f_a[i][j].split('|')
f1_repeated.write(splitted[0].rstrip() + '\n')
f2_repeated.write(splitted[1].lstrip())
f1_repeated.close()
f2_repeated.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment