Skip to content

Instantly share code, notes, and snippets.

@iamaaditya
Created January 10, 2019 21:50
Show Gist options
  • Save iamaaditya/bcae0a54b250e62c3be7e78f61de10df to your computer and use it in GitHub Desktop.
Save iamaaditya/bcae0a54b250e62c3be7e78f61de10df to your computer and use it in GitHub Desktop.
Generate paraphrase data from MSCOCO
import json
import sys
data_type = sys.argv[1]
t = json.load(open('./captions_' + data_type + '2014.json'))
ta = t[u'annotations']
ids = {}
for t in ta:
img_id = t[u'image_id']
if img_id in ids:
ids[img_id].append(t[u'caption'])
else:
ids[img_id] = [t[u'caption']]
out_source = open('mscoco_' + data_type + '_source.txt', 'w')
out_target = open('mscoco_' + data_type + '_target.txt', 'w')
for id, captions in ids.iteritems():
captions = map(lambda c: c.lower().replace('.', '').strip(), captions)
s0, s1 = captions[0], captions[1]
t0, t1 = captions[2], captions[3]
if s0 and t0:
out_source.write(s0)
out_source.write('\n')
out_target.write(t0)
out_target.write('\n')
# augment data by adding reverse
out_source.write(t0)
out_source.write('\n')
out_target.write(s0)
out_target.write('\n')
if s1 and t1:
out_source.write(s1)
out_source.write('\n')
out_target.write(t1)
out_target.write('\n')
out_source.write(t1)
out_source.write('\n')
out_target.write(s1)
out_target.write('\n')
out_target.close()
out_source.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment