Skip to content

Instantly share code, notes, and snippets.

@dedcode
Created January 27, 2016 11:21
Show Gist options
  • Save dedcode/0af8279ccd27475d4636 to your computer and use it in GitHub Desktop.
Save dedcode/0af8279ccd27475d4636 to your computer and use it in GitHub Desktop.
# coding: utf-8
# In[163]:
import codecs
from collections import defaultdict
tweets = defaultdict()
with io.open('/Users/xi/Downloads/NEEL2016-training.tsv', encoding='utf-8') as tw:
for line in tw.readlines():
id, txt = line.split('\t')
tweets[id] = txt
# In[173]:
gt = defaultdict(list)
with codecs.open('/Users/xi/Downloads/NEEL2016-training_neel.gs', 'r', encoding='utf-8') as tw:
for line in tw.readlines():
id, start, end, entity, saliency, type = line.strip().split('\t')
gt[id].append([int(start), int(end), entity])
# In[174]:
with codecs.open('/Users/xi/Downloads/training_microposts.txt', 'w', encoding='utf-8') as out:
for id, tweet in tweets.iteritems():
output = u''
for g in gt[id]:
start = g[0]
end = g[1]
entity = g[2]
if len(output) > 0:
output = output + '\t' +tweet[start:end] + '\t' + entity
else:
output = tweet[start:end] + '\t' + entity
out.write(id+'\t'+tweet+'\t'+output+'\n')
# In[ ]:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment