Skip to content

Instantly share code, notes, and snippets.

@aryamanarora
Created September 1, 2020 06:43
Show Gist options
  • Save aryamanarora/b98d1e36ddc24d9d075ac4785260152a to your computer and use it in GitHub Desktop.
Save aryamanarora/b98d1e36ddc24d9d075ac4785260152a to your computer and use it in GitHub Desktop.
import csv, glob
from collections import Counter
lang = {}
for file in glob.glob('./*.txt'):
if 'task' in file: continue
with open(file, 'r') as fin:
reader = csv.reader(fin)
for i, row in enumerate(reader):
if i == 0: continue
lang[row[0]] = file.strip('./').strip('.txt')
data = []
rows = []
count = 0
sent = {}
tokens = {}
# for merging all possessives into 's
possessives = ['my', 'your', 'his', 'her', 'our', 'their', 'its', 'whose']
for file in glob.glob('./task*'):
with open(file, 'r') as fin:
reader = csv.reader(fin)
cur_doc = None
cur_sent = None
cur_mwe = {'targets': [], 'ss1': None, 'ss2': None, 'id': None, 'num': None}
sent_so_far = []
for i, row in enumerate(reader):
if i == 0: continue
# get document id
if row[0].startswith('# new_doc'):
cur_doc = row[0][15:]
# new sentence
elif row[0].startswith('# sent_id'):
sent_so_far = []
cur_sent = row[0][12:]
# reset mwes at every sentence
if cur_mwe['targets']:
data.append((' '.join([x.lower() for x in cur_mwe['targets']]),
cur_mwe['ss1'], cur_mwe['ss2'], cur_mwe['id'],
lang[cur_doc]))
cur_mwe = {'targets': [], 'ss1': None, 'ss2': None, 'id': None, 'num': None}
# actual tokens
elif row[0] and not row[0].startswith('#'):
# track current sentence so context can be viewed later
sent_so_far.append(row[1])
sent[cur_doc + '_' + cur_sent] = sent_so_far
# keep count of tokens in document
if cur_doc not in tokens: tokens[cur_doc] = 0
tokens[cur_doc] += 1
rows.append(row)
count += 1
# if current token is part of an MWE target
# this assumes MWEs do not overlap, which appears to be reasonable for English
if ':' in row[2]:
if row[3] and row[4]:
number = row[2].split(':')[0]
# if this mwe is not continuing the previous mwe token (new id),
# save that token and start a new mwe
if cur_mwe['num'] != number and cur_mwe['targets']:
data.append((' '.join([x.lower() for x in cur_mwe['targets']]),
cur_mwe['ss1'], cur_mwe['ss2'], cur_mwe['id'],
lang[cur_doc]))
cur_mwe = {'targets': [], 'ss1': None, 'ss2': None, 'id': None, 'num': number}
cur_mwe['num'] = number
cur_mwe['targets'].append(row[1])
cur_mwe['id'] = cur_doc + '_' + cur_sent + '_' + row[0]
# if the current target has supersenses than keep it (in case of empty labels)
if row[3] != '_' and row[3] != ' ': cur_mwe['ss1'], cur_mwe['ss2'] = row[3], row[4]
# a normal non-mwe target
elif row[2] == '*':
if row[3] and row[4]:
if row[1].lower() in possessives:
row[1] = 's'
data.append((row[1].lower(), row[3], row[4],
cur_doc + '_' + cur_sent + '_' + row[0],
lang[cur_doc]))
# leftover mwe at the end of the file
if cur_mwe['targets']:
data.append((' '.join([x.lower() for x in cur_mwe['targets']]),
cur_mwe['ss1'], cur_mwe['ss2'], cur_mwe['id'],
lang[cur_doc]))
cur_mwe = {'targets': [], 'ss1': None, 'ss2': None, 'id': None, 'num': None}
print(count, len(data))
print(data[0])
print('DOCUMENTS')
print(Counter([x[1] for x in set([(y[3].split('_')[0], y[4]) for y in data])]))
print('TOKENS')
tokens_lang = {'french': 0, 'german': 0, 'english': 0, 'spanish': 0}
for doc in tokens:
tokens_lang[lang[doc]] += tokens[doc]
print(tokens_lang)
print('SENTENCS')
sent_lang = {'french': 0, 'german': 0, 'english': 0, 'spanish': 0}
for sentence in sent:
sent_lang[lang[sentence.split('_')[0]]] += 1
print(sent_lang)
print('TARGETS')
print(Counter([x[4] for x in data]))
@aryamanarora
Copy link
Author

Output:

22484 2394
('on', 'Instrument', 'Locus', '73a695f2-7da5-6266-bff7-03ce8613b181_1_14', 'german')
DOCUMENTS
Counter({'german': 74, 'french': 74, 'english': 67, 'spanish': 65})
TOKENS
{'french': 5297, 'german': 6313, 'english': 5412, 'spanish': 5462}
SENTENCS
{'french': 281, 'german': 334, 'english': 284, 'spanish': 256}
TARGETS
Counter({'german': 675, 'spanish': 601, 'english': 579, 'french': 539})

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment