Created
September 1, 2020 06:43
-
-
Save aryamanarora/b98d1e36ddc24d9d075ac4785260152a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv, glob | |
from collections import Counter | |
lang = {} | |
for file in glob.glob('./*.txt'): | |
if 'task' in file: continue | |
with open(file, 'r') as fin: | |
reader = csv.reader(fin) | |
for i, row in enumerate(reader): | |
if i == 0: continue | |
lang[row[0]] = file.strip('./').strip('.txt') | |
data = [] | |
rows = [] | |
count = 0 | |
sent = {} | |
tokens = {} | |
# for merging all possessives into 's | |
possessives = ['my', 'your', 'his', 'her', 'our', 'their', 'its', 'whose'] | |
for file in glob.glob('./task*'): | |
with open(file, 'r') as fin: | |
reader = csv.reader(fin) | |
cur_doc = None | |
cur_sent = None | |
cur_mwe = {'targets': [], 'ss1': None, 'ss2': None, 'id': None, 'num': None} | |
sent_so_far = [] | |
for i, row in enumerate(reader): | |
if i == 0: continue | |
# get document id | |
if row[0].startswith('# new_doc'): | |
cur_doc = row[0][15:] | |
# new sentence | |
elif row[0].startswith('# sent_id'): | |
sent_so_far = [] | |
cur_sent = row[0][12:] | |
# reset mwes at every sentence | |
if cur_mwe['targets']: | |
data.append((' '.join([x.lower() for x in cur_mwe['targets']]), | |
cur_mwe['ss1'], cur_mwe['ss2'], cur_mwe['id'], | |
lang[cur_doc])) | |
cur_mwe = {'targets': [], 'ss1': None, 'ss2': None, 'id': None, 'num': None} | |
# actual tokens | |
elif row[0] and not row[0].startswith('#'): | |
# track current sentence so context can be viewed later | |
sent_so_far.append(row[1]) | |
sent[cur_doc + '_' + cur_sent] = sent_so_far | |
# keep count of tokens in document | |
if cur_doc not in tokens: tokens[cur_doc] = 0 | |
tokens[cur_doc] += 1 | |
rows.append(row) | |
count += 1 | |
# if current token is part of an MWE target | |
# this assumes MWEs do not overlap, which appears to be reasonable for English | |
if ':' in row[2]: | |
if row[3] and row[4]: | |
number = row[2].split(':')[0] | |
# if this mwe is not continuing the previous mwe token (new id), | |
# save that token and start a new mwe | |
if cur_mwe['num'] != number and cur_mwe['targets']: | |
data.append((' '.join([x.lower() for x in cur_mwe['targets']]), | |
cur_mwe['ss1'], cur_mwe['ss2'], cur_mwe['id'], | |
lang[cur_doc])) | |
cur_mwe = {'targets': [], 'ss1': None, 'ss2': None, 'id': None, 'num': number} | |
cur_mwe['num'] = number | |
cur_mwe['targets'].append(row[1]) | |
cur_mwe['id'] = cur_doc + '_' + cur_sent + '_' + row[0] | |
# if the current target has supersenses than keep it (in case of empty labels) | |
if row[3] != '_' and row[3] != ' ': cur_mwe['ss1'], cur_mwe['ss2'] = row[3], row[4] | |
# a normal non-mwe target | |
elif row[2] == '*': | |
if row[3] and row[4]: | |
if row[1].lower() in possessives: | |
row[1] = 's' | |
data.append((row[1].lower(), row[3], row[4], | |
cur_doc + '_' + cur_sent + '_' + row[0], | |
lang[cur_doc])) | |
# leftover mwe at the end of the file | |
if cur_mwe['targets']: | |
data.append((' '.join([x.lower() for x in cur_mwe['targets']]), | |
cur_mwe['ss1'], cur_mwe['ss2'], cur_mwe['id'], | |
lang[cur_doc])) | |
cur_mwe = {'targets': [], 'ss1': None, 'ss2': None, 'id': None, 'num': None} | |
print(count, len(data)) | |
print(data[0]) | |
print('DOCUMENTS') | |
print(Counter([x[1] for x in set([(y[3].split('_')[0], y[4]) for y in data])])) | |
print('TOKENS') | |
tokens_lang = {'french': 0, 'german': 0, 'english': 0, 'spanish': 0} | |
for doc in tokens: | |
tokens_lang[lang[doc]] += tokens[doc] | |
print(tokens_lang) | |
print('SENTENCS') | |
sent_lang = {'french': 0, 'german': 0, 'english': 0, 'spanish': 0} | |
for sentence in sent: | |
sent_lang[lang[sentence.split('_')[0]]] += 1 | |
print(sent_lang) | |
print('TARGETS') | |
print(Counter([x[4] for x in data])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Output: