aryamanarora/main.py

## main.py
import csv, glob
from collections import Counter

lang = {}
for file in glob.glob('./*.txt'):
    if 'task' in file: continue
    with open(file, 'r') as fin:
        reader = csv.reader(fin)
        for i, row in enumerate(reader):
            if i == 0: continue
            lang[row[0]] = file.strip('./').strip('.txt')

data = []
rows = []
count = 0
sent = {}
tokens = {}

# for merging all possessives into 's
possessives = ['my', 'your', 'his', 'her', 'our', 'their', 'its', 'whose']
for file in glob.glob('./task*'):
    with open(file, 'r') as fin:
        reader = csv.reader(fin)
        cur_doc = None
        cur_sent = None
        cur_mwe = {'targets': [], 'ss1': None, 'ss2': None, 'id': None, 'num': None}
        sent_so_far = []
        for i, row in enumerate(reader):
            if i == 0: continue

            # get document id
            if row[0].startswith('# new_doc'):
                cur_doc = row[0][15:]

            # new sentence
            elif row[0].startswith('# sent_id'):
                sent_so_far = []
                cur_sent = row[0][12:]
                # reset mwes at every sentence
                if cur_mwe['targets']:
                    data.append((' '.join([x.lower() for x in cur_mwe['targets']]),
                             cur_mwe['ss1'], cur_mwe['ss2'], cur_mwe['id'],
                             lang[cur_doc]))
                cur_mwe = {'targets': [], 'ss1': None, 'ss2': None, 'id': None, 'num': None}

            # actual tokens
            elif row[0] and not row[0].startswith('#'):
                # track current sentence so context can be viewed later
                sent_so_far.append(row[1])
                sent[cur_doc + '_' + cur_sent] = sent_so_far
                # keep count of tokens in document
                if cur_doc not in tokens: tokens[cur_doc] = 0
                tokens[cur_doc] += 1
                rows.append(row)
                count += 1
                # if current token is part of an MWE target
                # this assumes MWEs do not overlap, which appears to be reasonable for English
                if ':' in row[2]:
                    if row[3] and row[4]:
                        number = row[2].split(':')[0]
                        # if this mwe is not continuing the previous mwe token (new id),
                        # save that token and start a new mwe
                        if cur_mwe['num'] != number and cur_mwe['targets']:
                            data.append((' '.join([x.lower() for x in cur_mwe['targets']]),
                                     cur_mwe['ss1'], cur_mwe['ss2'], cur_mwe['id'],
                                     lang[cur_doc]))
                            cur_mwe = {'targets': [], 'ss1': None, 'ss2': None, 'id': None, 'num': number}
                        cur_mwe['num'] = number
                        cur_mwe['targets'].append(row[1])
                        cur_mwe['id'] = cur_doc + '_' + cur_sent + '_' + row[0]
                        # if the current target has supersenses than keep it (in case of empty labels)
                        if row[3] != '_' and row[3] != ' ': cur_mwe['ss1'], cur_mwe['ss2'] = row[3], row[4]

                # a normal non-mwe target
                elif row[2] == '*':
                    if row[3] and row[4]:
                        if row[1].lower() in possessives:
                            row[1] = 's'
                        data.append((row[1].lower(), row[3], row[4],
                                     cur_doc + '_' + cur_sent + '_' + row[0],
                                     lang[cur_doc]))

        # leftover mwe at the end of the file
        if cur_mwe['targets']:
            data.append((' '.join([x.lower() for x in cur_mwe['targets']]),
                         cur_mwe['ss1'], cur_mwe['ss2'], cur_mwe['id'],
                         lang[cur_doc]))
            cur_mwe = {'targets': [], 'ss1': None, 'ss2': None, 'id': None, 'num': None}

print(count, len(data))
print(data[0])
print('DOCUMENTS')
print(Counter([x[1] for x in set([(y[3].split('_')[0], y[4]) for y in data])]))

print('TOKENS')
tokens_lang = {'french': 0, 'german': 0, 'english': 0, 'spanish': 0}
for doc in tokens:
    tokens_lang[lang[doc]] += tokens[doc]
print(tokens_lang)

print('SENTENCS')
sent_lang = {'french': 0, 'german': 0, 'english': 0, 'spanish': 0}
for sentence in sent:
    sent_lang[lang[sentence.split('_')[0]]] += 1
print(sent_lang)

print('TARGETS')
print(Counter([x[4] for x in data]))
	import csv, glob
	from collections import Counter

	lang = {}
	for file in glob.glob('./*.txt'):
	if 'task' in file: continue
	with open(file, 'r') as fin:
	reader = csv.reader(fin)
	for i, row in enumerate(reader):
	if i == 0: continue
	lang[row[0]] = file.strip('./').strip('.txt')

	data = []
	rows = []
	count = 0
	sent = {}
	tokens = {}

	# for merging all possessives into 's
	possessives = ['my', 'your', 'his', 'her', 'our', 'their', 'its', 'whose']
	for file in glob.glob('./task*'):
	with open(file, 'r') as fin:
	reader = csv.reader(fin)
	cur_doc = None
	cur_sent = None
	cur_mwe = {'targets': [], 'ss1': None, 'ss2': None, 'id': None, 'num': None}
	sent_so_far = []
	for i, row in enumerate(reader):
	if i == 0: continue

	# get document id
	if row[0].startswith('# new_doc'):
	cur_doc = row[0][15:]

	# new sentence
	elif row[0].startswith('# sent_id'):
	sent_so_far = []
	cur_sent = row[0][12:]
	# reset mwes at every sentence
	if cur_mwe['targets']:
	data.append((' '.join([x.lower() for x in cur_mwe['targets']]),
	cur_mwe['ss1'], cur_mwe['ss2'], cur_mwe['id'],
	lang[cur_doc]))
	cur_mwe = {'targets': [], 'ss1': None, 'ss2': None, 'id': None, 'num': None}

	# actual tokens
	elif row[0] and not row[0].startswith('#'):
	# track current sentence so context can be viewed later
	sent_so_far.append(row[1])
	sent[cur_doc + '_' + cur_sent] = sent_so_far
	# keep count of tokens in document
	if cur_doc not in tokens: tokens[cur_doc] = 0
	tokens[cur_doc] += 1
	rows.append(row)
	count += 1
	# if current token is part of an MWE target
	# this assumes MWEs do not overlap, which appears to be reasonable for English
	if ':' in row[2]:
	if row[3] and row[4]:
	number = row[2].split(':')[0]
	# if this mwe is not continuing the previous mwe token (new id),
	# save that token and start a new mwe
	if cur_mwe['num'] != number and cur_mwe['targets']:
	data.append((' '.join([x.lower() for x in cur_mwe['targets']]),
	cur_mwe['ss1'], cur_mwe['ss2'], cur_mwe['id'],
	lang[cur_doc]))
	cur_mwe = {'targets': [], 'ss1': None, 'ss2': None, 'id': None, 'num': number}
	cur_mwe['num'] = number
	cur_mwe['targets'].append(row[1])
	cur_mwe['id'] = cur_doc + '_' + cur_sent + '_' + row[0]
	# if the current target has supersenses than keep it (in case of empty labels)
	if row[3] != '_' and row[3] != ' ': cur_mwe['ss1'], cur_mwe['ss2'] = row[3], row[4]

	# a normal non-mwe target
	elif row[2] == '*':
	if row[3] and row[4]:
	if row[1].lower() in possessives:
	row[1] = 's'
	data.append((row[1].lower(), row[3], row[4],
	cur_doc + '_' + cur_sent + '_' + row[0],
	lang[cur_doc]))

	# leftover mwe at the end of the file
	if cur_mwe['targets']:
	data.append((' '.join([x.lower() for x in cur_mwe['targets']]),
	cur_mwe['ss1'], cur_mwe['ss2'], cur_mwe['id'],
	lang[cur_doc]))
	cur_mwe = {'targets': [], 'ss1': None, 'ss2': None, 'id': None, 'num': None}

	print(count, len(data))
	print(data[0])
	print('DOCUMENTS')
	print(Counter([x[1] for x in set([(y[3].split('_')[0], y[4]) for y in data])]))

	print('TOKENS')
	tokens_lang = {'french': 0, 'german': 0, 'english': 0, 'spanish': 0}
	for doc in tokens:
	tokens_lang[lang[doc]] += tokens[doc]
	print(tokens_lang)

	print('SENTENCS')
	sent_lang = {'french': 0, 'german': 0, 'english': 0, 'spanish': 0}
	for sentence in sent:
	sent_lang[lang[sentence.split('_')[0]]] += 1
	print(sent_lang)

	print('TARGETS')
	print(Counter([x[4] for x in data]))