lizaku/parse_BNC_for_clauses.py

## parse_BNC_for_clauses.py
from conllu.parser import parse, parse_tree
import pandas as pd
import random

#PATH = 'BNC/2554/download/Texts/A/A0'
CONLLU_PATH = 'test.conllu'
RELS = {'acl', 'amod'}
DATA_NUM = 1000


def cleansed_text(fname):
    with open(fname) as f_in:
        contents = f_in.read()
        text = re.sub('<[^>]*>', '', contents)
        with open(TEXT_PATH, 'a') as f_out:
            f_out.write(text + '\n')


def process_conllu(data):
    clauses = []
    by_id = {}
    tree = parse_tree(data)
    root = tree[0]
    for const in depth_first(root):
        by_id[const[0]['id']] = const
        if const[0]['deprel'] in RELS and const[0]['xpostag'].startswith('V'):
            try:
                head = by_id[const[0]['head']]
                clauses.append((const, head))
            except KeyError:
                clauses.append((const, ''))
    return clauses


def depth_first(node):
    yield node
    for child in node[1]:
        for n in depth_first(child):
            yield n


def breadth_first(node): # not finished
    yield node
    last = node
    for child in node[1]:
        yield child
        last = child
    if last == node:
        return


def features(clause):
    cl, head = clause

    # text
    text = sorted([(node[0]['id'], node[0]['form']) for node in depth_first(cl)], key=lambda m: m[0])
    text = ' '.join(x[1] for x in text)

    # type: relative, participial, to
    if text.startswith('to '):
        typ = 'to'
    elif 'that' in text:
        typ = 'rel'
    else:
        typ = 'part'

    # position
    cl_id = cl[0]['id']
    if head:
        head_id = head[0]['id']
        if head_id < cl_id:
            position = 'after'
        elif head_id > cl_id:
            position = 'before'
    else:
        position = None

    # binary position
    if position == 'after':
        bin_position = 0
    elif position == 'before':
        bin_position = 1
    else:
        bin_position = None

    # voice
    if cl[0]['xpostag'].endswith('G') or cl[0]['xpostag'].endswith('D'):
        voice = 'active'
    elif cl[0]['xpostag'].endswith('N'):
        voice = 'passive'
    else:
        voice = None

    # binary voice
    if voice == 'active':
        bin_voice = 0
    elif voice == 'passive':
        bin_voice = 1
    elif voice is None:
        bin_voice = None

    # length of construction
    length = len(text.split(' '))

    # distance from head
    if head:
        distance = abs(head_id - cl_id)
    else:
        distance = None

    # head lemma
    if head:
        head_lemma = head[0]['lemma']
    else:
        head_lemma = None

    # verb lemma
    verb_lemma = cl[0]['lemma']

    d = {'type': typ,
         'position': position,
         'binary position': bin_position,
         'text': text,
         'voice': voice,
         'binary voice': bin_voice,
         'length': length,
         'distance': distance,
         'head lemma': head_lemma,
         'verb lemma': verb_lemma
        }

    return d


def main():
    data = []
    with open(CONLLU_PATH) as parsed:
        sents = []
        sent = []
        for line in parsed:
            if line.startswith('1\t'):
                sents.append(sent)
                sent = [line]
            else:
                sent.append(line)
    for s in sents[1:]:
        clauses = process_conllu(''.join(s))
        for cl in clauses:
            data.append(features(cl))

    # optional -- decrease amount of data
    use_data = random.sample(data, DATA_NUM)
    df = pd.DataFrame(use_data)
    df.to_csv('test_data.csv', sep='\t', na_rep='None')


main()
	from conllu.parser import parse, parse_tree
	import pandas as pd
	import random

	#PATH = 'BNC/2554/download/Texts/A/A0'
	CONLLU_PATH = 'test.conllu'
	RELS = {'acl', 'amod'}
	DATA_NUM = 1000


	def cleansed_text(fname):
	with open(fname) as f_in:
	contents = f_in.read()
	text = re.sub('<[^>]*>', '', contents)
	with open(TEXT_PATH, 'a') as f_out:
	f_out.write(text + '\n')


	def process_conllu(data):
	clauses = []
	by_id = {}
	tree = parse_tree(data)
	root = tree[0]
	for const in depth_first(root):
	by_id[const[0]['id']] = const
	if const[0]['deprel'] in RELS and const[0]['xpostag'].startswith('V'):
	try:
	head = by_id[const[0]['head']]
	clauses.append((const, head))
	except KeyError:
	clauses.append((const, ''))
	return clauses


	def depth_first(node):
	yield node
	for child in node[1]:
	for n in depth_first(child):
	yield n


	def breadth_first(node): # not finished
	yield node
	last = node
	for child in node[1]:
	yield child
	last = child
	if last == node:
	return


	def features(clause):
	cl, head = clause

	# text
	text = sorted([(node[0]['id'], node[0]['form']) for node in depth_first(cl)], key=lambda m: m[0])
	text = ' '.join(x[1] for x in text)

	# type: relative, participial, to
	if text.startswith('to '):
	typ = 'to'
	elif 'that' in text:
	typ = 'rel'
	else:
	typ = 'part'

	# position
	cl_id = cl[0]['id']
	if head:
	head_id = head[0]['id']
	if head_id < cl_id:
	position = 'after'
	elif head_id > cl_id:
	position = 'before'
	else:
	position = None

	# binary position
	if position == 'after':
	bin_position = 0
	elif position == 'before':
	bin_position = 1
	else:
	bin_position = None

	# voice
	if cl[0]['xpostag'].endswith('G') or cl[0]['xpostag'].endswith('D'):
	voice = 'active'
	elif cl[0]['xpostag'].endswith('N'):
	voice = 'passive'
	else:
	voice = None

	# binary voice
	if voice == 'active':
	bin_voice = 0
	elif voice == 'passive':
	bin_voice = 1
	elif voice is None:
	bin_voice = None

	# length of construction
	length = len(text.split(' '))

	# distance from head
	if head:
	distance = abs(head_id - cl_id)
	else:
	distance = None

	# head lemma
	if head:
	head_lemma = head[0]['lemma']
	else:
	head_lemma = None

	# verb lemma
	verb_lemma = cl[0]['lemma']

	d = {'type': typ,
	'position': position,
	'binary position': bin_position,
	'text': text,
	'voice': voice,
	'binary voice': bin_voice,
	'length': length,
	'distance': distance,
	'head lemma': head_lemma,
	'verb lemma': verb_lemma
	}

	return d


	def main():
	data = []
	with open(CONLLU_PATH) as parsed:
	sents = []
	sent = []
	for line in parsed:
	if line.startswith('1\t'):
	sents.append(sent)
	sent = [line]
	else:
	sent.append(line)
	for s in sents[1:]:
	clauses = process_conllu(''.join(s))
	for cl in clauses:
	data.append(features(cl))

	# optional -- decrease amount of data
	use_data = random.sample(data, DATA_NUM)
	df = pd.DataFrame(use_data)
	df.to_csv('test_data.csv', sep='\t', na_rep='None')


	main()