Last active
May 13, 2017 21:55
-
-
Save lizaku/47040bc2e8211d5b7ab8f71761422ac1 to your computer and use it in GitHub Desktop.
A script I wrote for the project on Data Analysis course. Extracts all participial and relative clauses from UD-parsed BNC, retrieves their basic features.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from conllu.parser import parse, parse_tree | |
import pandas as pd | |
import random | |
#PATH = 'BNC/2554/download/Texts/A/A0' | |
CONLLU_PATH = 'test.conllu' | |
RELS = {'acl', 'amod'} | |
DATA_NUM = 1000 | |
def cleansed_text(fname): | |
with open(fname) as f_in: | |
contents = f_in.read() | |
text = re.sub('<[^>]*>', '', contents) | |
with open(TEXT_PATH, 'a') as f_out: | |
f_out.write(text + '\n') | |
def process_conllu(data): | |
clauses = [] | |
by_id = {} | |
tree = parse_tree(data) | |
root = tree[0] | |
for const in depth_first(root): | |
by_id[const[0]['id']] = const | |
if const[0]['deprel'] in RELS and const[0]['xpostag'].startswith('V'): | |
try: | |
head = by_id[const[0]['head']] | |
clauses.append((const, head)) | |
except KeyError: | |
clauses.append((const, '')) | |
return clauses | |
def depth_first(node): | |
yield node | |
for child in node[1]: | |
for n in depth_first(child): | |
yield n | |
def breadth_first(node): # not finished | |
yield node | |
last = node | |
for child in node[1]: | |
yield child | |
last = child | |
if last == node: | |
return | |
def features(clause): | |
cl, head = clause | |
# text | |
text = sorted([(node[0]['id'], node[0]['form']) for node in depth_first(cl)], key=lambda m: m[0]) | |
text = ' '.join(x[1] for x in text) | |
# type: relative, participial, to | |
if text.startswith('to '): | |
typ = 'to' | |
elif 'that' in text: | |
typ = 'rel' | |
else: | |
typ = 'part' | |
# position | |
cl_id = cl[0]['id'] | |
if head: | |
head_id = head[0]['id'] | |
if head_id < cl_id: | |
position = 'after' | |
elif head_id > cl_id: | |
position = 'before' | |
else: | |
position = None | |
# binary position | |
if position == 'after': | |
bin_position = 0 | |
elif position == 'before': | |
bin_position = 1 | |
else: | |
bin_position = None | |
# voice | |
if cl[0]['xpostag'].endswith('G') or cl[0]['xpostag'].endswith('D'): | |
voice = 'active' | |
elif cl[0]['xpostag'].endswith('N'): | |
voice = 'passive' | |
else: | |
voice = None | |
# binary voice | |
if voice == 'active': | |
bin_voice = 0 | |
elif voice == 'passive': | |
bin_voice = 1 | |
elif voice is None: | |
bin_voice = None | |
# length of construction | |
length = len(text.split(' ')) | |
# distance from head | |
if head: | |
distance = abs(head_id - cl_id) | |
else: | |
distance = None | |
# head lemma | |
if head: | |
head_lemma = head[0]['lemma'] | |
else: | |
head_lemma = None | |
# verb lemma | |
verb_lemma = cl[0]['lemma'] | |
d = {'type': typ, | |
'position': position, | |
'binary position': bin_position, | |
'text': text, | |
'voice': voice, | |
'binary voice': bin_voice, | |
'length': length, | |
'distance': distance, | |
'head lemma': head_lemma, | |
'verb lemma': verb_lemma | |
} | |
return d | |
def main(): | |
data = [] | |
with open(CONLLU_PATH) as parsed: | |
sents = [] | |
sent = [] | |
for line in parsed: | |
if line.startswith('1\t'): | |
sents.append(sent) | |
sent = [line] | |
else: | |
sent.append(line) | |
for s in sents[1:]: | |
clauses = process_conllu(''.join(s)) | |
for cl in clauses: | |
data.append(features(cl)) | |
# optional -- decrease amount of data | |
use_data = random.sample(data, DATA_NUM) | |
df = pd.DataFrame(use_data) | |
df.to_csv('test_data.csv', sep='\t', na_rep='None') | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment