Skip to content

Instantly share code, notes, and snippets.

@lizaku
Last active May 13, 2017 21:55
Show Gist options
  • Save lizaku/47040bc2e8211d5b7ab8f71761422ac1 to your computer and use it in GitHub Desktop.
Save lizaku/47040bc2e8211d5b7ab8f71761422ac1 to your computer and use it in GitHub Desktop.
A script I wrote for the project on Data Analysis course. Extracts all participial and relative clauses from UD-parsed BNC, retrieves their basic features.
from conllu.parser import parse, parse_tree
import pandas as pd
import random
#PATH = 'BNC/2554/download/Texts/A/A0'
CONLLU_PATH = 'test.conllu'
RELS = {'acl', 'amod'}
DATA_NUM = 1000
def cleansed_text(fname):
with open(fname) as f_in:
contents = f_in.read()
text = re.sub('<[^>]*>', '', contents)
with open(TEXT_PATH, 'a') as f_out:
f_out.write(text + '\n')
def process_conllu(data):
clauses = []
by_id = {}
tree = parse_tree(data)
root = tree[0]
for const in depth_first(root):
by_id[const[0]['id']] = const
if const[0]['deprel'] in RELS and const[0]['xpostag'].startswith('V'):
try:
head = by_id[const[0]['head']]
clauses.append((const, head))
except KeyError:
clauses.append((const, ''))
return clauses
def depth_first(node):
yield node
for child in node[1]:
for n in depth_first(child):
yield n
def breadth_first(node): # not finished
yield node
last = node
for child in node[1]:
yield child
last = child
if last == node:
return
def features(clause):
cl, head = clause
# text
text = sorted([(node[0]['id'], node[0]['form']) for node in depth_first(cl)], key=lambda m: m[0])
text = ' '.join(x[1] for x in text)
# type: relative, participial, to
if text.startswith('to '):
typ = 'to'
elif 'that' in text:
typ = 'rel'
else:
typ = 'part'
# position
cl_id = cl[0]['id']
if head:
head_id = head[0]['id']
if head_id < cl_id:
position = 'after'
elif head_id > cl_id:
position = 'before'
else:
position = None
# binary position
if position == 'after':
bin_position = 0
elif position == 'before':
bin_position = 1
else:
bin_position = None
# voice
if cl[0]['xpostag'].endswith('G') or cl[0]['xpostag'].endswith('D'):
voice = 'active'
elif cl[0]['xpostag'].endswith('N'):
voice = 'passive'
else:
voice = None
# binary voice
if voice == 'active':
bin_voice = 0
elif voice == 'passive':
bin_voice = 1
elif voice is None:
bin_voice = None
# length of construction
length = len(text.split(' '))
# distance from head
if head:
distance = abs(head_id - cl_id)
else:
distance = None
# head lemma
if head:
head_lemma = head[0]['lemma']
else:
head_lemma = None
# verb lemma
verb_lemma = cl[0]['lemma']
d = {'type': typ,
'position': position,
'binary position': bin_position,
'text': text,
'voice': voice,
'binary voice': bin_voice,
'length': length,
'distance': distance,
'head lemma': head_lemma,
'verb lemma': verb_lemma
}
return d
def main():
data = []
with open(CONLLU_PATH) as parsed:
sents = []
sent = []
for line in parsed:
if line.startswith('1\t'):
sents.append(sent)
sent = [line]
else:
sent.append(line)
for s in sents[1:]:
clauses = process_conllu(''.join(s))
for cl in clauses:
data.append(features(cl))
# optional -- decrease amount of data
use_data = random.sample(data, DATA_NUM)
df = pd.DataFrame(use_data)
df.to_csv('test_data.csv', sep='\t', na_rep='None')
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment