Skip to content

Instantly share code, notes, and snippets.

@mrdrozdov
Created October 3, 2019 18:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mrdrozdov/ecacdb33387c0af34d1f17d75e2fda5e to your computer and use it in GitHub Desktop.
Save mrdrozdov/ecacdb33387c0af34d1f17d75e2fda5e to your computer and use it in GitHub Desktop.
match_ptb_propbank.py
import re
import os
import sys
import time
import collections
import json
from tqdm import tqdm
# PTB
def convert_binary_bracketing(parse):
transitions = []
tokens = []
for word in parse.split(' '):
if word[0] != "(":
if word == ")":
transitions.append(1)
else:
tokens.append(word)
transitions.append(0)
return tokens, transitions
def read_ptb(path):
with open(path) as f:
for line in f:
yield json.loads(line)
# PROPBANK
def parse_lines(lines):
keys = [
'fileid',
'exampleid',
'tokenid',
'token',
'part-of-speech',
'parse',
'token.00',
'token.01',
]
key2idx = {k: i for i, k in enumerate(keys)}
assert lines is not None and len(lines) > 0
length = None
data = {}
for x in lines:
if length is None:
length = len(x)
assert len(x) == length, lines
for k in keys:
data.setdefault(k, []).append(x[key2idx[k]])
return data
def read_file(path):
result = []
with open(path) as f:
lines = []
for line in f:
line = line.strip()
if not line:
result.append(parse_lines(lines))
lines = []
continue
lines.append(line.split())
if len(lines) > 0:
result.append(parse_lines(lines))
return result
def read_files(options):
for fn in sorted(os.listdir(options.propbank)):
if not fn.endswith('gold_conll'):
continue
path = os.path.join(options.propbank, fn)
for obj in read_file(path):
yield obj
def main(options):
stats = dict(skip_key_ptb=0, skip_key_propbank=0, key_match=0)
propbank = {}
propbank_skipped = dict(key=0)
for obj in tqdm(read_files(options)):
key = tuple(obj['token'])
if key in propbank:
stats['skip_key_propbank'] += 1
continue
assert key not in propbank, (key, len(propbank))
propbank[key] = obj
print('propbank size = {}'.format(len(propbank)))
print(stats)
ptb = {}
ptb_skipped = dict(key=0)
for obj in tqdm(read_ptb(options.ptb)):
tokens, _ = convert_binary_bracketing(obj['sentence1_binary_parse'].strip())
key = tuple(tokens)
if key in ptb:
stats['skip_key_ptb'] += 1
continue
assert key not in ptb, (key, len(ptb))
ptb[key] = obj
if key in propbank:
stats['key_match'] += 1
print('ptb size = {}'.format(len(ptb)))
print(stats)
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--propbank', default=os.path.expanduser('~/data/ptb-propbank-v3-1'), type=str)
parser.add_argument('--ptb', default=os.path.expanduser('~/data/ptb.jsonl'), type=str)
options = parser.parse_args()
main(options)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment