Created
November 6, 2018 10:16
-
-
Save miikargh/fa8f301125fa433fc796cb8376ee0dce to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from glob import glob | |
from collections import defaultdict | |
import copy | |
import pke | |
from sklearn.metrics import f1_score, precision_score, recall_score | |
NUM_KEYPHRASES = 10 | |
STEMMING = True | |
TEST_DATA = './data/semeval-2010-pre/test/lvl-2/*' | |
STANDARD_KPS = './data/semeval-2010-pre/references/test.combined.stem.json' | |
def flatten(lst): | |
'''Makes two-dimentional list into one dimentional.''' | |
return [item for sublist in lst for item in sublist] | |
def make_equal_len(lst_a, lst_b, in_place=False, empty_marker='<EMPTY>'): | |
'''Appends Nones to shorter list to make it match the length of the longer. | |
Args: | |
lst_a (list(str)): List of strings | |
lst_b (list(str)): List of strings | |
in_place(bool): Modifies originals if set to True, else creates new ones | |
Returns: | |
(tuple(list, list)): Tuple consisting of (lengthened) a and b in their original order. | |
''' | |
if in_place: | |
a = lst_a | |
b = lst_b | |
else: | |
a = copy.deepcopy(lst_a) | |
b = copy.deepcopy(lst_b) | |
if len(a) == len(b): | |
return a, b | |
if len(a) < len(b): | |
for i in range(len(b) - len(a)): | |
a.append(empty_marker) | |
if len(a) > len(b): | |
for i in range(len(a) - len(b)): | |
b.append(empty_marker) | |
assert len(a) == len(b) | |
return a, b | |
if __name__ == '__main__': | |
# Extract keyphrases with TopicRank | |
extracted = defaultdict(list) | |
filenames = [f for f in glob(TEST_DATA) if f.endswith('.xml')] | |
for fname in filenames: | |
doc_id = fname.split('/')[-1].split('.')[0] | |
print('Extracting keyphrases for document {}'.format(doc_id)) | |
extractor = pke.unsupervised.TopicRank(input_file=fname) | |
extractor.read_document(format='corenlp') | |
extractor.candidate_selection() | |
extractor.candidate_weighting() | |
best = [k[0] for k in extractor.get_n_best(n=NUM_KEYPHRASES, stemming=STEMMING)] | |
extracted[doc_id] = best | |
# Load stanard keyphrases | |
with open(STANDARD_KPS, 'r') as f: | |
standard = {k: flatten(v) for k, v in json.loads(f.read()).items()} | |
# Get list of all keyphrases | |
standard_kps = list(set(flatten(standard.values()))) | |
# Make two 1-dimentional lists for evaluation | |
all_std = [] | |
all_ext = [] | |
for doc_id, ext in extracted.items(): | |
std = standard[doc_id] | |
std_e, ext_e = make_equal_len(std[:NUM_KEYPHRASES], ext[:NUM_KEYPHRASES]) | |
all_std += std_e | |
all_ext += ext_e | |
# Evaluate | |
f = f1_score(all_std, all_ext, labels=standard_kps, average='micro') | |
print('F-score with micro average {}'.format(f * 100)) | |
# Make into binary lists (1 == match, 0 == no match) | |
all_std_bin = [1 for i in range(len(all_std))] | |
all_ext_bin = [int(k == all_std[i]) for i, k in enumerate(all_ext)] | |
f_bin = f1_score(all_std_bin, all_ext_bin) | |
print('F-score with binary average {}'.format(f_bin * 100)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment