Skip to content

Instantly share code, notes, and snippets.

@miikargh
Created November 6, 2018 10:16
Show Gist options
  • Save miikargh/fa8f301125fa433fc796cb8376ee0dce to your computer and use it in GitHub Desktop.
Save miikargh/fa8f301125fa433fc796cb8376ee0dce to your computer and use it in GitHub Desktop.
import json
from glob import glob
from collections import defaultdict
import copy
import pke
from sklearn.metrics import f1_score, precision_score, recall_score
NUM_KEYPHRASES = 10
STEMMING = True
TEST_DATA = './data/semeval-2010-pre/test/lvl-2/*'
STANDARD_KPS = './data/semeval-2010-pre/references/test.combined.stem.json'
def flatten(lst):
'''Makes two-dimentional list into one dimentional.'''
return [item for sublist in lst for item in sublist]
def make_equal_len(lst_a, lst_b, in_place=False, empty_marker='<EMPTY>'):
'''Appends Nones to shorter list to make it match the length of the longer.
Args:
lst_a (list(str)): List of strings
lst_b (list(str)): List of strings
in_place(bool): Modifies originals if set to True, else creates new ones
Returns:
(tuple(list, list)): Tuple consisting of (lengthened) a and b in their original order.
'''
if in_place:
a = lst_a
b = lst_b
else:
a = copy.deepcopy(lst_a)
b = copy.deepcopy(lst_b)
if len(a) == len(b):
return a, b
if len(a) < len(b):
for i in range(len(b) - len(a)):
a.append(empty_marker)
if len(a) > len(b):
for i in range(len(a) - len(b)):
b.append(empty_marker)
assert len(a) == len(b)
return a, b
if __name__ == '__main__':
# Extract keyphrases with TopicRank
extracted = defaultdict(list)
filenames = [f for f in glob(TEST_DATA) if f.endswith('.xml')]
for fname in filenames:
doc_id = fname.split('/')[-1].split('.')[0]
print('Extracting keyphrases for document {}'.format(doc_id))
extractor = pke.unsupervised.TopicRank(input_file=fname)
extractor.read_document(format='corenlp')
extractor.candidate_selection()
extractor.candidate_weighting()
best = [k[0] for k in extractor.get_n_best(n=NUM_KEYPHRASES, stemming=STEMMING)]
extracted[doc_id] = best
# Load stanard keyphrases
with open(STANDARD_KPS, 'r') as f:
standard = {k: flatten(v) for k, v in json.loads(f.read()).items()}
# Get list of all keyphrases
standard_kps = list(set(flatten(standard.values())))
# Make two 1-dimentional lists for evaluation
all_std = []
all_ext = []
for doc_id, ext in extracted.items():
std = standard[doc_id]
std_e, ext_e = make_equal_len(std[:NUM_KEYPHRASES], ext[:NUM_KEYPHRASES])
all_std += std_e
all_ext += ext_e
# Evaluate
f = f1_score(all_std, all_ext, labels=standard_kps, average='micro')
print('F-score with micro average {}'.format(f * 100))
# Make into binary lists (1 == match, 0 == no match)
all_std_bin = [1 for i in range(len(all_std))]
all_ext_bin = [int(k == all_std[i]) for i, k in enumerate(all_ext)]
f_bin = f1_score(all_std_bin, all_ext_bin)
print('F-score with binary average {}'.format(f_bin * 100))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment