Skip to content

Instantly share code, notes, and snippets.

@oliver-batey
Last active November 29, 2021 11:45
Show Gist options
  • Save oliver-batey/2c6d6d369bf840e4c552ac793e137d9b to your computer and use it in GitHub Desktop.
Save oliver-batey/2c6d6d369bf840e4c552ac793e137d9b to your computer and use it in GitHub Desktop.
Plot keyterms of a document
from typing import List, Tuple
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.axes import Axes
from textacy import extract, make_spacy_doc
def decompose_keyterms(keyterm_list: List[str]) -> Tuple:
terms = [el[0].replace(" ", "\n") for el in keyterm_list]
scores = np.asarray([el[1] for el in keyterm_list])
return terms, scores
def make_barplot(
scores: np.array,
keyterms: List[str],
ax: Axes = None,
title: str = "barplot",
ylabel: str = "ylabel",
color: str = "lightblue",
edgecolor: str = "midnightblue",
align: str = "center",
alpha: float=1.0,
) -> None:
bars = ax.bar(
np.arange(len(keyterms)), scores, align=align, color=color, alpha=alpha
)
for bar in bars:
bar.set_edgecolor(edgecolor)
ax.set_xticks(np.arange(len(keyterms)))
ax.set_xticklabels(keyterms, fontsize=5)
ax.set_ylabel(ylabel, fontsize=12)
ax.set_title(title, fontsize=12)
return ax
# Open data from .txt file
with open("news_article.txt", "r") as file:
data = file.read().replace("\n", "")
article = data.replace(u"\xa0", u" ")
# Create doc object
doc = make_spacy_doc(article, lang="en_core_web_sm")
# KEYTERM EXTRACTION
# Each algorithm returns a list of tuples, containing the keyterm and a score
textrank = extract.keyterms.textrank(doc, normalize="lemma")
yake = extract.keyterms.yake(doc, normalize="lemma")
scake = extract.keyterms.scake(doc, normalize="lemma")
sgrank = extract.keyterms.sgrank(doc, normalize="lemma")
# Separate terms and relevancy scores
terms_textrank, scores_textrank = decompose_keyterms(textrank)
terms_yake, scores_yake = decompose_keyterms(yake)
terms_scake, scores_scake = decompose_keyterms(scake)
terms_sgrank, scores_sgrank = decompose_keyterms(sgrank)
# Make plot
fig, axes = plt.subplots(2, 2, figsize=(11, 8))
make_barplot(
scores_textrank,
terms_textrank,
axes[0, 0],
title="TextRank algorithm",
ylabel="Importance",
)
make_barplot(
scores_yake,
terms_yake,
axes[0, 1],
title="YAKE algorithm",
ylabel="Importance",
color="lightcoral",
edgecolor="firebrick",
)
make_barplot(
scores_scake,
terms_scake,
axes[1, 0],
title="sCAKE algorithm",
ylabel="Importance",
color="springgreen",
edgecolor="darkgreen",
)
make_barplot(
scores_sgrank,
terms_sgrank,
axes[1, 1],
title="SGRank algorithm",
ylabel="Importance",
color="moccasin",
edgecolor="darkorange",
)
plt.tight_layout()
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment