Skip to content

Instantly share code, notes, and snippets.

@dlukes
Last active December 2, 2022 15:48
Show Gist options
  • Save dlukes/c25fc112a4ff8d6a49121842ae090679 to your computer and use it in GitHub Desktop.
Save dlukes/c25fc112a4ff8d6a49121842ae090679 to your computer and use it in GitHub Desktop.
Dispersion plot with Shiny for Python
from collections import Counter
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import regex as re
from shiny import App, reactive, render, ui
mpl.rcParams.update(
{
"axes.grid": True,
"axes.grid.axis": "both",
"axes.spines.left": False,
"axes.spines.right": False,
"axes.spines.top": False,
"axes.spines.bottom": False,
}
)
app_ui = ui.page_fluid(
ui.panel_title("Lexical dispersion plot"),
ui.layout_sidebar(
ui.panel_sidebar(
ui.input_text_area("text", "Text to analyze"),
ui.input_text("words", "Space-separated words to plot"),
ui.input_checkbox("icase", "Ignore case"),
),
ui.panel_main(
ui.output_plot("dispersion_plot"),
ui.output_table("freq_dist"),
),
),
)
def tokenize(text):
return re.findall(
r"\p{Alphabetic}+(?:\S+\p{Alphabetic}+)*|[\S&&\P{Alphabetic}]+",
text,
flags=re.VERSION1,
)
def server(input, output, session):
@reactive.Calc
def tokenized_text():
text = input.text().strip()
if not text:
ui.notification_show("Please provide an input text.", type="warning")
return []
if input.icase():
text = text.lower()
return tokenize(text)
@reactive.Calc
def split_words():
words = input.words().strip()
if not words:
ui.notification_show("Please provide words to plot.", type="warning")
return {}
if input.icase():
words = words.lower()
return {w: i for (i, w) in enumerate(reversed(words.split()))}
@output
@render.plot(alt="Dispersion plot of chosen words in input text")
def dispersion_plot():
text = tokenized_text()
words = split_words()
if not (text and words):
return
xs = []
ys = []
for i, tok in enumerate(text):
if tok in words:
xs.append(i)
ys.append(words[tok])
fig, ax = plt.subplots()
ax.plot(xs, ys, marker="|", markersize=20, linestyle="")
ax.set_yticks(range(len(words)), labels=list(words))
ax.tick_params(axis="both", length=0)
ax.set_xlabel("Word offset")
return fig
@output
@render.table()
def freq_dist():
text = tokenized_text()
words = set(split_words())
if not (text and words):
return
freq_dist = Counter(t for t in text if t in words)
df = pd.DataFrame(freq_dist.most_common(), columns=["word", "frequency"])
return df
app = App(app_ui, server)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment