Last active
December 2, 2022 15:48
-
-
Save dlukes/c25fc112a4ff8d6a49121842ae090679 to your computer and use it in GitHub Desktop.
Dispersion plot with Shiny for Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import Counter | |
import matplotlib as mpl | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
import regex as re | |
from shiny import App, reactive, render, ui | |
mpl.rcParams.update( | |
{ | |
"axes.grid": True, | |
"axes.grid.axis": "both", | |
"axes.spines.left": False, | |
"axes.spines.right": False, | |
"axes.spines.top": False, | |
"axes.spines.bottom": False, | |
} | |
) | |
app_ui = ui.page_fluid( | |
ui.panel_title("Lexical dispersion plot"), | |
ui.layout_sidebar( | |
ui.panel_sidebar( | |
ui.input_text_area("text", "Text to analyze"), | |
ui.input_text("words", "Space-separated words to plot"), | |
ui.input_checkbox("icase", "Ignore case"), | |
), | |
ui.panel_main( | |
ui.output_plot("dispersion_plot"), | |
ui.output_table("freq_dist"), | |
), | |
), | |
) | |
def tokenize(text): | |
return re.findall( | |
r"\p{Alphabetic}+(?:\S+\p{Alphabetic}+)*|[\S&&\P{Alphabetic}]+", | |
text, | |
flags=re.VERSION1, | |
) | |
def server(input, output, session): | |
@reactive.Calc | |
def tokenized_text(): | |
text = input.text().strip() | |
if not text: | |
ui.notification_show("Please provide an input text.", type="warning") | |
return [] | |
if input.icase(): | |
text = text.lower() | |
return tokenize(text) | |
@reactive.Calc | |
def split_words(): | |
words = input.words().strip() | |
if not words: | |
ui.notification_show("Please provide words to plot.", type="warning") | |
return {} | |
if input.icase(): | |
words = words.lower() | |
return {w: i for (i, w) in enumerate(reversed(words.split()))} | |
@output | |
@render.plot(alt="Dispersion plot of chosen words in input text") | |
def dispersion_plot(): | |
text = tokenized_text() | |
words = split_words() | |
if not (text and words): | |
return | |
xs = [] | |
ys = [] | |
for i, tok in enumerate(text): | |
if tok in words: | |
xs.append(i) | |
ys.append(words[tok]) | |
fig, ax = plt.subplots() | |
ax.plot(xs, ys, marker="|", markersize=20, linestyle="") | |
ax.set_yticks(range(len(words)), labels=list(words)) | |
ax.tick_params(axis="both", length=0) | |
ax.set_xlabel("Word offset") | |
return fig | |
@output | |
@render.table() | |
def freq_dist(): | |
text = tokenized_text() | |
words = set(split_words()) | |
if not (text and words): | |
return | |
freq_dist = Counter(t for t in text if t in words) | |
df = pd.DataFrame(freq_dist.most_common(), columns=["word", "frequency"]) | |
return df | |
app = App(app_ui, server) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Jinja2 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment