Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save player1537/1c23b91b274d2e885be80d5892bac5b7 to your computer and use it in GitHub Desktop.
Save player1537/1c23b91b274d2e885be80d5892bac5b7 to your computer and use it in GitHub Desktop.
semeditor: Local GUI editor that supports semantic syntax highlighting.

Semeditor

Install with:

$ pip install semeditor@git+https://gist.github.com/player1537/1c23b91b274d2e885be80d5892bac5b7.git

Put one kind of text (e.g. text with positivity, text with concrete ideas, text referring to some concept) in each of the left panes. Then in the right pane, put some document text that you want to analyze/edit.

In the right pane, press <Control-Return> to perform the semantic embedding and classification.

image

# Installation:
# pip install semeditor@git+https://gist.github.com/player1537/1c23b91b274d2e885be80d5892bac5b7.git
[project]
name = "semeditor"
version = "0.2.3"
description = "Local GUI editor that supports semantic syntax highlighting."
authors = [{name = "https://github.com/player1537"}]
license = {text = "wtfpl"}
classifiers = [
"Development Status :: 4 - Beta"
]
dependencies = [
"fastcdc",
"numpy",
"torch",
"scikit-learn",
"transformers",
"more-itertools",
"xdg-base-dirs",
"llm",
"mediocreatbest@git+https://gist.github.com/player1537/3457b026ed6ef6696d758517f55a58df.git",
]
[project.gui-scripts]
semeditor = "semeditor:cli"
#!/usr/bin/env python3
"""
"""
# Script Dependencies:
# fastcdc
# numpy
# torch
# scikit-learn
# transformers
# xdg-base-dirs
# more-itertools
class auto:
import collections
import functools
import io
import itertools
import json
import pathlib
import re
import textwrap
import tkinter as tk, tkinter.ttk as ttk, tkinter.scrolledtext
import typing
import fastcdc, fastcdc.fastcdc_py
import more_itertools
import numpy, numpy as np
import sklearn, sklearn.svm
import tqdm, tqdm.tk
import transformers
import xdg_base_dirs
Tokenizer = auto.transformers.AutoTokenizer.from_pretrained
Model = auto.transformers.AutoModel.from_pretrained
Chunk = auto.collections.namedtuple('Chunk', 'offset length text')
CONFIG_ROOT = auto.xdg_base_dirs.xdg_config_home() / 'semeditor'
CONFIG_PATHS = {
'positive': CONFIG_ROOT / 'positive.txt',
'negative': CONFIG_ROOT / 'negative.txt',
'primary': CONFIG_ROOT / 'primary.txt',
'cache': CONFIG_ROOT / 'cache.json',
}
_g_tokenizer: Tokenizer = None
_g_model: Model = None
_g_tk: auto.tkinter.Tk = None
_g_cache: dict[str, list[float]] = None
RdBu5 = ['#ca0020', '#f4a582', '#f7f7f7', '#92c5de', '#0571b0' ]
RdBu7 = (
"""#b2182b
#ef8a62
#fddbc7
#f7f7f7
#d1e5f0
#67a9cf
#2166ac"""
).split('\n')
RdBu11 = (
"""#67001f
#b2182b
#d6604d
#f4a582
#fddbc7
#f7f7f7
#d1e5f0
#92c5de
#4393c3
#2166ac
#053061"""
).split('\n')
def Interpolate(a: str, b: str, r: float) -> str:
def Decode(hex: str, /) -> tuple[float, float, float]:
assert hex.startswith('#')
hex = hex.removeprefix('#')
r = int(hex[0:2], 16) / 255.0
g = int(hex[2:4], 16) / 255.0
b = int(hex[4:6], 16) / 255.0
return (r, g, b)
def Encode(rgb: tuple[float, float, float], /) -> str:
r = int(rgb[0] * 255.0)
g = int(rgb[1] * 255.0)
b = int(rgb[2] * 255.0)
return f'#{r:02x}{g:02x}{b:02x}'
def Interpolate(a: float, b: float, r: float) -> float:
return a * r + b * (1.0 - r)
a = Decode(a)
b = Decode(b)
c = (
Interpolate(a[0], b[0], r),
Interpolate(a[1], b[1], r),
Interpolate(a[2], b[2], r),
)
return Encode(c)
COLORMAP = {
'z': RdBu11[5],
'p1': RdBu11[5-1],
'p2': RdBu11[5-2],
'p3': RdBu11[5-3],
'p4': RdBu11[5-4],
'n1': RdBu11[5+1],
'n2': RdBu11[5+2],
'n3': RdBu11[5+3],
'n4': RdBu11[5+4],
}
COLORMAP |= {
'p0': COLORMAP['z'],
'n0': COLORMAP['z'],
}
COLORMAP |= {
'p1a': Interpolate(COLORMAP['p1'], COLORMAP['p0'], 0.85),
'p1b': Interpolate(COLORMAP['p1'], COLORMAP['p2'], 0.85),
'p2a': Interpolate(COLORMAP['p2'], COLORMAP['p1'], 0.85),
'p2b': Interpolate(COLORMAP['p2'], COLORMAP['p3'], 0.85),
'p3a': Interpolate(COLORMAP['p3'], COLORMAP['p2'], 0.85),
'p3b': Interpolate(COLORMAP['p3'], COLORMAP['p4'], 0.85),
'n1a': Interpolate(COLORMAP['n1'], COLORMAP['n0'], 0.85),
'n1b': Interpolate(COLORMAP['n1'], COLORMAP['n2'], 0.85),
'n2a': Interpolate(COLORMAP['n2'], COLORMAP['n1'], 0.85),
'n2b': Interpolate(COLORMAP['n2'], COLORMAP['n3'], 0.85),
'n3a': Interpolate(COLORMAP['n3'], COLORMAP['n2'], 0.85),
'n3b': Interpolate(COLORMAP['n3'], COLORMAP['n4'], 0.85),
}
def Chunks(text: str) -> list[Chunk]:
text: bytes = text.encode('ascii', errors='ignore')
chunks = []
for chunk in auto.fastcdc.fastcdc_py.chunk_generator(
stream=auto.io.BytesIO(text),
min_size=32,
avg_size=64,
max_size=128,
fat=False,
hf=None,
):
chunks.append(Chunk(
offset=chunk.offset,
length=chunk.length,
text=text[chunk.offset:chunk.offset + chunk.length].decode('ascii', errors='ignore'),
))
return chunks
def Embed(text: str, /) -> auto.np.ndarray:
text = f'Represent this sentence for searching relevant passages: {text}'
if text in _g_cache:
return auto.np.array(_g_cache[text])
inp = _g_tokenizer(text, return_tensors='pt')
out = _g_model(**inp)
out = out.pooler_output
out = out.detach().numpy()
if isinstance(text, str):
out = out[0, :]
_g_cache[text] = out.tolist()
return out
def Embeds(chunks: list[Chunk], /, *, k: int=3, progress: auto.tkinter.ttk.Progressbar=None) -> auto.np.ndarray:
embeds = []
windows = list(auto.more_itertools.windowed(auto.itertools.chain(
[None] * (k-1-1),
chunks,
[None] * (k-1-1),
), k))
assert len(windows) == len(chunks), \
f"""Expected {len(chunks)} chunks, got {len(windows)} windows."""
if progress is not None:
progress.config(maximum=len(windows), value=0)
for window in windows:
texts = []
for chunk in window:
if chunk is None:
texts.append('')
else:
texts.append(chunk.text)
text = ''.join(texts)
embeds.append(Embed(text))
if progress is not None:
progress.step(1)
progress.update()
embeds = auto.np.array(embeds)
return embeds
def Classifier(*, positive: str, negative: str) -> auto.sklearn.svm.SVC:
positive = Chunks(positive)
negative = Chunks(negative)
positive = Embeds(positive)
negative = Embeds(negative)
X = auto.np.vstack([positive, negative])
y = auto.np.zeros((len(X),), dtype='u4')
y[:len(positive)] = 1
model = auto.sklearn.svm.SVC(
# kernel='linear',
probability=True,
random_state=1337,
)
model.fit(X, y)
return model
def Classify(classifier: auto.sklearn.svm.SVC, /, *, embeds: auto.np.ndarray, debug: bool=False) -> auto.np.ndarray:
probs = classifier.predict_proba(embeds)
if debug:
print(f'{probs=!r}')
probs = probs[:, 0] - probs[:, 1]
# probs = auto.sklearn.preprocessing.minmax_scale(probs, feature_range=(-1.0, 1.0))
return probs
def _configure_text_defaults(text: auto.tkinter.Text):
def callback(event):
text.tag_add(auto.tkinter.SEL, '1.0', auto.tkinter.END)
text.mark_set(auto.tkinter.INSERT, '1.0')
text.see(auto.tkinter.INSERT)
return 'break' # prevent default
text.bind('<Control-KeyRelease-a>', callback)
def callback(event):
text.insert(auto.tkinter.INSERT, " ")
return 'break' # prevent default
text.bind('<Tab>', callback)
class Editor(auto.tkinter.Frame):
def __init__(self, master: auto.tkinter.ttk.Widget, *, value=None):
super().__init__(master)
self.grid_rowconfigure(0, weight=1)
self.grid_columnconfigure(0, weight=1)
self._history = auto.collections.deque(maxlen=64)
self._value = \
value = value or ''
def UpdateStringVar():
value = var.get()
self._value = value
self._history.append(value)
pause = False
def Hold():
nonlocal pause
pause = True
def Release():
nonlocal pause
pause = False
def IsHeld():
return pause
def Modified():
var.set(text.get('1.0', auto.tkinter.END))
text.edit_modified(0)
def Do(rotation_direction: auto.typing.Literal[-1, 1], /):
Hold()
initial = self._history[-1]
for _ in range(len(self._history)):
self._history.rotate(rotation_direction)
value = self._history[-1]
if value != initial:
break
else:
print(f'All history entries are the same?')
return
self._text.delete('1.0', auto.tkinter.END)
self._text.update()
self._text.insert(auto.tkinter.END, value)
Release()
def Undo():
Do(1)
def Redo():
Do(-1)
self._var = \
var = auto.tkinter.StringVar()
var.set(self.value)
def callback(name, index, mode):
UpdateStringVar()
return 'break'
var.trace('w', callback)
self._text = \
text = auto.tkinter.scrolledtext.ScrolledText(self, exportselection=False)
text.grid(row=0, column=0, sticky='nsew')
_configure_text_defaults(text)
text.insert(auto.tkinter.END, var.get())
def callback(event):
if not IsHeld():
Modified()
return 'break' # prevent default
text.bind('<<Modified>>', callback)
def callback(event):
Undo()
return 'break' # prevent default
text.bind('<Control-z>', callback)
def callback(event):
Redo()
return 'break' # prevent default
text.bind('<Control-Z>', callback)
@property
def value(self):
return self._value
@value.setter
def value(self, value):
self._text.delete('1.0', auto.tkinter.END)
self._text.insert(auto.tkinter.END, value)
# self._var.set(value)
@property
def text(self):
return self._text
def main(
*,
model_name: str,
tokenizer_name: str,
positive_text: str | None,
negative_text: str | None,
primary_text: str | None,
):
try:
with open(CONFIG_PATHS['cache'], 'r') as f:
cache = auto.json.load(f)
except FileNotFoundError:
cache = {}
tokenizer = Tokenizer(
tokenizer_name,
)
model = Model(
model_name,
)
global _g_cache
_g_cache = cache
global _g_tokenizer
_g_tokenizer = tokenizer
global _g_model
_g_model = model
tk = auto.tkinter.Tk()
tk.title('Semantic Editor')
tk.geometry('640x480')
tk.attributes('-zoomed', True)
tk.grid_rowconfigure(0, weight=1)
tk.grid_rowconfigure(1, weight=0)
tk.grid_columnconfigure(0, weight=1)
tk.grid_columnconfigure(1, weight=1)
global _g_tk
_g_tk = tk
# ┌───┬───┐
# │ A │ │
# ├───┤ C │
# │ B │ │
# ├───┴───┤
# │ P │
# └───────┘
abc = auto.tkinter.Frame(tk)
abc.grid(row=0, column=0, sticky='nsew')
abc.grid_rowconfigure(0, weight=1)
abc.grid_columnconfigure(0, weight=1)
p = auto.tkinter.Frame(tk, height=32)
p.grid(row=1, column=0, sticky='sew')
p.grid_rowconfigure(0, weight=1)
p.grid_columnconfigure(0, weight=1)
ab = auto.tkinter.Frame(abc)
ab.grid(row=0, column=0, sticky='nsew')
ab.grid_rowconfigure(0, weight=1)
ab.grid_rowconfigure(1, weight=1)
ab.grid_columnconfigure(0, weight=1)
a = auto.tkinter.Frame(ab)
a.grid(row=0, column=0, sticky='nsew')
a.grid_rowconfigure(0, weight=1)
a.grid_columnconfigure(0, weight=1)
b = auto.tkinter.Frame(ab)
b.grid(row=1, column=0, sticky='nsew')
b.grid_rowconfigure(0, weight=1)
b.grid_columnconfigure(0, weight=1)
c = auto.tkinter.Frame(abc)
c.grid(row=0, column=1, sticky='nsew')
c.grid_rowconfigure(0, weight=1)
c.grid_columnconfigure(0, weight=1)
def Configure(text: auto.tkinter.Text, /, foreground: auto.typing.Literal[-1, 0, 1]=0):
# text.config(bg=RdBu11[5+background])
if foreground != 0:
text.config(fg=RdBu11[5+5*foreground])
for tag, color in COLORMAP.items():
text.tag_config(tag, background=color)
def PreventDefault(func: auto.typing.Callable, *args, **kwargs) -> auto.typing.Literal['break']:
def callback(event):
try:
func(*args, **kwargs)
finally:
return 'break' # prevent default
return callback
def Expand(editor: Editor, /):
"""Expand primary/negative buffers with duplicated text to get more data."""
def Expand(original_text: str, /, target: int=64) -> list[Chunk]:
def Fill(text: str, /) -> str:
return auto.re.sub(r'\s+', ' ', text)
text = Fill(original_text)
while len(chunks := Chunks(text)) < target:
text = Fill(text + ' ' + original_text)
return text
editor.value = Expand(editor.value)
def Colorize(*, positive: Editor, negative: Editor, primary: Editor):
"""Classify and colorize editors"""
def Clear(text: auto.tkinter.Text, /):
for tag in COLORMAP.keys():
text.tag_remove(tag, '1.0', auto.tkinter.END)
Clear(positive.text)
Clear(negative.text)
Clear(primary.text)
classifier = Classifier(
positive=positive.value,
negative=negative.value,
)
def Tag(text: auto.tkinter.Text, /, *, tag: str | None, chunk: Chunk):
if tag is not None:
text.tag_add(tag, f'1.0+{chunk.offset}c', f'1.0+{chunk.offset+chunk.length}c')
def Alternate(text: auto.tkinter.Text, chunks: list[Chunk], /, *, tags: list[str | None]):
tags = iter(auto.itertools.cycle(tags))
for chunk in chunks:
Tag(text, tag=next(tags), chunk=chunk)
# Alternate(positive.text, pos_chunks, tags=[None, 'p1'])
# Alternate(negative.text, neg_chunks, tags=[None, 'n1'])
pos_chunks = Chunks(positive.value)
neg_chunks = Chunks(negative.value)
pos_embeds = Embeds(pos_chunks)
neg_embeds = Embeds(neg_chunks)
chunks = Chunks(primary.value)
embeds = Embeds(chunks, progress=progress)
def Colorize(
*,
text: auto.tkinter.Text,
chunks: list[Chunk],
embeds: auto.np.ndarray,
debug: bool=False,
):
probs = Classify(classifier, embeds=embeds, debug=debug)
suffixes = iter(auto.itertools.cycle(['a', 'b']))
for chunk, prob in zip(chunks, probs):
if debug:
print(f'{chunk.text!r}: {prob=!r}')
tag = None
if abs(prob) > 0.25:
suffix = next(suffixes)
level = 1
if abs(prob) > 0.50:
level = 2
if abs(prob) > 0.75:
level = 3
if prob > 0:
tag = f'p{level}{suffix}'
else:
tag = f'n{level}{suffix}'
Tag(text, tag=tag, chunk=chunk)
Colorize(text=positive.text, chunks=pos_chunks, embeds=pos_embeds, debug=True)
Colorize(text=negative.text, chunks=neg_chunks, embeds=neg_embeds, debug=True)
Colorize(text=primary.text, chunks=chunks, embeds=embeds)
def Save(*, positive: Editor, negative: Editor, primary: Editor):
"""Save buffers"""
if not CONFIG_ROOT.exists():
print(f'Creating directory: {CONFIG_ROOT!r}')
CONFIG_ROOT.mkdir(parents=True, exist_ok=True)
def Save(name: str, editor: Editor, /):
path = CONFIG_PATHS[name]
print(f'Saving file: {path!r}')
path.write_text(editor.value)
Save('positive', positive)
Save('negative', negative)
Save('primary', primary)
path = CONFIG_PATHS['cache']
print(f'Saving file: {path!r}')
with open(path, 'w') as f:
auto.json.dump(cache, f)
progress = auto.tkinter.ttk.Progressbar(p, mode='determinate')
progress.grid(row=0, column=0, sticky='nsew')
positive = Editor(a, value=positive_text)
Configure(positive.text, foreground=-1)
positive.grid(row=0, column=0, sticky='nsew')
positive.text.bind('<Control-Return>',
PreventDefault(Expand, positive),
)
negative = Editor(b, value=negative_text)
Configure(negative.text, foreground=+1)
negative.grid(row=0, column=0, sticky='nsew')
negative.text.bind('<Control-Return>',
PreventDefault(Expand, negative),
)
primary = Editor(c, value=primary_text)
Configure(primary.text, foreground=0)
primary.grid(row=0, column=0, sticky='nsew')
primary.text.bind('<Control-Return>',
PreventDefault(Colorize, positive=positive, negative=negative, primary=primary),
)
primary.text.bind('<Control-s>',
PreventDefault(Save, positive=positive, negative=negative, primary=primary),
)
tk.mainloop()
def cli():
import argparse
def Buffer(name: str, /) -> auto.typing.Generator[None, argparse.ArgumentParser | dict[str, any] | None, None]:
path = CONFIG_PATHS[name]
if not path.exists():
path = None
parser = yield
parser.add_argument(f'--{name}-text',
default=None,
)
parser.add_argument(f'--{name}-file',
default=path,
type=auto.pathlib.Path,
)
args = yield
path = args.pop(f'{name}_file')
if args[f'{name}_text'] is None:
if path is not None:
args[f'{name}_text'] = path.read_text()
else:
args[f'{name}_text'] = ''
yield # dummy
buffers = []
buffers.append(Buffer('positive'))
buffers.append(Buffer('negative'))
buffers.append(Buffer('primary'))
for buffer in buffers:
next(buffer)
parser = argparse.ArgumentParser()
parser.add_argument('--model-name', default='BAAI/bge-small-en')
parser.add_argument('--tokenizer-name', default=None)
parser.add_argument('--demo', action='store_true')
for buffer in buffers:
buffer.send(parser)
args = vars(parser.parse_args())
if args['tokenizer_name'] is None:
args['tokenizer_name'] = args['model_name']
if args.pop('demo'):
def Clean(s: str) -> str:
s = auto.re.split(r'\n\n+', s)
s = [auto.re.sub(r'\s+', ' ', s) for s in s]
s = [s.strip() for s in s]
s = "\n\n".join(s)
s = s.strip()
return s
args['positive_text'] = Clean(r"""
One great way to organize your notes and research is by using
Jupyter Notebooks. These notebooks allow you to keep all of your
information in one place, making it easy to access and analyze
everything that you need for your project or paper. With an
integrative approach, you can compile all of the relevant data and
present it in a clear and concise manner. This way, you can easily
see how different pieces of information relate to each other and
build upon each other to create a comprehensive understanding of
the topic at hand. By using Jupyter Notebooks with an integrative
approach, you can streamline your research process and produce
high-quality work in less time.
""")
args['negative_text'] = Clean(r"""
As a software engineer, you know that developing robust and
scalable applications is crucial for success. In order to achieve
this, you need to leverage various technologies and techniques. One
such technique is the use of microservices, which allows you to
break down your application into smaller, more manageable
components. These components can then be developed and maintained
independently, making it easier to scale each component as needed.
To make the most out of these microservices, you'll need an
effective visualization tool that can help you understand how your
system is performing under load. This will enable you to identify
potential bottlenecks or areas where performance could be improved.
With the right tools and techniques in place, it's possible to
build a scalable and efficient application that meets the needs of
your users.
""")
args['primary_text'] = Clean(r"""
\vspace{-0.05in}
Increasingly, data exploration and visualization occurs within
computational notebooks. Their widespread utility has been
demonstrated in the fields of biology [nature], AI [colab], and
more. This trend can be largely attributed to the nature of
computational notebooks: integrative of multiple kinds of data
sources, interactivity from the out-of-order cell execution, and
iterative composition of inputs, codes, and outputs.
In terms of data representation, computational notebooks have
gained support for a variety of fo rmats ranging from simple
tabular data to complex neuron activation networks.
In this work, we approach the challenge of integrating graph data
visualization into computational notebooks by using a general and
efficient Visualization-as-a-Service (VaaS) system. Graph is a
universal representation of complex relationships and a data
structure common in many application domains. A widespread ability
to understand the complex relationships embedded within those big
graphs will enable and accelerate discoveries.
Our VaaS system is centered on a domain specific language called
the Graph Shader (GS) language. The design of GS is intended to
provide a reusable and efficient interface to give users an
expressive means to control graph renderings. Typically,
interacting with a graph means to pan, zoom, pick or brush using a
pointing device like mouse or trackpad; we believe graph shaders
can provide an additional means of exploration. GS is inspired by
how GLSL shaders have enabled OpenGL renderings to become more
creative and expressive. From this respect, we've designed GS to
help users to express their exploratory hypotheses more effectively
using graph primitives. Corresponding to vertex, geometry and
fragment shaders that are available in GLSL shaders, GS also has
three components: (i) positional, (ii) relational, and (iii)
appearance shaders. The GS VaaS parses GS and automatically
generates the corresponding GLSL shaders in a transpiling process.
In this way, any graphics hardware that support current OpenGL
standard will be able to support graph renderings that use GS.
Graph Shaders also draw inspiration from the design of OpenMP
programs and their use of ``\texttt{\#pragma}'' statements to
interleave compiler directives within ordinary lines of code.
Using \texttt{pragma} directives, programs in GS can opt in to
restrictive data access patterns to improve their performance. The
combination of code in GS and \texttt{pragma} statements enables
the code to become a standalone GS program that can be executed by
the GS VaaS. Graph data is made available to GS using OpenGL's
Shader Storage Buffer Objects (SSBOs), allowing for simplistic and
powerful data access patterns.
We use three driving applications to demonstrate analytical
benefits of GS. Each application targets a particular dataset,
with sizes ranging from 1.2 to 3.8 million nodes and 6.2 to 63.5
million edges. In all cases, the driving application falls under
the use scenario where the graph that contains large and complex
relationships that will continue to evolve over time, and it's
beneficial to host the most up to date version of the data as a
cloud resource for remote users to share and use. The specific
example use cases are:
\begin{itemize}
\item To identify the evolving interdependency networks among
open-source projects using the JS-Deps dataset of JavaScript
Dependencies among NPM packages~\cite{npm} (shown in
Figure~\ref{fig:graph:JS-Deps}).
\item To reveal conversation patterns and relationships among users
on an online platform, such as Stack Overflow, using the SO-Answers
dataset from the SNAP project~\cite{snapnets} (shown in
Figure~\ref{fig:graph:SO-Answers-1234}).
\item To track the citation networks between patent applications of
US Patent and Trademark Office using the NBER-Patents
dataset~\cite{hall200213} (shown in
Figure~\ref{fig:graph:NBER-Patents-grid}).
\end{itemize}
The remainder of this work is organized as follows. We start with
describing the background and proceed with
Section~\ref{sec:graph:design} about our development of the GS VaaS
and the GS language. The results in
Section~\ref{sec:graph:results} show the efficiency of our system.
Section~\ref{sec:graph:apps} showcases three applications for the
visualization and analysis of large graph data. We conclude and
discuss future work in Section~\ref{sec:graph:conclusion}.
""")
for buffer in buffers:
buffer.send(args)
main(**args)
if __name__ == '__main__':
cli()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment