Skip to content

Instantly share code, notes, and snippets.

@jpwahle
Created October 20, 2022 12:23
Show Gist options
  • Save jpwahle/f9414b2ae85043225b10cd9e105d3b81 to your computer and use it in GitHub Desktop.
Save jpwahle/f9414b2ae85043225b10cd9e105d3b81 to your computer and use it in GitHub Desktop.
# Copyright 2022 by Jan Philip Wahle, https://jpwahle.com/
# All rights reserved.
import colorcet as cc
import dask_ml.feature_extraction.text
import datashader as ds
import pandas as pd
from datasets import load_dataset
from datashader.utils import export_image
from openTSNE import TSNE
from sklearn.decomposition import TruncatedSVD
dataset = load_dataset("codeparrot/github-code", streaming=True, split="train").shuffle(
seed=42, buffer_size=10_000
)
selection = list(dataset.take(1_000_000))
X = [example["code"] for example in selection]
y = [example["language"] for example in selection]
vect = dask_ml.feature_extraction.text.HashingVectorizer()
X_tfidf = vect.fit_transform(X)
X_tfidf
svd = TruncatedSVD(n_components=160, n_iter=9, random_state=42)
X_svd = svd.fit_transform(X_tfidf)
tsne = TSNE(
perplexity=30,
metric="euclidean",
n_jobs=8,
random_state=42,
verbose=True,
)
X_tsne = tsne.fit(X_svd)
X_tsne.shape
df = pd.DataFrame(X_tsne, columns=["x", "y"])
df = df.merge(pd.DataFrame({"color": y}, dtype="category"), left_index=True, right_index=True)
cvs = ds.Canvas(plot_width=750, plot_height=750)
agg = cvs.points(df, "x", "y", agg=ds.count_cat("color"))
long_key = list(set(cc.glasbey_cool + cc.glasbey_warm + cc.glasbey_dark))
img = ds.tf.set_background(ds.tf.shade(agg, color_key=long_key), "black")
export_image(img, "out", background="black", export_path=".")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment