Created
August 24, 2021 15:56
-
-
Save RensDimmendaal/cb54c0f441b3f941cbe6ce338cd136bb to your computer and use it in GitHub Desktop.
Roman Numerals Altair Embeddings
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import base64 | |
from pathlib import Path | |
from typing import Sequence, Union | |
import altair as alt | |
import numpy as np | |
import pandas as pd | |
import tensorflow as tf | |
import umap | |
from tensorflow.python.keras.preprocessing.image_dataset import ( | |
load_image as tf_load_image, | |
) | |
from tqdm import tqdm | |
ROMAN_NUMERALS = "i ii iii iv v vi vii viii ix x".split() | |
REVERSED_NUMERALS = {r: i for i, r in enumerate(ROMAN_NUMERALS, start=1)} | |
def load_img_32(fpath): | |
"""Loads an image as size 32x32""" | |
return ( | |
tf_load_image( | |
path=str(fpath), | |
image_size=(32, 32), | |
num_channels=3, | |
interpolation="bilinear", | |
smart_resize=False, | |
) | |
.numpy() | |
.astype(int) | |
) | |
def base64_encode_png(fpath): | |
with open(fpath, "rb") as f: | |
return "data:image/png;base64," + base64.b64encode(f.read()).decode() | |
def embed(image_paths: Sequence[Union[str, Path]]) -> np.ndarray: | |
# make model | |
base_model = tf.keras.applications.ResNet50( | |
input_shape=(32, 32, 3), | |
include_top=False, | |
weights="imagenet", | |
) | |
base_model = tf.keras.Model( | |
base_model.inputs, outputs=[base_model.get_layer("conv2_block3_out").output] | |
) | |
inputs = tf.keras.Input(shape=(32, 32, 3)) | |
x = tf.keras.applications.resnet.preprocess_input(inputs) | |
x = base_model(x) | |
x = tf.keras.layers.GlobalAveragePooling2D()(x) | |
model = tf.keras.Model(inputs, x) | |
embeddings = np.zeros((len(image_paths), 256)) | |
for idx, img_path in tqdm(enumerate(image_paths)): | |
img = load_img_32(img_path) | |
embeddings[idx, :] = model(img.reshape((1, 32, 32, 3))).numpy()[0] | |
return embeddings | |
def reduce_dimensionality(embeddings): | |
return umap.UMAP().fit_transform(embeddings) | |
def load_df(data_dir="data/raw"): | |
fpaths = list(Path(data_dir).glob("**/*.png")) | |
return ( | |
pd.DataFrame({"fpath": fpaths}) | |
.assign(base64_encoded_img32=lambda d: d["fpath"].apply(base64_encode_png)) | |
.assign(label=lambda d: d["fpath"].apply(lambda p: p.parent.name)) | |
.assign(subset=lambda d: d["fpath"].apply(lambda p: p.parent.parent.name)) | |
.assign( | |
arabic_label=lambda d: d["label"] | |
.apply(lambda r: REVERSED_NUMERALS[r]) | |
.replace(10, 0) | |
) | |
) | |
def altair_plot( | |
df, | |
x_axis, | |
y_axis, | |
color, | |
text_marker, | |
tooltip, | |
title, | |
img_col="base64_encoded_img32", | |
): | |
ddf = df[[x_axis, y_axis, color, text_marker, img_col] + tooltip] | |
result = ( | |
alt.Chart(ddf) | |
.mark_text(size=10, opacity=0.2) | |
.encode( | |
x=x_axis, | |
y=y_axis, | |
color=alt.Color(color, scale=alt.Scale(scheme="dark2")), | |
tooltip=tooltip, | |
text=text_marker, | |
) | |
.properties(title=title) | |
) | |
brush = alt.selection(type="interval") | |
ranked_img = ( | |
alt.Chart(ddf) | |
.mark_image(width=32, height=32) | |
.encode( | |
y=alt.Y("row_number:O", axis=None), | |
url=img_col, | |
) | |
.transform_window(row_number="row_number()") | |
.transform_filter(brush) | |
.transform_window(rank="rank(row_number)") | |
.transform_filter(alt.datum.rank < 20) | |
.properties(width=50, title="Img Selection") | |
) | |
return result.add_selection(brush) | ranked_img | |
if __name__ == "__main__": | |
df = load_df("./data/raw/") | |
embeds = embed(df["fpath"]) | |
df[["dim1", "dim2"]] = reduce_dimensionality(embeds) | |
fig = altair_plot( | |
df, | |
x_axis="dim1", | |
y_axis="dim2", | |
color="subset", | |
text_marker="label", | |
tooltip=["label", "subest"], | |
title="The standard Train/Validation split has style differences.", | |
) | |
fig.save("my_figure.html") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment