Skip to content

Instantly share code, notes, and snippets.

View MathiasGruber's full-sized avatar

Mathias Gruber MathiasGruber

View GitHub Profile
@MathiasGruber
MathiasGruber / image_to_tfrecords.py
Last active July 15, 2019 19:42
Convert image data to tf.records
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
@MathiasGruber
MathiasGruber / delete_corrupt_images.py
Last active July 15, 2019 19:09
Iterates through rootdir and finds corrupt images
import multiprocessing
from joblib import Parallel, delayed
from tqdm import tqdm_notebook
from skimage import io
from PIL import Image
import os
import gc
import numpy as np
import warnings
@MathiasGruber
MathiasGruber / get_keras_activations.py
Created July 15, 2019 12:05
Given a keras model & some input, show output statistics for each layer
import tensorflow.keras.backend as K
def get_activations(model, model_inputs):
print('----- activations -----')
activations = []
inp = model.input
model_multi_inputs_cond = True
if not isinstance(inp, list):
# only one input! let's wrap it in a list.
@MathiasGruber
MathiasGruber / tf.dataset.boilerplate.py
Created July 16, 2019 03:55
tf.Dataset pipeline boilerplate
features = {
'image/encoded': tf.FixedLenFeature([], tf.string),
'image/height': tf.FixedLenFeature([], tf.int64),
'image/width': tf.FixedLenFeature([], tf.int64)
}
def parse(record, image_size=256):
# Parse data
parsed = tf.parse_single_example(record, features)
@MathiasGruber
MathiasGruber / dimensionality_reduction.py
Last active April 7, 2021 08:34
Running t-SNE, UMAP, TriMAP and PaCMAP
import time
import trimap
import umap
import pacmap
from sklearn.manifold import TSNE
# Read data
df = pd.read_csv('your_dataset.csv')
# Algorithms to test
@MathiasGruber
MathiasGruber / azure_databricks.tf
Last active May 9, 2021 06:01
Basic setup for deploying a databricks workspace to azure
# Deploy to Azure
terraform {
required_providers {
azurerm = {
source = "hashicorp/azurerm"
version = "=2.46.0"
}
}
}
@MathiasGruber
MathiasGruber / mnist_train.py
Last active April 15, 2021 13:26
Training MNIST with mlflow logging on Databricks
"""
Keras MNIST example from: https://keras.io/examples/vision/mnist_convnet/
Adapted to add mlflow logging
"""
import mlflow
import mlflow.keras
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
@MathiasGruber
MathiasGruber / sts_sentence_embedding.py
Last active February 18, 2022 13:51
Embedding questions using sentence transformer model
from transformers import AutoTokenizer, AutoModel
def mean_pooling(model_output, attention_mask):
"""
Mean pooling to get sentence embeddings. See:
https://huggingface.co/sentence-transformers/paraphrase-distilroberta-base-v1
"""
token_embeddings = model_output[0]
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) # Sum columns
@MathiasGruber
MathiasGruber / get_best_match.py
Last active April 19, 2021 19:02
Getting the best match from a set of embeddings including the query itself
import numpy as np
from sklearn.preprocessing import normalize
# Use the first question as the query
QUERY_ID = 0
# Noralize the data
norm_data = normalize(sentence_embeddings, norm='l2')
# Calculate scores as dot product between all embedding & query
@MathiasGruber
MathiasGruber / semantic_search_attention.py
Last active April 19, 2021 19:08
Creating a plot showing the attention/context of a semantic search through the calculation of the cosine similarity between words
# For each sentence, store a list of token embeddings; i.e. a 1024-dimensional vector for each token
for i, sentence in enumerate(valid_sentences):
tokens = tokenizer.convert_ids_to_tokens(encoded_input['input_ids'][i])
embeddings = model_output[0][i]
token_embeddings.append(
[{"token": token, "embedding": embedding.detach().numpy()} for token, embedding in zip(tokens, embeddings)]
)
def get_token_embeddings(embeddings_word):
"""Returns a list of tokens and list of embeddings"""