Mathias Gruber MathiasGruber

## image_to_tfrecords.py
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,

## delete_corrupt_images.py
import multiprocessing
from joblib import Parallel, delayed
from tqdm import tqdm_notebook
from skimage import io
from PIL import Image
import os
import gc
import numpy as np
import warnings

## get_keras_activations.py
import tensorflow.keras.backend as K

def get_activations(model, model_inputs):
    print('----- activations -----')
    activations = []
    inp = model.input

    model_multi_inputs_cond = True
    if not isinstance(inp, list):
        # only one input! let's wrap it in a list.

## tf.dataset.boilerplate.py

    features = {
        'image/encoded': tf.FixedLenFeature([], tf.string),
        'image/height': tf.FixedLenFeature([], tf.int64),
        'image/width': tf.FixedLenFeature([], tf.int64)
    }

    def parse(record, image_size=256):
        # Parse data
        parsed = tf.parse_single_example(record, features)

## dimensionality_reduction.py
import time
import trimap
import umap
import pacmap
from sklearn.manifold import TSNE

# Read data
df = pd.read_csv('your_dataset.csv')

# Algorithms to test

## azure_databricks.tf
# Deploy to Azure
terraform {
  required_providers {
    azurerm = {
      source  = "hashicorp/azurerm"
      version = "=2.46.0"
    }
  }
}

## mnist_train.py
"""
Keras MNIST example from: https://keras.io/examples/vision/mnist_convnet/
Adapted to add mlflow logging
"""

import mlflow
import mlflow.keras
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers

## sts_sentence_embedding.py
from transformers import AutoTokenizer, AutoModel

def mean_pooling(model_output, attention_mask):
    """
    Mean pooling to get sentence embeddings. See:
    https://huggingface.co/sentence-transformers/paraphrase-distilroberta-base-v1
    """
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) # Sum columns

## get_best_match.py
import numpy as np
from sklearn.preprocessing import normalize

# Use the first question as the query
QUERY_ID = 0

# Noralize the data
norm_data = normalize(sentence_embeddings, norm='l2')

# Calculate scores as dot product between all embedding & query

## semantic_search_attention.py
# For each sentence, store a list of token embeddings; i.e. a 1024-dimensional vector for each token
for i, sentence in enumerate(valid_sentences):
    tokens = tokenizer.convert_ids_to_tokens(encoded_input['input_ids'][i])
    embeddings = model_output[0][i]
    token_embeddings.append(
        [{"token": token, "embedding": embedding.detach().numpy()} for token, embedding in zip(tokens, embeddings)]
    )

def get_token_embeddings(embeddings_word):
    """Returns a list of tokens and list of embeddings"""
	# Copyright 2016 Google Inc. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	import multiprocessing
	from joblib import Parallel, delayed
	from tqdm import tqdm_notebook
	from skimage import io
	from PIL import Image
	import os
	import gc
	import numpy as np
	import warnings
	import tensorflow.keras.backend as K

	def get_activations(model, model_inputs):
	print('----- activations -----')
	activations = []
	inp = model.input

	model_multi_inputs_cond = True
	if not isinstance(inp, list):
	# only one input! let's wrap it in a list.

	features = {
	'image/encoded': tf.FixedLenFeature([], tf.string),
	'image/height': tf.FixedLenFeature([], tf.int64),
	'image/width': tf.FixedLenFeature([], tf.int64)
	}

	def parse(record, image_size=256):
	# Parse data
	parsed = tf.parse_single_example(record, features)
	import time
	import trimap
	import umap
	import pacmap
	from sklearn.manifold import TSNE

	# Read data
	df = pd.read_csv('your_dataset.csv')

	# Algorithms to test
	# Deploy to Azure
	terraform {
	required_providers {
	azurerm = {
	source = "hashicorp/azurerm"
	version = "=2.46.0"
	}
	}
	}
	"""
	Keras MNIST example from: https://keras.io/examples/vision/mnist_convnet/
	Adapted to add mlflow logging
	"""

	import mlflow
	import mlflow.keras
	import numpy as np
	from tensorflow import keras
	from tensorflow.keras import layers
	from transformers import AutoTokenizer, AutoModel

	def mean_pooling(model_output, attention_mask):
	"""
	Mean pooling to get sentence embeddings. See:
	https://huggingface.co/sentence-transformers/paraphrase-distilroberta-base-v1
	"""
	token_embeddings = model_output[0]
	input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
	sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) # Sum columns
	import numpy as np
	from sklearn.preprocessing import normalize

	# Use the first question as the query
	QUERY_ID = 0

	# Noralize the data
	norm_data = normalize(sentence_embeddings, norm='l2')

	# Calculate scores as dot product between all embedding & query
	# For each sentence, store a list of token embeddings; i.e. a 1024-dimensional vector for each token
	for i, sentence in enumerate(valid_sentences):
	tokens = tokenizer.convert_ids_to_tokens(encoded_input['input_ids'][i])
	embeddings = model_output[0][i]
	token_embeddings.append(
	[{"token": token, "embedding": embedding.detach().numpy()} for token, embedding in zip(tokens, embeddings)]
	)

	def get_token_embeddings(embeddings_word):
	"""Returns a list of tokens and list of embeddings"""