Skip to content

Instantly share code, notes, and snippets.

Hannes Hapke hanneshapke

Block or report user

Report or block hanneshapke

Hide content and notifications from this user.

Learn more about blocking users

Contact Support about this user’s behavior.

Learn more about reporting abuse

Report abuse
View GitHub Profile
hanneshapke / tfx-pipeline-for-bert-preprocessing.ipynb
Last active Feb 20, 2020
TFX Pipeline for Bert Preprocessing.ipynb
View tfx-pipeline-for-bert-preprocessing.ipynb
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
import base64
import googleapiclient.discovery
from example_pb2 import Example
from feature_pb2 import BytesList, Feature, Features
def _convert_to_pb(value):
""" Serialize a given sentence to the ProtoBuf Structure required to model the tf.Example data structure.
Feel free to add more features and different data types if your models reqiures different inputs. An overview of
View .bashrc
export CURRENT_DEV=kreuzberg
alias latest_dev='cd ~/development/$CURRENT_DEV'
# ssh tunnel
alias ssd='~/bin/ ubuntu@remote -p 823 -L 6006:gpu:6006'
# add additional paths to the PYTHONPATH
export PYTHONPATH=$PYTHONPATH:~/development/additional_package
# git shortcuts
hanneshapke /
Created May 24, 2018
Load word vectors into a redis db
import bz2
import pickle
from django.conf import settings
from djang_redis import get_redis_connection
from tqdm import tqdm
from .constants import GOOGLE_WORD2VEC_MODEL_NAME
hanneshapke /
Last active May 24, 2018
Mimicking Gensim's KeyedVectors class
import bz2
import numpy as np
import pickle
from django.conf import settings
from django_redis import get_redis_connection
from gensim.models.keyedvectors import KeyedVectors
from .constants import GOOGLE_WORD2VEC_MODEL_NAME
from .redis import load_word2vec_model_into_redis, query_redis
def get_highlighted_tokens(tokens, matrix, model, layer_name, threshold, y_labels):
indices = get_token_indices(model, layer_name, threshold, matrix, y_labels)
ctokens = []
for i, t in enumerate(tokens):
if i in indices.keys():
_color = color(indices[i], threshold=threshold)
ctokens.append(cstr(t, color=_color))
def get_token_indices(model, layer_name, threshold, matrix, y_labels):
heatmap = get_heatmap(model=model, layer_name=layer_name, matrix=matrix, y_labels=y_labels)
_, output_dim = get_conv_layer(model, layer_name)
# depending on the ration between the input and layer output shape, we need to calculate
# how many original tokens have contributed to the layer output
dim_ratio = matrix.shape[1] / output_dim
if dim_ratio < 1.5:
window_size = 1
hanneshapke /
Last active Apr 16, 2018
Generate text color depending on the heat-map value
def color(hvalue, threshold, max=1, cdefault='black', colors=['red', 'yellow', 'green', 'cyan', 'blue']):
num_colors = len(colors)
if hvalue < threshold:
return cdefault
for i, color in enumerate(colors):
if hvalue > (max - (max - threshold) / num_colors * (i + 1)):
return color
hanneshapke /
Created Apr 11, 2018
Generate the text tag with the color attribute
def cstr(s, color='black'):
return "<text style=\"color:{}\">{}</text>".format(color, s)
hanneshapke /
Last active Apr 16, 2018
Plot the layer heatmap
def plot_heatmap(heatmap, height_ratio=0.05):
# calculating how often the vector should be repeated to display a height relative to the vector length
repeat_vector_n_times = int(heatmap.shape[0] * height_ratio)
plt.matshow([heatmap] * repeat_vector_n_times)
You can’t perform that action at this time.