Skip to content

Instantly share code, notes, and snippets.

View tomaarsen's full-sized avatar

Tom Aarsen tomaarsen

View GitHub Profile
@tomaarsen
tomaarsen / update_e5_nl.py
Created September 24, 2025 16:08
Script to update all E5-NL models to be nicely integrated with Sentence Transformers
import re
from huggingface_hub import get_collection, ModelCard
from sentence_transformers import SentenceTransformer
from sentence_transformers.models import Normalize
collection = get_collection(collection_slug="clips/e5-nl-68be9d3760240ce5c7d9f831")
ST_SNIPPET_PATTERN = r"""\
from sentence_transformers import SentenceTransformer
model = SentenceTransformer\((?:'|")([a-zA-Z0-9_\/\.-]+?)(?:'|")\)
@tomaarsen
tomaarsen / train_script.py
Created August 28, 2025 14:30
MS MARCO Contrastive and/or Distillation sample training script
import argparse
import logging
import traceback
from collections import defaultdict
from collections.abc import Iterable
from enum import Enum, auto
import torch
from datasets import load_dataset
from torch import Tensor
@tomaarsen
tomaarsen / train_script.py
Created July 9, 2025 13:00
Boilerplate to train a reranker model using Sentence Transformers
import logging
import traceback
import torch
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sentence_transformers.cross_encoder import (
CrossEncoder,
CrossEncoderModelCardData,
@tomaarsen
tomaarsen / train_script.py
Created July 9, 2025 12:59
Boilerplate to train a Sparse Embedding model (SPLADE architecture) using Sentence Transformers
import logging
from datasets import load_dataset
from sentence_transformers import (
SparseEncoder,
SparseEncoderModelCardData,
SparseEncoderTrainer,
SparseEncoderTrainingArguments,
)
@tomaarsen
tomaarsen / export_locally.py
Last active July 23, 2025 06:47
Export Sentence Transformer models to ONNX (+ optimization, quantization) & OpenVINO
# requires sentence_transformers>=3.2.0
from sentence_transformers import SentenceTransformer, export_optimized_onnx_model, export_dynamic_quantized_onnx_model
# The model to export to ONNX (+ optimize, quantize), OpenVINO
model_id = "mixedbread-ai/mxbai-embed-large-v1"
# Where to save the exported models locally
output_dir = model_id.replace("/", "-")
onnx_model = SentenceTransformer(model_id, backend="onnx", model_kwargs={"export": True})
onnx_model.save_pretrained(output_dir)
@tomaarsen
tomaarsen / snowflake_arctic_trust_remote_code.ipynb
Created April 24, 2024 15:16
Snowflake_Arctic_trust_remote_code.ipynb
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
from functools import partial
import datasets
from sentence_transformers import (
SentenceTransformer,
evaluation,
)
from torch.nn import functional as F
stsb = datasets.load_dataset("mteb/stsbenchmark-sts", split="test")
@tomaarsen
tomaarsen / demo.py
Created December 7, 2023 20:54
Attention Sinks in `transformers` showcase
from transformers import AutoTokenizer, SinkCache, LlamaForCausalLM, TextStreamer
import torch
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
model = LlamaForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf", device_map="auto", torch_dtype=torch.float16
)
inputs = tokenizer(["Vaswani et al. (2017) introduced the Transformers"], return_tensors="pt").to(model.device)
cache = SinkCache(window_length=256, num_sink_tokens=4)
@tomaarsen
tomaarsen / handler.py
Created September 20, 2023 08:56
SpanMarker handler.py for Inference Endpoints
from typing import Any, Dict, List
from span_marker import SpanMarkerModel
class EndpointHandler:
def __init__(self, model_id: str) -> None:
self.model = SpanMarkerModel.from_pretrained(model_id)
# Try to place it on CUDA, do nothing if it fails
self.model.try_cuda()
@tomaarsen
tomaarsen / train_span_marker_keyphrase.py
Created August 9, 2023 16:46
Keyphrase extraction model with SpanMarker
from datasets import load_dataset, concatenate_datasets
from transformers import TrainingArguments
from span_marker import SpanMarkerModel, Trainer
def main() -> None:
# Load the dataset, ensure "tokens" and "ner_tags" columns, and get a list of labels
dataset = load_dataset("midas/inspec", "extraction")
dataset = dataset.rename_columns({"document": "tokens", "doc_bio_tags": "ner_tags"})
# Map string labels to integer labels instead