Last active
April 24, 2024 08:53
-
-
Save agyaatcoder/ddd0ad92ebff64b0e8478852aa97f649 to your computer and use it in GitHub Desktop.
Huggingface Text Embedding Inference on Modal Labs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import subprocess | |
import os | |
from pathlib import Path | |
import socket | |
from modal import Image, Mount, Stub, Secret, web_server, gpu | |
MODEL_ID = "BAAI/bge-small-en-v1.5" | |
PORT = 8080 | |
DOCKER_IMAGE = "ghcr.io/huggingface/text-embeddings-inference:86-0.4.0" | |
stub = Stub("text-embeddings-inference-2") | |
GPU_CONFIG = gpu.A10G() | |
tei_image = ( | |
Image.from_registry(DOCKER_IMAGE, add_python="3.10") | |
.dockerfile_commands("ENTRYPOINT []") | |
#.run_function(download_model, timeout=60 * 20, secrets=[Secret.from_name("huggingface-secret")]) | |
) | |
@stub.function( | |
image=tei_image, | |
gpu=GPU_CONFIG, | |
concurrency_limit=1, | |
) | |
@web_server(port=PORT, startup_timeout=120) | |
def run_server(): | |
model = MODEL_ID | |
port = PORT | |
cmd = f"text-embeddings-router --model-id {model} --hostname 0.0.0.0 --port {port} " | |
subprocess.Popen(cmd, shell=True) |
You will get an endpoint from this code, which is openai compatible ( I can use openai client, except that query will be routed to my endpoint rather than openai endpoints)
from openai import OpenAI
client = OpenAI(
api_key= "EMPTY" #DEFAULT FROM HF text embedding inference
base_url="url_from_modal")
MODEL = "BAAI/bge-base-en-v1.5"
res = client.embeddings.create(
input=[
"Sample document text goes here",
"there will be several phrases in each batch"
], model=MODEL
)
# we can extract embeddings to a list
embeds = [record.embedding for record in res.data]
or like this
import os
from openai import OpenAI
client = OpenAI(
api_key= ,
base_url="url_from_modal"
)
def get_embeddings(texts, model= MODEL_NAME):
texts = [text.replace("\n", " ") for text in texts]
outputs = client.embeddings.create(input = texts, model=model)
return [outputs.data[i].embedding for i in range(len(texts))]
embeddings = get_embeddings(texts)
Or you can use Hugging Face Inference Endpoints code
curl 127.0.0.1:8080/embed \. #replace 127.0.0.1:8080 with modal endpoint
-X POST \
-d '{"inputs":"What is Deep Learning?"}' \
-H 'Content-Type: application/json'
https://huggingface.co/docs/text-embeddings-inference/en/quick_tour
I hope this helps!
Hey - how can you call this from another code base then? Like if I want to pass a list of chunks from a codebase and get back the list of embeddings? Also how would you launch the web server? Would you use something like modal deploy or modal run?
modal deploy hf-tei-modal-labs.py
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hey - how can you call this from another code base then? Like if I want to pass a list of chunks from a codebase and get back the list of embeddings?
Also how would you launch the web server? Would you use something like modal deploy or modal run?