Skip to content

Instantly share code, notes, and snippets.

@davidzhao
Last active November 8, 2024 00:50
Show Gist options
  • Save davidzhao/5738f0e2d434dea6e5224262ee5c3cfa to your computer and use it in GitHub Desktop.
Save davidzhao/5738f0e2d434dea6e5224262ee5c3cfa to your computer and use it in GitHub Desktop.
LiveKit Voice Assistant with Cartesia
import asyncio
import json
from livekit import rtc
from livekit.agents import JobContext, WorkerOptions, cli, JobProcess
from livekit.agents.llm import (
ChatContext,
ChatMessage,
)
from livekit.agents.voice_assistant import VoiceAssistant
from livekit.agents.log import logger
from livekit.plugins import deepgram, silero, cartesia, openai
from pydantic import BaseModel
from typing import Optional, List
from datetime import datetime
from dotenv import load_dotenv
load_dotenv()
class Voice(BaseModel):
id: str
user_id: Optional[str] = None
is_public: bool
name: str
description: str
created_at: datetime
embedding: List[float]
def prewarm(proc: JobProcess):
proc.userdata["vad"] = silero.VAD.load()
async def entrypoint(ctx: JobContext):
initial_ctx = ChatContext(
messages=[
ChatMessage(
role="system",
content="You are a voice assistant created by LiveKit. Your interface with users will be voice. Pretend we're having a conversation, no special formatting or headings, just natural speech.",
)
]
)
tts = cartesia.TTS(
voice="248be419-c632-4f23-adf1-5324ed7dbf1d",
)
assistant = VoiceAssistant(
vad=ctx.proc.userdata["vad"],
stt=deepgram.STT(),
llm=openai.LLM(model="gpt-4o-mini"),
tts=tts,
chat_ctx=initial_ctx,
)
is_user_speaking = False
is_agent_speaking = False
@ctx.room.on("participant_attributes_changed")
def on_participant_attributes_changed(
changed_attributes: dict[str, str], participant: rtc.Participant
):
# ignore agent state changes
if participant == ctx.room.local_participant:
return
if "voice" in changed_attributes:
voice = changed_attributes["voice"]
logger.info(
f"participant {participant.identity} requested voice change: {voice}"
)
voice_data = json.loads(voice)
if "embedding" in voice_data:
model = "sonic-english"
language = "en"
if "language" in voice_data and voice_data["language"] != "en":
language = voice_data["language"]
model = "sonic-multilingual"
tts._opts.voice = voice_data["embedding"]
tts._opts.model = model
tts._opts.language = language
if not (is_agent_speaking or is_user_speaking):
asyncio.create_task(
assistant.say("How do I sound now?", allow_interruptions=True)
)
await ctx.connect()
@assistant.on("agent_started_speaking")
def agent_started_speaking():
nonlocal is_agent_speaking
is_agent_speaking = True
@assistant.on("agent_stopped_speaking")
def agent_stopped_speaking():
nonlocal is_agent_speaking
is_agent_speaking = False
@assistant.on("user_started_speaking")
def user_started_speaking():
nonlocal is_user_speaking
is_user_speaking = True
@assistant.on("user_stopped_speaking")
def user_stopped_speaking():
nonlocal is_user_speaking
is_user_speaking = False
assistant.start(ctx.room)
await asyncio.sleep(1)
await assistant.say("Hi there, how are you doing today?", allow_interruptions=True)
if __name__ == "__main__":
cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint, prewarm_fnc=prewarm))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment