Last active
November 8, 2024 00:50
-
-
Save davidzhao/5738f0e2d434dea6e5224262ee5c3cfa to your computer and use it in GitHub Desktop.
LiveKit Voice Assistant with Cartesia
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import asyncio | |
import json | |
from livekit import rtc | |
from livekit.agents import JobContext, WorkerOptions, cli, JobProcess | |
from livekit.agents.llm import ( | |
ChatContext, | |
ChatMessage, | |
) | |
from livekit.agents.voice_assistant import VoiceAssistant | |
from livekit.agents.log import logger | |
from livekit.plugins import deepgram, silero, cartesia, openai | |
from pydantic import BaseModel | |
from typing import Optional, List | |
from datetime import datetime | |
from dotenv import load_dotenv | |
load_dotenv() | |
class Voice(BaseModel): | |
id: str | |
user_id: Optional[str] = None | |
is_public: bool | |
name: str | |
description: str | |
created_at: datetime | |
embedding: List[float] | |
def prewarm(proc: JobProcess): | |
proc.userdata["vad"] = silero.VAD.load() | |
async def entrypoint(ctx: JobContext): | |
initial_ctx = ChatContext( | |
messages=[ | |
ChatMessage( | |
role="system", | |
content="You are a voice assistant created by LiveKit. Your interface with users will be voice. Pretend we're having a conversation, no special formatting or headings, just natural speech.", | |
) | |
] | |
) | |
tts = cartesia.TTS( | |
voice="248be419-c632-4f23-adf1-5324ed7dbf1d", | |
) | |
assistant = VoiceAssistant( | |
vad=ctx.proc.userdata["vad"], | |
stt=deepgram.STT(), | |
llm=openai.LLM(model="gpt-4o-mini"), | |
tts=tts, | |
chat_ctx=initial_ctx, | |
) | |
is_user_speaking = False | |
is_agent_speaking = False | |
@ctx.room.on("participant_attributes_changed") | |
def on_participant_attributes_changed( | |
changed_attributes: dict[str, str], participant: rtc.Participant | |
): | |
# ignore agent state changes | |
if participant == ctx.room.local_participant: | |
return | |
if "voice" in changed_attributes: | |
voice = changed_attributes["voice"] | |
logger.info( | |
f"participant {participant.identity} requested voice change: {voice}" | |
) | |
voice_data = json.loads(voice) | |
if "embedding" in voice_data: | |
model = "sonic-english" | |
language = "en" | |
if "language" in voice_data and voice_data["language"] != "en": | |
language = voice_data["language"] | |
model = "sonic-multilingual" | |
tts._opts.voice = voice_data["embedding"] | |
tts._opts.model = model | |
tts._opts.language = language | |
if not (is_agent_speaking or is_user_speaking): | |
asyncio.create_task( | |
assistant.say("How do I sound now?", allow_interruptions=True) | |
) | |
await ctx.connect() | |
@assistant.on("agent_started_speaking") | |
def agent_started_speaking(): | |
nonlocal is_agent_speaking | |
is_agent_speaking = True | |
@assistant.on("agent_stopped_speaking") | |
def agent_stopped_speaking(): | |
nonlocal is_agent_speaking | |
is_agent_speaking = False | |
@assistant.on("user_started_speaking") | |
def user_started_speaking(): | |
nonlocal is_user_speaking | |
is_user_speaking = True | |
@assistant.on("user_stopped_speaking") | |
def user_stopped_speaking(): | |
nonlocal is_user_speaking | |
is_user_speaking = False | |
assistant.start(ctx.room) | |
await asyncio.sleep(1) | |
await assistant.say("Hi there, how are you doing today?", allow_interruptions=True) | |
if __name__ == "__main__": | |
cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint, prewarm_fnc=prewarm)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment