Skip to content

Instantly share code, notes, and snippets.

@philschmid
Last active December 25, 2024 20:58
Show Gist options
  • Save philschmid/cb8c98f0781e4e52e5d364ff39e2ccd2 to your computer and use it in GitHub Desktop.
Save philschmid/cb8c98f0781e4e52e5d364ff39e2ccd2 to your computer and use it in GitHub Desktop.
import asyncio
import base64
import json
import os
import pyaudio
from websockets.asyncio.client import connect
class SimpleGeminiVoice:
def __init__(self):
self.audio_queue = asyncio.Queue()
self.api_key = os.environ.get("GEMINI_API_KEY")
self.model = "gemini-2.0-flash-exp"
self.uri = f"wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={self.api_key}"
# Audio settings
self.FORMAT = pyaudio.paInt16
self.CHANNELS = 1
self.CHUNK = 512
self.RATE = 16000
async def start(self):
# Initialize websocket
self.ws = await connect(
self.uri, additional_headers={"Content-Type": "application/json"}
)
await self.ws.send(json.dumps({"setup": {"model": f"models/{self.model}"}}))
await self.ws.recv(decode=False)
print("Connected to Gemini, You can start talking now")
# Start audio streaming
async with asyncio.TaskGroup() as tg:
tg.create_task(self.capture_audio())
tg.create_task(self.stream_audio())
tg.create_task(self.play_response())
async def capture_audio(self):
audio = pyaudio.PyAudio()
stream = audio.open(
format=self.FORMAT,
channels=self.CHANNELS,
rate=self.RATE,
input=True,
frames_per_buffer=self.CHUNK,
)
while True:
data = await asyncio.to_thread(stream.read, self.CHUNK)
await self.ws.send(
json.dumps(
{
"realtime_input": {
"media_chunks": [
{
"data": base64.b64encode(data).decode(),
"mime_type": "audio/pcm",
}
]
}
}
)
)
async def stream_audio(self):
async for msg in self.ws:
response = json.loads(msg)
try:
audio_data = response["serverContent"]["modelTurn"]["parts"][0][
"inlineData"
]["data"]
self.audio_queue.put_nowait(base64.b64decode(audio_data))
except KeyError:
pass
try:
turn_complete = response["serverContent"]["turnComplete"]
except KeyError:
pass
else:
if turn_complete:
# If you interrupt the model, it sends an end_of_turn. For interruptions to work, we need to empty out the audio queue
print("\nEnd of turn")
while not self.audio_queue.empty():
self.audio_queue.get_nowait()
async def play_response(self):
audio = pyaudio.PyAudio()
stream = audio.open(
format=self.FORMAT, channels=self.CHANNELS, rate=24000, output=True
)
while True:
data = await self.audio_queue.get()
await asyncio.to_thread(stream.write, data)
if __name__ == "__main__":
client = SimpleGeminiVoice()
asyncio.run(client.start())
import asyncio
import base64
import json
import os
import pyaudio
from websockets.asyncio.client import connect
class SimpleGeminiVoice:
def __init__(self):
self.api_key = os.environ.get("GEMINI_API_KEY")
self.model = "gemini-2.0-flash-exp"
self.uri = f"wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={self.api_key}"
# Audio settings
self.FORMAT = pyaudio.paInt16
self.CHANNELS = 1
self.CHUNK = 512
async def start(self):
# Initialize websocket
self.ws = await connect(
self.uri, additional_headers={"Content-Type": "application/json"}
)
await self.ws.send(json.dumps({"setup": {"model": f"models/{self.model}"}}))
await self.ws.recv(decode=False)
print("Connected to Gemini, You can start talking now")
# Start audio streaming
async with asyncio.TaskGroup() as tg:
tg.create_task(self.send_user_audio())
tg.create_task(self.recv_model_audio())
async def send_user_audio(self):
audio = pyaudio.PyAudio()
stream = audio.open(
format=self.FORMAT,
channels=self.CHANNELS,
rate="16000",
input=True,
frames_per_buffer=self.CHUNK,
)
while True:
data = await asyncio.to_thread(stream.read, self.CHUNK)
await self.ws.send(
json.dumps(
{
"realtime_input": {
"media_chunks": [
{
"data": base64.b64encode(data).decode(),
"mime_type": "audio/pcm",
}
]
}
}
)
)
async def recv_model_audio(self):
audio = pyaudio.PyAudio()
stream = audio.open(
format=self.FORMAT, channels=self.CHANNELS, rate=24000, output=True
)
async for msg in self.ws:
response = json.loads(msg)
try:
audio_data = response["serverContent"]["modelTurn"]["parts"][0][
"inlineData"
]["data"]
await asyncio.to_thread(stream.write, base64.b64decode(audio_data))
except KeyError:
pass
if __name__ == "__main__":
client = SimpleGeminiVoice()
asyncio.run(client.start())
@lakshmanok
Copy link

Something wrong with the "turn_complete" logic -- the model interrupts itself in the middle of an answer

@cheesecake100201
Copy link

Yeah keeps happening with me as well. Is there a fix for this ?

@philschmid
Copy link
Author

@lakshmanok @cheesecake100201 You have to make sure you have a good microphone. Any noise even very silent will trigger an interruption. I want to work on improving this with some better detection. But for now every noise coming as input will trigger and interruption.

@cheesecake100201
Copy link

Any way we can stop these interruptions for now ??
@philschmid

@kloklojul
Copy link

Any way we can stop these interruptions for now ?? @philschmid

you can add a noise gate to your microphone setup so sounds below a threshold get canceled.

@avinashgawali
Copy link

Is it possible to add config, system instructions?

@frikazoid11
Copy link

am I the only one who can't start the script? it's just seem to crush (black command prompt box is closing on run)

@2187Nick
Copy link

Is it possible to add config, system instructions?

This works to change the voice.
image

image

@avinashgawali
Copy link

Is it possible to add config, system instructions?

This works to change the voice. image

image

And system instruction? Generation config worked for me but not able to set system instructions

@lakshmanok
Copy link

Any way we can stop these interruptions for now ?? @philschmid

I commented out the turn complete code and it works better now ... however Gemini sometimes fills silence by repeating the last phrase until you say something :)
image

@2187Nick
Copy link

Is it possible to add config, system instructions?

image
System instruction works.
And this shows you how to include search.

Fast voice conversation in a pirate voice and she can recall current information..... so nice.

@cheesecake100201
Copy link

cheesecake100201 commented Dec 13, 2024

Here for the system instruction shouldn't the role be system rather than user?
@2187Nick

@boozuk
Copy link

boozuk commented Dec 13, 2024

Any way we can stop these interruptions for now ?? @philschmid

I commented out the turn complete code and it works better now ... however Gemini sometimes fills silence by repeating the last phrase until you say something :) image

with this code i can not interrupt his speech. If you ask him to speak for 1 hour about nature, the only way to interrupt is ctrl+c

@boozuk
Copy link

boozuk commented Dec 13, 2024

And he don't speak Russian language. He understand Russian, but answers Chinese or something like that instead of Russian.

@KorowaLisa
Copy link

And he don't speak Russian language. He understand Russian, but answers Chinese or something like that instead of Russian.

Думаю русский добавят чуть позже, как было с LearnLM. Он поначалу тоже прикидывался в незнании русского.

@boozuk
Copy link

boozuk commented Dec 13, 2024

And he don't speak Russian language. He understand Russian, but answers Chinese or something like that instead of Russian.

Думаю русский добавят чуть позже, как было с LearnLM. Он поначалу тоже прикидывался в незнании русского.

Просто везде пишут, что он добавлен уже. И я подумал может это политика гугла такая.

@2187Nick
Copy link

Here for the system instruction shouldn't the role be system rather than user? @2187Nick

Im just winging it. Based it off of python-genai/google/genai/tests/live/live_test.py
image
But now the responses have slowed way down. Basically unusable until they get it back running faster.

@paimonian
Copy link

I would like to use it in a voice channel through discord.py, but I can't seem to get it to work.

@saharmor
Copy link

I've created a playground repository you can run and fork easily https://github.com/saharmor/gemini-multimodal-playground
You can change the system prompt and voice and enable/disable interruptions. cc @avinashgawali @Youmyluck @boozuk

@avinashgawali
Copy link

I've created a playground repository you can run and fork easily https://github.com/saharmor/gemini-multimodal-playground You can change the system prompt and voice and enable/disable interruptions. cc @avinashgawali @Youmyluck @boozuk

Thank you

@KorowaLisa
Copy link

And he don't speak Russian language. He understand Russian, but answers Chinese or something like that instead of Russian.

Думаю русский добавят чуть позже, как было с LearnLM. Он поначалу тоже прикидывался в незнании русского.

Просто везде пишут, что он добавлен уже. И я подумал может это политика гугла такая.

Пока нет. Ждем 2025. Середина января/начало февраля.

@jlia0
Copy link

jlia0 commented Dec 16, 2024

Perhaps we can use VAD to filter the noise form the audio and only send voice? @philschmid

@saharmor
Copy link

@jlia0 this is already implemented in the standalone script here https://github.com/saharmor/gemini-multimodal-playground

@jlia0
Copy link

jlia0 commented Dec 16, 2024

@jlia0 this is already implemented in the standalone script here https://github.com/saharmor/gemini-multimodal-playground

thanks so much 🫶

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment