-
-
Save philschmid/cb8c98f0781e4e52e5d364ff39e2ccd2 to your computer and use it in GitHub Desktop.
import asyncio | |
import base64 | |
import json | |
import os | |
import pyaudio | |
from websockets.asyncio.client import connect | |
class SimpleGeminiVoice: | |
def __init__(self): | |
self.audio_queue = asyncio.Queue() | |
self.api_key = os.environ.get("GEMINI_API_KEY") | |
self.model = "gemini-2.0-flash-exp" | |
self.uri = f"wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={self.api_key}" | |
# Audio settings | |
self.FORMAT = pyaudio.paInt16 | |
self.CHANNELS = 1 | |
self.CHUNK = 512 | |
self.RATE = 16000 | |
async def start(self): | |
# Initialize websocket | |
self.ws = await connect( | |
self.uri, additional_headers={"Content-Type": "application/json"} | |
) | |
await self.ws.send(json.dumps({"setup": {"model": f"models/{self.model}"}})) | |
await self.ws.recv(decode=False) | |
print("Connected to Gemini, You can start talking now") | |
# Start audio streaming | |
async with asyncio.TaskGroup() as tg: | |
tg.create_task(self.capture_audio()) | |
tg.create_task(self.stream_audio()) | |
tg.create_task(self.play_response()) | |
async def capture_audio(self): | |
audio = pyaudio.PyAudio() | |
stream = audio.open( | |
format=self.FORMAT, | |
channels=self.CHANNELS, | |
rate=self.RATE, | |
input=True, | |
frames_per_buffer=self.CHUNK, | |
) | |
while True: | |
data = await asyncio.to_thread(stream.read, self.CHUNK) | |
await self.ws.send( | |
json.dumps( | |
{ | |
"realtime_input": { | |
"media_chunks": [ | |
{ | |
"data": base64.b64encode(data).decode(), | |
"mime_type": "audio/pcm", | |
} | |
] | |
} | |
} | |
) | |
) | |
async def stream_audio(self): | |
async for msg in self.ws: | |
response = json.loads(msg) | |
try: | |
audio_data = response["serverContent"]["modelTurn"]["parts"][0][ | |
"inlineData" | |
]["data"] | |
self.audio_queue.put_nowait(base64.b64decode(audio_data)) | |
except KeyError: | |
pass | |
try: | |
turn_complete = response["serverContent"]["turnComplete"] | |
except KeyError: | |
pass | |
else: | |
if turn_complete: | |
# If you interrupt the model, it sends an end_of_turn. For interruptions to work, we need to empty out the audio queue | |
print("\nEnd of turn") | |
while not self.audio_queue.empty(): | |
self.audio_queue.get_nowait() | |
async def play_response(self): | |
audio = pyaudio.PyAudio() | |
stream = audio.open( | |
format=self.FORMAT, channels=self.CHANNELS, rate=24000, output=True | |
) | |
while True: | |
data = await self.audio_queue.get() | |
await asyncio.to_thread(stream.write, data) | |
if __name__ == "__main__": | |
client = SimpleGeminiVoice() | |
asyncio.run(client.start()) |
import asyncio | |
import base64 | |
import json | |
import os | |
import pyaudio | |
from websockets.asyncio.client import connect | |
class SimpleGeminiVoice: | |
def __init__(self): | |
self.api_key = os.environ.get("GEMINI_API_KEY") | |
self.model = "gemini-2.0-flash-exp" | |
self.uri = f"wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={self.api_key}" | |
# Audio settings | |
self.FORMAT = pyaudio.paInt16 | |
self.CHANNELS = 1 | |
self.CHUNK = 512 | |
async def start(self): | |
# Initialize websocket | |
self.ws = await connect( | |
self.uri, additional_headers={"Content-Type": "application/json"} | |
) | |
await self.ws.send(json.dumps({"setup": {"model": f"models/{self.model}"}})) | |
await self.ws.recv(decode=False) | |
print("Connected to Gemini, You can start talking now") | |
# Start audio streaming | |
async with asyncio.TaskGroup() as tg: | |
tg.create_task(self.send_user_audio()) | |
tg.create_task(self.recv_model_audio()) | |
async def send_user_audio(self): | |
audio = pyaudio.PyAudio() | |
stream = audio.open( | |
format=self.FORMAT, | |
channels=self.CHANNELS, | |
rate="16000", | |
input=True, | |
frames_per_buffer=self.CHUNK, | |
) | |
while True: | |
data = await asyncio.to_thread(stream.read, self.CHUNK) | |
await self.ws.send( | |
json.dumps( | |
{ | |
"realtime_input": { | |
"media_chunks": [ | |
{ | |
"data": base64.b64encode(data).decode(), | |
"mime_type": "audio/pcm", | |
} | |
] | |
} | |
} | |
) | |
) | |
async def recv_model_audio(self): | |
audio = pyaudio.PyAudio() | |
stream = audio.open( | |
format=self.FORMAT, channels=self.CHANNELS, rate=24000, output=True | |
) | |
async for msg in self.ws: | |
response = json.loads(msg) | |
try: | |
audio_data = response["serverContent"]["modelTurn"]["parts"][0][ | |
"inlineData" | |
]["data"] | |
await asyncio.to_thread(stream.write, base64.b64decode(audio_data)) | |
except KeyError: | |
pass | |
if __name__ == "__main__": | |
client = SimpleGeminiVoice() | |
asyncio.run(client.start()) |
Yeah keeps happening with me as well. Is there a fix for this ?
@lakshmanok @cheesecake100201 You have to make sure you have a good microphone. Any noise even very silent will trigger an interruption. I want to work on improving this with some better detection. But for now every noise coming as input will trigger and interruption.
Any way we can stop these interruptions for now ??
@philschmid
Any way we can stop these interruptions for now ?? @philschmid
you can add a noise gate to your microphone setup so sounds below a threshold get canceled.
Is it possible to add config, system instructions?
am I the only one who can't start the script? it's just seem to crush (black command prompt box is closing on run)
Any way we can stop these interruptions for now ?? @philschmid
I commented out the turn complete code and it works better now ... however Gemini sometimes fills silence by repeating the last phrase until you say something :)
Here for the system instruction shouldn't the role be system rather than user?
@2187Nick
Any way we can stop these interruptions for now ?? @philschmid
I commented out the turn complete code and it works better now ... however Gemini sometimes fills silence by repeating the last phrase until you say something :)
with this code i can not interrupt his speech. If you ask him to speak for 1 hour about nature, the only way to interrupt is ctrl+c
And he don't speak Russian language. He understand Russian, but answers Chinese or something like that instead of Russian.
And he don't speak Russian language. He understand Russian, but answers Chinese or something like that instead of Russian.
Думаю русский добавят чуть позже, как было с LearnLM. Он поначалу тоже прикидывался в незнании русского.
And he don't speak Russian language. He understand Russian, but answers Chinese or something like that instead of Russian.
Думаю русский добавят чуть позже, как было с LearnLM. Он поначалу тоже прикидывался в незнании русского.
Просто везде пишут, что он добавлен уже. И я подумал может это политика гугла такая.
Here for the system instruction shouldn't the role be system rather than user? @2187Nick
Im just winging it. Based it off of python-genai/google/genai/tests/live/live_test.py
But now the responses have slowed way down. Basically unusable until they get it back running faster.
I would like to use it in a voice channel through discord.py, but I can't seem to get it to work.
I've created a playground repository you can run and fork easily https://github.com/saharmor/gemini-multimodal-playground
You can change the system prompt and voice and enable/disable interruptions. cc @avinashgawali @Youmyluck @boozuk
I've created a playground repository you can run and fork easily https://github.com/saharmor/gemini-multimodal-playground You can change the system prompt and voice and enable/disable interruptions. cc @avinashgawali @Youmyluck @boozuk
Thank you
And he don't speak Russian language. He understand Russian, but answers Chinese or something like that instead of Russian.
Думаю русский добавят чуть позже, как было с LearnLM. Он поначалу тоже прикидывался в незнании русского.
Просто везде пишут, что он добавлен уже. И я подумал может это политика гугла такая.
Пока нет. Ждем 2025. Середина января/начало февраля.
Perhaps we can use VAD to filter the noise form the audio and only send voice? @philschmid
@jlia0 this is already implemented in the standalone script here https://github.com/saharmor/gemini-multimodal-playground
@jlia0 this is already implemented in the standalone script here https://github.com/saharmor/gemini-multimodal-playground
thanks so much 🫶
Something wrong with the "turn_complete" logic -- the model interrupts itself in the middle of an answer