human (Dec 17, 2024, 02:21 PM)
Suggest changes we could make to this based on this documentation - don't change any code yet:
paste.txt
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>OpenAI WebRTC Audio Session</title>
<style>
* {
box-sizing: border-box;
}
body {
font-family: Helvetica, Arial, sans-serif;
margin: 0;
padding: 20px;
background: #f5f5f5;
}
.container {
max-width: 800px;
margin: 0 auto;
}
.audio-indicator {
display: inline-block;
width: 20px;
height: 20px;
border-radius: 50%;
background: #ccc;
margin-right: 10px;
vertical-align: middle;
}
.audio-indicator.active {
background: #4CAF50;
animation: pulse 1s infinite;
}
@keyframes pulse {
0% { opacity: 1; }
50% { opacity: 0.5; }
100% { opacity: 1; }
}
.controls {
margin: 20px 0;
}
.form-group {
margin-bottom: 15px;
}
label {
display: block;
margin-bottom: 5px;
font-weight: bold;
}
input {
width: 100%;
padding: 8px;
font-size: 16px;
border: 1px solid #ddd;
border-radius: 4px;
}
button {
background: #007bff;
color: white;
border: none;
padding: 10px 20px;
font-size: 16px;
border-radius: 4px;
cursor: pointer;
}
button:disabled {
background: #ccc;
cursor: not-allowed;
}
.status {
margin-top: 10px;
padding: 10px;
border-radius: 4px;
}
.error {
background: #fee;
color: #c00;
}
.success {
background: #efe;
color: #0a0;
}
</style>
</head>
<body>
<div class="container">
<h1>
<span id="audioIndicator" class="audio-indicator"></span>
OpenAI WebRTC Audio Session
</h1>
<div class="controls">
<div class="form-group">
<label for="tokenInput">OpenAI API Token</label>
<input type="password" id="tokenInput">
</div>
<button id="startButton">Start Session</button>
</div>
<div id="status" class="status"></div>
</div>
<script type="module">
// Text decoder for incoming messages
const decoder = new TextDecoder()
async function createRealtimeSession(inStream, token) {
const pc = new RTCPeerConnection()
// Handle incoming audio
pc.ontrack = e => {
const audio = new Audio()
audio.srcObject = e.streams[0]
audio.play()
}
// Handle incoming text messages via data channel
pc.ondatachannel = event => {
const channel = event.channel
channel.onmessage = msg => {
console.log(decoder.decode(msg.data))
}
}
pc.addTrack(inStream.getTracks()[0])
const offer = await pc.createOffer()
await pc.setLocalDescription(offer)
const headers = {
Authorization: `Bearer ${token}`,
'Content-Type': 'application/sdp'
}
const opts = {
method: 'POST',
body: offer.sdp,
headers
}
const resp = await fetch('https://api.openai.com/v1/realtime', opts)
await pc.setRemoteDescription({
type: 'answer',
sdp: await resp.text()
})
return pc
}
const startButton = document.getElementById('startButton')
const tokenInput = document.getElementById('tokenInput')
const status = document.getElementById('status')
const audioIndicator = document.getElementById('audioIndicator')
let peerConnection = null
let audioContext = null
let audioStream = null
// Load saved API key on page load
document.addEventListener('DOMContentLoaded', () => {
const savedToken = localStorage.getItem('openai_api_key')
if (savedToken) {
tokenInput.value = savedToken
}
})
// Audio visualization
function setupAudioVisualization(stream) {
audioContext = new AudioContext()
const source = audioContext.createMediaStreamSource(stream)
const analyzer = audioContext.createAnalyser()
analyzer.fftSize = 256
source.connect(analyzer)
const bufferLength = analyzer.frequencyBinCount
const dataArray = new Uint8Array(bufferLength)
function updateIndicator() {
if (!audioContext) return
analyzer.getByteFrequencyData(dataArray)
const average = dataArray.reduce((a, b) => a + b) / bufferLength
audioIndicator.classList.toggle('active', average > 30)
requestAnimationFrame(updateIndicator)
}
updateIndicator()
}
async function startSession() {
try {
// Save API key to localStorage
localStorage.setItem('openai_api_key', tokenInput.value)
status.className = 'status'
status.textContent = 'Requesting microphone access...'
audioStream = await navigator.mediaDevices.getUserMedia({
audio: true,
video: false
})
setupAudioVisualization(audioStream)
status.textContent = 'Establishing connection...'
peerConnection = await createRealtimeSession(
audioStream,
tokenInput.value
)
status.className = 'status success'
status.textContent = 'Session established successfully!'
startButton.textContent = 'Stop Session'
} catch (err) {
status.className = 'status error'
status.textContent = `Error: ${err.message}`
console.error('Session error:', err)
stopSession()
}
}
function stopSession() {
if (peerConnection) {
peerConnection.close()
peerConnection = null
}
if (audioContext) {
audioContext.close()
audioContext = null
}
if (audioStream) {
audioStream.getTracks().forEach(track => track.stop())
audioStream = null
}
audioIndicator.classList.remove('active')
startButton.textContent = 'Start Session'
}
startButton.addEventListener('click', () => {
if (peerConnection) {
stopSession()
} else {
if (!tokenInput.value) {
status.className = 'status error'
status.textContent = 'Please enter an API token'
return
}
startSession()
}
})
// Cleanup on page unload
window.addEventListener('beforeunload', stopSession)
</script>
</body>
</html>
paste-2.txt
Realtime API with WebRTC
Beta
================================
Use WebRTC to connect client-side applications to the Realtime API.
[WebRTC](https://webrtc.org/) is a powerful set of standard interfaces for building real-time applications. The OpenAI Realtime API supports connecting to realtime models through a WebRTC peer connection. Follow this guide to learn how to configure a WebRTC connection to the Realtime API.
Overview
--------
In scenarios where you would like to connect to a Realtime model from an insecure client over the network (like a web browser), we recommend using the WebRTC connection method. WebRTC is better equipped to handle variable connection states, and provides a number of convenient APIs for capturing user audio inputs and playing remote audio streams from the model.
Connecting to the Realtime API from the browser should be done with an ephemeral API key, [generated via the OpenAI REST API](/docs/api-reference/realtime-sessions). The process for initializing a WebRTC connection is as follows (assuming a web browser client):
1. A browser makes a request to a developer-controlled server to mint an ephemeral API key.
2. The developer's server uses a [standard API key](/settings/api-keys) to request an ephemeral key from the [OpenAI REST API](/docs/api-reference/realtime-sessions), and returns that new key to the browser. Note that ephemeral keys currently expire one minute after being issued.
3. The browser uses the ephemeral key to authenticate a session directly with the OpenAI Realtime API as a [WebRTC peer connection](https://developer.mozilla.org/en-US/docs/Web/API/RTCPeerConnection).

While it is technically possible to use a [standard API key](/settings/api-keys) to authenticate WebRTC sessions, **this is a dangerous and insecure practice**. Standard API keys grant access to your full OpenAI API account, and should only be used in secure server-side environments. You should use ephemeral keys in client-side applications whenever possible.
Connection details
------------------
Connecting via WebRTC requires the following connection information:
|URL|https://api.openai.com/v1/realtime|
|Query Parameters|modelRealtime model ID to connect to, like gpt-4o-realtime-preview-2024-12-17|
|Headers|Authorization: Bearer EPHEMERAL_KEYSubstitute EPHEMERAL_KEY with an ephemeral API token - see below for details on how to generate one.|
The following example shows how to initialize a [WebRTC session](https://webrtc.org/getting-started/overview) (including the data channel to send and receive Realtime API events). It assumes you have already fetched an ephemeral API token (example server code for this can be found in the [next section](#creating-an-ephemeral-token)).
```javascript
async function init() {
// Get an ephemeral key from your server - see server code below
const tokenResponse = await fetch("/session");
const data = await tokenResponse.json();
const EPHEMERAL_KEY = data.client_secret.value;
// Create a peer connection
const pc = new RTCPeerConnection();
// Set up to play remote audio from the model
const audioEl = document.createElement("audio");
audioEl.autoplay = true;
pc.ontrack = e => audioEl.srcObject = e.streams[0];
// Add local audio track for microphone input in the browser
const ms = await navigator.mediaDevices.getUserMedia({
audio: true
});
pc.addTrack(ms.getTracks()[0]);
// Set up data channel for sending and receiving events
const dc = pc.createDataChannel("oai-events");
dc.addEventListener("message", (e) => {
// Realtime server events appear here!
console.log(e);
});
// Start the session using the Session Description Protocol (SDP)
const offer = await pc.createOffer();
await pc.setLocalDescription(offer);
const baseUrl = "https://api.openai.com/v1/realtime";
const model = "gpt-4o-realtime-preview-2024-12-17";
const sdpResponse = await fetch(`${baseUrl}?model=${model}`, {
method: "POST",
body: offer.sdp,
headers: {
Authorization: `Bearer ${EPHEMERAL_KEY}`,
"Content-Type": "application/sdp"
},
});
const answer = {
type: "answer",
sdp: await sdpResponse.text(),
};
await pc.setRemoteDescription(answer);
}
init();
```
The WebRTC APIs provide rich controls for handling media streams and input devices. For more guidance on building user interfaces on top of WebRTC, [refer to the docs on MDN](https://developer.mozilla.org/en-US/docs/Web/API/WebRTC_API).
Creating an ephemeral token
---------------------------
To create an ephemeral token to use on the client-side, you will need to build a small server-side application (or integrate with an existing one) to make an [OpenAI REST API](/docs/api-reference/realtime-sessions) request for an ephemeral key. You will use a [standard API key](/settings/api-keys) to authenticate this request on your backend server.
Below is an example of a simple Node.js [express](https://expressjs.com/) server which mints an ephemeral API key using the REST API:
```javascript
import express from "express";
const app = express();
// An endpoint which would work with the client code above - it returns
// the contents of a REST API request to this protected endpoint
app.get("/session", async (req, res) => {
const r = await fetch("https://api.openai.com/v1/realtime/sessions", {
method: "POST",
headers: {
"Authorization": `Bearer ${process.env.OPENAI_API_KEY}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
model: "gpt-4o-realtime-preview-2024-12-17",
voice: "verse",
}),
});
const data = await r.json();
// Send back the JSON we received from the OpenAI REST API
res.send(data);
});
app.listen(3000);
```
You can create a server endpoint like this one on any platform that can send and receive HTTP requests. Just ensure that **you only use standard OpenAI API keys on the server, not in the browser.**
Sending and receiving events
----------------------------
To interact with the Realtime models, you will send and receive messages over the WebRTC data channel, and send and receive audio over media streams with the Realtime API as a connected peer. The full list of messages that clients can send, and that will be sent from the server, are found in the [API reference](/docs/api-reference/realtime-client-events). Once connected, you'll send and receive events which represent text, audio, function calls, interruptions, configuration updates, and more.
Here is how you can send and receive events over the data channel:
```javascript
// Create a data channel from a peer connection
const dc = pc.createDataChannel("oai-events");
// Listen for server-sent events on the data channel - event data
// will need to be parsed from a JSON string
dc.addEventListener("message", (e) => {
const realtimeEvent = JSON.parse(e.data);
console.log(realtimeEvent);
});
// Send client events by serializing a valid client event to
// JSON, and sending it over the data channel
const responseCreate = {
type: "response.create",
response: {
modalities: ["text"],
instructions: "Write a haiku about code",
},
};
dc.send(JSON.stringify(responseCreate));
```
Next steps
----------
Now that you have a functioning WebRTC connection to the Realtime API, it's time to learn more about building applications with Realtime models.
[
Realtime model capabilities
Learn about sessions with a Realtime model, where you can send and receive audio, manage conversations, make one-off requests to the model, and execute function calls.
](/docs/guides/realtime-model-capabilities)[
Event API reference
A complete listing of client and server events in the Realtime API
](/docs/api-reference/realtime-client-events)
Was this page useful?
assistant (Dec 17, 2024, 02:21 PM)
Based on the documentation provided, here are several key improvements we could make to the code:
- Model Specification
- The code currently doesn't specify which model to use in the API call
- We should add the model parameter in the URL when making the request (
gpt-4o-realtime-preview-2024-12-17
) - We should also add voice configuration (e.g., "verse" as shown in the docs)
- Data Channel Handling
- The current code only passively listens to the data channel from server events
- We should add proper initialization of the data channel client-side rather than waiting for ondatachannel
- We should add proper JSON parsing/stringifying of messages as shown in the docs
- We should set up event sending capabilities to enable features like
response.create
- Error Handling and Session Management
- The code should handle the 1-minute expiration of ephemeral tokens
- We might want to add reconnection logic when the token expires
- Should add better error handling around peer connection state changes
- Security Improvements
- The code currently accepts a direct API key input which is not recommended
- We should modify it to work with ephemeral tokens from a backend server
- Should add a server endpoint integration example
- Audio Handling
- Could improve the audio setup by using the documented best practices
- Add proper cleanup of audio elements when the session ends
- Add better handling of track events and stream management
Would you like me to implement any of these changes in the code?
human (Dec 17, 2024, 02:22 PM)
I don't want to use ephemeral tokens at all - it turns out I can just use my regular openAI token instead, which I'm doing
Add a voice selection thing. The voices are ash, ballad, coral, sage, verse
Hook up that data channel and have anything recievede in that dat achannel display in an auto-scrolling area below the start button
assistant (Dec 17, 2024, 02:23 PM)
I'll help add those features. The voice selection and data channel display will be useful additions.
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>OpenAI WebRTC Audio Session</title>
<style>
* {
box-sizing: border-box;
}
body {
font-family: Helvetica, Arial, sans-serif;
margin: 0;
padding: 20px;
background: #f5f5f5;
}
.container {
max-width: 800px;
margin: 0 auto;
}
.audio-indicator {
display: inline-block;
width: 20px;
height: 20px;
border-radius: 50%;
background: #ccc;
margin-right: 10px;
vertical-align: middle;
}
.audio-indicator.active {
background: #4CAF50;
animation: pulse 1s infinite;
}
@keyframes pulse {
0% { opacity: 1; }
50% { opacity: 0.5; }
100% { opacity: 1; }
}
.controls {
margin: 20px 0;
}
.form-group {
margin-bottom: 15px;
}
label {
display: block;
margin-bottom: 5px;
font-weight: bold;
}
input, select {
width: 100%;
padding: 8px;
font-size: 16px;
border: 1px solid #ddd;
border-radius: 4px;
}
button {
background: #007bff;
color: white;
border: none;
padding: 10px 20px;
font-size: 16px;
border-radius: 4px;
cursor: pointer;
}
button:disabled {
background: #ccc;
cursor: not-allowed;
}
.status {
margin-top: 10px;
padding: 10px;
border-radius: 4px;
}
.error {
background: #fee;
color: #c00;
}
.success {
background: #efe;
color: #0a0;
}
#messageLog {
margin-top: 20px;
padding: 15px;
background: #fff;
border: 1px solid #ddd;
border-radius: 4px;
height: 300px;
overflow-y: auto;
font-family: monospace;
}
.message {
margin: 5px 0;
padding: 5px;
border-bottom: 1px solid #eee;
}
</style>
</head>
<body>
<div class="container">
<h1>
<span id="audioIndicator" class="audio-indicator"></span>
OpenAI WebRTC Audio Session
</h1>
<div class="controls">
<div class="form-group">
<label for="tokenInput">OpenAI API Token</label>
<input type="password" id="tokenInput">
</div>
<div class="form-group">
<label for="voiceSelect">Voice</label>
<select id="voiceSelect">
<option value="ash">Ash</option>
<option value="ballad">Ballad</option>
<option value="coral">Coral</option>
<option value="sage">Sage</option>
<option value="verse">Verse</option>
</select>
</div>
<button id="startButton">Start Session</button>
</div>
<div id="status" class="status"></div>
<div id="messageLog"></div>
</div>
<script type="module">
// Text decoder for incoming messages
const decoder = new TextDecoder()
async function createRealtimeSession(inStream, token, voice) {
const pc = new RTCPeerConnection()
// Handle incoming audio
pc.ontrack = e => {
const audio = new Audio()
audio.srcObject = e.streams[0]
audio.play()
}
// Create data channel for messages
const channel = pc.createDataChannel('oai-events')
channel.onmessage = msg => {
const messageLog = document.getElementById('messageLog')
const messageDiv = document.createElement('div')
messageDiv.className = 'message'
messageDiv.textContent = decoder.decode(msg.data)
messageLog.appendChild(messageDiv)
messageLog.scrollTop = messageLog.scrollHeight
}
pc.addTrack(inStream.getTracks()[0])
const offer = await pc.createOffer()
await pc.setLocalDescription(offer)
const headers = {
Authorization: `Bearer ${token}`,
'Content-Type': 'application/sdp'
}
const opts = {
method: 'POST',
body: offer.sdp,
headers
}
const model = 'gpt-4o-realtime-preview-2024-12-17'
const resp = await fetch(`https://api.openai.com/v1/realtime?model=${model}&voice=${voice}`, opts)
await pc.setRemoteDescription({
type: 'answer',
sdp: await resp.text()
})
return pc
}
const startButton = document.getElementById('startButton')
const tokenInput = document.getElementById('tokenInput')
const voiceSelect = document.getElementById('voiceSelect')
const status = document.getElementById('status')
const audioIndicator = document.getElementById('audioIndicator')
const messageLog = document.getElementById('messageLog')
let peerConnection = null
let audioContext = null
let audioStream = null
// Load saved API key on page load
document.addEventListener('DOMContentLoaded', () => {
const savedToken = localStorage.getItem('openai_api_key')
if (savedToken) {
tokenInput.value = savedToken
}
})
// Audio visualization
function setupAudioVisualization(stream) {
audioContext = new AudioContext()
const source = audioContext.createMediaStreamSource(stream)
const analyzer = audioContext.createAnalyser()
analyzer.fftSize = 256
source.connect(analyzer)
const bufferLength = analyzer.frequencyBinCount
const dataArray = new Uint8Array(bufferLength)
function updateIndicator() {
if (!audioContext) return
analyzer.getByteFrequencyData(dataArray)
const average = dataArray.reduce((a, b) => a + b) / bufferLength
audioIndicator.classList.toggle('active', average > 30)
requestAnimationFrame(updateIndicator)
}
updateIndicator()
}
async function startSession() {
try {
// Save API key to localStorage
localStorage.setItem('openai_api_key', tokenInput.value)
status.className = 'status'
status.textContent = 'Requesting microphone access...'
audioStream = await navigator.mediaDevices.getUserMedia({
audio: true,
video: false
})
setupAudioVisualization(audioStream)
status.textContent = 'Establishing connection...'
peerConnection = await createRealtimeSession(
audioStream,
tokenInput.value,
voiceSelect.value
)
status.className = 'status success'
status.textContent = 'Session established successfully!'
startButton.textContent = 'Stop Session'
} catch (err) {
status.className = 'status error'
status.textContent = `Error: ${err.message}`
console.error('Session error:', err)
stopSession()
}
}
function stopSession() {
if (peerConnection) {
peerConnection.close()
peerConnection = null
}
if (audioContext) {
audioContext.close()
audioContext = null
}
if (audioStream) {
audioStream.getTracks().forEach(track => track.stop())
audioStream = null
}
audioIndicator.classList.remove('active')
startButton.textContent = 'Start Session'
messageLog.innerHTML = ''
}
startButton.addEventListener('click', () => {
if (peerConnection) {
stopSession()
} else {
if (!tokenInput.value) {
status.className = 'status error'
status.textContent = 'Please enter an API token'
return
}
startSession()
}
})
// Cleanup on page unload
window.addEventListener('beforeunload', stopSession)
</script>
</body>
</html>
I've made the following key changes:
- Added a voice selection dropdown with all the available voices
- Added a message log div that displays data channel messages
- Created a client-side data channel instead of waiting for ondatachannel
- Added proper message display with auto-scrolling
- Added the model parameter to the API URL
- Added proper cleanup of the message log when stopping the session
- Improved the styling to accommodate the new elements
The messages will now appear in a scrollable area below the controls, with each message in its own div. The voice selection is a dropdown menu that will be passed to the API when establishing the connection.
Would you like me to make any adjustments to these changes?
human (Dec 17, 2024, 02:24 PM)
I don't really understand the data channel bit, at what point does the model send me back text and what happens with that?
assistant (Dec 17, 2024, 02:25 PM)
Ah, let me explain the data channel and model interaction:
From the documentation, it shows that the model will send various events through the data channel, including text responses. However, we need to actually trigger the model to respond first.
The current code just sets up the channel and listens passively. For the model to actually start generating responses, we need to send it a "response.create" event with instructions. Let me modify the code to include a simple text input that will let us send messages to the model and see its responses.
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>OpenAI WebRTC Audio Session</title>
<style>
* {
box-sizing: border-box;
}
body {
font-family: Helvetica, Arial, sans-serif;
margin: 0;
padding: 20px;
background: #f5f5f5;
}
.container {
max-width: 800px;
margin: 0 auto;
}
.audio-indicator {
display: inline-block;
width: 20px;
height: 20px;
border-radius: 50%;
background: #ccc;
margin-right: 10px;
vertical-align: middle;
}
.audio-indicator.active {
background: #4CAF50;
animation: pulse 1s infinite;
}
@keyframes pulse {
0% { opacity: 1; }
50% { opacity: 0.5; }
100% { opacity: 1; }
}
.controls {
margin: 20px 0;
}
.form-group {
margin-bottom: 15px;
}
label {
display: block;
margin-bottom: 5px;
font-weight: bold;
}
input, select {
width: 100%;
padding: 8px;
font-size: 16px;
border: 1px solid #ddd;
border-radius: 4px;
}
button {
background: #007bff;
color: white;
border: none;
padding: 10px 20px;
font-size: 16px;
border-radius: 4px;
cursor: pointer;
margin-right: 10px;
}
button:disabled {
background: #ccc;
cursor: not-allowed;
}
.status {
margin-top: 10px;
padding: 10px;
border-radius: 4px;
}
.error {
background: #fee;
color: #c00;
}
.success {
background: #efe;
color: #0a0;
}
#messageLog {
margin-top: 20px;
padding: 15px;
background: #fff;
border: 1px solid #ddd;
border-radius: 4px;
height: 300px;
overflow-y: auto;
font-family: monospace;
}
.message {
margin: 5px 0;
padding: 5px;
border-bottom: 1px solid #eee;
white-space: pre-wrap;
}
.chat-input {
margin-top: 20px;
display: flex;
gap: 10px;
}
.chat-input input {
flex-grow: 1;
}
.chat-input button {
flex-shrink: 0;
}
</style>
</head>
<body>
<div class="container">
<h1>
<span id="audioIndicator" class="audio-indicator"></span>
OpenAI WebRTC Audio Session
</h1>
<div class="controls">
<div class="form-group">
<label for="tokenInput">OpenAI API Token</label>
<input type="password" id="tokenInput">
</div>
<div class="form-group">
<label for="voiceSelect">Voice</label>
<select id="voiceSelect">
<option value="ash">Ash</option>
<option value="ballad">Ballad</option>
<option value="coral">Coral</option>
<option value="sage">Sage</option>
<option value="verse">Verse</option>
</select>
</div>
<button id="startButton">Start Session</button>
</div>
<div id="status" class="status"></div>
<div id="messageLog"></div>
<div class="chat-input">
<input type="text" id="messageInput" placeholder="Type a message..." disabled>
<button id="sendButton" disabled>Send</button>
</div>
</div>
<script type="module">
// Text decoder for incoming messages
const decoder = new TextDecoder()
let activeDataChannel = null;
async function createRealtimeSession(inStream, token, voice) {
const pc = new RTCPeerConnection()
// Handle incoming audio
pc.ontrack = e => {
const audio = new Audio()
audio.srcObject = e.streams[0]
audio.play()
}
// Create data channel for messages
const channel = pc.createDataChannel('oai-events')
channel.onopen = () => {
console.log('Data channel opened')
activeDataChannel = channel
document.getElementById('messageInput').disabled = false
document.getElementById('sendButton').disabled = false
}
channel.onmessage = msg => {
const messageLog = document.getElementById('messageLog')
const messageDiv = document.createElement('div')
messageDiv.className = 'message'
try {
// Try to parse and pretty print JSON messages
const data = JSON.parse(decoder.decode(msg.data))
messageDiv.textContent = JSON.stringify(data, null, 2)
} catch {
// If not JSON, just show the raw message
messageDiv.textContent = decoder.decode(msg.data)
}
messageLog.appendChild(messageDiv)
messageLog.scrollTop = messageLog.scrollHeight
}
pc.addTrack(inStream.getTracks()[0])
const offer = await pc.createOffer()
await pc.setLocalDescription(offer)
const headers = {
Authorization: `Bearer ${token}`,
'Content-Type': 'application/sdp'
}
const opts = {
method: 'POST',
body: offer.sdp,
headers
}
const model = 'gpt-4o-realtime-preview-2024-12-17'
const resp = await fetch(`https://api.openai.com/v1/realtime?model=${model}&voice=${voice}`, opts)
await pc.setRemoteDescription({
type: 'answer',
sdp: await resp.text()
})
return pc
}
const startButton = document.getElementById('startButton')
const sendButton = document.getElementById('sendButton')
const messageInput = document.getElementById('messageInput')
const tokenInput = document.getElementById('tokenInput')
const voiceSelect = document.getElementById('voiceSelect')
const status = document.getElementById('status')
const audioIndicator = document.getElementById('audioIndicator')
const messageLog = document.getElementById('messageLog')
let peerConnection = null
let audioContext = null
let audioStream = null
// Load saved API key on page load
document.addEventListener('DOMContentLoaded', () => {
const savedToken = localStorage.getItem('openai_api_key')
if (savedToken) {
tokenInput.value = savedToken
}
})
// Audio visualization
function setupAudioVisualization(stream) {
audioContext = new AudioContext()
const source = audioContext.createMediaStreamSource(stream)
const analyzer = audioContext.createAnalyser()
analyzer.fftSize = 256
source.connect(analyzer)
const bufferLength = analyzer.frequencyBinCount
const dataArray = new Uint8Array(bufferLength)
function updateIndicator() {
if (!audioContext) return
analyzer.getByteFrequencyData(dataArray)
const average = dataArray.reduce((a, b) => a + b) / bufferLength
audioIndicator.classList.toggle('active', average > 30)
requestAnimationFrame(updateIndicator)
}
updateIndicator()
}
async function startSession() {
try {
// Save API key to localStorage
localStorage.setItem('openai_api_key', tokenInput.value)
status.className = 'status'
status.textContent = 'Requesting microphone access...'
audioStream = await navigator.mediaDevices.getUserMedia({
audio: true,
video: false
})
setupAudioVisualization(audioStream)
status.textContent = 'Establishing connection...'
peerConnection = await createRealtimeSession(
audioStream,
tokenInput.value,
voiceSelect.value
)
status.className = 'status success'
status.textContent = 'Session established successfully!'
startButton.textContent = 'Stop Session'
} catch (err) {
status.className = 'status error'
status.textContent = `Error: ${err.message}`
console.error('Session error:', err)
stopSession()
}
}
function stopSession() {
if (peerConnection) {
peerConnection.close()
peerConnection = null
}
if (audioContext) {
audioContext.close()
audioContext = null
}
if (audioStream) {
audioStream.getTracks().forEach(track => track.stop())
audioStream = null
}
activeDataChannel = null
messageInput.disabled = true
sendButton.disabled = true
audioIndicator.classList.remove('active')
startButton.textContent = 'Start Session'
messageLog.innerHTML = ''
}
function sendMessage() {
if (!activeDataChannel) return;
const message = messageInput.value.trim()
if (!message) return;
// Create a response.create event as shown in the docs
const event = {
type: "response.create",
response: {
modalities: ["text", "speech"],
instructions: message
}
}
// Send the event through the data channel
activeDataChannel.send(JSON.stringify(event))
// Log our sent message
const messageDiv = document.createElement('div')
messageDiv.className = 'message'
messageDiv.textContent = `Sent: ${message}`
messageLog.appendChild(messageDiv)
messageLog.scrollTop = messageLog.scrollHeight
// Clear input
messageInput.value = ''
}
startButton.addEventListener('click', () => {
if (peerConnection) {
stopSession()
} else {
if (!tokenInput.value) {
status.className = 'status error'
status.textContent = 'Please enter an API token'
return
}
startSession()
}
})
sendButton.addEventListener('click', sendMessage)
messageInput.addEventListener('keypress', (e) => {
if (e.key === 'Enter') sendMessage()
})
// Cleanup on page unload
window.addEventListener('beforeunload', stopSession)
</script>
</body>
</html>
I've added a proper chat interface and the necessary code to interact with the model. Here's how it works:
- When you start a session, a data channel is created between your browser and the OpenAI server
- Once connected, the text input and send button become enabled
- When you send a message, it creates a "response.create" event that tells the model:
- To respond in both text and speech (
modalities: ["text", "speech"]
) - What your instructions/prompt is
- To respond in both text and speech (
- The model will then:
- Send text responses through the data channel (which will appear in the message log)
- Send audio through the WebRTC audio stream (which will play through your speakers)
The message log now shows both:
- Messages you send (prefixed with "Sent: ")
- All responses from the model (which come as JSON events, now properly formatted)
You can now have an interactive conversation with the model - type something in the input box and press Enter or click Send. The model will respond with both text (in the log) and speech (through your speakers).
Is this clearer? Would you like me to explain any part in more detail?
human (Dec 17, 2024, 02:26 PM)
Drop all of the text stuff and the debug panel, just go back to the original program but add the voice selection
assistant (Dec 17, 2024, 02:26 PM)
Sure, I'll simplify it back to just the audio interface with voice selection added.
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>OpenAI WebRTC Audio Session</title>
<style>
* {
box-sizing: border-box;
}
body {
font-family: Helvetica, Arial, sans-serif;
margin: 0;
padding: 20px;
background: #f5f5f5;
}
.container {
max-width: 800px;
margin: 0 auto;
}
.audio-indicator {
display: inline-block;
width: 20px;
height: 20px;
border-radius: 50%;
background: #ccc;
margin-right: 10px;
vertical-align: middle;
}
.audio-indicator.active {
background: #4CAF50;
animation: pulse 1s infinite;
}
@keyframes pulse {
0% { opacity: 1; }
50% { opacity: 0.5; }
100% { opacity: 1; }
}
.controls {
margin: 20px 0;
}
.form-group {
margin-bottom: 15px;
}
label {
display: block;
margin-bottom: 5px;
font-weight: bold;
}
input, select {
width: 100%;
padding: 8px;
font-size: 16px;
border: 1px solid #ddd;
border-radius: 4px;
}
button {
background: #007bff;
color: white;
border: none;
padding: 10px 20px;
font-size: 16px;
border-radius: 4px;
cursor: pointer;
}
button:disabled {
background: #ccc;
cursor: not-allowed;
}
.status {
margin-top: 10px;
padding: 10px;
border-radius: 4px;
}
.error {
background: #fee;
color: #c00;
}
.success {
background: #efe;
color: #0a0;
}
</style>
</head>
<body>
<div class="container">
<h1>
<span id="audioIndicator" class="audio-indicator"></span>
OpenAI WebRTC Audio Session
</h1>
<div class="controls">
<div class="form-group">
<label for="tokenInput">OpenAI API Token</label>
<input type="password" id="tokenInput">
</div>
<div class="form-group">
<label for="voiceSelect">Voice</label>
<select id="voiceSelect">
<option value="ash">Ash</option>
<option value="ballad">Ballad</option>
<option value="coral">Coral</option>
<option value="sage">Sage</option>
<option value="verse">Verse</option>
</select>
</div>
<button id="startButton">Start Session</button>
</div>
<div id="status" class="status"></div>
</div>
<script type="module">
async function createRealtimeSession(inStream, token, voice) {
const pc = new RTCPeerConnection()
// Handle incoming audio
pc.ontrack = e => {
const audio = new Audio()
audio.srcObject = e.streams[0]
audio.play()
}
pc.addTrack(inStream.getTracks()[0])
const offer = await pc.createOffer()
await pc.setLocalDescription(offer)
const headers = {
Authorization: `Bearer ${token}`,
'Content-Type': 'application/sdp'
}
const opts = {
method: 'POST',
body: offer.sdp,
headers
}
const model = 'gpt-4o-realtime-preview-2024-12-17'
const resp = await fetch(`https://api.openai.com/v1/realtime?model=${model}&voice=${voice}`, opts)
await pc.setRemoteDescription({
type: 'answer',
sdp: await resp.text()
})
return pc
}
const startButton = document.getElementById('startButton')
const tokenInput = document.getElementById('tokenInput')
const voiceSelect = document.getElementById('voiceSelect')
const status = document.getElementById('status')
const audioIndicator = document.getElementById('audioIndicator')
let peerConnection = null
let audioContext = null
let audioStream = null
// Load saved API key on page load
document.addEventListener('DOMContentLoaded', () => {
const savedToken = localStorage.getItem('openai_api_key')
if (savedToken) {
tokenInput.value = savedToken
}
})
// Audio visualization
function setupAudioVisualization(stream) {
audioContext = new AudioContext()
const source = audioContext.createMediaStreamSource(stream)
const analyzer = audioContext.createAnalyser()
analyzer.fftSize = 256
source.connect(analyzer)
const bufferLength = analyzer.frequencyBinCount
const dataArray = new Uint8Array(bufferLength)
function updateIndicator() {
if (!audioContext) return
analyzer.getByteFrequencyData(dataArray)
const average = dataArray.reduce((a, b) => a + b) / bufferLength
audioIndicator.classList.toggle('active', average > 30)
requestAnimationFrame(updateIndicator)
}
updateIndicator()
}
async function startSession() {
try {
// Save API key to localStorage
localStorage.setItem('openai_api_key', tokenInput.value)
status.className = 'status'
status.textContent = 'Requesting microphone access...'
audioStream = await navigator.mediaDevices.getUserMedia({
audio: true,
video: false
})
setupAudioVisualization(audioStream)
status.textContent = 'Establishing connection...'
peerConnection = await createRealtimeSession(
audioStream,
tokenInput.value,
voiceSelect.value
)
status.className = 'status success'
status.textContent = 'Session established successfully!'
startButton.textContent = 'Stop Session'
} catch (err) {
status.className = 'status error'
status.textContent = `Error: ${err.message}`
console.error('Session error:', err)
stopSession()
}
}
function stopSession() {
if (peerConnection) {
peerConnection.close()
peerConnection = null
}
if (audioContext) {
audioContext.close()
audioContext = null
}
if (audioStream) {
audioStream.getTracks().forEach(track => track.stop())
audioStream = null
}
audioIndicator.classList.remove('active')
startButton.textContent = 'Start Session'
}
startButton.addEventListener('click', () => {
if (peerConnection) {
stopSession()
} else {
if (!tokenInput.value) {
status.className = 'status error'
status.textContent = 'Please enter an API token'
return
}
startSession()
}
})
// Cleanup on page unload
window.addEventListener('beforeunload', stopSession)
</script>
</body>
</html>
I've simplified it back to just the audio interface, keeping only:
- The API token input
- The voice selection dropdown
- The start/stop button
- The audio indicator
- The status messages
The program now just handles the audio connection with your selected voice, without any text display or debugging panel. Is this more what you were looking for?