Skip to content

Instantly share code, notes, and snippets.

@simonw
Created December 17, 2024 22:06
Show Gist options
  • Select an option

  • Save simonw/69151091f7672adb9b42f5b17bd45d44 to your computer and use it in GitHub Desktop.

Select an option

Save simonw/69151091f7672adb9b42f5b17bd45d44 to your computer and use it in GitHub Desktop.

WebRTC Realtime Session Creator

human (Dec 17, 2024, 02:02 PM)

Turn this example code into a full artifact

async function createRealtimeSession(inStream, outEl, token) { const pc = new RTCPeerConnection(); pc.ontrack = e => outEl.srcObject = e.streams[0]; pc.addTrack(inStream.getTracks()[0]); const offer = await pc.createOffer(); await pc.setLocalDescription(offer); const headers = { Authorization: Bearer ${token}, 'Content-Type': 'application/sdp' }; const opts = { method: 'POST', body: offer.sdp, headers }; const resp = await fetch('https://api.openai.com/v1/realtime', opts); await pc.setRemoteDescription({ type: 'answer', sdp: await resp.text() }); return pc; }

assistant (Dec 17, 2024, 02:02 PM)

I'll help create a complete artifact with this WebRTC code using vanilla HTML, JavaScript, and CSS.

create Real-time WebRTC Session

<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <title>Real-time Session</title>
  <style>
  * {
    box-sizing: border-box;
  }

  body {
    font-family: Helvetica, Arial, sans-serif;
    margin: 0;
    padding: 20px;
    background: #f5f5f5;
  }

  .container {
    max-width: 800px;
    margin: 0 auto;
  }

  .video-container {
    position: relative;
    width: 100%;
    background: #000;
    border-radius: 8px;
    overflow: hidden;
    margin: 20px 0;
  }

  video {
    width: 100%;
    display: block;
  }

  .controls {
    margin: 20px 0;
  }

  input {
    width: 100%;
    padding: 8px;
    font-size: 16px;
    border: 1px solid #ddd;
    border-radius: 4px;
    margin-bottom: 10px;
  }

  button {
    background: #007bff;
    color: white;
    border: none;
    padding: 10px 20px;
    font-size: 16px;
    border-radius: 4px;
    cursor: pointer;
  }

  button:disabled {
    background: #ccc;
    cursor: not-allowed;
  }

  .status {
    margin-top: 10px;
    padding: 10px;
    border-radius: 4px;
  }

  .error {
    background: #fee;
    color: #c00;
  }

  .success {
    background: #efe;
    color: #0a0;
  }
  </style>
</head>
<body>
  <div class="container">
    <h1>Real-time Session</h1>
    
    <div class="video-container">
      <video id="localVideo" autoplay playsinline muted></video>
    </div>

    <div class="controls">
      <input type="text" id="tokenInput" placeholder="Enter your API token">
      <button id="startButton">Start Session</button>
    </div>

    <div id="status" class="status"></div>
  </div>

  <script type="module">
async function createRealtimeSession(inStream, outEl, token) {
  const pc = new RTCPeerConnection()
  
  pc.ontrack = e => outEl.srcObject = e.streams[0]
  pc.addTrack(inStream.getTracks()[0])
  
  const offer = await pc.createOffer()
  await pc.setLocalDescription(offer)
  
  const headers = {
    Authorization: `Bearer ${token}`,
    'Content-Type': 'application/sdp'
  }
  
  const opts = {
    method: 'POST',
    body: offer.sdp,
    headers
  }
  
  const resp = await fetch('https://api.openai.com/v1/realtime', opts)
  await pc.setRemoteDescription({
    type: 'answer',
    sdp: await resp.text()
  })
  
  return pc
}

const startButton = document.getElementById('startButton')
const tokenInput = document.getElementById('tokenInput')
const localVideo = document.getElementById('localVideo')
const status = document.getElementById('status')

let peerConnection = null

async function startSession() {
  try {
    status.className = 'status'
    status.textContent = 'Requesting camera access...'
    
    const stream = await navigator.mediaDevices.getUserMedia({
      video: true,
      audio: true
    })
    
    localVideo.srcObject = stream
    
    status.textContent = 'Establishing connection...'
    
    peerConnection = await createRealtimeSession(
      stream,
      localVideo,
      tokenInput.value
    )
    
    status.className = 'status success'
    status.textContent = 'Session established successfully!'
    startButton.textContent = 'Stop Session'
    
  } catch (err) {
    status.className = 'status error'
    status.textContent = `Error: ${err.message}`
    console.error('Session error:', err)
    stopSession()
  }
}

function stopSession() {
  if (peerConnection) {
    peerConnection.close()
    peerConnection = null
  }
  
  if (localVideo.srcObject) {
    localVideo.srcObject.getTracks().forEach(track => track.stop())
    localVideo.srcObject = null
  }
  
  startButton.textContent = 'Start Session'
}

startButton.addEventListener('click', () => {
  if (peerConnection) {
    stopSession()
  } else {
    if (!tokenInput.value) {
      status.className = 'status error'
      status.textContent = 'Please enter an API token'
      return
    }
    startSession()
  }
})

// Cleanup on page unload
window.addEventListener('beforeunload', stopSession)
  </script>
</body>
</html>

I've created a complete HTML artifact that:

  1. Includes a clean, responsive layout with proper styling
  2. Has error handling and status messages
  3. Manages the WebRTC lifecycle properly
  4. Includes cleanup on page unload
  5. Has proper camera/mic permission handling
  6. Provides visual feedback about the connection state
  7. Allows stopping/starting the session

The code follows the requested formatting:

  • CSS uses 2-space indentation and starts with the box-sizing reset
  • JavaScript uses 2-space indentation and is in a module script tag
  • Font sizes for inputs are 16px
  • Helvetica is the preferred font
  • Uses vanilla HTML/JS/CSS with no external dependencies

You can test it by:

  1. Entering an API token
  2. Clicking "Start Session" to begin
  3. The video feed will show locally
  4. Connection status is displayed below
  5. Click "Stop Session" to end the connection and cleanup resources

Would you like me to make any adjustments to the implementation?

human (Dec 17, 2024, 02:04 PM)

This is for audio only. Add a thing where any text messages coming in on the stream are shown in an auto-scrolling textarea. Audio returned by that stream should play too

assistant (Dec 17, 2024, 02:04 PM)

I'll modify the artifact to handle audio-only streaming and add a text message display area.

rewrite Real-time Audio Session with Messages

<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <title>Real-time Audio Session</title>
  <style>
  * {
    box-sizing: border-box;
  }

  body {
    font-family: Helvetica, Arial, sans-serif;
    margin: 0;
    padding: 20px;
    background: #f5f5f5;
  }

  .container {
    max-width: 800px;
    margin: 0 auto;
  }

  .messages {
    width: 100%;
    height: 300px;
    background: white;
    border: 1px solid #ddd;
    border-radius: 4px;
    padding: 10px;
    margin: 20px 0;
    font-size: 16px;
    font-family: monospace;
    resize: vertical;
    overflow-y: scroll;
  }

  .audio-indicator {
    display: inline-block;
    width: 20px;
    height: 20px;
    border-radius: 50%;
    background: #ccc;
    margin-right: 10px;
    vertical-align: middle;
  }

  .audio-indicator.active {
    background: #4CAF50;
    animation: pulse 1s infinite;
  }

  @keyframes pulse {
    0% { opacity: 1; }
    50% { opacity: 0.5; }
    100% { opacity: 1; }
  }

  .controls {
    margin: 20px 0;
  }

  input {
    width: 100%;
    padding: 8px;
    font-size: 16px;
    border: 1px solid #ddd;
    border-radius: 4px;
    margin-bottom: 10px;
  }

  button {
    background: #007bff;
    color: white;
    border: none;
    padding: 10px 20px;
    font-size: 16px;
    border-radius: 4px;
    cursor: pointer;
  }

  button:disabled {
    background: #ccc;
    cursor: not-allowed;
  }

  .status {
    margin-top: 10px;
    padding: 10px;
    border-radius: 4px;
  }

  .error {
    background: #fee;
    color: #c00;
  }

  .success {
    background: #efe;
    color: #0a0;
  }
  </style>
</head>
<body>
  <div class="container">
    <h1>
      <span id="audioIndicator" class="audio-indicator"></span>
      Real-time Audio Session
    </h1>
    
    <div class="messages" id="messages"></div>

    <div class="controls">
      <input type="text" id="tokenInput" placeholder="Enter your API token">
      <button id="startButton">Start Session</button>
    </div>

    <div id="status" class="status"></div>
  </div>

  <script type="module">
// Text decoder for incoming messages
const decoder = new TextDecoder()

async function createRealtimeSession(inStream, token) {
  const pc = new RTCPeerConnection()
  
  // Handle incoming audio
  pc.ontrack = e => {
    const audio = new Audio()
    audio.srcObject = e.streams[0]
    audio.play()
  }

  // Handle incoming text messages via data channel
  pc.ondatachannel = event => {
    const channel = event.channel
    channel.onmessage = msg => {
      appendMessage(decoder.decode(msg.data))
    }
  }
  
  pc.addTrack(inStream.getTracks()[0])
  
  const offer = await pc.createOffer()
  await pc.setLocalDescription(offer)
  
  const headers = {
    Authorization: `Bearer ${token}`,
    'Content-Type': 'application/sdp'
  }
  
  const opts = {
    method: 'POST',
    body: offer.sdp,
    headers
  }
  
  const resp = await fetch('https://api.openai.com/v1/realtime', opts)
  await pc.setRemoteDescription({
    type: 'answer',
    sdp: await resp.text()
  })
  
  return pc
}

const startButton = document.getElementById('startButton')
const tokenInput = document.getElementById('tokenInput')
const messages = document.getElementById('messages')
const status = document.getElementById('status')
const audioIndicator = document.getElementById('audioIndicator')

let peerConnection = null
let audioContext = null
let audioStream = null

function appendMessage(text) {
  const div = document.createElement('div')
  div.textContent = text
  messages.appendChild(div)
  messages.scrollTop = messages.scrollHeight
}

// Audio visualization
function setupAudioVisualization(stream) {
  audioContext = new AudioContext()
  const source = audioContext.createMediaStreamSource(stream)
  const analyzer = audioContext.createAnalyser()
  analyzer.fftSize = 256
  
  source.connect(analyzer)
  
  const bufferLength = analyzer.frequencyBinCount
  const dataArray = new Uint8Array(bufferLength)
  
  function updateIndicator() {
    if (!audioContext) return
    
    analyzer.getByteFrequencyData(dataArray)
    const average = dataArray.reduce((a, b) => a + b) / bufferLength
    
    audioIndicator.classList.toggle('active', average > 30)
    requestAnimationFrame(updateIndicator)
  }
  
  updateIndicator()
}

async function startSession() {
  try {
    status.className = 'status'
    status.textContent = 'Requesting microphone access...'
    
    audioStream = await navigator.mediaDevices.getUserMedia({
      audio: true,
      video: false
    })
    
    setupAudioVisualization(audioStream)
    
    status.textContent = 'Establishing connection...'
    
    peerConnection = await createRealtimeSession(
      audioStream,
      tokenInput.value
    )
    
    status.className = 'status success'
    status.textContent = 'Session established successfully!'
    startButton.textContent = 'Stop Session'
    
  } catch (err) {
    status.className = 'status error'
    status.textContent = `Error: ${err.message}`
    console.error('Session error:', err)
    stopSession()
  }
}

function stopSession() {
  if (peerConnection) {
    peerConnection.close()
    peerConnection = null
  }
  
  if (audioContext) {
    audioContext.close()
    audioContext = null
  }
  
  if (audioStream) {
    audioStream.getTracks().forEach(track => track.stop())
    audioStream = null
  }
  
  audioIndicator.classList.remove('active')
  startButton.textContent = 'Start Session'
}

startButton.addEventListener('click', () => {
  if (peerConnection) {
    stopSession()
  } else {
    if (!tokenInput.value) {
      status.className = 'status error'
      status.textContent = 'Please enter an API token'
      return
    }
    startSession()
  }
})

// Cleanup on page unload
window.addEventListener('beforeunload', stopSession)
  </script>
</body>
</html>

I've updated the artifact with these key changes:

  1. Removed video elements and made it audio-only
  2. Added a scrolling message display area that automatically scrolls to new messages
  3. Added an audio visualization indicator that pulses when sound is detected
  4. Added handling for incoming text messages via WebRTC data channel
  5. Set up proper audio output routing for the remote audio stream
  6. Added proper cleanup for audio context and streams

Key features:

  • Messages appear in a scrolling div with monospace font for better readability
  • Audio activity is shown with a pulsing indicator dot
  • Incoming audio plays automatically
  • All audio resources are properly cleaned up on stop/unload
  • Messages persist until the session is stopped

The interface includes:

  1. A pulsing indicator to show audio activity
  2. A scrollable message area showing incoming text
  3. Token input and start/stop controls
  4. Status messages for connection state

Would you like me to modify anything about how the messages or audio are handled?

@simonw
Copy link
Author

simonw commented Dec 17, 2024

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment