Skip to content

Instantly share code, notes, and snippets.

@simonw
Created December 18, 2024 15:35
Show Gist options
  • Save simonw/d7dcea0263fc9bee86d6bd738cae1d9d to your computer and use it in GitHub Desktop.
Save simonw/d7dcea0263fc9bee86d6bd738cae1d9d to your computer and use it in GitHub Desktop.

Displaying Decoded JSON Events from WebRTC Data Channel

human (Dec 17, 2024, 09:01 PM)

After this line: pc.addTrack(inStream.getTracks()[0])

Add this: const dc = pc.createDataChannel("oai-events"); dc.addEventListener("message", (e) => { console.log(e); });

But do better than that. e.data will always be JSON - JSON decode it and then log it in an area at the bottom of the page, always putting the latest event pretty-printed at the top of that area

paste.txt

<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <title>OpenAI WebRTC Audio Session</title>
  <style>
  * {
    box-sizing: border-box;
  }

  body {
    font-family: Helvetica, Arial, sans-serif;
    margin: 0;
    padding: 20px;
    background: #f5f5f5;
  }

  .container {
    max-width: 800px;
    margin: 0 auto;
  }

  .audio-indicator {
    display: inline-block;
    width: 20px;
    height: 20px;
    border-radius: 50%;
    background: #ccc;
    margin-right: 10px;
    vertical-align: middle;
  }

  .audio-indicator.active {
    background: #4CAF50;
    animation: pulse 1s infinite;
  }

  @keyframes pulse {
    0% { opacity: 1; }
    50% { opacity: 0.5; }
    100% { opacity: 1; }
  }

  .controls {
    margin: 20px 0;
  }

  .form-group {
    margin-bottom: 15px;
  }

  label {
    display: block;
    margin-bottom: 5px;
    font-weight: bold;
  }

  input, select {
    width: 100%;
    padding: 8px;
    font-size: 16px;
    border: 1px solid #ddd;
    border-radius: 4px;
  }

  button {
    background: #007bff;
    color: white;
    border: none;
    padding: 10px 20px;
    font-size: 16px;
    border-radius: 4px;
    cursor: pointer;
  }

  button:disabled {
    background: #ccc;
    cursor: not-allowed;
  }

  .status {
    margin-top: 10px;
    padding: 10px;
    border-radius: 4px;
  }

  .error {
    background: #fee;
    color: #c00;
  }

  .success {
    background: #efe;
    color: #0a0;
  }
  </style>
</head>
<body>
  <div class="container">
    <h1>
      <span id="audioIndicator" class="audio-indicator"></span>
      OpenAI WebRTC Audio Session
    </h1>

    <div class="controls">
      <div class="form-group">
        <label for="tokenInput">OpenAI API Token</label>
        <input type="password" id="tokenInput">
      </div>
      <div class="form-group">
        <label for="voiceSelect">Voice</label>
        <select id="voiceSelect">
          <option value="ash">Ash</option>
          <option value="ballad">Ballad</option>
          <option value="coral">Coral</option>
          <option value="sage">Sage</option>
          <option value="verse">Verse</option>
        </select>
      </div>
      <button id="startButton">Start Session</button>
    </div>

    <div id="status" class="status"></div>
  </div>

  <script type="module">
    async function createRealtimeSession(inStream, token, voice) {
      const pc = new RTCPeerConnection()
      
      // Handle incoming audio
      pc.ontrack = e => {
        const audio = new Audio()
        audio.srcObject = e.streams[0]
        audio.play()
      }
      
      pc.addTrack(inStream.getTracks()[0])
      
      const offer = await pc.createOffer()
      await pc.setLocalDescription(offer)
      
      const headers = {
        Authorization: `Bearer ${token}`,
        'Content-Type': 'application/sdp'
      }
      
      const opts = {
        method: 'POST',
        body: offer.sdp,
        headers
      }
      
      const model = 'gpt-4o-realtime-preview-2024-12-17'
      const resp = await fetch(`https://api.openai.com/v1/realtime?model=${model}&voice=${voice}`, opts)
      
      await pc.setRemoteDescription({
        type: 'answer',
        sdp: await resp.text()
      })
      
      return pc
    }

    const startButton = document.getElementById('startButton')
    const tokenInput = document.getElementById('tokenInput')
    const voiceSelect = document.getElementById('voiceSelect')
    const status = document.getElementById('status')
    const audioIndicator = document.getElementById('audioIndicator')

    let peerConnection = null
    let audioContext = null
    let audioStream = null

    // Load saved API key on page load
    document.addEventListener('DOMContentLoaded', () => {
      const savedToken = localStorage.getItem('openai_api_key')
      if (savedToken) {
        tokenInput.value = savedToken
      }
    })

    // Audio visualization
    function setupAudioVisualization(stream) {
      audioContext = new AudioContext()
      const source = audioContext.createMediaStreamSource(stream)
      const analyzer = audioContext.createAnalyser()
      analyzer.fftSize = 256
      
      source.connect(analyzer)
      
      const bufferLength = analyzer.frequencyBinCount
      const dataArray = new Uint8Array(bufferLength)
      
      function updateIndicator() {
        if (!audioContext) return
        
        analyzer.getByteFrequencyData(dataArray)
        const average = dataArray.reduce((a, b) => a + b) / bufferLength
        
        audioIndicator.classList.toggle('active', average > 30)
        requestAnimationFrame(updateIndicator)
      }
      
      updateIndicator()
    }

    async function startSession() {
      try {
        // Save API key to localStorage
        localStorage.setItem('openai_api_key', tokenInput.value)
        
        status.className = 'status'
        status.textContent = 'Requesting microphone access...'
        
        audioStream = await navigator.mediaDevices.getUserMedia({
          audio: true,
          video: false
        })
        
        setupAudioVisualization(audioStream)
        
        status.textContent = 'Establishing connection...'
        
        peerConnection = await createRealtimeSession(
          audioStream,
          tokenInput.value,
          voiceSelect.value
        )
        
        status.className = 'status success'
        status.textContent = 'Session established successfully!'
        startButton.textContent = 'Stop Session'
        
      } catch (err) {
        status.className = 'status error'
        status.textContent = `Error: ${err.message}`
        console.error('Session error:', err)
        stopSession()
      }
    }

    function stopSession() {
      if (peerConnection) {
        peerConnection.close()
        peerConnection = null
      }
      
      if (audioContext) {
        audioContext.close()
        audioContext = null
      }
      
      if (audioStream) {
        audioStream.getTracks().forEach(track => track.stop())
        audioStream = null
      }
      
      audioIndicator.classList.remove('active')
      startButton.textContent = 'Start Session'
    }

    startButton.addEventListener('click', () => {
      if (peerConnection) {
        stopSession()
      } else {
        if (!tokenInput.value) {
          status.className = 'status error'
          status.textContent = 'Please enter an API token'
          return
        }
        startSession()
      }
    })

    // Cleanup on page unload
    window.addEventListener('beforeunload', stopSession)
  </script>
</body>
</html>


assistant (Dec 17, 2024, 09:01 PM)

I'll help modify the code to add a data channel that displays JSON events in a nicely formatted area at the bottom of the page. I'll need to:

  1. Add a container for the events in the HTML
  2. Add some CSS to style it
  3. Modify the data channel code to parse and display the events

create OpenAI WebRTC Audio Session with Events

<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <title>OpenAI WebRTC Audio Session</title>
  <style>
  * {
    box-sizing: border-box;
  }

  body {
    font-family: Helvetica, Arial, sans-serif;
    margin: 0;
    padding: 20px;
    background: #f5f5f5;
  }

  .container {
    max-width: 800px;
    margin: 0 auto;
  }

  .audio-indicator {
    display: inline-block;
    width: 20px;
    height: 20px;
    border-radius: 50%;
    background: #ccc;
    margin-right: 10px;
    vertical-align: middle;
  }

  .audio-indicator.active {
    background: #4CAF50;
    animation: pulse 1s infinite;
  }

  @keyframes pulse {
    0% { opacity: 1; }
    50% { opacity: 0.5; }
    100% { opacity: 1; }
  }

  .controls {
    margin: 20px 0;
  }

  .form-group {
    margin-bottom: 15px;
  }

  label {
    display: block;
    margin-bottom: 5px;
    font-weight: bold;
  }

  input, select {
    width: 100%;
    padding: 8px;
    font-size: 16px;
    border: 1px solid #ddd;
    border-radius: 4px;
  }

  button {
    background: #007bff;
    color: white;
    border: none;
    padding: 10px 20px;
    font-size: 16px;
    border-radius: 4px;
    cursor: pointer;
  }

  button:disabled {
    background: #ccc;
    cursor: not-allowed;
  }

  .status {
    margin-top: 10px;
    padding: 10px;
    border-radius: 4px;
  }

  .error {
    background: #fee;
    color: #c00;
  }

  .success {
    background: #efe;
    color: #0a0;
  }

  .events-container {
    margin-top: 20px;
    background: #fff;
    border-radius: 4px;
    padding: 15px;
    box-shadow: 0 1px 3px rgba(0,0,0,0.1);
  }

  .event-entry {
    font-family: monospace;
    white-space: pre;
    padding: 10px;
    margin: 10px 0;
    background: #f8f9fa;
    border-radius: 4px;
    border-left: 4px solid #007bff;
  }

  .event-timestamp {
    color: #666;
    font-size: 0.9em;
    margin-bottom: 5px;
  }
  </style>
</head>
<body>
  <div class="container">
    <h1>
      <span id="audioIndicator" class="audio-indicator"></span>
      OpenAI WebRTC Audio Session
    </h1>

    <div class="controls">
      <div class="form-group">
        <label for="tokenInput">OpenAI API Token</label>
        <input type="password" id="tokenInput">
      </div>
      <div class="form-group">
        <label for="voiceSelect">Voice</label>
        <select id="voiceSelect">
          <option value="ash">Ash</option>
          <option value="ballad">Ballad</option>
          <option value="coral">Coral</option>
          <option value="sage">Sage</option>
          <option value="verse">Verse</option>
        </select>
      </div>
      <button id="startButton">Start Session</button>
    </div>

    <div id="status" class="status"></div>
    
    <div class="events-container">
      <h2>Session Events</h2>
      <div id="events"></div>
    </div>
  </div>

  <script type="module">
    async function createRealtimeSession(inStream, token, voice) {
      const pc = new RTCPeerConnection()
      
      // Handle incoming audio
      pc.ontrack = e => {
        const audio = new Audio()
        audio.srcObject = e.streams[0]
        audio.play()
      }
      
      pc.addTrack(inStream.getTracks()[0])
      
      // Create data channel for events
      const dc = pc.createDataChannel("oai-events");
      dc.addEventListener("message", (e) => {
        try {
          const eventData = JSON.parse(e.data);
          addEventToLog(eventData);
        } catch (err) {
          console.error('Error parsing event data:', err);
        }
      });
      
      const offer = await pc.createOffer()
      await pc.setLocalDescription(offer)
      
      const headers = {
        Authorization: `Bearer ${token}`,
        'Content-Type': 'application/sdp'
      }
      
      const opts = {
        method: 'POST',
        body: offer.sdp,
        headers
      }
      
      const model = 'gpt-4o-realtime-preview-2024-12-17'
      const resp = await fetch(`https://api.openai.com/v1/realtime?model=${model}&voice=${voice}`, opts)
      
      await pc.setRemoteDescription({
        type: 'answer',
        sdp: await resp.text()
      })
      
      return pc
    }

    function addEventToLog(eventData) {
      const eventsContainer = document.getElementById('events');
      const eventEntry = document.createElement('div');
      eventEntry.className = 'event-entry';
      
      const timestamp = document.createElement('div');
      timestamp.className = 'event-timestamp';
      timestamp.textContent = new Date().toISOString();
      
      const content = document.createElement('div');
      content.textContent = JSON.stringify(eventData, null, 2);
      
      eventEntry.appendChild(timestamp);
      eventEntry.appendChild(content);
      
      // Add new events at the top
      eventsContainer.insertBefore(eventEntry, eventsContainer.firstChild);
      
      // Optional: limit number of displayed events to prevent excessive DOM growth
      while (eventsContainer.children.length > 50) {
        eventsContainer.removeChild(eventsContainer.lastChild);
      }
    }

    const startButton = document.getElementById('startButton')
    const tokenInput = document.getElementById('tokenInput')
    const voiceSelect = document.getElementById('voiceSelect')
    const status = document.getElementById('status')
    const audioIndicator = document.getElementById('audioIndicator')

    let peerConnection = null
    let audioContext = null
    let audioStream = null

    // Load saved API key on page load
    document.addEventListener('DOMContentLoaded', () => {
      const savedToken = localStorage.getItem('openai_api_key')
      if (savedToken) {
        tokenInput.value = savedToken
      }
    })

    // Audio visualization
    function setupAudioVisualization(stream) {
      audioContext = new AudioContext()
      const source = audioContext.createMediaStreamSource(stream)
      const analyzer = audioContext.createAnalyser()
      analyzer.fftSize = 256
      
      source.connect(analyzer)
      
      const bufferLength = analyzer.frequencyBinCount
      const dataArray = new Uint8Array(bufferLength)
      
      function updateIndicator() {
        if (!audioContext) return
        
        analyzer.getByteFrequencyData(dataArray)
        const average = dataArray.reduce((a, b) => a + b) / bufferLength
        
        audioIndicator.classList.toggle('active', average > 30)
        requestAnimationFrame(updateIndicator)
      }
      
      updateIndicator()
    }

    async function startSession() {
      try {
        // Save API key to localStorage
        localStorage.setItem('openai_api_key', tokenInput.value)
        
        status.className = 'status'
        status.textContent = 'Requesting microphone access...'
        
        audioStream = await navigator.mediaDevices.getUserMedia({
          audio: true,
          video: false
        })
        
        setupAudioVisualization(audioStream)
        
        status.textContent = 'Establishing connection...'
        
        peerConnection = await createRealtimeSession(
          audioStream,
          tokenInput.value,
          voiceSelect.value
        )
        
        status.className = 'status success'
        status.textContent = 'Session established successfully!'
        startButton.textContent = 'Stop Session'
        
      } catch (err) {
        status.className = 'status error'
        status.textContent = `Error: ${err.message}`
        console.error('Session error:', err)
        stopSession()
      }
    }

    function stopSession() {
      if (peerConnection) {
        peerConnection.close()
        peerConnection = null
      }
      
      if (audioContext) {
        audioContext.close()
        audioContext = null
      }
      
      if (audioStream) {
        audioStream.getTracks().forEach(track => track.stop())
        audioStream = null
      }
      
      audioIndicator.classList.remove('active')
      startButton.textContent = 'Start Session'
    }

    startButton.addEventListener('click', () => {
      if (peerConnection) {
        stopSession()
      } else {
        if (!tokenInput.value) {
          status.className = 'status error'
          status.textContent = 'Please enter an API token'
          return
        }
        startSession()
      }
    })

    // Cleanup on page unload
    window.addEventListener('beforeunload', stopSession)
  </script>
</body>
</html>

I've made the following key changes to the code:

  1. Added a new events container section in the HTML with appropriate styling
  2. Created the data channel as requested and added proper event handling
  3. Added an addEventToLog() function that:
    • Creates timestamped entries for each event
    • Pretty-prints the JSON data
    • Adds new events at the top of the list
    • Limits the number of displayed events to prevent memory issues
  4. Added styling for the events container and individual event entries

The events are now displayed in a scrollable container at the bottom of the page, with the most recent events at the top. Each event entry includes a timestamp and properly formatted JSON data.

Would you like me to make any adjustments to the formatting or behavior of the event display?

human (Dec 17, 2024, 09:06 PM)

The most interesting events look like this:

{
  "type": "response.done",
  "event_id": "event_AfgWkPhRWwbXtoLJBBmQ1",
  "response": {
    "object": "realtime.response",
    "id": "resp_AfgWihozG6XgsCB9UWrHv",
    "status": "completed",
    "status_details": null,
    "output": [
      {
        "id": "item_AfgWiWpUEFzniPgzvnUMG",
        "object": "realtime.item",
        "type": "message",
        "status": "completed",
        "role": "assistant",
        "content": [
          {
            "type": "audio",
            "transcript": "Alright, here's another one for you: Why was the math book sad? Because it had too many problems! "
          }
        ]
      }
    ],
    "usage": {
      "total_tokens": 3456,
      "input_tokens": 3263,
      "output_tokens": 193,
      "input_token_details": {
        "text_tokens": 702,
        "audio_tokens": 2561,
        "cached_tokens": 3136,
        "cached_tokens_details": {
          "text_tokens": 640,
          "audio_tokens": 2496
        }
      },
      "output_token_details": {
        "text_tokens": 40,
        "audio_tokens": 153
      }
    },
    "metadata": null
  }
}

If there is a response.usage block I want to do something clever: I want to calculate the cost of the session so far and display that above the Session Events panel

Here's how pricing works. Any questions before you implement this?

We’re releasing gpt-4o-realtime-preview-2024-12-17 as part of the Realtime API beta with improved voice quality, more reliable input (especially for dictated numbers), and reduced costs. Due to our efficiency improvements, we’re dropping the audio token price by 60% to $40/1M input tokens and $80/1M output tokens. Cached audio input costs are reduced by 87.5% to $2.50/1M input tokens. We’re also bringing GPT-4o mini to the Realtime API beta as gpt-4o-mini-realtime-preview-2024-12-17. GPT-4o mini is our most cost-efficient small model and brings the same rich voice experiences to the Realtime API as GPT-4o. GPT-4o mini audio price is $10/1M input tokens and $20/1M output tokens. Text tokens are priced at $0.60/1M input tokens and $2.40/1M output tokens. Cached audio and text both cost $0.30/1M tokens.

assistant (Dec 17, 2024, 09:06 PM)

Let me help implement a cost calculator for the Realtime API sessions. From the pricing information:

For gpt-4o-realtime-preview-2024-12-17:

  • Audio input: $40/1M tokens ($0.00004 per token)
  • Audio output: $80/1M tokens ($0.00008 per token)
  • Cached audio input: $2.50/1M tokens ($0.0000025 per token)
  • Text input/output pricing isn't specified in the quote

I'll add a cost display panel and update it whenever we receive usage information. Let me modify the code:

update webrtc-events

Find this:

    <div id="status" class="status"></div>
    
    <div class="events-container">

Replace with this:

    <div id="status" class="status"></div>
    
    <div class="cost-container">
      <h2>Session Costs</h2>
      <div class="cost-details">
        <div>Input Cost: <span id="inputCost">$0.00</span></div>
        <div>Output Cost: <span id="outputCost">$0.00</span></div>
        <div>Total Cost: <span id="totalCost">$0.00</span></div>
      </div>
    </div>

    <div class="events-container">

update webrtc-events

Find this:

.event-timestamp {
    color: #666;
    font-size: 0.9em;
    margin-bottom: 5px;
  }

Replace with this:

.event-timestamp {
    color: #666;
    font-size: 0.9em;
    margin-bottom: 5px;
  }

  .cost-container {
    margin: 20px 0;
    padding: 15px;
    background: #fff;
    border-radius: 4px;
    box-shadow: 0 1px 3px rgba(0,0,0,0.1);
  }

  .cost-details {
    display: grid;
    grid-template-columns: repeat(3, 1fr);
    gap: 15px;
    font-size: 1.1em;
    margin-top: 10px;
  }

  .cost-details span {
    font-weight: bold;
    color: #007bff;
  }

update webrtc-events

Find this:

function addEventToLog(eventData) {

Replace with this:

function calculateCosts(usage) {
  const AUDIO_INPUT_COST = 0.00004;    // $40/1M tokens
  const AUDIO_OUTPUT_COST = 0.00008;   // $80/1M tokens
  const CACHED_AUDIO_COST = 0.0000025; // $2.50/1M tokens

  const inputTokenDetails = usage.input_token_details;
  const outputTokenDetails = usage.output_token_details;
  const cachedDetails = inputTokenDetails.cached_tokens_details;

  // Calculate input costs
  const freshAudioInputCost = (inputTokenDetails.audio_tokens - cachedDetails.audio_tokens) * AUDIO_INPUT_COST;
  const cachedAudioInputCost = cachedDetails.audio_tokens * CACHED_AUDIO_COST;
  const totalInputCost = freshAudioInputCost + cachedAudioInputCost;

  // Calculate output costs
  const audioOutputCost = outputTokenDetails.audio_tokens * AUDIO_OUTPUT_COST;
  const totalOutputCost = audioOutputCost;

  return {
    inputCost: totalInputCost,
    outputCost: totalOutputCost,
    totalCost: totalInputCost + totalOutputCost
  };
}

function updateCostDisplay(costs) {
  document.getElementById('inputCost').textContent = `$${costs.inputCost.toFixed(4)}`;
  document.getElementById('outputCost').textContent = `$${costs.outputCost.toFixed(4)}`;
  document.getElementById('totalCost').textContent = `$${costs.totalCost.toFixed(4)}`;
}

function addEventToLog(eventData) {

update webrtc-events

Find this:

      try {
          const eventData = JSON.parse(e.data);
          addEventToLog(eventData);

Replace with this:

      try {
          const eventData = JSON.parse(e.data);
          
          // Update costs if this is a response.done event with usage info
          if (eventData.type === 'response.done' && 
              eventData.response && 
              eventData.response.usage) {
            const costs = calculateCosts(eventData.response.usage);
            updateCostDisplay(costs);
          }
          
          addEventToLog(eventData);

I've made the following changes:

  1. Added a new cost display panel above the events container showing input, output, and total costs
  2. Created a calculateCosts function that:
    • Handles the different pricing tiers for audio input (fresh vs cached)
    • Calculates output costs for audio
    • Returns a breakdown of costs
  3. Added real-time cost updates whenever a response.done event with usage information is received
  4. Styled the cost display to be clear and prominent

Note: I didn't include text token costs since they weren't specified in the pricing information provided. If you'd like to add those, just let me know the rates and I can update the calculator.

The costs are displayed with 4 decimal places since these are typically very small amounts per message. Would you like me to adjust any of the formatting or add any additional cost breakdown details?

human (Dec 17, 2024, 09:07 PM)

Text is $2.50 / 1M input tokens and $10.00 / 1M output tokens

assistant (Dec 17, 2024, 09:07 PM)

I'll update the cost calculator to include text token pricing.

update webrtc-events

Find this:

function calculateCosts(usage) {
  const AUDIO_INPUT_COST = 0.00004;    // $40/1M tokens
  const AUDIO_OUTPUT_COST = 0.00008;   // $80/1M tokens
  const CACHED_AUDIO_COST = 0.0000025; // $2.50/1M tokens

  const inputTokenDetails = usage.input_token_details;
  const outputTokenDetails = usage.output_token_details;
  const cachedDetails = inputTokenDetails.cached_tokens_details;

  // Calculate input costs
  const freshAudioInputCost = (inputTokenDetails.audio_tokens - cachedDetails.audio_tokens) * AUDIO_INPUT_COST;
  const cachedAudioInputCost = cachedDetails.audio_tokens * CACHED_AUDIO_COST;
  const totalInputCost = freshAudioInputCost + cachedAudioInputCost;

  // Calculate output costs
  const audioOutputCost = outputTokenDetails.audio_tokens * AUDIO_OUTPUT_COST;
  const totalOutputCost = audioOutputCost;

  return {
    inputCost: totalInputCost,
    outputCost: totalOutputCost,
    totalCost: totalInputCost + totalOutputCost
  };
}

Replace with this:

function calculateCosts(usage) {
  // Audio token costs
  const AUDIO_INPUT_COST = 0.00004;    // $40/1M tokens
  const AUDIO_OUTPUT_COST = 0.00008;   // $80/1M tokens
  const CACHED_AUDIO_COST = 0.0000025; // $2.50/1M tokens
  
  // Text token costs
  const TEXT_INPUT_COST = 0.0000025;   // $2.50/1M tokens
  const TEXT_OUTPUT_COST = 0.00001;    // $10.00/1M tokens

  const inputTokenDetails = usage.input_token_details;
  const outputTokenDetails = usage.output_token_details;
  const cachedDetails = inputTokenDetails.cached_tokens_details;

  // Calculate input costs
  const freshAudioInputCost = (inputTokenDetails.audio_tokens - cachedDetails.audio_tokens) * AUDIO_INPUT_COST;
  const cachedAudioInputCost = cachedDetails.audio_tokens * CACHED_AUDIO_COST;
  const textInputCost = inputTokenDetails.text_tokens * TEXT_INPUT_COST;
  const totalInputCost = freshAudioInputCost + cachedAudioInputCost + textInputCost;

  // Calculate output costs
  const audioOutputCost = outputTokenDetails.audio_tokens * AUDIO_OUTPUT_COST;
  const textOutputCost = outputTokenDetails.text_tokens * TEXT_OUTPUT_COST;
  const totalOutputCost = audioOutputCost + textOutputCost;

  return {
    inputCost: totalInputCost,
    outputCost: totalOutputCost,
    totalCost: totalInputCost + totalOutputCost
  };
}

I've updated the calculateCosts function to include:

  • Text input cost at $2.50/1M tokens ($0.0000025 per token)
  • Text output cost at $10.00/1M tokens ($0.00001 per token)

The calculator now accounts for all token types:

  1. Fresh audio input tokens
  2. Cached audio input tokens
  3. Text input tokens
  4. Audio output tokens
  5. Text output tokens

Would you like me to add a more detailed breakdown of the costs in the UI, showing the split between text and audio costs?

human (Dec 17, 2024, 09:08 PM)

Also display total text/audio/cached input tokens and total text/audio output tokens

assistant (Dec 17, 2024, 09:08 PM)

I'll add a token count display alongside the costs.

update webrtc-events

Find this:

    <div id="status" class="status"></div>
    
    <div class="cost-container">
      <h2>Session Costs</h2>
      <div class="cost-details">
        <div>Input Cost: <span id="inputCost">$0.00</span></div>
        <div>Output Cost: <span id="outputCost">$0.00</span></div>
        <div>Total Cost: <span id="totalCost">$0.00</span></div>
      </div>
    </div>

    <div class="events-container">

Replace with this:

    <div id="status" class="status"></div>
    
    <div class="cost-container">
      <h2>Session Stats</h2>
      <div class="stats-grid">
        <div class="stats-section">
          <h3>Input Tokens</h3>
          <div>Fresh Audio: <span id="freshAudioTokens">0</span></div>
          <div>Cached Audio: <span id="cachedAudioTokens">0</span></div>
          <div>Text: <span id="textInputTokens">0</span></div>
        </div>
        <div class="stats-section">
          <h3>Output Tokens</h3>
          <div>Audio: <span id="audioOutputTokens">0</span></div>
          <div>Text: <span id="textOutputTokens">0</span></div>
        </div>
        <div class="stats-section">
          <h3>Costs</h3>
          <div>Input Cost: <span id="inputCost">$0.00</span></div>
          <div>Output Cost: <span id="outputCost">$0.00</span></div>
          <div>Total Cost: <span id="totalCost">$0.00</span></div>
        </div>
      </div>
    </div>

    <div class="events-container">

update webrtc-events

Find this:

.cost-container {
    margin: 20px 0;
    padding: 15px;
    background: #fff;
    border-radius: 4px;
    box-shadow: 0 1px 3px rgba(0,0,0,0.1);
  }

  .cost-details {
    display: grid;
    grid-template-columns: repeat(3, 1fr);
    gap: 15px;
    font-size: 1.1em;
    margin-top: 10px;
  }

  .cost-details span {
    font-weight: bold;
    color: #007bff;
  }

Replace with this:

.cost-container {
    margin: 20px 0;
    padding: 15px;
    background: #fff;
    border-radius: 4px;
    box-shadow: 0 1px 3px rgba(0,0,0,0.1);
  }

  .stats-grid {
    display: grid;
    grid-template-columns: repeat(3, 1fr);
    gap: 20px;
    margin-top: 15px;
  }

  .stats-section {
    padding: 10px;
    background: #f8f9fa;
    border-radius: 4px;
  }

  .stats-section h3 {
    margin: 0 0 10px 0;
    font-size: 1.1em;
    color: #495057;
  }

  .stats-section div {
    margin: 5px 0;
    font-size: 0.95em;
  }

  .stats-section span {
    font-weight: bold;
    color: #007bff;
  }

update webrtc-events

Find this:

function updateCostDisplay(costs) {
  document.getElementById('inputCost').textContent = `$${costs.inputCost.toFixed(4)}`;
  document.getElementById('outputCost').textContent = `$${costs.outputCost.toFixed(4)}`;
  document.getElementById('totalCost').textContent = `$${costs.totalCost.toFixed(4)}`;
}

function addEventToLog(eventData) {

Replace with this:

function updateStatsDisplay(usage, costs) {
  // Update token counts
  const inputDetails = usage.input_token_details;
  const outputDetails = usage.output_token_details;
  const cachedDetails = inputDetails.cached_tokens_details;
  
  // Calculate fresh audio tokens
  const freshAudioTokens = inputDetails.audio_tokens - cachedDetails.audio_tokens;
  
  document.getElementById('freshAudioTokens').textContent = freshAudioTokens.toLocaleString();
  document.getElementById('cachedAudioTokens').textContent = cachedDetails.audio_tokens.toLocaleString();
  document.getElementById('textInputTokens').textContent = inputDetails.text_tokens.toLocaleString();
  document.getElementById('audioOutputTokens').textContent = outputDetails.audio_tokens.toLocaleString();
  document.getElementById('textOutputTokens').textContent = outputDetails.text_tokens.toLocaleString();
  
  // Update costs
  document.getElementById('inputCost').textContent = `$${costs.inputCost.toFixed(4)}`;
  document.getElementById('outputCost').textContent = `$${costs.outputCost.toFixed(4)}`;
  document.getElementById('totalCost').textContent = `$${costs.totalCost.toFixed(4)}`;
}

function addEventToLog(eventData) {

update webrtc-events

Find this:

          if (eventData.type === 'response.done' && 
              eventData.response && 
              eventData.response.usage) {
            const costs = calculateCosts(eventData.response.usage);
            updateCostDisplay(costs);
          }

Replace with this:

          if (eventData.type === 'response.done' && 
              eventData.response && 
              eventData.response.usage) {
            const costs = calculateCosts(eventData.response.usage);
            updateStatsDisplay(eventData.response.usage, costs);
          }

I've updated the interface to show a comprehensive stats panel with three sections:

  1. Input Tokens:

    • Fresh Audio Tokens
    • Cached Audio Tokens
    • Text Tokens
  2. Output Tokens:

    • Audio Tokens
    • Text Tokens
  3. Costs:

    • Input Cost
    • Output Cost
    • Total Cost

The token counts are formatted with thousands separators for better readability (e.g., "1,234,567" instead of "1234567"). The costs continue to be shown with 4 decimal places.

The stats panel updates in real-time whenever a response.done event is received. Would you like me to make any adjustments to the layout or formatting?

human (Dec 17, 2024, 09:12 PM)

updateStatsDisplay is missing

assistant (Dec 17, 2024, 09:12 PM)

Ah sorry - I'll add the missing function. I need to update where we process the events in the data channel handler:

update webrtc-events

Find this:

      dc.addEventListener("message", (e) => {
        try {
          const eventData = JSON.parse(e.data);
          
          // Update costs if this is a response.done event with usage info
          if (eventData.type === 'response.done' && 
              eventData.response && 
              eventData.response.usage) {
            const costs = calculateCosts(eventData.response.usage);
            updateStatsDisplay(eventData.response.usage, costs);
          }
          
          addEventToLog(eventData);

Replace with this:

      dc.addEventListener("message", (e) => {
        try {
          const eventData = JSON.parse(e.data);
          
          // Update costs if this is a response.done event with usage info
          if (eventData.type === 'response.done' && 
              eventData.response && 
              eventData.response.usage) {
            const usage = eventData.response.usage;
            const costs = calculateCosts(usage);
            const inputDetails = usage.input_token_details;
            const outputDetails = usage.output_token_details;
            const cachedDetails = inputDetails.cached_tokens_details;

            // Calculate fresh audio tokens
            const freshAudioTokens = inputDetails.audio_tokens - cachedDetails.audio_tokens;

            // Update display elements
            document.getElementById('freshAudioTokens').textContent = freshAudioTokens.toLocaleString();
            document.getElementById('cachedAudioTokens').textContent = cachedDetails.audio_tokens.toLocaleString();
            document.getElementById('textInputTokens').textContent = inputDetails.text_tokens.toLocaleString();
            document.getElementById('audioOutputTokens').textContent = outputDetails.audio_tokens.toLocaleString();
            document.getElementById('textOutputTokens').textContent = outputDetails.text_tokens.toLocaleString();

            // Update costs
            document.getElementById('inputCost').textContent = `$${costs.inputCost.toFixed(4)}`;
            document.getElementById('outputCost').textContent = `$${costs.outputCost.toFixed(4)}`;
            document.getElementById('totalCost').textContent = `$${costs.totalCost.toFixed(4)}`;
          }
          
          addEventToLog(eventData);

I've moved the display update logic directly into the event handler rather than having a separate function. This should now properly update all the token counts and costs whenever a response.done event is received. Let me know if you'd like me to make any other changes!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment