Skip to content

Instantly share code, notes, and snippets.

@sarchak
Created November 24, 2023 00:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sarchak/931c5427aefb572f9ad92b2825f924b7 to your computer and use it in GitHub Desktop.
Save sarchak/931c5427aefb572f9ad92b2825f924b7 to your computer and use it in GitHub Desktop.
const WebSocket = require("ws");
const express = require("express");
const WaveFile = require("wavefile").WaveFile;
const axios = require("axios");
const fs = require("fs");
const ffmpeg = require("fluent-ffmpeg");
const path = require("path");
const app = express();
const server = require("http").createServer(app);
const wss = new WebSocket.Server({ server });
let assembly;
let chunks = [];
async function convertMP3toMuLawAndPrepareForTwilio(mp3Url, streamSid) {
try {
// Download MP3
const response = await axios({
url: mp3Url,
method: "GET",
responseType: "arraybuffer",
});
const tempMP3File = "tempfile.mp3";
fs.writeFileSync(tempMP3File, Buffer.from(response.data));
return new Promise((resolve, reject) => {
ffmpeg(tempMP3File)
.toFormat("wav")
.on("end", () => {
const wavData = fs.readFileSync(tempMP3File.replace(".mp3", ".wav"));
let wav = new WaveFile(wavData);
wav.toMuLaw();
// Extract only the raw µ-law audio data, excluding headers
const rawMuLawData = wav.data.samples;
// Split the buffer into 20ms chunks
let chunkSize = 320; // For 8kHz µ-law audio, 20ms is 320 bytes
let twilioMessages = [];
for (let i = 0; i < rawMuLawData.length; i += chunkSize) {
let chunk = rawMuLawData.slice(i, i + chunkSize);
let base64Chunk = Buffer.from(chunk).toString("base64");
let message = {
event: "media",
streamSid: streamSid,
media: {
payload: base64Chunk,
},
};
twilioMessages.push(message);
}
// Clean up temporary files
// fs.unlinkSync(tempMP3File);
// fs.unlinkSync(tempMP3File.replace(".mp3", ".wav"));
resolve(twilioMessages);
})
.on("error", (err) => {
console.error("An error occurred: " + err.message);
reject(err);
})
.save(tempMP3File.replace(".mp3", ".wav"));
});
} catch (error) {
console.error("Error in convertMP3toMuLawAndPrepareForTwilio:", error);
}
}
// Handle Web Socket Connection
wss.on("connection", async function connection(ws) {
console.log("New Connection Initiated");
let streamSid;
ws.on("message", async function incoming(message) {
if (!assembly)
return console.error("AssemblyAI's WebSocket must be initialized.");
const msg = JSON.parse(message);
switch (msg.event) {
case "connected":
console.log(`A new call has connected.`);
assembly.onerror = console.error;
const texts = {};
assembly.onmessage = (assemblyMsg) => {
const res = JSON.parse(assemblyMsg.data);
texts[res.audio_start] = res.text;
const keys = Object.keys(texts);
keys.sort((a, b) => a - b);
let msg = "";
for (const key of keys) {
if (texts[key]) {
msg += ` ${texts[key]}`;
}
}
console.log(msg);
wss.clients.forEach((client) => {
if (client.readyState === WebSocket.OPEN) {
client.send(
JSON.stringify({
event: "interim-transcription",
text: msg,
})
);
}
});
console.log("==============");
console.log(msg, streamSid);
let mp3Url =
"https://eleven-public-cdn.elevenlabs.io/audio-native/d65e433fd8a560cbba1b7dd26809dd48ea2b408c5b6c0a3e42e5b83c43957f5b/fUQGnaXq583QFBHJUhLY.mp3";
convertMP3toMuLawAndPrepareForTwilio(mp3Url, streamSid)
.then((twilioMessages) => {
twilioMessages.forEach((message1) => {
console.log(message1);
ws.send(JSON.stringify(message1));
});
ws.send(
JSON.stringify({
event: "mark",
streamSid: streamSid,
mark: {
name: "my label",
},
})
);
})
.catch((error) => {
console.error("Error processing MP3 to MuLaw:", error);
});
};
break;
case "start":
console.log(`Starting Media Stream ${msg.streamSid}`);
streamSid = msg.streamSid;
break;
case "media":
const twilioData = msg.media.payload;
// Build the wav file from scratch since it comes in as raw data
let wav = new WaveFile();
// Twilio uses MuLaw so we have to encode for that
wav.fromScratch(1, 8000, "8m", Buffer.from(twilioData, "base64"));
// This library has a handy method to decode MuLaw straight to 16-bit PCM
wav.fromMuLaw();
// Get the raw audio data in base64
const twilio64Encoded = wav.toDataURI().split("base64,")[1];
// Create our audio buffer
const twilioAudioBuffer = Buffer.from(twilio64Encoded, "base64");
// Send data starting at byte 44 to remove wav headers so our model sees only audio data
chunks.push(twilioAudioBuffer.slice(44));
// We have to chunk data b/c twilio sends audio durations of ~20ms and AAI needs a min of 100ms
if (chunks.length >= 5) {
const audioBuffer = Buffer.concat(chunks);
const encodedAudio = audioBuffer.toString("base64");
assembly.send(JSON.stringify({ audio_data: encodedAudio }));
chunks = [];
}
break;
case "stop":
console.log(`Call Has Ended`);
assembly.send(JSON.stringify({ terminate_session: true }));
break;
}
});
});
//Handle HTTP Request
app.get("/", (req, res) => res.sendFile(path.join(__dirname, "/index.html")));
app.post("/", async (req, res) => {
console.log("Post called");
console.log(`${req.headers.host}`);
assembly = new WebSocket(
"wss://api.assemblyai.com/v2/realtime/ws?sample_rate=8000",
{ headers: { authorization: "key" } }
);
res.set("Content-Type", "text/xml");
res.send(
`<Response>
<Start>
<Stream url='wss://assistant.loca.lt' />
</Start>
<Say>
Start
</Say>
<Pause length='30' />
</Response>`
);
});
// Start server
console.log("Listening at Port 8080");
server.listen(8080);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment