Skip to content

Instantly share code, notes, and snippets.

@niuniulla
Created February 21, 2024 05:06
Show Gist options
  • Save niuniulla/e876a0924971b47fa26d2589b23385a2 to your computer and use it in GitHub Desktop.
Save niuniulla/e876a0924971b47fa26d2589b23385a2 to your computer and use it in GitHub Desktop.
A minimal use case of SDL Audio and whisper.cpp
/*
A minimal use case of SDL Audio and whisper.cpp
*/
/*
The program, upon launch, record a piece of recording of up to
1 minute and transcribe it into text on the terminal.
For more info on whipser.cpp: https://github.com/ggerganov/whisper.cpp/blob/master/CMakeLists.txt
*/
#include <SDL2/SDL.h>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <thread>
#include <vector>
#include "whisper.h"
// command-line parameters
struct whisper_params {
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
int32_t n_processors = 1;
int32_t step_ms = 3000;
int32_t length_ms = 10000;
int32_t keep_ms = 200;
int32_t capture_id = -1;
int32_t max_tokens = 32;
int32_t audio_ctx = 0;
float vad_thold = 0.6f;
float freq_thold = 100.0f;
bool speed_up = false;
bool translate = false;
bool no_fallback = false;
bool print_special = false;
bool no_context = true;
bool no_timestamps = false;
bool tinydiarize = false;
bool save_audio = false; // save audio to wav file
bool use_gpu = true;
std::string language = "en";
std::string model = "models/ggml-base.en.bin";
std::string fname_out;
};
static std::vector<float> pcmf32;
static SDL_AudioDeviceID input_dev;
static std::vector<float> floatBufffer;
static size_t in_floatPos = 0;
void callback(void *userdata, Uint8 *stream, int len)
{
size_t floatLen = len / sizeof(float);
SDL_memcpy(&pcmf32[in_floatPos], stream, len);
in_floatPos += floatLen;
}
int main() {
// test for whisper
std::cout << "Starting test" << std::endl;
std::cout << "set params" << std::endl;
whisper_params params;
// init whisper
std::cout << "Init whisper" << std::endl;
struct whisper_context_params cparams = whisper_context_default_params();
params.use_gpu = params.use_gpu;
std::cout << "create context" << std::endl;
struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
std::vector<whisper_token> prompt_tokens;
// init
int lenghtOfRecordingSecond = 60; // 5s
std::cout << "init SDL audio" << std::endl;
SDL_Init(SDL_INIT_AUDIO);
//Default recording spec
SDL_AudioSpec desiredRecordingSpec, receivedRecordingSpec;
SDL_zero(desiredRecordingSpec);
desiredRecordingSpec.freq = WHISPER_SAMPLE_RATE;
desiredRecordingSpec.format = AUDIO_F32; //float32 of little endian byte order
desiredRecordingSpec.channels = 1;
desiredRecordingSpec.samples = 1024;
desiredRecordingSpec.callback = callback;
// select audio device
SDL_AudioDeviceID recordingDeviceId = 0;
//Open recording device
recordingDeviceId = SDL_OpenAudioDevice( SDL_GetAudioDeviceName(0, SDL_TRUE),
SDL_TRUE,
&desiredRecordingSpec,
&receivedRecordingSpec,
SDL_AUDIO_ALLOW_FORMAT_CHANGE
);
if( recordingDeviceId == 0 )
{
std::cout << "ERR - Failed to open recording device :" << SDL_GetError() << std::endl;
return -1;
}
// compte buffer size
int bufferSize = receivedRecordingSpec.freq * lenghtOfRecordingSecond;
pcmf32.resize(bufferSize);
std::fill(pcmf32.begin(), pcmf32.end(), 0);
std::cout << "INFO - pcmf32 size :" << bufferSize << std::endl;
bool is_running = true;
bool bReadyForInference = false;
std::cout << "INFO - Start to record." << std::endl;
while (is_running)
{
SDL_PauseAudioDevice(recordingDeviceId, SDL_FALSE);// unpause device
SDL_LockAudioDevice(recordingDeviceId);
// fill the buffer until full
if (in_floatPos > pcmf32.size())
{
std::cout << "INFO - recording done: " << std::endl;
SDL_PauseAudioDevice(recordingDeviceId, SDL_TRUE); // pause recording
bReadyForInference = true;
is_running = false;
}
SDL_UnlockAudioDevice(recordingDeviceId);
SDL_Delay(100); // pause a bit for loop
// do inference
if (bReadyForInference)
{
std::cout << "INFO - Start to transcript." << std::endl;
// do inference
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
wparams.print_progress = false;
wparams.print_special = params.print_special;
wparams.print_realtime = false;
wparams.print_timestamps = !params.no_timestamps;
wparams.translate = params.translate;
wparams.single_segment = false;
wparams.max_tokens = params.max_tokens;
wparams.language = params.language.c_str();
wparams.n_threads = params.n_threads;
wparams.audio_ctx = params.audio_ctx;
wparams.speed_up = params.speed_up;
wparams.tdrz_enable = params.tinydiarize; // [TDRZ]
// disable temperature fallback
//wparams.temperature_inc = -1.0f;
wparams.temperature_inc = params.no_fallback ? 0.0f : wparams.temperature_inc;
wparams.prompt_tokens = params.no_context ? nullptr : prompt_tokens.data();
wparams.prompt_n_tokens = params.no_context ? 0 : prompt_tokens.size();
if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0)
{
std::cout << "failed to process audio." << std::endl;
return 1;
}
std::cout << "INFO - end of inference." << std::endl;
// output text
int n_segments = whisper_full_n_segments(ctx);
for (int i=0; i<n_segments; i++)
{
const char *text = whisper_full_get_segment_text(ctx, i);
std::cout << "transcript: " << text;
}
SDL_Delay(5000);
std::cout << "INFO - finished. " << std::endl;
}
}
whisper_free(ctx);
SDL_CloseAudioDevice(recordingDeviceId);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment