niuniulla/sdl_whispercpp.cpp

## sdl_whispercpp.cpp
/*
    A minimal use case of SDL Audio and whisper.cpp
*/
/*
    The program, upon launch, record a piece of recording of up to
    1 minute and transcribe it into text on the terminal.
    For more info on whipser.cpp: https://github.com/ggerganov/whisper.cpp/blob/master/CMakeLists.txt
*/

#include <SDL2/SDL.h>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <thread>
#include <vector>

#include "whisper.h"


// command-line parameters
struct whisper_params {
    int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
    int32_t n_processors = 1;
    int32_t step_ms    = 3000;
    int32_t length_ms  = 10000;
    int32_t keep_ms    = 200;
    int32_t capture_id = -1;
    int32_t max_tokens = 32;
    int32_t audio_ctx  = 0;

    float vad_thold    = 0.6f;
    float freq_thold   = 100.0f;

    bool speed_up      = false;
    bool translate     = false;
    bool no_fallback   = false;
    bool print_special = false;
    bool no_context    = true;
    bool no_timestamps = false;
    bool tinydiarize   = false;
    bool save_audio    = false; // save audio to wav file
    bool use_gpu       = true;

    std::string language  = "en";
    std::string model     = "models/ggml-base.en.bin";
    std::string fname_out;
};

static std::vector<float> pcmf32;

static SDL_AudioDeviceID input_dev;

static std::vector<float> floatBufffer;
static size_t in_floatPos = 0;

void callback(void *userdata, Uint8 *stream, int len)
{

    size_t floatLen = len / sizeof(float);
    SDL_memcpy(&pcmf32[in_floatPos], stream, len);
    in_floatPos += floatLen;
}


int main() {

    // test for whisper
    std::cout << "Starting test" << std::endl;

    std::cout << "set params" << std::endl;
    whisper_params params;

    // init whisper
    std::cout << "Init whisper" << std::endl;
    struct whisper_context_params cparams = whisper_context_default_params();
    params.use_gpu = params.use_gpu;


    std::cout << "create context" << std::endl;
    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);

    std::vector<whisper_token> prompt_tokens;

    // init
    int lenghtOfRecordingSecond = 60; // 5s

    std::cout << "init SDL audio" << std::endl;
    SDL_Init(SDL_INIT_AUDIO);

    //Default recording spec
    SDL_AudioSpec desiredRecordingSpec, receivedRecordingSpec;
    SDL_zero(desiredRecordingSpec);
    desiredRecordingSpec.freq = WHISPER_SAMPLE_RATE;
    desiredRecordingSpec.format = AUDIO_F32; //float32 of little endian byte order
    desiredRecordingSpec.channels = 1;
    desiredRecordingSpec.samples = 1024;
    desiredRecordingSpec.callback = callback;

    // select audio device
    SDL_AudioDeviceID recordingDeviceId = 0;

    //Open recording device
	recordingDeviceId = SDL_OpenAudioDevice( SDL_GetAudioDeviceName(0, SDL_TRUE),
                                             SDL_TRUE,
                                             &desiredRecordingSpec,
                                             &receivedRecordingSpec,
                                             SDL_AUDIO_ALLOW_FORMAT_CHANGE
                                            );
	if( recordingDeviceId == 0 )
    {
        std::cout << "ERR - Failed to open recording device :" << SDL_GetError() << std::endl;
        return -1;
    }

    // compte buffer size
    int bufferSize = receivedRecordingSpec.freq * lenghtOfRecordingSecond;

    pcmf32.resize(bufferSize);
    std::fill(pcmf32.begin(), pcmf32.end(), 0);

    std::cout << "INFO - pcmf32 size :" << bufferSize << std::endl;


    bool is_running = true;
    bool bReadyForInference = false;

    std::cout << "INFO - Start to record." << std::endl;

    while (is_running)
    {
        SDL_PauseAudioDevice(recordingDeviceId, SDL_FALSE);// unpause device

        SDL_LockAudioDevice(recordingDeviceId);

        // fill the buffer until full
        if (in_floatPos > pcmf32.size())
        {
            std::cout << "INFO - recording done: " << std::endl;

            SDL_PauseAudioDevice(recordingDeviceId, SDL_TRUE); // pause recording

            bReadyForInference = true;

            is_running = false;
        }

        SDL_UnlockAudioDevice(recordingDeviceId);

        SDL_Delay(100); // pause a bit for loop

        // do inference
        if (bReadyForInference)
        {
            std::cout << "INFO - Start to transcript." << std::endl;

            // do inference
            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
            wparams.print_progress   = false;
            wparams.print_special    = params.print_special;
            wparams.print_realtime   = false;
            wparams.print_timestamps = !params.no_timestamps;
            wparams.translate        = params.translate;
            wparams.single_segment   = false;
            wparams.max_tokens       = params.max_tokens;
            wparams.language         = params.language.c_str();
            wparams.n_threads        = params.n_threads;

            wparams.audio_ctx        = params.audio_ctx;
            wparams.speed_up         = params.speed_up;

            wparams.tdrz_enable      = params.tinydiarize; // [TDRZ]

            // disable temperature fallback
            //wparams.temperature_inc  = -1.0f;
            wparams.temperature_inc  = params.no_fallback ? 0.0f : wparams.temperature_inc;

            wparams.prompt_tokens    = params.no_context ? nullptr : prompt_tokens.data();
            wparams.prompt_n_tokens  = params.no_context ? 0       : prompt_tokens.size();

            if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0)
            {
                std::cout << "failed to process audio." << std::endl;
                return 1;
            }
            std::cout << "INFO - end of inference." << std::endl;

            // output text
            int n_segments = whisper_full_n_segments(ctx);
            for (int i=0; i<n_segments; i++)
            {
                const char *text = whisper_full_get_segment_text(ctx, i);
                std::cout << "transcript: " << text;
            }
            SDL_Delay(5000);
            std::cout << "INFO - finished. " << std::endl;
        }
    }

    whisper_free(ctx);
    SDL_CloseAudioDevice(recordingDeviceId);

    return 0;

}
	/*
	A minimal use case of SDL Audio and whisper.cpp
	*/
	/*
	The program, upon launch, record a piece of recording of up to
	1 minute and transcribe it into text on the terminal.
	For more info on whipser.cpp: https://github.com/ggerganov/whisper.cpp/blob/master/CMakeLists.txt
	*/

	#include <SDL2/SDL.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <iostream>
	#include <thread>
	#include <vector>

	#include "whisper.h"


	// command-line parameters
	struct whisper_params {
	int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
	int32_t n_processors = 1;
	int32_t step_ms = 3000;
	int32_t length_ms = 10000;
	int32_t keep_ms = 200;
	int32_t capture_id = -1;
	int32_t max_tokens = 32;
	int32_t audio_ctx = 0;

	float vad_thold = 0.6f;
	float freq_thold = 100.0f;

	bool speed_up = false;
	bool translate = false;
	bool no_fallback = false;
	bool print_special = false;
	bool no_context = true;
	bool no_timestamps = false;
	bool tinydiarize = false;
	bool save_audio = false; // save audio to wav file
	bool use_gpu = true;

	std::string language = "en";
	std::string model = "models/ggml-base.en.bin";
	std::string fname_out;
	};

	static std::vector<float> pcmf32;

	static SDL_AudioDeviceID input_dev;

	static std::vector<float> floatBufffer;
	static size_t in_floatPos = 0;

	void callback(void userdata, Uint8 stream, int len)
	{

	size_t floatLen = len / sizeof(float);
	SDL_memcpy(&pcmf32[in_floatPos], stream, len);
	in_floatPos += floatLen;
	}


	int main() {

	// test for whisper
	std::cout << "Starting test" << std::endl;

	std::cout << "set params" << std::endl;
	whisper_params params;

	// init whisper
	std::cout << "Init whisper" << std::endl;
	struct whisper_context_params cparams = whisper_context_default_params();
	params.use_gpu = params.use_gpu;


	std::cout << "create context" << std::endl;
	struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);

	std::vector<whisper_token> prompt_tokens;

	// init
	int lenghtOfRecordingSecond = 60; // 5s

	std::cout << "init SDL audio" << std::endl;
	SDL_Init(SDL_INIT_AUDIO);

	//Default recording spec
	SDL_AudioSpec desiredRecordingSpec, receivedRecordingSpec;
	SDL_zero(desiredRecordingSpec);
	desiredRecordingSpec.freq = WHISPER_SAMPLE_RATE;
	desiredRecordingSpec.format = AUDIO_F32; //float32 of little endian byte order
	desiredRecordingSpec.channels = 1;
	desiredRecordingSpec.samples = 1024;
	desiredRecordingSpec.callback = callback;

	// select audio device
	SDL_AudioDeviceID recordingDeviceId = 0;

	//Open recording device
	recordingDeviceId = SDL_OpenAudioDevice( SDL_GetAudioDeviceName(0, SDL_TRUE),
	SDL_TRUE,
	&desiredRecordingSpec,
	&receivedRecordingSpec,
	SDL_AUDIO_ALLOW_FORMAT_CHANGE
	);
	if( recordingDeviceId == 0 )
	{
	std::cout << "ERR - Failed to open recording device :" << SDL_GetError() << std::endl;
	return -1;
	}

	// compte buffer size
	int bufferSize = receivedRecordingSpec.freq * lenghtOfRecordingSecond;

	pcmf32.resize(bufferSize);
	std::fill(pcmf32.begin(), pcmf32.end(), 0);

	std::cout << "INFO - pcmf32 size :" << bufferSize << std::endl;


	bool is_running = true;
	bool bReadyForInference = false;

	std::cout << "INFO - Start to record." << std::endl;

	while (is_running)
	{
	SDL_PauseAudioDevice(recordingDeviceId, SDL_FALSE);// unpause device

	SDL_LockAudioDevice(recordingDeviceId);

	// fill the buffer until full
	if (in_floatPos > pcmf32.size())
	{
	std::cout << "INFO - recording done: " << std::endl;

	SDL_PauseAudioDevice(recordingDeviceId, SDL_TRUE); // pause recording

	bReadyForInference = true;

	is_running = false;
	}

	SDL_UnlockAudioDevice(recordingDeviceId);

	SDL_Delay(100); // pause a bit for loop

	// do inference
	if (bReadyForInference)
	{
	std::cout << "INFO - Start to transcript." << std::endl;

	// do inference
	whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
	wparams.print_progress = false;
	wparams.print_special = params.print_special;
	wparams.print_realtime = false;
	wparams.print_timestamps = !params.no_timestamps;
	wparams.translate = params.translate;
	wparams.single_segment = false;
	wparams.max_tokens = params.max_tokens;
	wparams.language = params.language.c_str();
	wparams.n_threads = params.n_threads;

	wparams.audio_ctx = params.audio_ctx;
	wparams.speed_up = params.speed_up;

	wparams.tdrz_enable = params.tinydiarize; // [TDRZ]

	// disable temperature fallback
	//wparams.temperature_inc = -1.0f;
	wparams.temperature_inc = params.no_fallback ? 0.0f : wparams.temperature_inc;

	wparams.prompt_tokens = params.no_context ? nullptr : prompt_tokens.data();
	wparams.prompt_n_tokens = params.no_context ? 0 : prompt_tokens.size();

	if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0)
	{
	std::cout << "failed to process audio." << std::endl;
	return 1;
	}
	std::cout << "INFO - end of inference." << std::endl;

	// output text
	int n_segments = whisper_full_n_segments(ctx);
	for (int i=0; i<n_segments; i++)
	{
	const char *text = whisper_full_get_segment_text(ctx, i);
	std::cout << "transcript: " << text;
	}
	SDL_Delay(5000);
	std::cout << "INFO - finished. " << std::endl;
	}
	}

	whisper_free(ctx);
	SDL_CloseAudioDevice(recordingDeviceId);

	return 0;

	}