mmozeiko/example.c Secret

## example.c
#include "win32_wasapi.h"

//
// example
//

#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <mfapi.h>
#include <mfidl.h>
#include <mfreadwrite.h>

#pragma comment (lib, "mfplat")
#pragma comment (lib, "mfreadwrite")

typedef struct {
	short* samples;
	size_t count;
	size_t pos;
	bool loop;
} Sound;

// loads any supported sound file, and resamples to mono 16-bit audio with specified sample rate
static Sound S_Load(const WCHAR* path, size_t sampleRate)
{
	Sound sound = { NULL, 0, 0, false };
	HR(MFStartup(MF_VERSION, MFSTARTUP_LITE));

	IMFSourceReader* reader;
	HR(MFCreateSourceReaderFromURL(path, NULL, &reader));

	// read only first audio stream
	HR(IMFSourceReader_SetStreamSelection(reader, (DWORD)MF_SOURCE_READER_ALL_STREAMS, FALSE));
	HR(IMFSourceReader_SetStreamSelection(reader, (DWORD)MF_SOURCE_READER_FIRST_AUDIO_STREAM, TRUE));

	const size_t kChannelCount = 1;
	const WAVEFORMATEXTENSIBLE format =
	{
		.Format =
		{
			.wFormatTag = WAVE_FORMAT_EXTENSIBLE,
			.nChannels = (WORD)kChannelCount,
			.nSamplesPerSec = (WORD)sampleRate,
			.nAvgBytesPerSec = (DWORD)(sampleRate * kChannelCount * sizeof(short)),
			.nBlockAlign = (WORD)(kChannelCount * sizeof(short)),
			.wBitsPerSample = (WORD)(8 * sizeof(short)),
			.cbSize = sizeof(format) - sizeof(format.Format),
		},
		.Samples.wValidBitsPerSample = 8 * sizeof(short),
		.dwChannelMask = SPEAKER_FRONT_CENTER,
		.SubFormat = MEDIASUBTYPE_PCM,
	};

	// Media Foundation in Windows 8+ allows reader to convert output to different format than native
	IMFMediaType* type;
	HR(MFCreateMediaType(&type));
	HR(MFInitMediaTypeFromWaveFormatEx(type, &format.Format, sizeof(format)));
	HR(IMFSourceReader_SetCurrentMediaType(reader, (DWORD)MF_SOURCE_READER_FIRST_AUDIO_STREAM, NULL, type));
	IMFMediaType_Release(type);

	size_t used = 0;
	size_t capacity = 0;

	for (;;)
	{
		IMFSample* sample;
		DWORD flags = 0;
		HRESULT hr = IMFSourceReader_ReadSample(reader, (DWORD)MF_SOURCE_READER_FIRST_AUDIO_STREAM, 0, NULL, &flags, NULL, &sample);
		if (FAILED(hr))
		{
			break;
		}

		if (flags & MF_SOURCE_READERF_ENDOFSTREAM)
		{
			break;
		}
		assert(flags == 0);

		IMFMediaBuffer* buffer;
		HR(IMFSample_ConvertToContiguousBuffer(sample, &buffer));

		BYTE* data;
		DWORD size;
		HR(IMFMediaBuffer_Lock(buffer, &data, NULL, &size));
		{
			size_t avail = capacity - used;
			if (avail < size)
			{
				sound.samples = realloc(sound.samples, capacity += 64 * 1024);
			}
			memcpy((char*)sound.samples + used, data, size);
			used += size;
		}
		HR(IMFMediaBuffer_Unlock(buffer));

		IMediaBuffer_Release(buffer);
		IMFSample_Release(sample);
	}

	IMFSourceReader_Release(reader);

	HR(MFShutdown());

	sound.pos = sound.count = used / format.Format.nBlockAlign;
	return sound;
}

static void S_Update(Sound* sound, size_t samples)
{
	sound->pos += samples;
	if (sound->loop)
	{
		sound->pos %= sound->count;
	}
	else
	{
		sound->pos = min(sound->pos, sound->count);
	}
}

static void S_Mix(float* outSamples, size_t outSampleCount, float volume, const Sound* sound)
{
	const short* inSamples = sound->samples;
	size_t inPos = sound->pos;
	size_t inCount = sound->count;
	bool inLoop = sound->loop;

	for (size_t i = 0; i < outSampleCount; i++)
	{
		if (inLoop)
		{
			if (inPos == inCount)
			{
				// reset looping sound back to start
				inPos = 0;
			}
		}
		else
		{
			if (inPos >= inCount)
			{
				// non-looping sounds stops playback when done
				break;
			}
		}

		float sample = inSamples[inPos++] * (1.f / 32768.f);
		outSamples[0] += volume * sample;
		outSamples[1] += volume * sample;
		outSamples += 2;
	}
}

int main()
{
	WasapiAudio audio;
	WA_Start(&audio, 48000, 2, SPEAKER_FRONT_LEFT | SPEAKER_FRONT_RIGHT);
	size_t sampleRate = audio.bufferFormat->nSamplesPerSec;
	size_t bytesPerSample = audio.bufferFormat->nBlockAlign;

	// background "music" that will be looping
	Sound background = S_Load(L"C:/Windows/Media/Ring10.wav", sampleRate);
	background.loop = true;

	// simple sound effect, won't be looping
	Sound effect = S_Load(L"C:/Windows/Media/tada.wav", sampleRate);

	printf("Press SPACE for sound effect, D for small delay, or ESC to stop\n");

	HANDLE input = GetStdHandle(STD_INPUT_HANDLE);

	for (;;)
	{
		bool escPressed = false;
		bool spacePressed = false;
		bool delayPressed = false;

		while (WaitForSingleObject(input, 0) == WAIT_OBJECT_0)
		{
			INPUT_RECORD record;
			DWORD read;
			if (ReadConsoleInputW(input, &record, 1, &read)
				&& read == 1
				&& record.EventType == KEY_EVENT
				&& record.Event.KeyEvent.bKeyDown)
			{
				switch (record.Event.KeyEvent.wVirtualKeyCode)
				{
				case VK_ESCAPE: escPressed = true; break;
				case VK_SPACE: spacePressed = true; break;
				case 'D': delayPressed = true; break;
				}
			}
		}

		if (escPressed)
		{
			printf("stop!\n");
			break;
		}

		if (spacePressed)
		{
			printf("tada!\n");
			effect.pos = 0;
		}

		{
			WA_LockBuffer(&audio);

			// write at least 100msec of samples into buffer (or whatever space available, whichever is smaller)
			// this is max amount of time you expect code will take until the next iteration of loop
			// if code will take more time then you'll hear discontinuity as buffer will be filled with silence
			size_t writeCount = min(sampleRate/10, audio.sampleCount);

			// alternatively you can write as much as "audio.sampleCount" to fully fill the buffer (~1 second)
			// then you can try to increase delay below to 900+ msec, it still should sound fine
			//writeCount = audio.sampleCount;

			// advance sound playback positions
			size_t playCount = audio.playCount;
			S_Update(&background, playCount);
			S_Update(&effect, playCount);

			// initialize output with 0.0f
			float* output = audio.sampleBuffer;
			memset(output, 0, writeCount * bytesPerSample);

			// mix sounds into output
			S_Mix(output, writeCount, 0.3f, &background);
			S_Mix(output, writeCount, 0.8f, &effect);

			WA_UnlockBuffer(&audio, writeCount);
		}

		if (delayPressed)
		{
			printf("delay!\n");
			Sleep(5 * 17); // large delay for ~5 frames = ~68 msec
			//Sleep(900);
		}
		else
		{
			// just a small delay, pretend this is your normal rendering code
			Sleep(17); // "60" fps
		}

		printf(".");
		fflush(stdout);
	}

	WA_Stop(&audio);

	printf("Done!\n");

	return 0;
}

## win32_wasapi.h
#pragma once

#define COBJMACROS
#define WIN32_LEAN_AND_MEAN
#include <initguid.h>
#include <windows.h>
#include <objbase.h>
#include <uuids.h>
#include <avrt.h>
#include <audioclient.h>
#include <mmdeviceapi.h>

#include <stddef.h>

// "count" means sample count (for example, 1 sample = 2 floats for stereo)
// "offset" or "size" means byte count

typedef struct {
	// public part

	// describes sampleBuffer format
	WAVEFORMATEX* bufferFormat;

	// use these values only between LockBuffer/UnlockBuffer calls
	void* sampleBuffer;  // ringbuffer for interleaved samples, no need to handle wrapping
	size_t sampleCount;  // how big is buffer in samples
	size_t playCount;    // how many samples were actually used for playback since previous LockBuffer call

	// private
	IAudioClient* client;
	HANDLE event;
	HANDLE thread;
	LONG stop;
	LONG lock;
	BYTE* buffer1;
	BYTE* buffer2;
	UINT32 outSize;              // output buffer size in bytes
	UINT32 rbSize;               // ringbuffer size, always power of 2
	UINT32 bufferUsed;           // how many samples are used from buffer
	BOOL bufferFirstLock;        // true when BufferLock is used at least once
	volatile LONG rbReadOffset;  // offset to read from buffer
	volatile LONG rbLockOffset;  // offset up to what buffer is currently being used
	volatile LONG rbWriteOffset; // offset up to what buffer is filled
} WasapiAudio;

//
// interface
//

// pass 0 for rate/count/mask to get default format of output device (use audio->bufferFormat)
// channelMask is bitmask of values from table here: https://learn.microsoft.com/en-us/windows/win32/api/mmreg/ns-mmreg-waveformatextensible#remarks
static void WA_Start(WasapiAudio* audio, size_t sampleRate, size_t channelCount, DWORD channelMask);

// stops the playback and releases resources
static void WA_Stop(WasapiAudio* audio);

// once locked, then you're allowed to write samples into the ringbuffer
// use only sampleBuffer, sampleCount and submittedCount members
static void WA_LockBuffer(WasapiAudio* audio);
static void WA_UnlockBuffer(WasapiAudio* audio, size_t writtenCount);

//
// implementation
//

// TODO: what's missing here:
// * proper error handling, like when no audio device is present (currently asserts)
// * automatically switch to new device when default audio device changes (IMMNotificationClient)

#pragma comment (lib, "avrt")
#pragma comment (lib, "ole32")
#pragma comment (lib, "onecore")

#include <assert.h>
#define HR(stmt) do { HRESULT _hr = stmt; assert(SUCCEEDED(_hr)); } while (0)

// why these are missing from windows libs? :(
DEFINE_GUID(CLSID_MMDeviceEnumerator, 0xbcde0395, 0xe52f, 0x467c, 0x8e, 0x3d, 0xc4, 0x57, 0x92, 0x91, 0x69, 0x2e);
DEFINE_GUID(IID_IMMDeviceEnumerator,  0xa95664d2, 0x9614, 0x4f35, 0xa7, 0x46, 0xde, 0x8d, 0xb6, 0x36, 0x17, 0xe6);
DEFINE_GUID(IID_IAudioClient,         0x1cb9ad4c, 0xdbfa, 0x4c32, 0xb1, 0x78, 0xc2, 0xf5, 0x68, 0xa7, 0x03, 0xb2);
DEFINE_GUID(IID_IAudioClient3,        0x7ed4ee07, 0x8e67, 0x4cd4, 0x8c, 0x1a, 0x2b, 0x7a, 0x59, 0x87, 0xad, 0x42);
DEFINE_GUID(IID_IAudioRenderClient,   0xf294acfc, 0x3146, 0x4483, 0xa7, 0xbf, 0xad, 0xdc, 0xa7, 0xc2, 0x60, 0xe2);

static void WA__Lock(WasapiAudio* audio)
{
	// loop while audio->lock != FALSE
	while (InterlockedCompareExchange(&audio->lock, TRUE, FALSE) != FALSE)
	{
		// wait while audio->lock == locked
		LONG locked = FALSE;
		WaitOnAddress(&audio->lock, &locked, sizeof(locked), INFINITE);
	}
	// now audio->lock == TRUE
}

static void WA__Unlock(WasapiAudio* audio)
{
	// audio->lock = FALSE
	InterlockedExchange(&audio->lock, FALSE);
	WakeByAddressSingle(&audio->lock);
}

static DWORD CALLBACK WA__AudioThread(LPVOID arg)
{
	WasapiAudio* audio = arg;

	DWORD task = 0;
	HANDLE handle = AvSetMmThreadCharacteristicsW(L"Pro Audio", &task);
	assert(handle);

	IAudioClient* client = audio->client;

	IAudioRenderClient* playback;
	HR(IAudioClient_GetService(client, &IID_IAudioRenderClient, (LPVOID*)&playback));

	// get audio buffer size in samples
	UINT32 bufferSamples;
	HR(IAudioClient_GetBufferSize(client, &bufferSamples));

	// start the playback
	HR(IAudioClient_Start(client));

	UINT32 bytesPerSample = audio->bufferFormat->nBlockAlign;
	UINT32 rbMask = audio->rbSize - 1;
	BYTE* input = audio->buffer1;

	while (WaitForSingleObject(audio->event, INFINITE) == WAIT_OBJECT_0)
	{
		if (InterlockedExchange(&audio->stop, FALSE))
		{
			break;
		}

		UINT32 paddingSamples;
		HR(IAudioClient_GetCurrentPadding(client, &paddingSamples));

		// get output buffer from WASAPI
		BYTE* output;
		UINT32 maxOutputSamples = bufferSamples - paddingSamples;
		HR(IAudioRenderClient_GetBuffer(playback, maxOutputSamples, &output));

		WA__Lock(audio);

		UINT32 readOffset = audio->rbReadOffset;
		UINT32 writeOffset = audio->rbWriteOffset;

		// how many bytes available to read from ringbuffer
		UINT32 availableSize = writeOffset - readOffset;

		// how many samples available
		UINT32 availableSamples = availableSize / bytesPerSample;

		// will use up to max that's possible to output
		UINT32 useSamples = min(availableSamples, maxOutputSamples);

		// how many bytes to use
		UINT32 useSize = useSamples * bytesPerSample;

		// lock range [read, lock) that memcpy will read from below
		audio->rbLockOffset = readOffset + useSize;

		// will always submit required amount of samples, but if there's not enough to use, then submit silence
		UINT32 submitCount = useSamples ? useSamples : maxOutputSamples;
		DWORD flags = useSamples ? 0 : AUDCLNT_BUFFERFLAGS_SILENT;

		// remember how many samples are submitted
		audio->bufferUsed += submitCount;

		WA__Unlock(audio);

		// copy bytes to output
		// safe to do it outside WA__Lock/Unlock, because nobody will overwrite [read, lock) interval
		memcpy(output, input + (readOffset & rbMask), useSize);

		// advance read offset up to lock position, allows writing to [read, lock) interval
		InterlockedAdd(&audio->rbReadOffset, useSize);

		// submit output buffer to WASAPI
		HR(IAudioRenderClient_ReleaseBuffer(playback, submitCount, flags));
	}

	// stop the playback
	HR(IAudioClient_Stop(client));
	IAudioRenderClient_Release(playback);

	AvRevertMmThreadCharacteristics(handle);
	return 0;
}

static DWORD RoundUpPow2(DWORD value)
{
	unsigned long index;
	_BitScanReverse(&index, value - 1);
	assert(index < 31);
	return 1U << (index + 1);
}

static void WA_Start(WasapiAudio* audio, size_t sampleRate, size_t channelCount, DWORD channelMask)
{
	// initialize COM
	HR(CoInitializeEx(NULL, COINIT_APARTMENTTHREADED));

	// create enumerator to get audio device
	IMMDeviceEnumerator* enumerator;
	HR(CoCreateInstance(&CLSID_MMDeviceEnumerator, NULL, CLSCTX_ALL, &IID_IMMDeviceEnumerator, (LPVOID*)&enumerator));

	// get default playback device
	IMMDevice* device;
	HR(IMMDeviceEnumerator_GetDefaultAudioEndpoint(enumerator, eRender, eConsole, &device));
	IMMDeviceEnumerator_Release(enumerator);

	// create audio client for device
	HR(IMMDevice_Activate(device, &IID_IAudioClient, CLSCTX_ALL, NULL, (LPVOID*)&audio->client));
	IMMDevice_Release(device);

	WAVEFORMATEXTENSIBLE formatEx =
	{
		.Format =
		{
			.wFormatTag = WAVE_FORMAT_EXTENSIBLE,
			.nChannels = (WORD)channelCount,
			.nSamplesPerSec = (WORD)sampleRate,
			.nAvgBytesPerSec = (DWORD)(sampleRate * channelCount * sizeof(float)),
			.nBlockAlign = (WORD)(channelCount * sizeof(float)),
			.wBitsPerSample = (WORD)(8 * sizeof(float)),
			.cbSize = sizeof(formatEx) - sizeof(formatEx.Format),
		},
		.Samples.wValidBitsPerSample = 8 * sizeof(float),
		.dwChannelMask = channelMask,
		.SubFormat = MEDIASUBTYPE_IEEE_FLOAT,
	};

	WAVEFORMATEX* wfx;
	if (sampleRate == 0 || channelCount == 0 || channelMask == 0)
	{
		// use native mixing format
		HR(IAudioClient_GetMixFormat(audio->client, &wfx));
		audio->bufferFormat = wfx;
	}
	else
	{
		// will use our format
		wfx = &formatEx.Format;
	}

	BOOL clientInitialized = FALSE;

	// try to initialize client with newer functionality in Windows 10, no AUTOCONVERTPCM allowed
	IAudioClient3* client3;
	if (SUCCEEDED(IAudioClient_QueryInterface(audio->client, &IID_IAudioClient3, (LPVOID*)&client3)))
	{
		// minimum buffer size will typically be 480 samples (10msec @ 48khz)
		// but it can be 128 samples (2.66 msec @ 48khz) if driver is properly installed
		// see bullet-point instructions here: https://learn.microsoft.com/en-us/windows-hardware/drivers/audio/low-latency-audio#measurement-tools
		UINT32 defaultPeriodSamples, fundamentalPeriodSamples, minPeriodSamples, maxPeriodSamples;
		HR(IAudioClient3_GetSharedModeEnginePeriod(client3, wfx, &defaultPeriodSamples, &fundamentalPeriodSamples, &minPeriodSamples, &maxPeriodSamples));

		const DWORD flags = AUDCLNT_STREAMFLAGS_EVENTCALLBACK;
		if (SUCCEEDED(IAudioClient3_InitializeSharedAudioStream(client3, flags, minPeriodSamples, wfx, NULL)))
		{
			clientInitialized = TRUE;
		}

		IAudioClient3_Release(client3);
	}

	if (!clientInitialized)
	{
		// get duration for shared-mode streams, this will typically be 480 samples (10msec @ 48khz)
		REFERENCE_TIME duration;
		HR(IAudioClient_GetDevicePeriod(audio->client, &duration, NULL));

		// initialize audio playback
		const DWORD flags = AUDCLNT_STREAMFLAGS_EVENTCALLBACK | AUDCLNT_STREAMFLAGS_AUTOCONVERTPCM | AUDCLNT_STREAMFLAGS_SRC_DEFAULT_QUALITY;
		HR(IAudioClient_Initialize(audio->client, AUDCLNT_SHAREMODE_SHARED, flags, duration, 0, wfx, NULL));
	}

	if (wfx == &formatEx.Format)
	{
		HR(IAudioClient_GetMixFormat(audio->client, &wfx));
		audio->bufferFormat = wfx;
	}

	UINT32 bufferSamples;
	HR(IAudioClient_GetBufferSize(audio->client, &bufferSamples));
	audio->outSize = bufferSamples * audio->bufferFormat->nBlockAlign;

	// setup event handle to wait on
	audio->event = CreateEventW(NULL, FALSE, FALSE, NULL);
	HR(IAudioClient_SetEventHandle(audio->client, audio->event));

	// use at least 64KB or 1 second whichever is larger, and round upwards to pow2 for ringbuffer
	DWORD rbSize = RoundUpPow2(max(64 * 1024, audio->bufferFormat->nAvgBytesPerSec));

	// reserve virtual address placeholder for 2x size for magic ringbuffer
	char* placeholder1 = VirtualAlloc2(NULL, NULL, 2 * rbSize, MEM_RESERVE | MEM_RESERVE_PLACEHOLDER, PAGE_NOACCESS, NULL, 0);
	char* placeholder2 = placeholder1 + rbSize;
	assert(placeholder1);

	// split allocated address space in half
	BOOL ok = VirtualFree(placeholder1, rbSize, MEM_RELEASE | MEM_PRESERVE_PLACEHOLDER);
	assert(ok);

	// create page-file backed section for buffer
	HANDLE section = CreateFileMappingW(INVALID_HANDLE_VALUE, NULL, PAGE_READWRITE, 0, rbSize, NULL);
	assert(section);

	// map same section into both addresses
	void* view1 = MapViewOfFile3(section, NULL, placeholder1, 0, rbSize, MEM_REPLACE_PLACEHOLDER, PAGE_READWRITE, NULL, 0);
	void* view2 = MapViewOfFile3(section, NULL, placeholder2, 0, rbSize, MEM_REPLACE_PLACEHOLDER, PAGE_READWRITE, NULL, 0);
	assert(view1 && view2);

	audio->sampleBuffer = NULL;
	audio->sampleCount = 0;
	audio->playCount = 0;
	audio->buffer1 = view1;
	audio->buffer2 = view2;
	audio->rbSize = rbSize;
	audio->bufferUsed = 0;
	audio->bufferFirstLock = TRUE;
	audio->rbReadOffset = 0;
	audio->rbLockOffset = 0;
	audio->rbWriteOffset = 0;
	InterlockedExchange(&audio->stop, FALSE);
	InterlockedExchange(&audio->lock, FALSE);
	audio->thread = CreateThread(NULL, 0, &WA__AudioThread, audio, 0, NULL);

	// this is ok, actual memory will be freed only when it is unmapped
	VirtualFree(placeholder1, 0, MEM_RELEASE);
	VirtualFree(placeholder2, 0, MEM_RELEASE);
	CloseHandle(section);
}

static void WA_Stop(WasapiAudio* audio)
{
	// notify thread to stop
	InterlockedExchange(&audio->stop, TRUE);
	SetEvent(audio->event);

	// wait for thread to finish
	WaitForSingleObject(audio->thread, INFINITE);
	CloseHandle(audio->thread);
	CloseHandle(audio->event);

	// release ringbuffer
	UnmapViewOfFileEx(audio->buffer1, 0);
	UnmapViewOfFileEx(audio->buffer2, 0);

	// release audio client
	CoTaskMemFree(audio->bufferFormat);
	IAudioClient_Release(audio->client);

	// done with COM
	CoUninitialize();
}

static void WA_LockBuffer(WasapiAudio* audio)
{
	UINT32 bytesPerSample = audio->bufferFormat->nBlockAlign;
	UINT32 rbSize = audio->rbSize;
	UINT32 outSize = audio->outSize;

	WA__Lock(audio);

	UINT32 readOffset = audio->rbReadOffset;
	UINT32 lockOffset = audio->rbLockOffset;
	UINT32 writeOffset = audio->rbWriteOffset;

	// how many bytes are used in buffer by reader = [read, lock) range
	UINT32 usedSize = lockOffset - readOffset;

	// make sure there are samples available for one wasapi buffer submission
	// so in case audio thread needs samples before UnlockBuffer is called, it can get some
	if (usedSize < outSize)
	{
		// how many bytes available in current buffer = [read, write) range
		UINT32 availSize = writeOffset - readOffset;

		// if [read, lock) is smaller than outSize buffer, then increase lock to [read, read+outSize) range
		usedSize = min(outSize, availSize);
		audio->rbLockOffset = lockOffset = readOffset + usedSize;
	}

	// how many bytes can be written to buffer
	UINT32 writeSize = rbSize - usedSize;

	// reset write marker to beginning of lock offset (can start writing there)
	audio->rbWriteOffset = lockOffset;

	// reset play sample count, use 0 for playCount when LockBuffer is called first time
	audio->playCount = audio->bufferFirstLock ? 0 : audio->bufferUsed;
	audio->bufferFirstLock = FALSE;
	audio->bufferUsed = 0;

	WA__Unlock(audio);

	// buffer offset/size where to write
	// safe to write in [write, read) range, because reading happen in [read, lock) range (lock==write)
	audio->sampleBuffer = audio->buffer1 + (lockOffset & (rbSize - 1));
	audio->sampleCount = writeSize / bytesPerSample;
}

static void WA_UnlockBuffer(WasapiAudio* audio, size_t writtenSamples)
{
	UINT32 bytesPerSample = audio->bufferFormat->nBlockAlign;
	size_t writeSize = writtenSamples * bytesPerSample;

	// advance write offset to allow reading new samples
	InterlockedAdd(&audio->rbWriteOffset, (LONG)writeSize);
}
	#include "win32_wasapi.h"

	//
	// example
	//

	#include <stdio.h>
	#include <stdlib.h>
	#include <stdbool.h>
	#include <mfapi.h>
	#include <mfidl.h>
	#include <mfreadwrite.h>

	#pragma comment (lib, "mfplat")
	#pragma comment (lib, "mfreadwrite")

	typedef struct {
	short* samples;
	size_t count;
	size_t pos;
	bool loop;
	} Sound;

	// loads any supported sound file, and resamples to mono 16-bit audio with specified sample rate
	static Sound S_Load(const WCHAR* path, size_t sampleRate)
	{
	Sound sound = { NULL, 0, 0, false };
	HR(MFStartup(MF_VERSION, MFSTARTUP_LITE));

	IMFSourceReader* reader;
	HR(MFCreateSourceReaderFromURL(path, NULL, &reader));

	// read only first audio stream
	HR(IMFSourceReader_SetStreamSelection(reader, (DWORD)MF_SOURCE_READER_ALL_STREAMS, FALSE));
	HR(IMFSourceReader_SetStreamSelection(reader, (DWORD)MF_SOURCE_READER_FIRST_AUDIO_STREAM, TRUE));

	const size_t kChannelCount = 1;
	const WAVEFORMATEXTENSIBLE format =
	{
	.Format =
	{
	.wFormatTag = WAVE_FORMAT_EXTENSIBLE,
	.nChannels = (WORD)kChannelCount,
	.nSamplesPerSec = (WORD)sampleRate,
	.nAvgBytesPerSec = (DWORD)(sampleRate * kChannelCount * sizeof(short)),
	.nBlockAlign = (WORD)(kChannelCount * sizeof(short)),
	.wBitsPerSample = (WORD)(8 * sizeof(short)),
	.cbSize = sizeof(format) - sizeof(format.Format),
	},
	.Samples.wValidBitsPerSample = 8 * sizeof(short),
	.dwChannelMask = SPEAKER_FRONT_CENTER,
	.SubFormat = MEDIASUBTYPE_PCM,
	};

	// Media Foundation in Windows 8+ allows reader to convert output to different format than native
	IMFMediaType* type;
	HR(MFCreateMediaType(&type));
	HR(MFInitMediaTypeFromWaveFormatEx(type, &format.Format, sizeof(format)));
	HR(IMFSourceReader_SetCurrentMediaType(reader, (DWORD)MF_SOURCE_READER_FIRST_AUDIO_STREAM, NULL, type));
	IMFMediaType_Release(type);

	size_t used = 0;
	size_t capacity = 0;

	for (;;)
	{
	IMFSample* sample;
	DWORD flags = 0;
	HRESULT hr = IMFSourceReader_ReadSample(reader, (DWORD)MF_SOURCE_READER_FIRST_AUDIO_STREAM, 0, NULL, &flags, NULL, &sample);
	if (FAILED(hr))
	{
	break;
	}

	if (flags & MF_SOURCE_READERF_ENDOFSTREAM)
	{
	break;
	}
	assert(flags == 0);

	IMFMediaBuffer* buffer;
	HR(IMFSample_ConvertToContiguousBuffer(sample, &buffer));

	BYTE* data;
	DWORD size;
	HR(IMFMediaBuffer_Lock(buffer, &data, NULL, &size));
	{
	size_t avail = capacity - used;
	if (avail < size)
	{
	sound.samples = realloc(sound.samples, capacity += 64 * 1024);
	}
	memcpy((char*)sound.samples + used, data, size);
	used += size;
	}
	HR(IMFMediaBuffer_Unlock(buffer));

	IMediaBuffer_Release(buffer);
	IMFSample_Release(sample);
	}

	IMFSourceReader_Release(reader);

	HR(MFShutdown());

	sound.pos = sound.count = used / format.Format.nBlockAlign;
	return sound;
	}

	static void S_Update(Sound* sound, size_t samples)
	{
	sound->pos += samples;
	if (sound->loop)
	{
	sound->pos %= sound->count;
	}
	else
	{
	sound->pos = min(sound->pos, sound->count);
	}
	}

	static void S_Mix(float* outSamples, size_t outSampleCount, float volume, const Sound* sound)
	{
	const short* inSamples = sound->samples;
	size_t inPos = sound->pos;
	size_t inCount = sound->count;
	bool inLoop = sound->loop;

	for (size_t i = 0; i < outSampleCount; i++)
	{
	if (inLoop)
	{
	if (inPos == inCount)
	{
	// reset looping sound back to start
	inPos = 0;
	}
	}
	else
	{
	if (inPos >= inCount)
	{
	// non-looping sounds stops playback when done
	break;
	}
	}

	float sample = inSamples[inPos++] * (1.f / 32768.f);
	outSamples[0] += volume * sample;
	outSamples[1] += volume * sample;
	outSamples += 2;
	}
	}

	int main()
	{
	WasapiAudio audio;
	WA_Start(&audio, 48000, 2, SPEAKER_FRONT_LEFT \| SPEAKER_FRONT_RIGHT);
	size_t sampleRate = audio.bufferFormat->nSamplesPerSec;
	size_t bytesPerSample = audio.bufferFormat->nBlockAlign;

	// background "music" that will be looping
	Sound background = S_Load(L"C:/Windows/Media/Ring10.wav", sampleRate);
	background.loop = true;

	// simple sound effect, won't be looping
	Sound effect = S_Load(L"C:/Windows/Media/tada.wav", sampleRate);

	printf("Press SPACE for sound effect, D for small delay, or ESC to stop\n");

	HANDLE input = GetStdHandle(STD_INPUT_HANDLE);

	for (;;)
	{
	bool escPressed = false;
	bool spacePressed = false;
	bool delayPressed = false;

	while (WaitForSingleObject(input, 0) == WAIT_OBJECT_0)
	{
	INPUT_RECORD record;
	DWORD read;
	if (ReadConsoleInputW(input, &record, 1, &read)
	&& read == 1
	&& record.EventType == KEY_EVENT
	&& record.Event.KeyEvent.bKeyDown)
	{
	switch (record.Event.KeyEvent.wVirtualKeyCode)
	{
	case VK_ESCAPE: escPressed = true; break;
	case VK_SPACE: spacePressed = true; break;
	case 'D': delayPressed = true; break;
	}
	}
	}

	if (escPressed)
	{
	printf("stop!\n");
	break;
	}

	if (spacePressed)
	{
	printf("tada!\n");
	effect.pos = 0;
	}

	{
	WA_LockBuffer(&audio);

	// write at least 100msec of samples into buffer (or whatever space available, whichever is smaller)
	// this is max amount of time you expect code will take until the next iteration of loop
	// if code will take more time then you'll hear discontinuity as buffer will be filled with silence
	size_t writeCount = min(sampleRate/10, audio.sampleCount);

	// alternatively you can write as much as "audio.sampleCount" to fully fill the buffer (~1 second)
	// then you can try to increase delay below to 900+ msec, it still should sound fine
	//writeCount = audio.sampleCount;

	// advance sound playback positions
	size_t playCount = audio.playCount;
	S_Update(&background, playCount);
	S_Update(&effect, playCount);

	// initialize output with 0.0f
	float* output = audio.sampleBuffer;
	memset(output, 0, writeCount * bytesPerSample);

	// mix sounds into output
	S_Mix(output, writeCount, 0.3f, &background);
	S_Mix(output, writeCount, 0.8f, &effect);

	WA_UnlockBuffer(&audio, writeCount);
	}

	if (delayPressed)
	{
	printf("delay!\n");
	Sleep(5 * 17); // large delay for ~5 frames = ~68 msec
	//Sleep(900);
	}
	else
	{
	// just a small delay, pretend this is your normal rendering code
	Sleep(17); // "60" fps
	}

	printf(".");
	fflush(stdout);
	}

	WA_Stop(&audio);

	printf("Done!\n");

	return 0;
	}
	#pragma once

	#define COBJMACROS
	#define WIN32_LEAN_AND_MEAN
	#include <initguid.h>
	#include <windows.h>
	#include <objbase.h>
	#include <uuids.h>
	#include <avrt.h>
	#include <audioclient.h>
	#include <mmdeviceapi.h>

	#include <stddef.h>

	// "count" means sample count (for example, 1 sample = 2 floats for stereo)
	// "offset" or "size" means byte count

	typedef struct {
	// public part

	// describes sampleBuffer format
	WAVEFORMATEX* bufferFormat;

	// use these values only between LockBuffer/UnlockBuffer calls
	void* sampleBuffer; // ringbuffer for interleaved samples, no need to handle wrapping
	size_t sampleCount; // how big is buffer in samples
	size_t playCount; // how many samples were actually used for playback since previous LockBuffer call

	// private
	IAudioClient* client;
	HANDLE event;
	HANDLE thread;
	LONG stop;
	LONG lock;
	BYTE* buffer1;
	BYTE* buffer2;
	UINT32 outSize; // output buffer size in bytes
	UINT32 rbSize; // ringbuffer size, always power of 2
	UINT32 bufferUsed; // how many samples are used from buffer
	BOOL bufferFirstLock; // true when BufferLock is used at least once
	volatile LONG rbReadOffset; // offset to read from buffer
	volatile LONG rbLockOffset; // offset up to what buffer is currently being used
	volatile LONG rbWriteOffset; // offset up to what buffer is filled
	} WasapiAudio;

	//
	// interface
	//

	// pass 0 for rate/count/mask to get default format of output device (use audio->bufferFormat)
	// channelMask is bitmask of values from table here: https://learn.microsoft.com/en-us/windows/win32/api/mmreg/ns-mmreg-waveformatextensible#remarks
	static void WA_Start(WasapiAudio* audio, size_t sampleRate, size_t channelCount, DWORD channelMask);

	// stops the playback and releases resources
	static void WA_Stop(WasapiAudio* audio);

	// once locked, then you're allowed to write samples into the ringbuffer
	// use only sampleBuffer, sampleCount and submittedCount members
	static void WA_LockBuffer(WasapiAudio* audio);
	static void WA_UnlockBuffer(WasapiAudio* audio, size_t writtenCount);

	//
	// implementation
	//

	// TODO: what's missing here:
	// * proper error handling, like when no audio device is present (currently asserts)
	// * automatically switch to new device when default audio device changes (IMMNotificationClient)

	#pragma comment (lib, "avrt")
	#pragma comment (lib, "ole32")
	#pragma comment (lib, "onecore")

	#include <assert.h>
	#define HR(stmt) do { HRESULT _hr = stmt; assert(SUCCEEDED(_hr)); } while (0)

	// why these are missing from windows libs? :(
	DEFINE_GUID(CLSID_MMDeviceEnumerator, 0xbcde0395, 0xe52f, 0x467c, 0x8e, 0x3d, 0xc4, 0x57, 0x92, 0x91, 0x69, 0x2e);
	DEFINE_GUID(IID_IMMDeviceEnumerator, 0xa95664d2, 0x9614, 0x4f35, 0xa7, 0x46, 0xde, 0x8d, 0xb6, 0x36, 0x17, 0xe6);
	DEFINE_GUID(IID_IAudioClient, 0x1cb9ad4c, 0xdbfa, 0x4c32, 0xb1, 0x78, 0xc2, 0xf5, 0x68, 0xa7, 0x03, 0xb2);
	DEFINE_GUID(IID_IAudioClient3, 0x7ed4ee07, 0x8e67, 0x4cd4, 0x8c, 0x1a, 0x2b, 0x7a, 0x59, 0x87, 0xad, 0x42);
	DEFINE_GUID(IID_IAudioRenderClient, 0xf294acfc, 0x3146, 0x4483, 0xa7, 0xbf, 0xad, 0xdc, 0xa7, 0xc2, 0x60, 0xe2);

	static void WA__Lock(WasapiAudio* audio)
	{
	// loop while audio->lock != FALSE
	while (InterlockedCompareExchange(&audio->lock, TRUE, FALSE) != FALSE)
	{
	// wait while audio->lock == locked
	LONG locked = FALSE;
	WaitOnAddress(&audio->lock, &locked, sizeof(locked), INFINITE);
	}
	// now audio->lock == TRUE
	}

	static void WA__Unlock(WasapiAudio* audio)
	{
	// audio->lock = FALSE
	InterlockedExchange(&audio->lock, FALSE);
	WakeByAddressSingle(&audio->lock);
	}

	static DWORD CALLBACK WA__AudioThread(LPVOID arg)
	{
	WasapiAudio* audio = arg;

	DWORD task = 0;
	HANDLE handle = AvSetMmThreadCharacteristicsW(L"Pro Audio", &task);
	assert(handle);

	IAudioClient* client = audio->client;

	IAudioRenderClient* playback;
	HR(IAudioClient_GetService(client, &IID_IAudioRenderClient, (LPVOID*)&playback));

	// get audio buffer size in samples
	UINT32 bufferSamples;
	HR(IAudioClient_GetBufferSize(client, &bufferSamples));

	// start the playback
	HR(IAudioClient_Start(client));

	UINT32 bytesPerSample = audio->bufferFormat->nBlockAlign;
	UINT32 rbMask = audio->rbSize - 1;
	BYTE* input = audio->buffer1;

	while (WaitForSingleObject(audio->event, INFINITE) == WAIT_OBJECT_0)
	{
	if (InterlockedExchange(&audio->stop, FALSE))
	{
	break;
	}

	UINT32 paddingSamples;
	HR(IAudioClient_GetCurrentPadding(client, &paddingSamples));

	// get output buffer from WASAPI
	BYTE* output;
	UINT32 maxOutputSamples = bufferSamples - paddingSamples;
	HR(IAudioRenderClient_GetBuffer(playback, maxOutputSamples, &output));

	WA__Lock(audio);

	UINT32 readOffset = audio->rbReadOffset;
	UINT32 writeOffset = audio->rbWriteOffset;

	// how many bytes available to read from ringbuffer
	UINT32 availableSize = writeOffset - readOffset;

	// how many samples available
	UINT32 availableSamples = availableSize / bytesPerSample;

	// will use up to max that's possible to output
	UINT32 useSamples = min(availableSamples, maxOutputSamples);

	// how many bytes to use
	UINT32 useSize = useSamples * bytesPerSample;

	// lock range [read, lock) that memcpy will read from below
	audio->rbLockOffset = readOffset + useSize;

	// will always submit required amount of samples, but if there's not enough to use, then submit silence
	UINT32 submitCount = useSamples ? useSamples : maxOutputSamples;
	DWORD flags = useSamples ? 0 : AUDCLNT_BUFFERFLAGS_SILENT;

	// remember how many samples are submitted
	audio->bufferUsed += submitCount;

	WA__Unlock(audio);

	// copy bytes to output
	// safe to do it outside WA__Lock/Unlock, because nobody will overwrite [read, lock) interval
	memcpy(output, input + (readOffset & rbMask), useSize);

	// advance read offset up to lock position, allows writing to [read, lock) interval
	InterlockedAdd(&audio->rbReadOffset, useSize);

	// submit output buffer to WASAPI
	HR(IAudioRenderClient_ReleaseBuffer(playback, submitCount, flags));
	}

	// stop the playback
	HR(IAudioClient_Stop(client));
	IAudioRenderClient_Release(playback);

	AvRevertMmThreadCharacteristics(handle);
	return 0;
	}

	static DWORD RoundUpPow2(DWORD value)
	{
	unsigned long index;
	_BitScanReverse(&index, value - 1);
	assert(index < 31);
	return 1U << (index + 1);
	}

	static void WA_Start(WasapiAudio* audio, size_t sampleRate, size_t channelCount, DWORD channelMask)
	{
	// initialize COM
	HR(CoInitializeEx(NULL, COINIT_APARTMENTTHREADED));

	// create enumerator to get audio device
	IMMDeviceEnumerator* enumerator;
	HR(CoCreateInstance(&CLSID_MMDeviceEnumerator, NULL, CLSCTX_ALL, &IID_IMMDeviceEnumerator, (LPVOID*)&enumerator));

	// get default playback device
	IMMDevice* device;
	HR(IMMDeviceEnumerator_GetDefaultAudioEndpoint(enumerator, eRender, eConsole, &device));
	IMMDeviceEnumerator_Release(enumerator);

	// create audio client for device
	HR(IMMDevice_Activate(device, &IID_IAudioClient, CLSCTX_ALL, NULL, (LPVOID*)&audio->client));
	IMMDevice_Release(device);

	WAVEFORMATEXTENSIBLE formatEx =
	{
	.Format =
	{
	.wFormatTag = WAVE_FORMAT_EXTENSIBLE,
	.nChannels = (WORD)channelCount,
	.nSamplesPerSec = (WORD)sampleRate,
	.nAvgBytesPerSec = (DWORD)(sampleRate * channelCount * sizeof(float)),
	.nBlockAlign = (WORD)(channelCount * sizeof(float)),
	.wBitsPerSample = (WORD)(8 * sizeof(float)),
	.cbSize = sizeof(formatEx) - sizeof(formatEx.Format),
	},
	.Samples.wValidBitsPerSample = 8 * sizeof(float),
	.dwChannelMask = channelMask,
	.SubFormat = MEDIASUBTYPE_IEEE_FLOAT,
	};

	WAVEFORMATEX* wfx;
	if (sampleRate == 0 \|\| channelCount == 0 \|\| channelMask == 0)
	{
	// use native mixing format
	HR(IAudioClient_GetMixFormat(audio->client, &wfx));
	audio->bufferFormat = wfx;
	}
	else
	{
	// will use our format
	wfx = &formatEx.Format;
	}

	BOOL clientInitialized = FALSE;

	// try to initialize client with newer functionality in Windows 10, no AUTOCONVERTPCM allowed
	IAudioClient3* client3;
	if (SUCCEEDED(IAudioClient_QueryInterface(audio->client, &IID_IAudioClient3, (LPVOID*)&client3)))
	{
	// minimum buffer size will typically be 480 samples (10msec @ 48khz)
	// but it can be 128 samples (2.66 msec @ 48khz) if driver is properly installed
	// see bullet-point instructions here: https://learn.microsoft.com/en-us/windows-hardware/drivers/audio/low-latency-audio#measurement-tools
	UINT32 defaultPeriodSamples, fundamentalPeriodSamples, minPeriodSamples, maxPeriodSamples;
	HR(IAudioClient3_GetSharedModeEnginePeriod(client3, wfx, &defaultPeriodSamples, &fundamentalPeriodSamples, &minPeriodSamples, &maxPeriodSamples));

	const DWORD flags = AUDCLNT_STREAMFLAGS_EVENTCALLBACK;
	if (SUCCEEDED(IAudioClient3_InitializeSharedAudioStream(client3, flags, minPeriodSamples, wfx, NULL)))
	{
	clientInitialized = TRUE;
	}

	IAudioClient3_Release(client3);
	}

	if (!clientInitialized)
	{
	// get duration for shared-mode streams, this will typically be 480 samples (10msec @ 48khz)
	REFERENCE_TIME duration;
	HR(IAudioClient_GetDevicePeriod(audio->client, &duration, NULL));

	// initialize audio playback
	const DWORD flags = AUDCLNT_STREAMFLAGS_EVENTCALLBACK \| AUDCLNT_STREAMFLAGS_AUTOCONVERTPCM \| AUDCLNT_STREAMFLAGS_SRC_DEFAULT_QUALITY;
	HR(IAudioClient_Initialize(audio->client, AUDCLNT_SHAREMODE_SHARED, flags, duration, 0, wfx, NULL));
	}

	if (wfx == &formatEx.Format)
	{
	HR(IAudioClient_GetMixFormat(audio->client, &wfx));
	audio->bufferFormat = wfx;
	}

	UINT32 bufferSamples;
	HR(IAudioClient_GetBufferSize(audio->client, &bufferSamples));
	audio->outSize = bufferSamples * audio->bufferFormat->nBlockAlign;

	// setup event handle to wait on
	audio->event = CreateEventW(NULL, FALSE, FALSE, NULL);
	HR(IAudioClient_SetEventHandle(audio->client, audio->event));

	// use at least 64KB or 1 second whichever is larger, and round upwards to pow2 for ringbuffer
	DWORD rbSize = RoundUpPow2(max(64 * 1024, audio->bufferFormat->nAvgBytesPerSec));

	// reserve virtual address placeholder for 2x size for magic ringbuffer
	char* placeholder1 = VirtualAlloc2(NULL, NULL, 2 * rbSize, MEM_RESERVE \| MEM_RESERVE_PLACEHOLDER, PAGE_NOACCESS, NULL, 0);
	char* placeholder2 = placeholder1 + rbSize;
	assert(placeholder1);

	// split allocated address space in half
	BOOL ok = VirtualFree(placeholder1, rbSize, MEM_RELEASE \| MEM_PRESERVE_PLACEHOLDER);
	assert(ok);

	// create page-file backed section for buffer
	HANDLE section = CreateFileMappingW(INVALID_HANDLE_VALUE, NULL, PAGE_READWRITE, 0, rbSize, NULL);
	assert(section);

	// map same section into both addresses
	void* view1 = MapViewOfFile3(section, NULL, placeholder1, 0, rbSize, MEM_REPLACE_PLACEHOLDER, PAGE_READWRITE, NULL, 0);
	void* view2 = MapViewOfFile3(section, NULL, placeholder2, 0, rbSize, MEM_REPLACE_PLACEHOLDER, PAGE_READWRITE, NULL, 0);
	assert(view1 && view2);

	audio->sampleBuffer = NULL;
	audio->sampleCount = 0;
	audio->playCount = 0;
	audio->buffer1 = view1;
	audio->buffer2 = view2;
	audio->rbSize = rbSize;
	audio->bufferUsed = 0;
	audio->bufferFirstLock = TRUE;
	audio->rbReadOffset = 0;
	audio->rbLockOffset = 0;
	audio->rbWriteOffset = 0;
	InterlockedExchange(&audio->stop, FALSE);
	InterlockedExchange(&audio->lock, FALSE);
	audio->thread = CreateThread(NULL, 0, &WA__AudioThread, audio, 0, NULL);

	// this is ok, actual memory will be freed only when it is unmapped
	VirtualFree(placeholder1, 0, MEM_RELEASE);
	VirtualFree(placeholder2, 0, MEM_RELEASE);
	CloseHandle(section);
	}

	static void WA_Stop(WasapiAudio* audio)
	{
	// notify thread to stop
	InterlockedExchange(&audio->stop, TRUE);
	SetEvent(audio->event);

	// wait for thread to finish
	WaitForSingleObject(audio->thread, INFINITE);
	CloseHandle(audio->thread);
	CloseHandle(audio->event);

	// release ringbuffer
	UnmapViewOfFileEx(audio->buffer1, 0);
	UnmapViewOfFileEx(audio->buffer2, 0);

	// release audio client
	CoTaskMemFree(audio->bufferFormat);
	IAudioClient_Release(audio->client);

	// done with COM
	CoUninitialize();
	}

	static void WA_LockBuffer(WasapiAudio* audio)
	{
	UINT32 bytesPerSample = audio->bufferFormat->nBlockAlign;
	UINT32 rbSize = audio->rbSize;
	UINT32 outSize = audio->outSize;

	WA__Lock(audio);

	UINT32 readOffset = audio->rbReadOffset;
	UINT32 lockOffset = audio->rbLockOffset;
	UINT32 writeOffset = audio->rbWriteOffset;

	// how many bytes are used in buffer by reader = [read, lock) range
	UINT32 usedSize = lockOffset - readOffset;

	// make sure there are samples available for one wasapi buffer submission
	// so in case audio thread needs samples before UnlockBuffer is called, it can get some
	if (usedSize < outSize)
	{
	// how many bytes available in current buffer = [read, write) range
	UINT32 availSize = writeOffset - readOffset;

	// if [read, lock) is smaller than outSize buffer, then increase lock to [read, read+outSize) range
	usedSize = min(outSize, availSize);
	audio->rbLockOffset = lockOffset = readOffset + usedSize;
	}

	// how many bytes can be written to buffer
	UINT32 writeSize = rbSize - usedSize;

	// reset write marker to beginning of lock offset (can start writing there)
	audio->rbWriteOffset = lockOffset;

	// reset play sample count, use 0 for playCount when LockBuffer is called first time
	audio->playCount = audio->bufferFirstLock ? 0 : audio->bufferUsed;
	audio->bufferFirstLock = FALSE;
	audio->bufferUsed = 0;

	WA__Unlock(audio);

	// buffer offset/size where to write
	// safe to write in [write, read) range, because reading happen in [read, lock) range (lock==write)
	audio->sampleBuffer = audio->buffer1 + (lockOffset & (rbSize - 1));
	audio->sampleCount = writeSize / bytesPerSample;
	}

	static void WA_UnlockBuffer(WasapiAudio* audio, size_t writtenSamples)
	{
	UINT32 bytesPerSample = audio->bufferFormat->nBlockAlign;
	size_t writeSize = writtenSamples * bytesPerSample;

	// advance write offset to allow reading new samples
	InterlockedAdd(&audio->rbWriteOffset, (LONG)writeSize);
	}