Skip to content

Instantly share code, notes, and snippets.

@MikuAuahDark
Last active June 13, 2024 04:03
Show Gist options
  • Save MikuAuahDark/e7d391145693e920a9ac8c015bcaef85 to your computer and use it in GitHub Desktop.
Save MikuAuahDark/e7d391145693e920a9ac8c015bcaef85 to your computer and use it in GitHub Desktop.
NPad Audio Video Decode library example.
// clang -Inav/include -Lnav/lib --std=c++17 program.cpp lodepng.cpp -lnav
// Get lodepng.cpp from https://github.com/lvandeve/lodepng
// See https://github.com/MikuAuahDark/nav for more information about NAV.
#include <algorithm>
#include <array>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <list>
#include <stdexcept>
#include <string>
#include <sstream>
#include <vector>
#include <type_traits>
#include "lodepng.h"
#include "nav/nav.h"
constexpr size_t MINUS_1 = -1;
struct NavInputGuard
{
NavInputGuard(nav_input &in)
: input(&in)
{}
~NavInputGuard()
{
if (input->close)
input->closef();
}
nav_input *input;
};
template<typename T>
struct binary_data
{
static_assert(std::is_integral<T>::value, "binary_data not integral value");
using remove_signed = std::make_unsigned_t<T>;
static constexpr size_t size = sizeof(T);
binary_data(T v): value(v) {}
binary_data(const binary_data<T> &) = default;
binary_data(binary_data<T> &&) = default;
std::array<uint8_t, size> bytes() const
{
std::array<uint8_t, size> b {};
remove_signed temp = (remove_signed) value;
for (int i = 0; i < size && temp; i++)
{
b[i] = temp & 0xFF;
temp >>= 8;
}
return b;
}
T value;
};
static std::vector<std::string> convertArgs(int argc, char *argv[])
{
std::vector<std::string> result;
for (size_t i = 0; i < argc; i++)
result.emplace_back(argv[i]);
return result;
}
static void closeInput(nav_input *input)
{
input->closef();
}
static void usage(const std::vector<std::string> &args, bool hasout)
{
std::cout << "Usage: " << args[0] << " <audio|video|enum> <input file>";
if (hasout)
std::cout << " <output file/dir>";
else
std::cout << " [output file/dir]";
std::cout << std::endl;
}
static std::string parseAudioFormat(nav_audioformat fmt)
{
std::stringstream ss;
if (NAV_AUDIOFORMAT_ISFLOAT(fmt))
ss << "pcm_f" << NAV_AUDIOFORMAT_BITSIZE(fmt) << (NAV_AUDIOFORMAT_ISLITTLEENDIAN(fmt) ? "le" : "be");
else
ss << "pcm_" << (NAV_AUDIOFORMAT_ISUNSIGNED(fmt) ? "u" : "s") << NAV_AUDIOFORMAT_BITSIZE(fmt) << (NAV_AUDIOFORMAT_ISLITTLEENDIAN(fmt) ? "le" : "be");
return ss.str();
}
static const char *pixelFormatToString(nav_pixelformat pixfmt)
{
switch (pixfmt)
{
case NAV_PIXELFORMAT_RGB8:
return "rgb8";
case NAV_PIXELFORMAT_YUV420:
return "yuv420p";
case NAV_PIXELFORMAT_YUV444:
return "yuv444p";
case NAV_PIXELFORMAT_NV12:
return "nv12";
default:
return "unknown";
}
}
// https://learn.microsoft.com/en-us/windows/win32/medfound/recommended-8-bit-yuv-formats-for-video-rendering#converting-420-yuv-to-422-yuv
static uint8_t simplewebp__do_uv_fancy_upsampling(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t x, uint8_t y)
{
switch (y * 2 + x)
{
case 0:
return (9u*a + 3u*b + 3u*c + d + 8u) / 16u;
case 1:
return (3u*a + 9u*b + c + 3u*d + 8u) / 16u;
case 2:
return (3u*a + b + 9u*c + 3u*d + 8u) / 16u;
case 3:
return (a + 3u*b + 3u*c + 9u*d + 8u) / 16u;
default:
return 0;
}
}
static int simplewebp__multhi(int v, int coeff)
{
return (v * coeff) >> 8;
}
static uint8_t simplewebp__yuv2rgb_clip8(int v)
{
return ((v & ~16383) == 0) ? ((uint8_t) (v >> 6)) : (v < 0) ? 0 : 255;
}
static void simplewebp__yuv2rgb_plain(uint8_t y, uint8_t u, uint8_t v, uint8_t *rgb)
{
int yhi = simplewebp__multhi(y, 19077);
rgb[0] = simplewebp__yuv2rgb_clip8(yhi + simplewebp__multhi(v, 26149) - 14234);
rgb[1] = simplewebp__yuv2rgb_clip8(yhi - simplewebp__multhi(u, 6419) - simplewebp__multhi(v, 13320) + 8708);
rgb[2] = simplewebp__yuv2rgb_clip8(yhi + simplewebp__multhi(u, 33050) - 17685);
}
template<typename T>
constexpr const T &clamp(const T &value, const T &min, const T &max)
{
return std::min<T>(std::max<T>(value, min), max);
}
static std::vector<uint8_t> convertPixelFormat(nav_pixelformat pixfmt, uint32_t width, uint32_t height, const uint8_t *buf)
{
if (pixfmt != NAV_PIXELFORMAT_UNKNOWN)
{
size_t dimension = ((size_t) width) * height;
if (pixfmt == NAV_PIXELFORMAT_RGB8)
return std::vector<uint8_t>(buf, buf + dimension * 3);
std::vector<uint8_t> result(dimension * 3);
const uint8_t *uv = buf + dimension;
uint8_t *dest = result.data();
if (pixfmt == NAV_PIXELFORMAT_YUV420 || pixfmt == NAV_PIXELFORMAT_NV12)
{
size_t uvw = ((size_t) width + 1) / 2;
size_t uvh = ((size_t) height + 1) / 2;
for (size_t i = 0; i < dimension; i++)
{
size_t xp = i % width;
size_t yp = i / width;
uint8_t y = buf[i];
uint8_t ut[4], vt[4];
size_t xpp = (xp + 1) / 2;
size_t ypp = (yp + 1) / 2;
xpp = xpp == 0 ? 0 : (xpp - 1); // NOTE: Can't use std::max because size_t is unsigned.
ypp = ypp == 0 ? 0 : (ypp - 1);
size_t xppm = std::min(xpp + 1, uvw - 1);
size_t yppm = std::min(ypp + 1, uvh - 1);
if (pixfmt == NAV_PIXELFORMAT_YUV420)
{
// UV planar
size_t udim = uvw * uvh;
ut[0] = uv[ypp * uvw + xpp]; // a
ut[1] = uv[ypp * uvw + xppm]; // b
ut[2] = uv[yppm * uvw + xpp]; // c
ut[3] = uv[yppm * uvw + xppm]; // d
vt[0] = uv[udim + ypp * uvw + xpp]; // a
vt[1] = uv[udim + ypp * uvw + xppm]; // b
vt[2] = uv[udim + yppm * uvw + xpp]; // c
vt[3] = uv[udim + yppm * uvw + xppm]; // d
}
else
{
// UV interleaved (NV12)
ut[0] = uv[(ypp * uvw + xpp) * 2]; // a
ut[1] = uv[(ypp * uvw + xppm) * 2]; // b
ut[2] = uv[(yppm * uvw + xpp) * 2]; // c
ut[3] = uv[(yppm * uvw + xppm) * 2]; // d
vt[0] = uv[1 + (ypp * uvw + xpp) * 2]; // a
vt[1] = uv[1 + (ypp * uvw + xppm) * 2]; // b
vt[2] = uv[1 + (yppm * uvw + xpp) * 2]; // c
vt[3] = uv[1 + (yppm * uvw + xppm) * 2]; // d
}
uint8_t u = simplewebp__do_uv_fancy_upsampling(ut[0], ut[1], ut[2], ut[3], (~xp) & 1, (~yp) & 1);
uint8_t v = simplewebp__do_uv_fancy_upsampling(vt[0], vt[1], vt[2], vt[3], (~xp) & 1, (~yp) & 1);
simplewebp__yuv2rgb_plain(y, u, v, dest + i * 3);
}
return result;
}
else if (pixfmt == NAV_PIXELFORMAT_YUV444)
{
for (size_t i = 0; i < dimension; i++)
{
size_t xp = i % width;
size_t yp = i / width;
uint8_t y = buf[i];
uint8_t u = buf[i + dimension];
uint8_t v = buf[i + dimension * 2];
simplewebp__yuv2rgb_plain(y, u, v, dest + i * 3);
}
return result;
}
}
return std::vector<uint8_t>();
}
static std::string joinPath(const std::string &p1, const std::string &p2)
{
std::string newp1 = p1;
std::transform(p1.begin(), p1.end(), newp1.begin(), [](char c) { return c == '\\' ? '/' : c; });
return newp1.back() == '/' ? (newp1 + p2) : (newp1 + "/" + p2);
}
template<typename T>
std::ostream &operator<<(std::ostream &ostr, const binary_data<T> &bd)
{
const auto array = bd.bytes();
return ostr.write((const char*) array.data(), array.size());
}
int main(int argc, char *argv[])
{
using UniqueNAV = std::unique_ptr<nav_t, decltype(&nav_close)>;
std::vector<std::string> args = convertArgs(argc, argv);
std::ios_base::sync_with_stdio(false);
if (args.size() < 3)
{
usage(args, false);
return 1;
}
int mode = -1;
if (args[1] == "audio" || args[1] == "a")
mode = 1;
else if (args[1] == "video" || args[1] == "v")
mode = 2;
else if (args[1] == "enum" || args[1] == "e")
mode = 0;
if (mode == -1)
{
usage(args, false);
return 1;
}
else if (mode > 0 && args.size() < 4)
{
usage(args, true);
return 1;
}
nav_input mediaInput;
NavInputGuard _g(mediaInput);
if (!nav_input_populate_from_file(&mediaInput, args[2].c_str()))
{
std::cerr << "nav_input_populate_from_file(): " << nav_error() << std::endl;
return 1;
}
UniqueNAV navInst(nav_open(&mediaInput, args[2].c_str()), nav_close);
if (!navInst)
{
std::cerr << "nav_open(): " << nav_error() << std::endl;
return 1;
}
size_t nstreams = nav_nstreams(navInst.get());
size_t streamIndex = MINUS_1;
nav_audioformat audioFormat = 0;
nav_pixelformat pixelFormat = NAV_PIXELFORMAT_UNKNOWN;
uint32_t width = 0, height = 0, sampleRate = 0, nchannels = 0;
if (mode == 0)
{
// Enumerate only
std::cout << "List of streams" << std::endl;
for (size_t i = 0; i < nstreams; i++)
{
nav_streaminfo_t *sinfo = nav_stream_info(navInst.get(), i);
switch (nav_streaminfo_type(sinfo))
{
case NAV_STREAMTYPE_AUDIO:
{
std::cout << i << " audio stream ";
std::cout << nav_audio_sample_rate(sinfo) << "Hz ";
std::cout << nav_audio_nchannels(sinfo) << "ch ";
std::cout << parseAudioFormat(nav_audio_format(sinfo)) << std::endl;
break;
}
case NAV_STREAMTYPE_VIDEO:
{
uint32_t w, h;
nav_video_dimensions(sinfo, &w, &h);
std::cout << i << " video stream " << w << "x" << h;
std::cout << " " << nav_video_fps(sinfo) << " FPS ";
std::cout << pixelFormatToString(nav_video_pixel_format(sinfo)) << std::endl;
break;
}
default:
{
std::cout << i << " unknown stream" << std::endl;
break;
}
}
}
return 0;
}
else
{
for (size_t i = 0; i < nstreams; i++)
{
nav_streaminfo_t *sinfo = nav_stream_info(navInst.get(), i);
nav_streamtype type = nav_streaminfo_type(sinfo);
if (streamIndex == MINUS_1)
{
if (mode == 1 && type == NAV_STREAMTYPE_AUDIO)
{
streamIndex = i;
audioFormat = nav_audio_format(sinfo);
sampleRate = nav_audio_sample_rate(sinfo);
nchannels = nav_audio_nchannels(sinfo);
}
else if (mode == 2 && type == NAV_STREAMTYPE_VIDEO)
{
streamIndex = i;
pixelFormat = nav_video_pixel_format(sinfo);
nav_video_dimensions(sinfo, &width, &height);
}
else
nav_stream_enable(navInst.get(), i, false);
}
else
nav_stream_enable(navInst.get(), i, false);
}
if (streamIndex == MINUS_1)
{
std::cerr << "Cannot find " << (mode == 1 ? "audio" : "video") << " stream in file." << std::endl;
return 1;
}
}
std::list<std::vector<uint8_t>> audioSamples;
size_t totalAudioSamples = 0;
size_t frameCount = 0;
while (true)
{
using UniqueNAVFrame = std::unique_ptr<nav_frame_t, decltype(&nav_frame_free)>;
UniqueNAVFrame frame(nav_read(navInst.get()), nav_frame_free);
if (!frame)
{
const char *err = nav_error();
if (err)
{
std::cerr << "Cannot read stream: " << err << std::endl;
return 1;
}
break;
}
if (nav_frame_streamindex(frame.get()) == streamIndex)
{
if (mode == 1)
{
// Audio frame
const uint8_t *buf = (const uint8_t*) nav_frame_buffer(frame.get());
size_t size = nav_frame_size(frame.get());
audioSamples.emplace_back(buf, buf + size);
totalAudioSamples += size;
std::cout << "Total sample " << totalAudioSamples << std::endl;
if (sizeof(size_t) > 4 && totalAudioSamples > UINT32_MAX)
{
std::cerr << "Cannot write file larger than 4GB for now" << std::endl;
return 1;
}
}
else if (mode == 2)
{
// Video frame
const uint8_t *buf = (const uint8_t*) nav_frame_buffer(frame.get());
try
{
std::stringstream ss;
ss << ++frameCount << "-" << nav_frame_tell(frame.get()) << ".png";
std::string path = joinPath(args[3], ss.str());
std::vector<uint8_t> rgb = convertPixelFormat(pixelFormat, width, height, buf);
unsigned lodepngerr = lodepng::encode(path.c_str(), rgb, width, height, LCT_RGB);
if (lodepngerr)
throw std::runtime_error(lodepng_error_text(lodepngerr));
std::cout << "Frame " << frameCount << std::endl;
}
catch (const std::exception &e)
{
std::cerr << "Cannot save: " << e.what() << std::endl;
return 1;
}
}
}
}
if (mode == 1)
{
// Encode to WAV
uint32_t size =
12 /* WAVE + "fmt " + <size> */
+ 2 /* format */
+ 2 /* nchannels */
+ 4 /* sample rate */
+ 4 /* sample rate * sample size */
+ 4 /* sample size = nchannels * bps / 8 */
+ 2 /* bps */
+ 8 /* "data" + <size> */
+ totalAudioSamples;
uint32_t sampleSize = nchannels * ((NAV_AUDIOFORMAT_BITSIZE(audioFormat) + 7) / 8);
uint32_t smp = sampleRate * sampleSize;
try
{
std::ofstream f(args[3], std::ios_base::out | std::ios_base::binary);
f << "RIFF" << binary_data<uint32_t>(size)
<< "WAVEfmt " << binary_data<uint32_t>(16)
<< binary_data<uint16_t>(NAV_AUDIOFORMAT_ISFLOAT(audioFormat) ? 3 : 1)
<< binary_data<uint16_t>(nchannels)
<< binary_data<uint32_t>(sampleRate)
<< binary_data<uint32_t>(smp)
<< binary_data<uint16_t>((uint16_t) sampleSize)
<< binary_data<uint16_t>(NAV_AUDIOFORMAT_BITSIZE(audioFormat))
<< "data"
<< binary_data<uint32_t>((uint32_t) totalAudioSamples);
for (const std::vector<uint8_t> &samples: audioSamples)
f.write((const char*) samples.data(), samples.size());
}
catch (const std::exception &e)
{
std::cerr << "Cannot save WAV: " << e.what() << std::endl;
return 1;
}
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment