Skip to content

Instantly share code, notes, and snippets.

@MaPePeR
Created July 31, 2020 19:58
Show Gist options
  • Save MaPePeR/4bf437e5057f8b9f5a88f2a7b301ef5c to your computer and use it in GitHub Desktop.
Save MaPePeR/4bf437e5057f8b9f5a88f2a7b301ef5c to your computer and use it in GitHub Desktop.
ffmpeg filter to trim silence from beginning and end of an audio file.
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 0dc74f8b70..18ac1d989f 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -133,6 +133,7 @@ OBJS-$(CONFIG_SIDECHAINCOMPRESS_FILTER) += af_sidechaincompress.o
OBJS-$(CONFIG_SIDECHAINGATE_FILTER) += af_agate.o
OBJS-$(CONFIG_SILENCEDETECT_FILTER) += af_silencedetect.o
OBJS-$(CONFIG_SILENCEREMOVE_FILTER) += af_silenceremove.o
+OBJS-$(CONFIG_SILENCETRIM_FILTER) += af_silenceremove.o
OBJS-$(CONFIG_SOFALIZER_FILTER) += af_sofalizer.o
OBJS-$(CONFIG_STEREOTOOLS_FILTER) += af_stereotools.o
OBJS-$(CONFIG_STEREOWIDEN_FILTER) += af_stereowiden.o
diff --git a/libavfilter/af_silenceremove.c b/libavfilter/af_silenceremove.c
index 7dd8c5a1d8..0de1a956b4 100644
--- a/libavfilter/af_silenceremove.c
+++ b/libavfilter/af_silenceremove.c
@@ -25,6 +25,7 @@
#include "libavutil/opt.h"
#include "libavutil/timestamp.h"
+#include "libavutil/audio_fifo.h"
#include "audio.h"
#include "formats.h"
#include "avfilter.h"
@@ -45,7 +46,10 @@ enum SilenceMode {
SILENCE_TRIM_FLUSH,
SILENCE_COPY,
SILENCE_COPY_FLUSH,
- SILENCE_STOP
+ SILENCE_STOP,
+ SILENCETRIM_START,
+ SILENCETRIM_COPY,
+ SILENCETRIM_BUFFER,
};
typedef struct SilenceRemoveContext {
@@ -98,6 +102,8 @@ typedef struct SilenceRemoveContext {
int detection;
void (*update)(struct SilenceRemoveContext *s, double sample);
double(*compute)(struct SilenceRemoveContext *s, double sample);
+
+ AVAudioFifo *fifo;
} SilenceRemoveContext;
#define OFFSET(x) offsetof(SilenceRemoveContext, x)
@@ -679,3 +685,214 @@ AVFilter ff_af_silenceremove = {
.inputs = silenceremove_inputs,
.outputs = silenceremove_outputs,
};
+
+
+
+
+static const AVOption silencetrim_options[] = {
+ { "start_threshold", NULL, OFFSET(start_threshold), AV_OPT_TYPE_DOUBLE, {.dbl=0}, 0, DBL_MAX, AF },
+ { "stop_threshold", NULL, OFFSET(stop_threshold), AV_OPT_TYPE_DOUBLE, {.dbl=0}, 0, DBL_MAX, AF },
+ { "detection", NULL, OFFSET(detection), AV_OPT_TYPE_INT, {.i64=1}, 0, 1, AF, "detection" },
+ { "peak", 0, 0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, AF, "detection" },
+ { "rms", 0, 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, AF, "detection" },
+ { "window", NULL, OFFSET(window_ratio), AV_OPT_TYPE_DOUBLE, {.dbl=0.02}, 0, 10, AF },
+ { NULL }
+};
+
+AVFILTER_DEFINE_CLASS(silencetrim);
+
+
+static int trim_config_input(AVFilterLink *inlink)
+{
+ AVFilterContext *ctx = inlink->dst;
+ SilenceRemoveContext *s = ctx->priv;
+ AVFilterLink *outlink = ctx->outputs[0];
+
+ s->window_size = FFMAX((inlink->sample_rate * s->window_ratio), 1) * inlink->channels;
+ s->window = av_malloc_array(s->window_size, sizeof(*s->window));
+ if (!s->window)
+ return AVERROR(ENOMEM);
+
+ clear_window(s);
+
+ s->mode = SILENCETRIM_START;
+ s->fifo = av_audio_fifo_alloc(outlink->format, outlink->channels, inlink->sample_rate * 4);
+ if (!s->fifo) {
+ return AVERROR(ENOMEM);
+ }
+
+ return 0;
+}
+
+static int filter_subframe(AVFilterLink *inlink, AVFrame *in, int start_index, int end_index) {
+ AVFilterContext *ctx = inlink->dst;
+ AVFilterLink *outlink = ctx->outputs[0];
+ SilenceRemoveContext *s = ctx->priv;
+ AVFrame *out = ff_get_audio_buffer(outlink, end_index - start_index + 1);
+ if (!out) {
+ return AVERROR(ENOMEM);
+ }
+
+ memcpy(out->data[0], in->data[0] + start_index * inlink->channels * sizeof(double), out->nb_samples * inlink->channels * sizeof(double));
+ out->pts = s->next_pts;
+ s->next_pts += av_rescale_q(out->nb_samples,
+ (AVRational){1, outlink->sample_rate},
+ outlink->time_base);
+ return ff_filter_frame(outlink, out);
+}
+
+static void buffer_frame_end(AVFilterLink *inlink, AVFrame *in, int start_index) {
+ AVFilterContext *ctx = inlink->dst;
+ SilenceRemoveContext *s = ctx->priv;
+ int out_samples = in->nb_samples - start_index;
+ double *sub_frame_data = (double*)in->data[0] + start_index * inlink ->channels;
+ av_audio_fifo_write(s->fifo, (void**)&sub_frame_data, out_samples);
+}
+
+static int trim_filter_frame(AVFilterLink *inlink, AVFrame *in)
+{
+ AVFilterContext *ctx = inlink->dst;
+ AVFilterLink *outlink = ctx->outputs[0];
+ SilenceRemoveContext *s = ctx->priv;
+ double *ibuf = (double *)in->data[0];
+ int i, j;
+ int above_start_threshold;
+ int above_stop_threshold;
+ double sample_volume;
+ int first_non_silence_sample_in_frame = -1;
+ int last_non_silence_sample_in_frame = -1;
+ int ret;
+ AVFrame *out;
+
+ for (i = 0;i < in->nb_samples; i++) {
+ above_start_threshold = 0;
+ above_stop_threshold = 0;
+ for (j = 0; j < inlink->channels; j++) {
+ sample_volume = s->compute(s, ibuf[i * inlink->channels + j]);
+ //sample_volume might be NaN, but we want to get a false for this case anyway. (So NaN = assume silence)
+ above_start_threshold |= sample_volume > s->start_threshold;
+ above_stop_threshold |= sample_volume > s->stop_threshold;
+ }
+ if (above_start_threshold && first_non_silence_sample_in_frame == -1) {
+ first_non_silence_sample_in_frame = i;
+ last_non_silence_sample_in_frame = i;
+ }
+ if (above_stop_threshold) {
+ last_non_silence_sample_in_frame = i;
+ }
+ for (j = 0; j < inlink->channels; j++) {
+ s->update(s, ibuf[i * inlink->channels + j]);
+ }
+
+ }
+ if (s->mode == SILENCETRIM_START) {
+ //We were trimming silence from the start of the audio stream
+ if (first_non_silence_sample_in_frame != -1) {
+ //The audio started playing in this frame
+ //At this point we discard the data that was used to calculate the window.
+ ret = filter_subframe(inlink, in, first_non_silence_sample_in_frame, last_non_silence_sample_in_frame);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (last_non_silence_sample_in_frame < in->nb_samples - 1) {
+ //Silence also started in this frame: Buffer remaining samples
+ buffer_frame_end(inlink, in, last_non_silence_sample_in_frame + 1);
+
+ s->mode = SILENCETRIM_BUFFER;
+ } else {
+ s->mode = SILENCETRIM_COPY;
+ }
+ }
+ } else if (s->mode == SILENCETRIM_COPY) {
+ //We are currently copying data, so we need to generate a frame with all unwritten data.
+ //Generate Frame from 0 to last_non_silence_sample_in_frame(Might be the whole frame)
+ assert(last_non_silence_sample_in_frame >= 0);
+ ret = filter_subframe(inlink, in, 0, last_non_silence_sample_in_frame);
+ if (ret < 0) {
+ return ret;
+ }
+ if (last_non_silence_sample_in_frame < in->nb_samples - 1) {
+ //Silence also started in this frame: Buffer remaining samples
+ buffer_frame_end(inlink, in, last_non_silence_sample_in_frame + 1);
+
+ s->mode = SILENCETRIM_BUFFER;
+ }
+ } else if (s->mode == SILENCETRIM_BUFFER) {
+ //We are currently buffering silence, that might be the end of all audio.
+ if (first_non_silence_sample_in_frame != -1) {
+ //Silence ended in this frame, so buffered silence should not be trimmed.
+ //Generate Frame from buffer + frame from 0 to last_non_silence_sample_in_frame
+ if (av_audio_fifo_size(s->fifo) > 0) {
+ out = ff_get_audio_buffer(outlink, av_audio_fifo_size(s->fifo));
+ if (!out) {
+ return AVERROR(ENOMEM);
+ }
+ av_audio_fifo_read(s->fifo, (void**)out->extended_data, out->nb_samples);
+
+ out->pts = s->next_pts;
+ s->next_pts += av_rescale_q(out->nb_samples,
+ (AVRational){1, outlink->sample_rate},
+ outlink->time_base);
+ ret = ff_filter_frame(outlink, out);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+ ret = filter_subframe(inlink, in, 0, last_non_silence_sample_in_frame);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (last_non_silence_sample_in_frame < in->nb_samples - 1) {
+ buffer_frame_end(inlink, in, last_non_silence_sample_in_frame + 1);
+
+ s->mode = SILENCETRIM_BUFFER;
+ } else {
+ s->mode = SILENCETRIM_COPY;
+ }
+ } else {
+ //Silence did not end in this frame: Buffer the whole frame
+ av_audio_fifo_write(s->fifo, (void**)in->data, in->nb_samples);
+ }
+ }
+
+ av_frame_free(&in);
+ return 0;
+}
+
+static av_cold void trim_uninit(AVFilterContext *ctx)
+{
+ SilenceRemoveContext *s = ctx->priv;
+ av_audio_fifo_free(s->fifo);
+}
+
+static const AVFilterPad silencetrim_inputs[] = {
+ {
+ .name = "default",
+ .type = AVMEDIA_TYPE_AUDIO,
+ .config_props = trim_config_input,
+ .filter_frame = trim_filter_frame,
+ },
+ { NULL }
+};
+
+static const AVFilterPad silencetrim_outputs[] = {
+ {
+ .name = "default",
+ .type = AVMEDIA_TYPE_AUDIO,
+ },
+ { NULL }
+};
+
+AVFilter ff_af_silencetrim = {
+ .name = "silencetrim",
+ .description = NULL_IF_CONFIG_SMALL("Trim silence from start and end of audio."),
+ .priv_size = sizeof(SilenceRemoveContext),
+ .priv_class = &silenceremove_class,
+ .init = init,
+ .uninit = trim_uninit,
+ .query_formats = query_formats,
+ .inputs = silencetrim_inputs,
+ .outputs = silencetrim_outputs,
+};
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 3f70153986..565c475ef4 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -127,6 +127,7 @@ extern AVFilter ff_af_sidechaincompress;
extern AVFilter ff_af_sidechaingate;
extern AVFilter ff_af_silencedetect;
extern AVFilter ff_af_silenceremove;
+extern AVFilter ff_af_silencetrim;
extern AVFilter ff_af_sofalizer;
extern AVFilter ff_af_stereotools;
extern AVFilter ff_af_stereowiden;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment