Created
April 25, 2020 13:31
-
-
Save davidavdav/a85cde3ae7e0c7bf1b904e0da6deecac to your computer and use it in GitHub Desktop.
Some routines for computing energy levels in speech
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env julia | |
## (c) David A. van Leeuwen | |
## routines for adding noise at the right level | |
import MFCC | |
import WAV | |
import ProgressMeter | |
import StatsBase | |
import LinearAlgebra | |
import DelimitedFiles | |
import ArgParse | |
import Random | |
import JSON | |
# from energy to dB and vice sersa | |
undb(x) = 10 ^ (x / 10) | |
db(x) = 10log10(x) | |
const Aweight = undb.([ -16, -9, -3, 0.0, 1, 1, -1 ]) ## for octave bands 125 Hz / 8000 Hz | |
""" | |
Compute the energy per frame, summing over bands first and weighting these using the A-weighting | |
""" | |
function aWeightedEnergy(x::Vector, sr; steptime=0.1) | |
dur = length(x) / sr | |
wintime = min(dur, 2steptime) | |
if dur < 0.025 | |
return [0.0] | |
end | |
p = MFCC.powspec(x, sr, wintime=wintime, steptime=0.5wintime) | |
nfreq, nframes = size(p) | |
freqstep = sr / 2 / nfreq | |
freq = collect(0 : nfreq-1) * freqstep ## fequency of bins | |
bands = exp10.(range(log10(125), log10(8000), length=7)) | |
energy = zeros(nframes) | |
for (midfreq, aweight) in zip(bands, Aweight) | |
lowi = ceil(Int, midfreq / √2 / freqstep) + 1 | |
highi = min(floor(Int, midfreq * √2 / freqstep) + 1, nfreq) | |
## println(p[lowi:highi, :], " ", aweight) | |
energy += vec(sum(p[lowi:highi, :], dims=1)) * aweight | |
end | |
return energy / nfreq | |
end | |
""" | |
Compute the A-weighted level of a speech file, in relation to a reference level `ref`. | |
By default the reference is a 1000 Hz sine tone at full aplitude (-1.0, 1.0) | |
with `sad==true`, a speech-level-meter speech activity detection is carried out, | |
effectively not accounting the silent parts of the speech in computing the level. | |
""" | |
function alevel(x::Vector; sr=16000.0, ref=reflevel, sad=true) | |
energy = aWeightedEnergy(x, sr) | |
if sad | |
energydb = db.(energy) | |
maxlevel = StatsBase.percentile(energydb, 99) | |
energy = energy[energydb .> maxlevel - 14] | |
## println(stderr, length(energy) / length(energydb)) | |
return db(StatsBase.mean(energy)) - ref | |
else | |
## medenergy = StatsBase.percentile(energy, 50) | |
return db(StatsBase.mean(energy)) - ref | |
end | |
end | |
## can't be const, alas | |
const reflevel = alevel(sin.(2π * collect(0:16000) / 16), ref=0.0, sad=false) | |
## little helper | |
wavnorm(x::Tuple) = vec(x[1]), Float64(x[2]) | |
function alevel(file::AbstractString; kwargs...) | |
x, sr = WAV.wavread(file) |> wavnorm | |
return alevel(x; sr=sr, kwargs...) | |
end | |
function stats(file::AbstractString) | |
x, sr = WAV.wavread(file) |> wavnorm | |
al = alevel(x; sr=sr) | |
dur = length(x) / sr | |
return dur, al | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment