Created
October 11, 2014 15:46
-
-
Save kotarou3/12d9a7db32a141489b73 to your computer and use it in GitHub Desktop.
Cleans small snippets of speech, and attempts to equate loudness
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/nodejs --harmony | |
"use strict"; | |
require("prfun"); | |
function readAudio(filename) { | |
return new Promise(function (resolve, reject) { | |
let reader = new (require("wav").Reader)(); | |
let rawBuffer = new Buffer(0); | |
let format; | |
require("fs").createReadStream(filename).on("error", reject).pipe(reader).on("format", function (f) { | |
format = f; | |
if (f.endianness !== "LE" || f.channels !== 1 || (f.bitDepth !== 16 && f.bitDepth !== 32) || !f.signed) | |
reject("Don't know what to do with format: " + JSON.stringify(f)); | |
}).on("data", function (data) { | |
rawBuffer = Buffer.concat([rawBuffer, data]); | |
}).on("end", function () { | |
let buffer = null; | |
if (format.bitDepth === 16) { | |
buffer = new Float32Array(rawBuffer.length / 2); | |
for (let b = 0; b * 2 < rawBuffer.length; ++b) | |
buffer[b] = rawBuffer.readInt16LE(b * 2) / 0x8000; | |
} else if (format.bitDepth === 32) { | |
buffer = new Float32Array(rawBuffer.length / 4); | |
for (let b = 0; b * 4 < rawBuffer.length; ++b) | |
buffer[b] = rawBuffer.readFloatLE(b * 4); | |
} | |
resolve({buffer: buffer, sampleRate: format.sampleRate}); | |
}).on("error", reject); | |
}); | |
} | |
function writeAudio(buffer, sampleRate, filename, isFloat) { | |
let rawBuffer; | |
if (isFloat) { | |
rawBuffer = new Buffer(buffer.length * 4); | |
for (let b = 0; b < buffer.length; ++b) | |
rawBuffer.writeFloatLE(buffer[b], b * 4); | |
} else { | |
rawBuffer = new Buffer(buffer.length * 2); | |
let maxClip = 0; | |
for (let b = 0; b < buffer.length; ++b) { | |
if (Math.abs(buffer[b]) > 1) { | |
maxClip = Math.max(maxClip, Math.abs(buffer[b])); | |
continue; | |
} | |
rawBuffer.writeInt16LE(Math.round(buffer[b] * 0x7fff), b * 2); | |
} | |
if (maxClip !== 0) | |
return Promise.reject("Clipped by " + maxClip); | |
} | |
let writer = new (require("wav").FileWriter)(filename, {format: isFloat ? 3 : 1, channels: 1, sampleRate: sampleRate, bitDepth: isFloat ? 32 : 16}); | |
writer.write(rawBuffer); | |
return new Promise(function (resolve, reject) { | |
writer.on("done", resolve.bind(null, filename)).on("error", reject); | |
}); | |
} | |
// Calculated from ISO226:2003 at 50 phons | |
const equalLoudnessContour = { | |
freq: [20, 25, 31.5, 40, 50, 63, 80, 100, 125, 160, 200, 250, 315, 400, 500, 630, 800, 1000, 1250, 1600, 2000, 2500, 3150, 4000, 5000, 6300, 8000, 10000, 12500], | |
spl: [104.720, 99.145, 93.694, 88.485, 83.963, 79.606, 75.362, 71.609, 68.170, 64.679, 61.721, 59.043, 56.550, 54.265, 52.590, 51.101, 49.983, 50.011, 51.989, 52.875, 49.618, 46.906, 46.050, 47.146, 50.479, 56.112, 61.756, 63.780, 60.135] | |
}; | |
let equalLoudnessContourAtCache = {}; | |
function equalLoudnessContourAt(frequency) { | |
if (frequency < equalLoudnessContour.freq[0] || frequency > equalLoudnessContour.freq[equalLoudnessContour.freq.length - 1]) | |
return NaN; | |
if (frequency in equalLoudnessContourAtCache) | |
return equalLoudnessContourAtCache[frequency]; | |
let spl = require('cubic-spline')(frequency, equalLoudnessContour.freq, equalLoudnessContour.spl); | |
return spl / 8 - 17; // Hacked together formula to converts SPL to power which works reasonably well for speech | |
} | |
function rmsToPower(rms) { | |
return 10 * Math.log(rms) / Math.log(10); | |
} | |
function powerToRms(power) { | |
return Math.pow(10, power / 10); | |
} | |
function removeDcOffset(buffer) { | |
let mean = 0; | |
for (var b = 0; b < buffer.length; ++b) | |
mean += buffer[b]; | |
mean /= buffer.length; | |
for (let b = 0; b < buffer.length; ++b) | |
buffer[b] -= mean; | |
return buffer; | |
} | |
function fft(buffer) { | |
let length = Math.pow(2, Math.ceil(Math.log(buffer.length) / Math.log(2))); | |
let rawBuffer = null; | |
if (length === buffer.length) { | |
rawBuffer = buffer; | |
} else { | |
rawBuffer = new Float32Array(length); | |
rawBuffer.set(buffer); | |
} | |
let transform = new (require("digitalsignals").FFT)(rawBuffer.length, 1); | |
transform.forward(rawBuffer); | |
let spectrum = { | |
real: transform.real.subarray(0, transform.real.length / 2 + 1), | |
imag: transform.imag.subarray(0, transform.imag.length / 2 + 1) | |
}; | |
for (let f = 0; f < spectrum.real.length; ++f) { | |
spectrum.real[f] /= spectrum.real.length - 1; | |
spectrum.imag[f] /= spectrum.imag.length - 1; | |
} | |
return spectrum; | |
} | |
function ifft(real, imag, length) { | |
let rawReal = new Float32Array((real.length - 1) * 2); | |
let rawImag = new Float32Array((imag.length - 1) * 2); | |
rawReal.set(real); | |
rawImag.set(imag); | |
for (let f = 1; f < real.length; ++f) { | |
rawReal[f] *= real.length - 1; | |
rawImag[f] *= imag.length - 1; | |
rawReal[rawReal.length - f] = rawReal[f]; | |
rawImag[rawImag.length - f] = -rawImag[f]; | |
} | |
let transform = new (require("digitalsignals").FFT)(Math.pow(2, Math.ceil(Math.log(length) / Math.log(2))), 1); | |
let rawBuffer = transform.inverse(rawReal, rawImag); | |
return rawBuffer.subarray(0, length); | |
} | |
function doSpectrumFiltering(buffer, length, callback, isRmsOnly) { | |
if (length % 4 !== 0) | |
throw new Error("length is not a multiple of 4."); | |
const windowA = [0.3635819, -0.4891775, 0.1365995, -0.0106411]; | |
let window = new Float32Array(length); | |
for (let w = 0; w < window.length; ++w) | |
for (let a = 0; a < windowA.length; ++a) { | |
window[w] += windowA[a] * Math.cos(2 * a * Math.PI * w / length); | |
} | |
let inBuffer = new Float32Array(Math.ceil(buffer.length / length + 2) * length); | |
let outBuffer = new Float32Array(inBuffer.length); | |
inBuffer.set(buffer, length); | |
let startSample = 0; | |
let endSample = length; | |
let isChanged = false; | |
let rawBuffer = new Float32Array(length); | |
while (endSample < inBuffer.length) { | |
rawBuffer.set(inBuffer.subarray(startSample, endSample)); | |
let rms = 0; | |
for (let b = 0; b < rawBuffer.length; ++b) | |
rms += rawBuffer[b] * rawBuffer[b]; | |
rms = Math.sqrt(rms / rawBuffer.length); | |
for (let w = 0; w < window.length; ++w) | |
rawBuffer[w] *= window[w]; | |
let spectrum = isRmsOnly ? new Float32Array(0) : fft(rawBuffer); | |
spectrum = callback(spectrum, rms, startSample - length / 2); | |
let tmpOutBuffer = spectrum ? ifft(spectrum.real, spectrum.imag, rawBuffer.length) : rawBuffer; | |
if (spectrum) | |
isChanged = true; | |
for (let b = 0; b < tmpOutBuffer.length; ++b) | |
outBuffer[b + startSample] += tmpOutBuffer[b]; | |
startSample += length / 4; | |
endSample += length / 4; | |
} | |
if (!isChanged) | |
return buffer; | |
buffer.set(outBuffer.subarray(length, buffer.length + length)); | |
for (let b = 0; b < buffer.length; ++b) | |
buffer[b] /= 4 * windowA[0]; | |
return buffer; | |
} | |
function getOverlappingSpectrumAndRms(buffer, length, isRmsOnly) { | |
let samples = []; | |
let spectrums = []; | |
let rmss = []; | |
doSpectrumFiltering(buffer, length, function (spectrum, rms, middleSample) { | |
samples.push(middleSample); | |
spectrums.push(spectrum); | |
rmss.push(rms); | |
}, isRmsOnly); | |
return { | |
sample: samples, | |
spectrum: spectrums, | |
rms: rmss | |
}; | |
} | |
function averageSpectrums(spectrums) { | |
let result = new Float32Array(spectrums[0].real.length); | |
for (let s = 0; s < spectrums.length; ++s) { | |
let spectrum = realiseSpectrum(spectrums[s]); | |
for (let f = 0; f < result.length; ++f) | |
result[f] += spectrum[f] / spectrums.length; | |
} | |
return result; | |
} | |
function frequencyToIndex(frequency, length, sampleRate) { | |
return Math.round(2 * frequency * (length - 1) / sampleRate); | |
} | |
function indexToFrequency(index, length, sampleRate) { | |
return index * sampleRate / (2 * (length - 1)); | |
} | |
function realiseSpectrum(spectrum) { | |
let realSpectrum = new Float32Array(spectrum.real.length); | |
for (let f = 0; f < realSpectrum.length; ++f) | |
realSpectrum[f] = Math.sqrt(spectrum.real[f] * spectrum.real[f] + spectrum.imag[f] * spectrum.imag[f]); | |
return realSpectrum; | |
} | |
function findPeaks(array) { | |
let peaks = []; | |
for (let e = 1; e < array.length - 1; ++e) | |
if (array[e] > array[e - 1] && array[e] > array[e + 1]) | |
peaks.push({index: e, value: array[e]}); | |
return peaks.sort(function (a, b) { | |
return b.value - a.value; | |
}); | |
} | |
function bandPass(startFrequency, endFrequency, spectrum, sampleRate) { | |
let startF = frequencyToIndex(startFrequency, spectrum.real.length, sampleRate); | |
startF = startF > spectrum.real.length ? spectrum.real.length : startF; | |
let endF = frequencyToIndex(endFrequency, spectrum.real.length, sampleRate); | |
for (let f = 0; f < startF; ++f) | |
spectrum.real[f] = spectrum.imag[f] = 0; | |
for (let f = endF; f < spectrum.real.length; ++f) | |
spectrum.real[f] = spectrum.imag[f] = 0; | |
return spectrum; | |
} | |
function lowPass(cutoffFrequency, spectrum, sampleRate) { | |
return bandPass(0, cutoffFrequency, spectrum, sampleRate); | |
} | |
function highPass(cutoffFrequency, spectrum, sampleRate) { | |
return bandPass(cutoffFrequency, Infinity, spectrum, sampleRate); | |
} | |
function calculateStandardDeviation(values) { | |
let mean = 0; | |
for (let e = 0; e < values.length; ++e) | |
mean += values[e]; | |
mean /= values.length; | |
let sd = 0; | |
for (let e = 0; e < values.length; ++e) | |
mean += values[e]; | |
} | |
function getMeanAndSdWithoutOutliers(values, cutoffZ) { | |
values = values.filter(function (v) { return -Infinity < v && v < Infinity; }).sort(function (a, b) { return b - a; }); | |
let sum = 0; | |
let squaredSum = 0; | |
for (let v = 0; v < values.length; ++v) { | |
sum += values[v]; | |
squaredSum += values[v] * values[v]; | |
} | |
let valuesStart = 0; | |
let valuesEnd = values.length; | |
let elements = values.length; | |
let mean; | |
let sd; | |
while (true) { | |
mean = sum / elements; | |
sd = Math.sqrt(elements * squaredSum - sum * sum) / elements; | |
if (isNaN(sd)) { | |
sd = 0; | |
break; | |
} | |
if (values[valuesStart] - mean > cutoffZ * sd) { | |
let value = values[valuesStart] | |
sum -= value; | |
squaredSum -= value * value; | |
--elements; | |
++valuesStart; | |
continue; | |
} | |
if (mean - values[valuesEnd - 1] > cutoffZ * sd) { | |
let value = values[valuesEnd - 1] | |
sum -= value; | |
squaredSum -= value * value; | |
--elements; | |
--valuesEnd; | |
continue; | |
} | |
break; | |
} | |
return {values: values, mean: mean, sd: sd, sum: sum, squaredSum: squaredSum, valuesStart: valuesStart, valuesEnd: valuesEnd, elements: elements}; | |
} | |
function removeNoise(buffer) { | |
let paddedBuffer = new Float32Array(Math.floor(1.5 * buffer.length)); | |
paddedBuffer.set(buffer, Math.floor(buffer.length / 4)); | |
let analysis = getOverlappingSpectrumAndRms(paddedBuffer, 4096); | |
let spectrumMagnitudes = analysis.spectrum.map(realiseSpectrum); // [time][frequency] -> magnitude | |
let transposedSpectrumMagnitudes = new Array(spectrumMagnitudes[0].length); // [frequency][time] -> magnitude | |
for (let f = 0; f < transposedSpectrumMagnitudes.length; ++f) | |
transposedSpectrumMagnitudes[f] = new Float32Array(spectrumMagnitudes.length); | |
for (let t = 0; t < spectrumMagnitudes.length; ++t) | |
for (let f = 0; f < transposedSpectrumMagnitudes.length; ++f) | |
transposedSpectrumMagnitudes[f][t] = spectrumMagnitudes[t][f]; | |
let noiseSpectrum = transposedSpectrumMagnitudes.map(function (frequencyOverTime) { | |
let stats = getMeanAndSdWithoutOutliers(Array.prototype.map.call(frequencyOverTime, rmsToPower), 2); | |
return powerToRms(stats.mean + stats.sd); | |
}); | |
doSpectrumFiltering(buffer, 4096, function (spectrum) { | |
for (let f = 0; f < spectrum.real.length; ++f) { | |
let mod = Math.sqrt(spectrum.real[f] * spectrum.real[f] + spectrum.imag[f] * spectrum.imag[f]); | |
let arg = Math.atan2(spectrum.imag[f], spectrum.real[f]); | |
mod -= noiseSpectrum[f]; | |
if (mod <= 0) | |
spectrum.real[f] = spectrum.imag[f] = 0; | |
else { | |
spectrum.real[f] = mod * Math.cos(arg); | |
spectrum.imag[f] = mod * Math.sin(arg); | |
} | |
} | |
return spectrum; | |
}); | |
} | |
function detectPitch(buffer, sampleRate) { | |
let spectrum = realiseSpectrum(fft(buffer)); | |
let correlatedSpectrum = new Float32Array(spectrum); | |
for (let n = 2; n < 5; ++n) | |
for (let f = 0; f < spectrum.length; ++f) | |
correlatedSpectrum[f] *= spectrum[f * n] || 0; | |
return indexToFrequency(findPeaks(correlatedSpectrum)[0].index, correlatedSpectrum.length, sampleRate); | |
} | |
function trimSilence(buffer) { | |
let start = 0; | |
let end = buffer.length - 1; | |
for (; buffer[start] === 0; ++start); | |
for (; buffer[end] === 0; --end); | |
// Include the first and final 0s | |
start -= 2; | |
end += 2; | |
if (start < 0) | |
start = 0; | |
if (end > buffer.length) | |
end = buffer.length; | |
return buffer.subarray(start, end); | |
} | |
function findSilences(buffer, minimumSamples) { | |
let silences = []; | |
for (let start = 0, end = 0; end < buffer.length; ++end) | |
if (buffer[end] !== 0) { | |
if (end - start > minimumSamples) | |
silences.push({start: start, end: end}); | |
start = end + 1; | |
} | |
return silences; | |
} | |
let guardedReadAudio = Promise.guard(1, readAudio); | |
for (let a = 3; a < process.argv.length; ++a) { | |
let filename = process.argv[a]; | |
guardedReadAudio(filename).then(function (data) { | |
let buffer = removeDcOffset(data.buffer); | |
let sampleRate = data.sampleRate; | |
let duration = buffer.length / sampleRate; | |
removeNoise(buffer); | |
doSpectrumFiltering(buffer, 4096, function (spectrum) { | |
highPass(120, spectrum, sampleRate); | |
return spectrum; | |
}); | |
let peakRms = findPeaks(getOverlappingSpectrumAndRms(buffer, 2048, true).rms)[0].value; | |
doSpectrumFiltering(buffer, 2048, function (spectrum, rms) { | |
if (rmsToPower(peakRms) - rmsToPower(rms) > 17) { | |
highPass(Infinity, spectrum, 1); | |
return spectrum; | |
} | |
}); | |
buffer = trimSilence(buffer); | |
if (buffer.length <= 2) | |
return Promise.reject("buffer empty"); | |
let loudestFreq = detectPitch(buffer, sampleRate); | |
let ratio = powerToRms(equalLoudnessContourAt(loudestFreq)) / peakRms; | |
for (let b = 0; b < buffer.length; ++b) | |
buffer[b] *= ratio; | |
let silences = findSilences(buffer, 0.1 * sampleRate); | |
if (silences.length > 0) | |
console.warn(filename, "has silence at:", silences.map(function (silence) { return (silence.end - silence.start) / sampleRate; }).join(", ")); | |
//if (silences.length !== 2) // keep between | |
// return Promise.reject(silences); | |
//else | |
// buffer = trimSilence(buffer.subarray(silences[0].start, silences[1].end)); | |
//buffer = trimSilence(buffer.subarray(0, silences[0].end)); // remove end | |
//buffer = trimSilence(buffer.subarray(silences[silences.length - 1].start)); // remove start | |
return writeAudio(buffer, sampleRate, process.argv[2] + Math.round(loudestFreq) + "-" + filename); | |
}).catch(function (e) { | |
console.warn(filename, e.stack || e); | |
}).done(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment