Created
January 31, 2022 22:34
-
-
Save schollz/cde86c4038764f7a461a577aa5319a32 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# python3 depop.py file1.wav file1_depopped.wav | |
__author__ = "ezra buchla, zack scholl" | |
import shutil | |
from scipy import signal | |
from scipy.io import wavfile | |
import numpy as np | |
debug_depop = False | |
if debug_depop: | |
import matplotlib.pyplot as plt | |
# (adapted from ezra's excise.m, all mistakes are from zack) | |
# %% excise a portion of given (audio) signal with crossfade | |
# %% | |
# %% it is assumed that the exact final duration is flexible, | |
# %% so a fade time is programatically determined | |
# %% | |
# %% parameters: | |
# %% a, b: first and last indices of region to excise | |
# %% nw: window length (maximum crossfade duration in samples) | |
# %% | |
# %% returns: | |
# %% Y: new signal | |
# %% c: correlation peak value | |
# %% nf: actual count of crossfade samples | |
def excise(X, a, b, nw=1000): | |
# count of excised samples | |
nx = b-a+1; | |
# restrict window size | |
nw = np.amin([a-1, nw]); | |
nw = np.amin([len(X)-b, nw]); | |
# windows preceding and following excision, | |
# which we'll examine for correlation | |
W = X[a-nw:a]; | |
V = X[b:b+nw]; | |
# find lag time maximizing cross-correlation | |
C = signal.correlate(W,V,'full') | |
lags = signal.correlation_lags(W.size,V.size,mode="full") | |
# remove negative lags | |
ipos = np.where(lags>=0) | |
C = C[ipos] | |
lags = lags[ipos] | |
idx = np.argmax(C) | |
cmax = C[idx] | |
r = int(lags[idx]) | |
# xfade duration | |
nf = nw - r | |
# build output | |
# initial segment | |
Y0 = X[:a-nf] | |
# ending segment | |
Y1 = X[b+nf:] | |
# calculate xfaded segment | |
phi = np.linspace(0,1,nf) | |
W = X[a-nf:a] | |
V = X[b:b+nf] | |
# FIXME: | |
# should choose interpolation type based on degree of correlation | |
# (linear if totally correlated, equal-power if uncorrelated) | |
# for now, just use linear | |
Yf = np.multiply(V,phi) | |
Yf = Yf + np.multiply(W,1-phi) | |
Y = Y0 | |
Y = np.append(Y,Yf) | |
Y = np.append(Y,Y1) | |
return Y | |
def depop(filename,newfilename,channel=0): | |
# read in file | |
samplerate, data = wavfile.read(filename) | |
# collect data-type information and normalize | |
original_type = data.dtype | |
data_max = (np.iinfo(original_type).max) | |
data = data.astype(float) | |
data = data / data_max | |
# compute basic properties | |
length = data.shape[0] / samplerate | |
num_channels = data.shape[1] | |
time = np.linspace(0., length, data.shape[0]) | |
# stft of the channel | |
# go through each band and, if its soft, | |
# collect it to be averaged later | |
f, t, Zxx = signal.stft(data[:, channel], samplerate, window=signal.get_window("hann",128),nperseg=128) | |
zmean = np.zeros(Zxx[1,:].shape) | |
znum = 0 | |
for i in range(Zxx.shape[0]): | |
z = 20*np.log10(np.abs(Zxx[i,:])) | |
z = z - np.mean(z[-500:]) | |
# find soft audio regions and use them for the average | |
# arbitrary but works okay | |
if np.sum(z) < 10000: | |
zmean = np.add(zmean,z) | |
znum += 1 | |
# average all the soft bands | |
zmean = zmean / znum | |
zmean = zmean - np.mean(zmean[-500:]) | |
z_std = np.std(zmean[-500:]) | |
peaks, _ = signal.find_peaks(zmean,height=z_std*6,prominence=10) | |
if debug_depop: | |
print(peaks) | |
plt.plot(t,zmean) | |
plt.plot(t[peaks],zmean[peaks],'o') | |
plt.title("ch {}".format(channel)) | |
plt.show() | |
for _, ind in enumerate(peaks): | |
if t[ind] < 0.005 or t[ind] > length-0.005: | |
continue | |
# assume a pop width of 6 ms | |
r = np.where(np.logical_and(time>t[ind]-0.003,time<t[ind]+0.003)) | |
score = zmean[ind]/z_std | |
# anything over 20 sd's is probably a pop | |
if score < 5: | |
if debug_depop: | |
print("too low score: {}".format(score)) | |
continue | |
data0 = excise(data[:,0],r[0][0],r[0][-1]) | |
if num_channels > 1: | |
data1 = excise(data[:,1],r[0][0],r[0][-1]) | |
newlen = min([len(data0),len(data1)]) | |
newdata = np.column_stack((data0[:newlen],data1[:newlen]))*data_max | |
else: | |
newdata = data0 | |
wavfile.write(newfilename,samplerate,newdata.astype(original_type)) | |
return "removed a pop at {}".format(t[ind]) | |
return "" | |
def depop_file(filename,newfilename=""): | |
if newfilename=="": | |
newfilename="foo.wav" | |
shutil.copy(filename,newfilename) | |
for channel in range(0,2): | |
for i in range(10): | |
result = depop(newfilename,newfilename,channel) | |
if result == "": | |
break | |
print(result + " in file " + filename + ", channel {}".format(channel+1)) | |
if newfilename == "foo.wav": | |
shutil.copy(newfilename,filename) | |
import sys | |
filename = sys.argv[1] | |
filename2 = sys.argv[2] | |
depop_file(filename,filename2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment