Created February 14, 2020 23:50
audio transcription poc
pip install adtlib librosa spleeter cython adtlib matplotlib
Music IR from
# Download song
import IPython
import IPython.display as ipd
import librosa
from ADTLib import ADT
song_name = "give_you_up"
%time !youtube-dl --extract-audio --audio-format mp3 --keep-video -o "{song_name}.%(ext)s"
ipd.HTML(f'<audio controls src="/files/{song_name}.mp3"></audio>')
# Split song
%time !spleeter separate -i {song_name}.mp3 -p spleeter:5stems -o .
"outputs": []
piano_filename = f"{song_name}/piano.wav"
bass_filename = f"{song_name}/bass.wav"
other_filename = f"{song_name}/other.wav"
vocals_filename = f"{song_name}/vocals.wav"
drums_filename = f"{song_name}/drums.wav"
ipd.HTML(f'<audio controls src="/files/{drums_filename}"></audio>')
partial_song_filename = f"{song_name}_drumless.mp3"
!rm -f {partial_song_filename}
%time !ffmpeg -i {song_name}/piano.wav -i {song_name}/bass.wav -i {song_name}/other.wav -i {song_name}/vocals.wav -filter_complex amix=inputs=4:duration=longest {partial_song_filename}
ipd.HTML(f'<audio controls src="/files/{bass_filename}"></audio>')
ipd.HTML(f'<audio controls src="/files/{piano_filename}"></audio>')
ipd.HTML(f'<audio controls src="/files/{partial_song_filename}"></audio>')
# Transcribe drums
%time x, sr = librosa.load(drums_filename)
"outputs": []
%time drum_onsets = ADT([drums_filename])[0]
drum_onsets
clicks = librosa.clicks(times=drum_onsets['Kick'], sr=sr, length=len(x))
ipd.Audio(x + clicks, rate=sr)
clicks = librosa.clicks(times=drum_onsets['Snare'], sr=sr, length=len(x))
ipd.Audio(x + clicks, rate=sr)
clicks = librosa.clicks(times=drum_onsets['Hihat'], sr=sr, length=len(x))
ipd.Audio(x + clicks, rate=sr)
### Question: do we want to sample the kick, snare, and hihat noise, and replay them when you play along on a MIDI drumkit?
# Transcribe vocals


Might help to also backtrack from onsets?
%time x, sr = librosa.load(vocals_filename)
"outputs": []
sr, x
"outputs": []
ipd.Audio(x, rate=sr)
"outputs": []
from librosa import amplitude_to_db
bins_per_octave = 36
cqt = librosa.cqt(x, sr=sr, n_bins=300, bins_per_octave=bins_per_octave)
log_cqt = amplitude_to_db(cqt)
import librosa.display
librosa.display.specshow(log_cqt, sr=sr, x_axis='time', y_axis='cqt_note', 
                         bins_per_octave=bins_per_octave)
import matplotlib.pyplot as plt
hop_length = 100
onset_env = librosa.onset.onset_strength(x, sr=sr, hop_length=hop_length)
plt.plot(onset_env)
plt.xlim(0, len(onset_env))
"source": "onset_samples = librosa.onset.onset_detect(x,\n sr=sr, units='samples', \n hop_length=hop_length, \n backtrack=False,\n pre_max=20,\n post_max=20,\n pre_avg=100,\n post_avg=100,\n delta=0.2,\n wait=0)",
onset_samples
onset_boundaries = numpy.concatenate([[0], onset_samples, [len(x)]])
onset_times = librosa.samples_to_time(onset_boundaries, sr=sr)
librosa.display.waveplot(x, sr=sr)
plt.vlines(onset_times, -1, 1, color='r')
def estimate_pitch(segment, sr, fmin=50.0, fmax=2000.0):
    
    # Compute autocorrelation of input segment.
    r = librosa.autocorrelate(segment)
    
    # Define lower and upper limits for the autocorrelation argmax.
    i_min = sr/fmax
    i_max = sr/fmin
    r[:int(i_min)] = 0
    r[int(i_max):] = 0
    
    # Find the location of the maximum autocorrelation.
    i = r.argmax()
    f0 = float(sr)/i
    return f0

def generate_sine(f0, sr, n_duration):
    n = numpy.arange(n_duration)
    return 0.2*numpy.sin(2*numpy.pi*f0*n/float(sr))

def estimate_pitch_and_generate_sine(x, onset_samples, i, sr):
    n0 = onset_samples[i]
    n1 = onset_samples[i+1]
    f0 = estimate_pitch(x[n0:n1], sr)
    return generate_sine(f0, sr, n1-n0)
y = numpy.concatenate([
    estimate_pitch_and_generate_sine(x, onset_boundaries, i, sr=sr)
    for i in range(len(onset_boundaries)-1)
])
ipd.Audio(y, rate=sr)
# Transcribe bass
## Get sample
bass_sample_filename = "bass_sample.wav"
"outputs": []
!rm -f {bass_sample_filename}
!ffmpeg -i {bass_filename} -ss 6 -t 4 {bass_sample_filename}
"outputs": []
ipd.HTML(f'<audio controls src="/files/{bass_sample_filename}"></audio>')
"outputs": []
input: x, sr
output:
x, sr = librosa.load(bass_sample_filename)
"outputs": []
"source": "def get_onset_boundaries(x, sr):\n hop_length = 100\n onset_env = librosa.onset.onset_strength(x, sr=sr, hop_length=hop_length)\n onset_samples = librosa.onset.onset_detect(x,\n sr=sr, units='samples', \n hop_length=hop_length, \n backtrack=False,\n pre_max=20,\n post_max=20,\n pre_avg=100,\n post_avg=100,\n delta=0.2,\n wait=0)\n return numpy.concatenate([[0], onset_samples, [len(x)]])",
onset_boundaries = get_onset_boundaries(x, sr)
def chart_onset_times(x, sr, onset_boundaries):
    onset_times = librosa.samples_to_time(onset_boundaries, sr=sr)
    librosa.display.waveplot(x, sr=sr)
    plt.vlines(onset_times, -1, 1, color='r')
chart_onset_times(x, sr, onset_boundaries)
y = numpy.concatenate([
    estimate_pitch_and_generate_sine(x, onset_boundaries, i, sr=sr)
    for i in range(len(onset_boundaries)-1)
])
ipd.Audio(y, rate=sr)
