Skip to content

Instantly share code, notes, and snippets.

@kylemcdonald
Created December 15, 2015 05:50
Show Gist options
  • Star 18 You must be signed in to star a gist
  • Fork 4 You must be signed in to fork a gist
  • Save kylemcdonald/c8e62ef8cb9515d64df4 to your computer and use it in GitHub Desktop.
Save kylemcdonald/c8e62ef8cb9515d64df4 to your computer and use it in GitHub Desktop.
Split an audio file into multiple files based on detected onsets from librosa.
#!/usr/bin/env python
import argparse
import matplotlib.pyplot as plt
import librosa
import numpy as np
import os
from progressbar import ProgressBar
parser = argparse.ArgumentParser(
description='Split audio into multiple files and save analysis.')
parser.add_argument('-i', '--input', type=str)
parser.add_argument('-o', '--output', type=str, default='transients')
parser.add_argument('-s', '--sr', type=int, default=44100)
args = parser.parse_args()
y, sr = librosa.load(args.input, sr=args.sr)
o_env = librosa.onset.onset_strength(y, sr=sr, feature=librosa.cqt)
onset_frames = librosa.onset.onset_detect(onset_envelope=o_env, sr=sr)
def prepare(y, sr=22050):
y = librosa.to_mono(y)
y = librosa.util.fix_length(y, sr) # 1 second of audio
y = librosa.util.normalize(y)
return y
def get_fingerprint(y, sr=22050):
y = prepare(y, sr)
cqt = librosa.cqt(y, sr=sr, hop_length=2048)
return cqt.flatten('F')
def normalize(x):
x -= x.min(axis=0)
x /= x.max(axis=0)
return x
def basename(file):
file = os.path.basename(file)
return os.path.splitext(file)[0]
vectors = []
words = []
filenames = []
onset_samples = list(librosa.frames_to_samples(onset_frames))
onset_samples = np.concatenate(onset_samples, len(y))
starts = onset_samples[0:-1]
stops = onset_samples[1:]
analysis_folder = args.output
samples_folder = os.path.join(args.output, 'samples')
try:
os.makedirs(samples_folder)
except:
pass
pbar = ProgressBar()
for i, (start, stop) in enumerate(pbar(zip(starts, stops))):
audio = y[start:stop]
filename = os.path.join(samples_folder, str(i) + '.wav')
librosa.output.write_wav(filename, audio, sr)
vector = get_fingerprint(audio, sr=sr)
word = basename(filename)
vectors.append(vector)
words.append(word)
filenames.append(filename)
np.savetxt(os.path.join(analysis_folder, 'vectors'), vectors, fmt='%.5f', delimiter='\t')
np.savetxt(os.path.join(analysis_folder, 'words'), words, fmt='%s')
np.savetxt(os.path.join(analysis_folder, 'filenames.txt'), filenames, fmt='%s')
@Cmefteh
Copy link

Cmefteh commented May 29, 2019

Hello! i'm trying to split an audio into parts using librosa, but when i run your code, especially the for loop, i get this error :

ValueError: cannot convert float NaN to integer

I haven't understand where is the problem! can you help me please to solve it?

@usworked
Copy link

usworked commented Jun 6, 2019

Hello! i'm trying to split an audio into parts using librosa, but when i run your code, especially the for loop, i get this error :

ValueError: cannot convert float NaN to integer

I haven't understand where is the problem! can you help me please to solve it?

Get rid of the progress bar and the float issue will go away.

Here's how I wrote the bottom part:

for i, (start, stop) in enumerate(zip(starts, stops)):
        audio = y[start:stop]
        filename = str(i) + '.wav'
        librosa.output.write_wav(filename, audio, sr)

@Cmefteh
Copy link

Cmefteh commented Jun 11, 2019

It works now! thank you very much ^^

@cri5Castro
Copy link

which version of librosa are you using? I'm getting the following error

 librosa/onset.py", line 538, in onset_strength_multi
    S = np.abs(feature(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, **kwargs))
TypeError: cqt() got an unexpected keyword argument 'n_fft'
(ppdataenv)

@rintala
Copy link

rintala commented Feb 3, 2020

@cri5Castro, I am getting the same error. Guessing it is caused by updates of Librosa. Did you have any progress on this matter?

@rintala
Copy link

rintala commented Feb 3, 2020

@cri5Castro - Update: I did some investigation, and the syntax is indeed a bit updated. If you want to use the Constant-Q spectrogram, just replace the code for the variable o_env with:

C = np.abs(librosa.cqt(y=y, sr=sr))
o_env = librosa.onset.onset_strength(sr=sr, S=librosa.amplitude_to_db(C, ref=np.max))

Reference to documentation: https://librosa.github.io/librosa/generated/librosa.onset.onset_strength.html

@stackoverflowsam93
Copy link

I ran this and got


Traceback (most recent call last):
  File "/Users/sam/Documents/Dev/mp3-sound-detection/src/./split_transients.py", line 57, in <module>
    for i, (start, stop) in enumerate(pbar(zip(starts, stops))):
  File "/usr/local/lib/python3.9/site-packages/progressbar/progressbar.py", line 152, in __next__
    self.start()
  File "/usr/local/lib/python3.9/site-packages/progressbar/progressbar.py", line 291, in start
    self.update(0)
  File "/usr/local/lib/python3.9/site-packages/progressbar/progressbar.py", line 262, in update
    self.fd.write(self._format_line() + '\r')
  File "/usr/local/lib/python3.9/site-packages/progressbar/progressbar.py", line 222, in _format_line
    widgets = ''.join(self._format_widgets())
  File "/usr/local/lib/python3.9/site-packages/progressbar/progressbar.py", line 202, in _format_widgets
    widget = widgets.format_updatable(widget, self)
  File "/usr/local/lib/python3.9/site-packages/progressbar/widgets.py", line 39, in format_updatable
    if hasattr(updatable, 'update'): return updatable.update(pbar)
  File "/usr/local/lib/python3.9/site-packages/progressbar/widgets.py", line 230, in update
    return '%3d%%' % pbar.percentage()
ValueError: cannot convert float NaN to integer

@stackoverflowsam93
Copy link

The following code works, I had to ditch the progress bar though

#!/usr/bin/env python

import argparse
import matplotlib.pyplot as plt
import librosa
import numpy as np
import os
import soundfile as sf

parser = argparse.ArgumentParser(
	description='Split audio into multiple files and save analysis.')
parser.add_argument('-i', '--input', type=str)
parser.add_argument('-o', '--output', type=str, default='transients')
parser.add_argument('-s', '--sr', type=int, default=44100)
args = parser.parse_args()

def prepare(y, sr=22050):
    y = librosa.to_mono(y)
    y = librosa.util.fix_length(y, sr) # 1 second of audio
    y = librosa.util.normalize(y)
    return y

def get_fingerprint(y, sr=22050):
    y = prepare(y, sr)
    cqt = librosa.cqt(y, sr=sr, hop_length=2048)
    return cqt.flatten('F')

def normalize(x):
    x -= x.min(axis=0)
    x /= x.max(axis=0)
    return x

def basename(file):
    file = os.path.basename(file)
    return os.path.splitext(file)[0]

vectors = []
words = []
filenames = []

y, sr = librosa.load(args.input, sr=args.sr)
C = np.abs(librosa.cqt(y=y, sr=sr))
o_env = librosa.onset.onset_strength(sr=sr, S=librosa.amplitude_to_db(C, ref=np.max))
onset_frames = librosa.onset.onset_detect(onset_envelope=o_env, sr=sr)

onset_samples = list(librosa.frames_to_samples(onset_frames))
onset_samples = np.concatenate(onset_samples, len(y))
starts = onset_samples[0:-1]
stops = onset_samples[1:]
analysis_folder = args.output
samples_folder = os.path.join(args.output, 'samples')
try:
	os.makedirs(samples_folder)
except:
	pass
enumeration = enumerate(zip(starts, stops))
#print(list(enumeration))
for i, (start, stop) in enumeration:
    audio = y[start:stop]
    filename = os.path.join(samples_folder, str(i) + '.wav')
    sf.write(filename, audio, sr)
    vector = get_fingerprint(audio, sr=sr)
    word = basename(filename)
    vectors.append(vector)
    words.append(word)
    filenames.append(filename)
np.savetxt(os.path.join(analysis_folder, 'vectors'), vectors, fmt='%.5f', delimiter='\t')
np.savetxt(os.path.join(analysis_folder, 'words'), words, fmt='%s')
np.savetxt(os.path.join(analysis_folder, 'filenames.txt'), filenames, fmt='%s')

@elgiano
Copy link

elgiano commented Dec 8, 2021

Here is a working version with ProgressBar and librosa 0.8,1

#!/usr/bin/env python

import argparse
import soundfile
import librosa
import numpy as np
import os
from progressbar import ProgressBar, Percentage, Bar

parser = argparse.ArgumentParser(
	description='Split audio into multiple files and save analysis.')
parser.add_argument('-i', '--input', type=str)
parser.add_argument('-o', '--output', type=str, default='transients')
parser.add_argument('-s', '--sr', type=int, default=44100)
args = parser.parse_args()

print(f'Loading {args.input}')
y, sr = librosa.load(args.input, sr=args.sr)
print('Calculating CQT')
C = np.abs(librosa.cqt(y=y, sr=sr))
print('Extracting onsets')
o_env = librosa.onset.onset_strength(y, sr=sr, S=librosa.amplitude_to_db(C, ref=np.max))
onset_frames = librosa.onset.onset_detect(onset_envelope=o_env, sr=sr)

def prepare(y, sr=22050):
    y = librosa.to_mono(y)
    y = librosa.util.fix_length(y, sr) # 1 second of audio
    y = librosa.util.normalize(y)
    return y

def get_fingerprint(y, sr=22050):
    y = prepare(y, sr)
    cqt = librosa.cqt(y, sr=sr, hop_length=2048)
    return cqt.flatten('F')

def normalize(x):
    x -= x.min(axis=0)
    x /= x.max(axis=0)
    return x

def basename(file):
    file = os.path.basename(file)
    return os.path.splitext(file)[0]

vectors = []
words = []
filenames = []

onset_samples = list(librosa.frames_to_samples(onset_frames))
onset_samples = np.concatenate(onset_samples, len(y))
starts = onset_samples[0:-1]
stops = onset_samples[1:]
analysis_folder = args.output
samples_folder = os.path.join(args.output, 'samples')
num_segments = len(onset_samples)
print(f'Writing {num_segments} segments to {samples_folder}')
try:
	os.makedirs(samples_folder)
except:
	pass
pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=num_segments).start()
for i, (start, stop) in enumerate(zip(starts, stops)):
    audio = y[start:stop]
    filename = os.path.join(samples_folder, str(i) + '.wav')
    soundfile.write(filename, audio, sr)
    vector = get_fingerprint(audio, sr=sr)
    word = basename(filename)
    vectors.append(vector)
    words.append(word)
    filenames.append(filename)
    pbar.update(i+1)
pbar.finish()
np.savetxt(os.path.join(analysis_folder, 'vectors'), vectors, fmt='%.5f', delimiter='\t')
np.savetxt(os.path.join(analysis_folder, 'words'), words, fmt='%s')
np.savetxt(os.path.join(analysis_folder, 'filenames.txt'), filenames, fmt='%s')

@RichardJohnn
Copy link

thank you @elgiano !

@ni-dschiller
Copy link

This works with librosa==0.10.1 and numpy==1.26.4:

#!/usr/bin/env python

import argparse
import soundfile
import librosa
import numpy as np
import os
from progressbar import ProgressBar, Percentage, Bar

parser = argparse.ArgumentParser(
	description='Split audio into multiple files and save analysis.')
parser.add_argument('-i', '--input', type=str)
parser.add_argument('-o', '--output', type=str, default='transients')
parser.add_argument('-s', '--sr', type=int, default=44100)
args = parser.parse_args()

print(f'Loading {args.input}')
y, sr = librosa.load(args.input, sr=args.sr)
print('Calculating CQT')
C = np.abs(librosa.cqt(y=y, sr=sr))
print('Extracting onsets')
o_env = librosa.onset.onset_strength(y=y, sr=sr, S=librosa.amplitude_to_db(C, ref=np.max))
onset_frames = librosa.onset.onset_detect(onset_envelope=o_env, sr=sr)

def prepare(y, sr=22050):
    y = librosa.to_mono(y)
    y = librosa.util.fix_length(y, size=sr) # 1 second of audio
    y = librosa.util.normalize(y)
    return y

def get_fingerprint(y, sr=22050):
    y = prepare(y, sr)
    cqt = librosa.cqt(y, sr=sr, hop_length=2048)
    return cqt.flatten('F')

def normalize(x):
    x -= x.min(axis=0)
    x /= x.max(axis=0)
    return x

def basename(file):
    file = os.path.basename(file)
    return os.path.splitext(file)[0]

vectors = []
words = []
filenames = []

onset_samples = list(librosa.frames_to_samples(onset_frames))
onset_samples = np.concatenate(onset_samples, len(y))
starts = onset_samples[0:-1]
stops = onset_samples[1:]
analysis_folder = args.output
samples_folder = os.path.join(args.output, 'samples')
num_segments = len(onset_samples)
print(f'Writing {num_segments} segments to {samples_folder}')
try:
	os.makedirs(samples_folder)
except:
	pass
pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=num_segments).start()
for i, (start, stop) in enumerate(zip(starts, stops)):
    audio = y[start:stop]
    filename = os.path.join(samples_folder, str(i) + '.wav')
    soundfile.write(filename, audio, sr)
    vector = get_fingerprint(audio, sr=sr)
    word = basename(filename)
    vectors.append(vector)
    words.append(word)
    filenames.append(filename)
    pbar.update(i+1)
pbar.finish()
np.savetxt(os.path.join(analysis_folder, 'vectors'), vectors, fmt='%.5f', delimiter='\t')
np.savetxt(os.path.join(analysis_folder, 'words'), words, fmt='%s')
np.savetxt(os.path.join(analysis_folder, 'filenames.txt'), filenames, fmt='%s')

Usage:

python Scripts/split2.py -i "Voices/Alexine Dreams 24-02-17.mp3"

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment