Skip to content

Instantly share code, notes, and snippets.

@lzqlzzq
Created January 18, 2024 07:33
Show Gist options
  • Save lzqlzzq/9f332bb56805c8e1444e1549f1b63085 to your computer and use it in GitHub Desktop.
Save lzqlzzq/9f332bb56805c8e1444e1549f1b63085 to your computer and use it in GitHub Desktop.
A script for labeling where in a audio file is sounding
"""
A script for labeling where in a audio file is sounding.
Dependenicy:
pip install torch torchaudio
"""
import torch
from torch.nn import functional as F
import torchaudio
amp_to_db = lambda amp: 20 * torch.log10(amp)
def detect(path: str, # filepath of audio
threshold: int = -30, # threshold of "sounding", in decibel
window_size: int = 300, # window size of RMS, in microsecond
hop_size: int = None, # hop size of window, in microsecond, (window_size // 2) by default
device: str = None, # cuda can be used to accelerate
):
audio, sr = torchaudio.load(path)
audio = audio.to(device).mean(dim=0) # convert to mono
window_size = int((window_size / 1000) * sr) # convert unit to sample num
hop_size = int((hop_size / 1000) * sr) if hop_size else window_size // 2
frame_to_second = lambda frame: (frame + .5) * hop_size / sr
frames = audio.unfold(0, window_size, hop_size) # split RMS window
frames -= frames.mean(dim=-1, keepdim=True) # zero-center frames
frames_rms = (frames ** 2).mean(dim=-1) ** .5 # calculate RMS
loudness = amp_to_db(frames_rms) # convert to db
frames = F.pad((loudness > threshold) * 1, (1, 1))
vary_points = frames[1:] - frames[:-1]
# Return in seconds
starts = frame_to_second(torch.where(vary_points == 1)[0]).cpu().tolist()
ends = frame_to_second(torch.where(vary_points == -1)[0]).cpu().tolist()
return list(zip(starts, ends))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment