lzqlzzq/sound_detect.py

## sound_detect.py
"""
A script for labeling where in a audio file is sounding.

Dependenicy:
pip install torch torchaudio
"""

import torch
from torch.nn import functional as F
import torchaudio


amp_to_db = lambda amp: 20 * torch.log10(amp)

def detect(path: str,             # filepath of audio
        threshold: int = -30,     # threshold of "sounding", in decibel
        window_size: int = 300,   # window size of RMS, in microsecond
        hop_size: int = None,     # hop size of window, in microsecond, (window_size // 2) by default
        device: str = None,       # cuda can be used to accelerate
        ):
    audio, sr = torchaudio.load(path)
    audio = audio.to(device).mean(dim=0)                      # convert to mono

    window_size = int((window_size / 1000) * sr)              # convert unit to sample num
    hop_size = int((hop_size / 1000) * sr) if hop_size else window_size // 2
    frame_to_second = lambda frame: (frame + .5) * hop_size / sr

    frames = audio.unfold(0, window_size, hop_size)           # split RMS window
    frames -= frames.mean(dim=-1, keepdim=True)               # zero-center frames
    frames_rms = (frames ** 2).mean(dim=-1) ** .5             # calculate RMS
    loudness = amp_to_db(frames_rms)                          # convert to db

    frames = F.pad((loudness > threshold) * 1, (1, 1))
    vary_points = frames[1:] - frames[:-1]

    # Return in seconds
    starts = frame_to_second(torch.where(vary_points == 1)[0]).cpu().tolist()
    ends = frame_to_second(torch.where(vary_points == -1)[0]).cpu().tolist()

    return list(zip(starts, ends))
	"""
	A script for labeling where in a audio file is sounding.

	Dependenicy:
	pip install torch torchaudio
	"""

	import torch
	from torch.nn import functional as F
	import torchaudio


	amp_to_db = lambda amp: 20 * torch.log10(amp)

	def detect(path: str, # filepath of audio
	threshold: int = -30, # threshold of "sounding", in decibel
	window_size: int = 300, # window size of RMS, in microsecond
	hop_size: int = None, # hop size of window, in microsecond, (window_size // 2) by default
	device: str = None, # cuda can be used to accelerate
	):
	audio, sr = torchaudio.load(path)
	audio = audio.to(device).mean(dim=0) # convert to mono

	window_size = int((window_size / 1000) * sr) # convert unit to sample num
	hop_size = int((hop_size / 1000) * sr) if hop_size else window_size // 2
	frame_to_second = lambda frame: (frame + .5) * hop_size / sr

	frames = audio.unfold(0, window_size, hop_size) # split RMS window
	frames -= frames.mean(dim=-1, keepdim=True) # zero-center frames
	frames_rms = (frames 2).mean(dim=-1) .5 # calculate RMS
	loudness = amp_to_db(frames_rms) # convert to db

	frames = F.pad((loudness > threshold) * 1, (1, 1))
	vary_points = frames[1:] - frames[:-1]

	# Return in seconds
	starts = frame_to_second(torch.where(vary_points == 1)[0]).cpu().tolist()
	ends = frame_to_second(torch.where(vary_points == -1)[0]).cpu().tolist()

	return list(zip(starts, ends))