Last active
April 9, 2024 12:58
-
-
Save dreness/a060def3e2748faf51ad821ceab7538c to your computer and use it in GitHub Desktop.
Extract "long notes" from audio, optimized for speech.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import parselmouth | |
# import IPython | |
import os | |
import sys | |
import pandas as pd | |
# Feed me audio files as command line arguments | |
# uncomment this to stop pandas from being so coy in its repr()s | |
# pd.set_option("display.max_rows", None, "display.max_columns", None) | |
# Interesting clips are selected based on three criteria: | |
# 1) pitch track frequencies are within this many standard | |
# deviations in a rolling window, implimented by pd.core.series.Series.std | |
max_std = 1.5 | |
# 2) voiced pitches have a strength ("loudness") of at least this value. | |
min_strength = 0.5 | |
# 3) duration of at least this long, in seconds | |
min_length = 0.75 | |
for f in sys.argv[1:]: | |
fname = os.path.basename(f) | |
print(f"Processing {fname}") | |
df = pd.DataFrame() | |
pt = pd.DataFrame() | |
sound = parselmouth.read(f) | |
po = sound.to_pitch() | |
pitch_track = po.selected_array["frequency"] | |
strength_track = po.selected_array["strength"] | |
df = df.assign(pitch=pd.Series(pitch_track)) | |
df = df.assign(strength=pd.Series(strength_track)) | |
cs = df["pitch"].rolling(window=4).std() | |
df = df.assign(dev=cs) | |
pt = pt.assign(pstd=cs) | |
pt = pt.assign(ptime=pd.Series(po.xs())) | |
# https://stackoverflow.com/questions/24281936/delimiting-contiguous-regions-with-values-above-a-certain-threshold-in-pandas-da | |
# Tag the rows that satisfy our pitch and strength criteria | |
df["tag"] = (df["dev"] < max_std) & (df["strength"] > min_strength) | |
fst = df.index[df["tag"] & ~df["tag"].shift(1).fillna(False)] | |
lst = df.index[df["tag"] & ~df["tag"].shift(-1).fillna(False)] | |
# Find sufficiently long sections of audio | |
# The first and last values calculated above are frame numbers. | |
# To allow specifying an extraction duration in seconds, we need to | |
# multiply by the inverse of the time step used for the pitch tracking | |
min_frame_offset = min_length * (1.0 / po.get_time_step()) | |
pr = [(i, j) for i, j in zip(fst, lst) if j > i + min_frame_offset] | |
i = 0 | |
for c in pr: | |
# emergency breaks, useful when tuning settings to avoid tens of thousands of wav files. | |
if i > 1000: | |
print("emergency breaks!") | |
break | |
ftime = pt.ptime[c[0]] | |
ttime = pt.ptime[c[1]] | |
# Move each clip boundary to the nearest zero crossing | |
ftime = sound.get_nearest_zero_crossing(ftime) | |
ttime = sound.get_nearest_zero_crossing(ttime) | |
print(f"extracting {ttime-ftime:.2f} seconds from {ftime:.2f} to {ttime:.2f}") | |
e = sound.extract_part( | |
from_time=ftime, | |
to_time=ttime, | |
) | |
e.save(f"{fname}-{i}.wav", "WAV") | |
i += 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
... then concatenate the clips together and you get:
sample.mov