Skip to content

Instantly share code, notes, and snippets.

@dreness
Last active April 9, 2024 12:58
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dreness/a060def3e2748faf51ad821ceab7538c to your computer and use it in GitHub Desktop.
Save dreness/a060def3e2748faf51ad821ceab7538c to your computer and use it in GitHub Desktop.
Extract "long notes" from audio, optimized for speech.
import parselmouth
# import IPython
import os
import sys
import pandas as pd
# Feed me audio files as command line arguments
# uncomment this to stop pandas from being so coy in its repr()s
# pd.set_option("display.max_rows", None, "display.max_columns", None)
# Interesting clips are selected based on three criteria:
# 1) pitch track frequencies are within this many standard
# deviations in a rolling window, implimented by pd.core.series.Series.std
max_std = 1.5
# 2) voiced pitches have a strength ("loudness") of at least this value.
min_strength = 0.5
# 3) duration of at least this long, in seconds
min_length = 0.75
for f in sys.argv[1:]:
fname = os.path.basename(f)
print(f"Processing {fname}")
df = pd.DataFrame()
pt = pd.DataFrame()
sound = parselmouth.read(f)
po = sound.to_pitch()
pitch_track = po.selected_array["frequency"]
strength_track = po.selected_array["strength"]
df = df.assign(pitch=pd.Series(pitch_track))
df = df.assign(strength=pd.Series(strength_track))
cs = df["pitch"].rolling(window=4).std()
df = df.assign(dev=cs)
pt = pt.assign(pstd=cs)
pt = pt.assign(ptime=pd.Series(po.xs()))
# https://stackoverflow.com/questions/24281936/delimiting-contiguous-regions-with-values-above-a-certain-threshold-in-pandas-da
# Tag the rows that satisfy our pitch and strength criteria
df["tag"] = (df["dev"] < max_std) & (df["strength"] > min_strength)
fst = df.index[df["tag"] & ~df["tag"].shift(1).fillna(False)]
lst = df.index[df["tag"] & ~df["tag"].shift(-1).fillna(False)]
# Find sufficiently long sections of audio
# The first and last values calculated above are frame numbers.
# To allow specifying an extraction duration in seconds, we need to
# multiply by the inverse of the time step used for the pitch tracking
min_frame_offset = min_length * (1.0 / po.get_time_step())
pr = [(i, j) for i, j in zip(fst, lst) if j > i + min_frame_offset]
i = 0
for c in pr:
# emergency breaks, useful when tuning settings to avoid tens of thousands of wav files.
if i > 1000:
print("emergency breaks!")
break
ftime = pt.ptime[c[0]]
ttime = pt.ptime[c[1]]
# Move each clip boundary to the nearest zero crossing
ftime = sound.get_nearest_zero_crossing(ftime)
ttime = sound.get_nearest_zero_crossing(ttime)
print(f"extracting {ttime-ftime:.2f} seconds from {ftime:.2f} to {ttime:.2f}")
e = sound.extract_part(
from_time=ftime,
to_time=ttime,
)
e.save(f"{fname}-{i}.wav", "WAV")
i += 1
@dreness
Copy link
Author

dreness commented Oct 18, 2021

% python longtone_finder.py The\ Exorcism\ of\ Cake\ Boss.mp3  
Processing 289  The Exorcism of Cake Boss.mp3
extracting 0.83 seconds from 46.03 to 46.86
extracting 1.56 seconds from 489.47 to 491.03
extracting 0.82 seconds from 517.63 to 518.45
extracting 1.42 seconds from 1038.00 to 1039.42
extracting 0.93 seconds from 1040.53 to 1041.46
extracting 0.78 seconds from 1436.46 to 1437.24
extracting 1.15 seconds from 4300.93 to 4302.08
extracting 0.98 seconds from 4757.88 to 4758.86
extracting 0.76 seconds from 5354.36 to 5355.12
extracting 2.61 seconds from 5608.27 to 5610.88

... then concatenate the clips together and you get:

sample.mov

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment