dreness/longtone_finder.py

## longtone_finder.py
import parselmouth
# import IPython
import os
import sys
import pandas as pd

# Feed me audio files as command line arguments

# uncomment this to stop pandas from being so coy in its repr()s
# pd.set_option("display.max_rows", None, "display.max_columns", None)

# Interesting clips are selected based on three criteria:
# 1) pitch track frequencies are within this many standard
# deviations in a rolling window, implimented by pd.core.series.Series.std
max_std = 1.5

# 2) voiced pitches have a strength ("loudness") of at least this value.
min_strength = 0.5

# 3) duration of at least this long, in seconds
min_length = 0.75

for f in sys.argv[1:]:
    fname = os.path.basename(f)
    print(f"Processing {fname}")
    df = pd.DataFrame()
    pt = pd.DataFrame()
    sound = parselmouth.read(f)
    po = sound.to_pitch()
    pitch_track = po.selected_array["frequency"]
    strength_track = po.selected_array["strength"]
    df = df.assign(pitch=pd.Series(pitch_track))
    df = df.assign(strength=pd.Series(strength_track))
    cs = df["pitch"].rolling(window=4).std()
    df = df.assign(dev=cs)
    pt = pt.assign(pstd=cs)
    pt = pt.assign(ptime=pd.Series(po.xs()))

    # https://stackoverflow.com/questions/24281936/delimiting-contiguous-regions-with-values-above-a-certain-threshold-in-pandas-da
    # Tag the rows that satisfy our pitch and strength criteria
    df["tag"] = (df["dev"] < max_std) & (df["strength"] > min_strength)

    fst = df.index[df["tag"] & ~df["tag"].shift(1).fillna(False)]
    lst = df.index[df["tag"] & ~df["tag"].shift(-1).fillna(False)]

    # Find sufficiently long sections of audio
    # The first and last values calculated above are frame numbers.
    # To allow specifying an extraction duration in seconds, we need to
    # multiply by the inverse of the time step used for the pitch tracking
    min_frame_offset = min_length * (1.0 / po.get_time_step())
    pr = [(i, j) for i, j in zip(fst, lst) if j > i + min_frame_offset]
    i = 0
    for c in pr:
        # emergency breaks, useful when tuning settings to avoid tens of thousands of wav files.
        if i > 1000:
            print("emergency breaks!")
            break
        ftime = pt.ptime[c[0]]
        ttime = pt.ptime[c[1]]

        # Move each clip boundary to the nearest zero crossing
        ftime = sound.get_nearest_zero_crossing(ftime)
        ttime = sound.get_nearest_zero_crossing(ttime)
        print(f"extracting {ttime-ftime:.2f} seconds from {ftime:.2f} to {ttime:.2f}")
        e = sound.extract_part(
            from_time=ftime,
            to_time=ttime,
        )
        e.save(f"{fname}-{i}.wav", "WAV")
        i += 1
	import parselmouth
	# import IPython
	import os
	import sys
	import pandas as pd

	# Feed me audio files as command line arguments

	# uncomment this to stop pandas from being so coy in its repr()s
	# pd.set_option("display.max_rows", None, "display.max_columns", None)

	# Interesting clips are selected based on three criteria:
	# 1) pitch track frequencies are within this many standard
	# deviations in a rolling window, implimented by pd.core.series.Series.std
	max_std = 1.5

	# 2) voiced pitches have a strength ("loudness") of at least this value.
	min_strength = 0.5

	# 3) duration of at least this long, in seconds
	min_length = 0.75

	for f in sys.argv[1:]:
	fname = os.path.basename(f)
	print(f"Processing {fname}")
	df = pd.DataFrame()
	pt = pd.DataFrame()
	sound = parselmouth.read(f)
	po = sound.to_pitch()
	pitch_track = po.selected_array["frequency"]
	strength_track = po.selected_array["strength"]
	df = df.assign(pitch=pd.Series(pitch_track))
	df = df.assign(strength=pd.Series(strength_track))
	cs = df["pitch"].rolling(window=4).std()
	df = df.assign(dev=cs)
	pt = pt.assign(pstd=cs)
	pt = pt.assign(ptime=pd.Series(po.xs()))

	# https://stackoverflow.com/questions/24281936/delimiting-contiguous-regions-with-values-above-a-certain-threshold-in-pandas-da
	# Tag the rows that satisfy our pitch and strength criteria
	df["tag"] = (df["dev"] < max_std) & (df["strength"] > min_strength)

	fst = df.index[df["tag"] & ~df["tag"].shift(1).fillna(False)]
	lst = df.index[df["tag"] & ~df["tag"].shift(-1).fillna(False)]

	# Find sufficiently long sections of audio
	# The first and last values calculated above are frame numbers.
	# To allow specifying an extraction duration in seconds, we need to
	# multiply by the inverse of the time step used for the pitch tracking
	min_frame_offset = min_length * (1.0 / po.get_time_step())
	pr = [(i, j) for i, j in zip(fst, lst) if j > i + min_frame_offset]
	i = 0
	for c in pr:
	# emergency breaks, useful when tuning settings to avoid tens of thousands of wav files.
	if i > 1000:
	print("emergency breaks!")
	break
	ftime = pt.ptime[c[0]]
	ttime = pt.ptime[c[1]]

	# Move each clip boundary to the nearest zero crossing
	ftime = sound.get_nearest_zero_crossing(ftime)
	ttime = sound.get_nearest_zero_crossing(ttime)
	print(f"extracting {ttime-ftime:.2f} seconds from {ftime:.2f} to {ttime:.2f}")
	e = sound.extract_part(
	from_time=ftime,
	to_time=ttime,
	)
	e.save(f"{fname}-{i}.wav", "WAV")
	i += 1