Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Cut a video file into silent and noisy clips (2 output files), or into segments with one face and other (non-one-face) segments
# requires https://github.com/Zulko/moviepy
# requires https://github.com/jiaaro/pydub
# requires https://github.com/ageitgey/face_recognition
from pydub import AudioSegment
from moviepy.editor import *
import face_recognition
import itertools
import numpy as np
# output is in millisecond tuples
def silent_segments(seg):
slice_size = 100
gap_tolerance = 1500 # how many milliseconds of silence is ok
gap_segments_tol = gap_tolerance / slice_size
db_levels = [s.dBFS for s in seg[::slice_size]]
sound_dist = [d for d in db_levels if d > -55]
mu = np.mean(sound_dist)
sigma = np.std(sound_dist)
trim_threshold = mu - 2*sigma;
quiet_segs = []
running_count = 0
for i in range(len(db_levels)):
seg = db_levels[i]
if seg > trim_threshold:
if running_count >= gap_segments_tol:
quiet_segs.append(((i - running_count)*slice_size, (i*slice_size)))
running_count = 0
continue
running_count = running_count + 1
return quiet_segs
def infer_segments_complement(input_segments, duration):
complement_segments = []
offset = 0
for seg in input_segments:
if seg[0] > offset:
complement_segments.append((offset, seg[0]))
offset = seg[1]
if offset < duration:
complement_segments.append((offset, duration))
return complement_segments
def cut_together(file, segments, units='milliseconds'):
clips = []
for segment in segments:
t_start = float(segment[0])
t_stop = float(segment[1])
if units == 'milliseconds':
t_start = t_start / float(1000)
t_stop = t_stop / float(1000)
clips.append(file.subclip(t_start, t_stop))
return concatenate_videoclips(clips)
def cut_to_relevent_video(input_file, output_name):
audio_from_file = AudioSegment.from_file(input_file, "mp4")
quiet_segments = silent_segments(audio_from_file)
noisy_segments = infer_segments_complement(quiet_segments, len(audio_from_file))
full_file = VideoFileClip(input_file)
quiet_clip = cut_together(full_file, quiet_segments)
noisy_clip = cut_together(full_file, noisy_segments)
quiet_clip.write_videofile("./%s_quiet.mp4" % ((output_name)), temp_audiofile="temp-audio.m4a", remove_temp=True, codec="libx264", audio_codec="aac")
noisy_clip.write_videofile("./%s_noisy.mp4" % ((output_name)), temp_audiofile="temp-audio.m4a", remove_temp=True, codec="libx264", audio_codec="aac")
def find_one_face_segments(video_clip, slice_size_in_seconds=0.5):
one_face_segs = []
running_count = 0
offset = 0
currently_single_face = False
start_point = 0
while(offset <= video_clip.duration):
print(offset, "seconds.....")
frame = video_clip.get_frame(offset)
face_count = len(face_recognition.face_locations(frame))
if currently_single_face and face_count != 1: # stop interval
currently_single_face = False
one_face_segs.append((start_point, offset))
elif not currently_single_face and face_count == 1: # start interval
currently_single_face = True
start_point = offset
else:
pass
offset += slice_size_in_seconds
if currently_single_face:
one_face_segs.append((start_point, video_clip.duration))
return one_face_segs
def cut_to_single_face_video(input_file, output_name):
video_clip = VideoFileClip(input_file)
one_face_segments = find_one_face_segments(video_clip)
other_segments = infer_segments_complement(one_face_segments, video_clip.duration)
one_face_clip = cut_together(video_clip, one_face_segments, units='seconds')
others_clip = cut_together(video_clip, other_segments, units='seconds')
one_face_clip.write_videofile("./%s_one_face.mp4" % ((output_name)), temp_audiofile="temp-audio.m4a", remove_temp=True, codec="libx264", audio_codec="aac")
others_clip.write_videofile("./%s_NOT_one_face.mp4" % ((output_name)), temp_audiofile="temp-audio.m4a", remove_temp=True, codec="libx264", audio_codec="aac")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.