vivekhaldar/color_edit.py

## color_edit.py
#!/usr/bin/env python

import math
import sys
from moviepy.editor import AudioClip, VideoFileClip, concatenate_videoclips


# Get average RGB of part of a frame. Frame is H * W * 3 (rgb)
# Assumes x1 < x2, y1 < y2
def avg_rgb(frame, x1, y1, x2, y2):
    r, g, b = 0, 0, 0
    for x in range(x1, x2):
        for y in range(y1, y2):
            r += frame[x, y, 0]
            g += frame[x, y, 1]
            b += frame[x, y, 2]
    total_pixels = (x2 - x1) * (y2 - y1)
    avg_r = r / total_pixels
    avg_g = g / total_pixels
    avg_b = b / total_pixels
    #print(avg_r, avg_g, avg_b)
    return avg_r, avg_g, avg_b


# Look for colors in frame, edit based on that.
# Returns list of (start, end) tuples of time intervals we want to keep.
def color_edit(video):
    intervals_to_keep = []
    frame_marker = [] # 'c': content; 'y': keep prior interval; 'n': drop prior interval.
    # Iterate over every frame.
    for frame in video.iter_frames():
        avg_r, avg_g, avg_b = avg_rgb(frame, 100, 100, 110, 110)
        is_red = (avg_r > 120) and (avg_g < 50) and (avg_b < 50)
        is_green = (avg_r < 50) and (avg_g > 120) and (avg_b < 50)
        marker = 'c'
        if is_red:
            marker = 'n'
        elif is_green:
            marker = 'y'
        frame_marker.append(marker)

    keep_start, keep_end = 0, 0
    keep_intervals = []
    start_of_last_green = 0
    for i in range(1, len(frame_marker)):
        m1 = frame_marker[i - 1]
        m2 = frame_marker[i]
        # Content followed by green, take note.
        if m1 == 'c' and m2 == 'y':
            start_of_last_green = i
        # Green followed by content. Keep previous interval. Start a (possible) new interval.
        if m1 == 'y' and m2 == 'c':
            keep_end = start_of_last_green / video.fps
            keep_intervals.append([keep_start, keep_end])
            keep_start = (i + 1) / video.fps
        # Red followed by content. Drop the previous interval. Start a (possible) new interval.
        if m1 == 'n' and m2 == 'c':
            keep_start = i / video.fps

    # Ending on green with no following content.
    last_index = len(frame_marker) - 1
    if frame_marker[last_index] == 'c' or frame_marker[last_index] == 'y':
        keep_end = i / video.fps
        keep_intervals.append([keep_start, keep_end])

    return keep_intervals


# Iterate over audio to find the non-silent parts. Outputs a list of
# (speaking_start, speaking_end) intervals.
# Args:
#  window_size: (in seconds) hunt for silence in windows of this size
#  volume_threshold: volume below this threshold is considered to be silence
#  ease_in: (in seconds) add this much silence around speaking intervals
def find_speaking(audio_clip, window_size=0.1, volume_threshold=0.02, ease_in=0.1, audio_fps=44100):
    # First, iterate over audio to find all silent windows.
    num_windows = math.floor(audio_clip.end/window_size)
    window_is_silent = []
    for i in range(num_windows):
        s = audio_clip.subclip(i * window_size, (i + 1) * window_size).set_fps(audio_fps)
        v = s.max_volume()
        window_is_silent.append(v < volume_threshold)

    # Find speaking intervals.
    speaking_start = 0
    speaking_end = 0
    speaking_intervals = []
    for i in range(1, len(window_is_silent)):
        e1 = window_is_silent[i - 1]
        e2 = window_is_silent[i]
        # silence -> speaking
        if e1 and not e2:
            speaking_start = i * window_size
        # speaking -> silence, now have a speaking interval
        if not e1 and e2:
            speaking_end = i * window_size
            new_speaking_interval = [max(0, speaking_start - ease_in), speaking_end + ease_in]
            # With tiny windows, this can sometimes overlap the previous window, so merge.
            need_to_merge = len(speaking_intervals) > 0 and speaking_intervals[-1][1] > new_speaking_interval[0]
            if need_to_merge:
                merged_interval = [speaking_intervals[-1][0], new_speaking_interval[1]]
                speaking_intervals[-1] = merged_interval
            else:
                speaking_intervals.append(new_speaking_interval)

    return speaking_intervals


def main():
    # Parse args
    # Input file path
    file_in = sys.argv[1]
    # Output file path
    file_out = sys.argv[2]

    vid = VideoFileClip(file_in)

    # Color edit.
    intervals_to_keep = color_edit(vid)
    print("Keeping color edit intervals: " + str(intervals_to_keep))
    keep_clips = [vid.subclip(start, end) for [start, end] in intervals_to_keep]
    color_edited_video = concatenate_videoclips(keep_clips)

    # Cut out dead air.
    speaking_intervals = find_speaking(color_edited_video.audio, audio_fps=vid.audio.fps)
    print("Keeping speaking intervals: " + str(speaking_intervals))
    speaking_clips = [color_edited_video.subclip(start, end) for [start, end] in speaking_intervals]
    final_video = concatenate_videoclips(speaking_clips)

    final_video.write_videofile(file_out,
        #fps=60,
        preset='ultrafast',
        codec='libx264',
        temp_audiofile='temp-audio.m4a',
        remove_temp=True,
        audio_codec="aac",
        threads=6
    )

    vid.close()

if __name__ == '__main__':
    main()
	#!/usr/bin/env python

	import math
	import sys
	from moviepy.editor import AudioClip, VideoFileClip, concatenate_videoclips



	# Get average RGB of part of a frame. Frame is H * W * 3 (rgb)
	# Assumes x1 < x2, y1 < y2
	def avg_rgb(frame, x1, y1, x2, y2):
	r, g, b = 0, 0, 0
	for x in range(x1, x2):
	for y in range(y1, y2):
	r += frame[x, y, 0]
	g += frame[x, y, 1]
	b += frame[x, y, 2]
	total_pixels = (x2 - x1) * (y2 - y1)
	avg_r = r / total_pixels
	avg_g = g / total_pixels
	avg_b = b / total_pixels
	#print(avg_r, avg_g, avg_b)
	return avg_r, avg_g, avg_b


	# Look for colors in frame, edit based on that.
	# Returns list of (start, end) tuples of time intervals we want to keep.
	def color_edit(video):
	intervals_to_keep = []
	frame_marker = [] # 'c': content; 'y': keep prior interval; 'n': drop prior interval.
	# Iterate over every frame.
	for frame in video.iter_frames():
	avg_r, avg_g, avg_b = avg_rgb(frame, 100, 100, 110, 110)
	is_red = (avg_r > 120) and (avg_g < 50) and (avg_b < 50)
	is_green = (avg_r < 50) and (avg_g > 120) and (avg_b < 50)
	marker = 'c'
	if is_red:
	marker = 'n'
	elif is_green:
	marker = 'y'
	frame_marker.append(marker)

	keep_start, keep_end = 0, 0
	keep_intervals = []
	start_of_last_green = 0
	for i in range(1, len(frame_marker)):
	m1 = frame_marker[i - 1]
	m2 = frame_marker[i]
	# Content followed by green, take note.
	if m1 == 'c' and m2 == 'y':
	start_of_last_green = i
	# Green followed by content. Keep previous interval. Start a (possible) new interval.
	if m1 == 'y' and m2 == 'c':
	keep_end = start_of_last_green / video.fps
	keep_intervals.append([keep_start, keep_end])
	keep_start = (i + 1) / video.fps
	# Red followed by content. Drop the previous interval. Start a (possible) new interval.
	if m1 == 'n' and m2 == 'c':
	keep_start = i / video.fps

	# Ending on green with no following content.
	last_index = len(frame_marker) - 1
	if frame_marker[last_index] == 'c' or frame_marker[last_index] == 'y':
	keep_end = i / video.fps
	keep_intervals.append([keep_start, keep_end])

	return keep_intervals


	# Iterate over audio to find the non-silent parts. Outputs a list of
	# (speaking_start, speaking_end) intervals.
	# Args:
	# window_size: (in seconds) hunt for silence in windows of this size
	# volume_threshold: volume below this threshold is considered to be silence
	# ease_in: (in seconds) add this much silence around speaking intervals
	def find_speaking(audio_clip, window_size=0.1, volume_threshold=0.02, ease_in=0.1, audio_fps=44100):
	# First, iterate over audio to find all silent windows.
	num_windows = math.floor(audio_clip.end/window_size)
	window_is_silent = []
	for i in range(num_windows):
	s = audio_clip.subclip(i * window_size, (i + 1) * window_size).set_fps(audio_fps)
	v = s.max_volume()
	window_is_silent.append(v < volume_threshold)

	# Find speaking intervals.
	speaking_start = 0
	speaking_end = 0
	speaking_intervals = []
	for i in range(1, len(window_is_silent)):
	e1 = window_is_silent[i - 1]
	e2 = window_is_silent[i]
	# silence -> speaking
	if e1 and not e2:
	speaking_start = i * window_size
	# speaking -> silence, now have a speaking interval
	if not e1 and e2:
	speaking_end = i * window_size
	new_speaking_interval = [max(0, speaking_start - ease_in), speaking_end + ease_in]
	# With tiny windows, this can sometimes overlap the previous window, so merge.
	need_to_merge = len(speaking_intervals) > 0 and speaking_intervals[-1][1] > new_speaking_interval[0]
	if need_to_merge:
	merged_interval = [speaking_intervals[-1][0], new_speaking_interval[1]]
	speaking_intervals[-1] = merged_interval
	else:
	speaking_intervals.append(new_speaking_interval)

	return speaking_intervals


	def main():
	# Parse args
	# Input file path
	file_in = sys.argv[1]
	# Output file path
	file_out = sys.argv[2]

	vid = VideoFileClip(file_in)

	# Color edit.
	intervals_to_keep = color_edit(vid)
	print("Keeping color edit intervals: " + str(intervals_to_keep))
	keep_clips = [vid.subclip(start, end) for [start, end] in intervals_to_keep]
	color_edited_video = concatenate_videoclips(keep_clips)

	# Cut out dead air.
	speaking_intervals = find_speaking(color_edited_video.audio, audio_fps=vid.audio.fps)
	print("Keeping speaking intervals: " + str(speaking_intervals))
	speaking_clips = [color_edited_video.subclip(start, end) for [start, end] in speaking_intervals]
	final_video = concatenate_videoclips(speaking_clips)

	final_video.write_videofile(file_out,
	#fps=60,
	preset='ultrafast',
	codec='libx264',
	temp_audiofile='temp-audio.m4a',
	remove_temp=True,
	audio_codec="aac",
	threads=6
	)

	vid.close()

	if __name__ == '__main__':
	main()