rolux/culture_shock.py

## culture_shock.py
# git clone https://github.com/NVlabs/stylegan2
import os
import numpy as np
from scipy.interpolate import interp1d
from scipy.io import wavfile
import matplotlib.pyplot as plt
import PIL.Image
import moviepy.editor

import dnnlib
import dnnlib.tflib as tflib
import pretrained_networks

audio = {}
fps = 60

# https://www.google.com/search?q=death+grips+black+google+download
for mp3_filename in [f for f in os.listdir('data') if f.endswith('.mp3')]:
    mp3_filename = f'data/{mp3_filename}'
    wav_filename = mp3_filename[:-4] + '.wav'
    if not os.path.exists(wav_filename):
        audio_clip = moviepy.editor.AudioFileClip(mp3_filename)
        audio_clip.write_audiofile(wav_filename, fps=44100, nbytes=2, codec='pcm_s16le')
    track_name = os.path.basename(wav_filename)[15:-5]
    rate, signal = wavfile.read(wav_filename)
    signal = np.mean(signal, axis=1) # to mono
    signal = np.abs(signal)
    seed = signal.shape[0]
    duration = signal.shape[0] / rate
    frames = int(np.ceil(duration * fps))
    samples_per_frame = signal.shape[0] / frames
    audio[track_name] = np.zeros(frames, dtype=signal.dtype)
    for frame in range(frames):
        start = int(round(frame * samples_per_frame))
        stop = int(round((frame + 1) * samples_per_frame))
        audio[track_name][frame] = np.mean(signal[start:stop], axis=0)
    audio[track_name] /= max(audio[track_name])

for track in sorted(audio.keys()):
    plt.figure(figsize=(8, 3))
    plt.title(track)
    plt.plot(audio[track])
    plt.savefig(f'data/{track}.png')

network_pkl = 'gdrive:networks/stylegan2-ffhq-config-f.pkl'
_G, _D, Gs = pretrained_networks.load_networks(network_pkl)

Gs_kwargs = dnnlib.EasyDict()
Gs_kwargs.output_transform = dict(func=tflib.convert_images_to_uint8, nchw_to_nhwc=True)
Gs_kwargs.randomize_noise = False
Gs_syn_kwargs = dnnlib.EasyDict()
Gs_syn_kwargs.output_transform = dict(func=tflib.convert_images_to_uint8, nchw_to_nhwc=True)
Gs_syn_kwargs.randomize_noise = False
Gs_syn_kwargs.minibatch_size = 4
noise_vars = [var for name, var in Gs.components.synthesis.vars.items() if name.startswith('noise')]
w_avg = Gs.get_var('dlatent_avg')

def get_ws(n, frames, seed):
    filename = f'data/ws_{n}_{frames}_{seed}.npy'
    if not os.path.exists(filename):
        src_ws = np.random.RandomState(seed).randn(n, 512)
        ws = np.empty((frames, 512))
        for i in range(512):
            # FIXME: retarded
            x = np.linspace(0, 3*frames, 3*len(src_ws), endpoint=False)
            y = np.tile(src_ws[:, i], 3)
            x_ = np.linspace(0, 3*frames, 3*frames, endpoint=False)
            y_ = interp1d(x, y, kind='quadratic', fill_value='extrapolate')(x_)
            ws[:, i] = y_[frames:2*frames]
        np.save(filename, ws)
    else:
        ws = np.load(filename)
    return ws

def mix_styles(wa, wb, ivs):
    w = np.copy(wa)
    for i, v in ivs:
        w[i] = wa[i] * (1 - v) + wb[i] * v
    return w

def normalize_vector(v):
    return v * np.std(w_avg) / np.std(v) + np.mean(w_avg) - np.mean(v)

def render_frame(t):
    global base_index
    frame = np.clip(np.int(np.round(t * fps)), 0, frames - 1)
    base_index += base_speed * audio['Instrumental'][frame]**2
    base_w = base_ws[int(round(base_index)) % len(base_ws)]
    base_w = np.tile(base_w, (18, 1))
    psi = 0.5 + audio['FX'][frame] / 2
    base_w = w_avg + (base_w - w_avg) * psi
    mix_w = np.tile(mix_ws[frame], (18, 1))
    mix_w = w_avg + (mix_w - w_avg) * 0.75
    ranges = [range(0, 4), range(4, 8), range(8, 18)]
    values = [audio[track][frame] for track in ['Drums', 'E Drums', 'Synth']]
    w = mix_styles(base_w, mix_w, zip(ranges, values))
    w += mouth_open * audio['Vocal'][frame] * 1.5
    image = Gs.components.synthesis.run(np.stack([w]), **Gs_syn_kwargs)[0]
    image = PIL.Image.fromarray(image).resize((size, size), PIL.Image.LANCZOS)
    return np.array(image)

size = 1080
seconds = int(np.ceil(duration))
resolution = 10
base_frames = resolution * frames
base_ws = get_ws(seconds, base_frames, seed)
base_speed = base_frames / sum(audio['Instrumental']**2)
base_index = 0
mix_ws = get_ws(seconds, frames, seed + 1)
# https://rolux.org/media/stylegan2/vectors/mouth_ratio.npy
mouth_open = normalize_vector(-np.load('data/mouth_ratio.npy'))

mp4_filename = 'data/Culture Shock.mp4'
video_clip = moviepy.editor.VideoClip(render_frame, duration=duration)
audio_clip_i = moviepy.editor.AudioFileClip('data/Culture Shock (Instrumental).wav')
audio_clip_v = moviepy.editor.AudioFileClip('data/Culture Shock (Vocal).wav')
audio_clip = moviepy.editor.CompositeAudioClip([audio_clip_i, audio_clip_v])
video_clip = video_clip.set_audio(audio_clip)
video_clip.write_videofile(mp4_filename, fps=fps, codec='libx264', audio_codec='aac', bitrate='8M')
	# git clone https://github.com/NVlabs/stylegan2
	import os
	import numpy as np
	from scipy.interpolate import interp1d
	from scipy.io import wavfile
	import matplotlib.pyplot as plt
	import PIL.Image
	import moviepy.editor

	import dnnlib
	import dnnlib.tflib as tflib
	import pretrained_networks

	audio = {}
	fps = 60

	# https://www.google.com/search?q=death+grips+black+google+download
	for mp3_filename in [f for f in os.listdir('data') if f.endswith('.mp3')]:
	mp3_filename = f'data/{mp3_filename}'
	wav_filename = mp3_filename[:-4] + '.wav'
	if not os.path.exists(wav_filename):
	audio_clip = moviepy.editor.AudioFileClip(mp3_filename)
	audio_clip.write_audiofile(wav_filename, fps=44100, nbytes=2, codec='pcm_s16le')
	track_name = os.path.basename(wav_filename)[15:-5]
	rate, signal = wavfile.read(wav_filename)
	signal = np.mean(signal, axis=1) # to mono
	signal = np.abs(signal)
	seed = signal.shape[0]
	duration = signal.shape[0] / rate
	frames = int(np.ceil(duration * fps))
	samples_per_frame = signal.shape[0] / frames
	audio[track_name] = np.zeros(frames, dtype=signal.dtype)
	for frame in range(frames):
	start = int(round(frame * samples_per_frame))
	stop = int(round((frame + 1) * samples_per_frame))
	audio[track_name][frame] = np.mean(signal[start:stop], axis=0)
	audio[track_name] /= max(audio[track_name])

	for track in sorted(audio.keys()):
	plt.figure(figsize=(8, 3))
	plt.title(track)
	plt.plot(audio[track])
	plt.savefig(f'data/{track}.png')

	network_pkl = 'gdrive:networks/stylegan2-ffhq-config-f.pkl'
	_G, _D, Gs = pretrained_networks.load_networks(network_pkl)

	Gs_kwargs = dnnlib.EasyDict()
	Gs_kwargs.output_transform = dict(func=tflib.convert_images_to_uint8, nchw_to_nhwc=True)
	Gs_kwargs.randomize_noise = False
	Gs_syn_kwargs = dnnlib.EasyDict()
	Gs_syn_kwargs.output_transform = dict(func=tflib.convert_images_to_uint8, nchw_to_nhwc=True)
	Gs_syn_kwargs.randomize_noise = False
	Gs_syn_kwargs.minibatch_size = 4
	noise_vars = [var for name, var in Gs.components.synthesis.vars.items() if name.startswith('noise')]
	w_avg = Gs.get_var('dlatent_avg')

	def get_ws(n, frames, seed):
	filename = f'data/ws_{n}_{frames}_{seed}.npy'
	if not os.path.exists(filename):
	src_ws = np.random.RandomState(seed).randn(n, 512)
	ws = np.empty((frames, 512))
	for i in range(512):
	# FIXME: retarded
	x = np.linspace(0, 3frames, 3len(src_ws), endpoint=False)
	y = np.tile(src_ws[:, i], 3)
	x_ = np.linspace(0, 3frames, 3frames, endpoint=False)
	y_ = interp1d(x, y, kind='quadratic', fill_value='extrapolate')(x_)
	ws[:, i] = y_[frames:2*frames]
	np.save(filename, ws)
	else:
	ws = np.load(filename)
	return ws

	def mix_styles(wa, wb, ivs):
	w = np.copy(wa)
	for i, v in ivs:
	w[i] = wa[i] * (1 - v) + wb[i] * v
	return w

	def normalize_vector(v):
	return v * np.std(w_avg) / np.std(v) + np.mean(w_avg) - np.mean(v)

	def render_frame(t):
	global base_index
	frame = np.clip(np.int(np.round(t * fps)), 0, frames - 1)
	base_index += base_speed * audio['Instrumental'][frame]**2
	base_w = base_ws[int(round(base_index)) % len(base_ws)]
	base_w = np.tile(base_w, (18, 1))
	psi = 0.5 + audio['FX'][frame] / 2
	base_w = w_avg + (base_w - w_avg) * psi
	mix_w = np.tile(mix_ws[frame], (18, 1))
	mix_w = w_avg + (mix_w - w_avg) * 0.75
	ranges = [range(0, 4), range(4, 8), range(8, 18)]
	values = [audio[track][frame] for track in ['Drums', 'E Drums', 'Synth']]
	w = mix_styles(base_w, mix_w, zip(ranges, values))
	w += mouth_open * audio['Vocal'][frame] * 1.5
	image = Gs.components.synthesis.run(np.stack([w]), **Gs_syn_kwargs)[0]
	image = PIL.Image.fromarray(image).resize((size, size), PIL.Image.LANCZOS)
	return np.array(image)

	size = 1080
	seconds = int(np.ceil(duration))
	resolution = 10
	base_frames = resolution * frames
	base_ws = get_ws(seconds, base_frames, seed)
	base_speed = base_frames / sum(audio['Instrumental']**2)
	base_index = 0
	mix_ws = get_ws(seconds, frames, seed + 1)
	# https://rolux.org/media/stylegan2/vectors/mouth_ratio.npy
	mouth_open = normalize_vector(-np.load('data/mouth_ratio.npy'))

	mp4_filename = 'data/Culture Shock.mp4'
	video_clip = moviepy.editor.VideoClip(render_frame, duration=duration)
	audio_clip_i = moviepy.editor.AudioFileClip('data/Culture Shock (Instrumental).wav')
	audio_clip_v = moviepy.editor.AudioFileClip('data/Culture Shock (Vocal).wav')
	audio_clip = moviepy.editor.CompositeAudioClip([audio_clip_i, audio_clip_v])
	video_clip = video_clip.set_audio(audio_clip)
	video_clip.write_videofile(mp4_filename, fps=fps, codec='libx264', audio_codec='aac', bitrate='8M')