Skip to content

Instantly share code, notes, and snippets.

@CookieBox26
Created June 30, 2023 14:49
Show Gist options
  • Save CookieBox26/13abd3b6bb02054ef25c749235af51d7 to your computer and use it in GitHub Desktop.
Save CookieBox26/13abd3b6bb02054ef25c749235af51d7 to your computer and use it in GitHub Desktop.
動画を作成する (20230630)
import toml
from synthesis import synthesis
import hashlib
from pydub import AudioSegment
from PIL import Image, ImageFont, ImageDraw
from moviepy.editor import ImageClip, concatenate_videoclips
import math
import os
def str_to_hash(s):
return hashlib.md5(s.encode()).hexdigest()
def file_to_hash(file):
with open(file, 'rb') as f:
md5 = hashlib.md5(f.read()).hexdigest()
return md5
def dict_to_str(d):
keys = sorted(d.keys())
s = ''
for key in keys:
s += str(key) + str(d[key])
return s
def format_duration(duration):
sec = math.floor(duration)
msec = int(1000 * (duration - sec))
return f'00:00:{sec:02}.{msec:03}'
def _add_text(img, text, text_settings):
""" 背景画像に文字列を貼り付けます
"""
font_size = text_settings['font_size']
font_path = text_settings['font_path']
font = ImageFont.truetype(font_path, font_size)
draw = ImageDraw.Draw(img)
coord = text_settings['coordinate']
width = text_settings['width']
if '<br/>' in text:
text = text.split('<br/>')
for row, text_ in enumerate(text):
coord_ = (coord[0], coord[1] + row * int(font_size * 1.4)) # 行間 1.4
draw.text(
coord_, text_,
tuple(text_settings['font_color']), font=font
)
return
length = len(text)
for row, i in enumerate(range(0, length, width)):
coord_ = (coord[0], coord[1] + row * int(font_size * 1.4)) # 行間 1.4
draw.text(
coord_, text[i:(i + width)],
tuple(text_settings['font_color']), font=font
)
def _paste(img, chara_id, mode, character_images, mouse=0):
""" 背景画像に立ち絵画像を貼り付けます
"""
img_ = Image.open(character_images[chara_id][mode][mouse]).convert('RGBA')
scale = character_images[chara_id]['scale']
size_new = (int(scale * img_.width), int(scale * img_.height))
img_ = img_.resize(size_new)
w = character_images[chara_id]['coordinate'][0]
h = character_images[chara_id]['coordinate'][1]
img.paste(img_, character_images[chara_id]['coordinate'], img_) # (w, h)
def generate_image(root, shot, character_images,
text_settings, regenerate=False):
""" 必要な画像を合成します
"""
back = shot['back']
characters = shot['characters']
text = shot['text']
if text == '':
text = shot['serifu'] # 暫定敵にテキストがなければセリフを画像に記入する
filebody = root + file_to_hash(back) + '_' + dict_to_str(characters)
filebody += '_' + str_to_hash(text)
speakers = [-1]
if shot['speaker'] > -1: # 話者がいれば口開き版も必要なため2枚合成する
speakers.append(shot['speaker'])
out_files = []
for speaker in speakers:
postfix = '' if (speaker == -1) else ('_' + str(speaker))
out_file = filebody + postfix + '.png'
if (not regenerate) and os.path.isfile(out_file):
out_files.append(out_file)
continue
img = Image.open(back).convert('RGBA')
for chara_id, mode in characters.items():
mouse = 1 if (chara_id == str(speaker)) else 0
_paste(img, chara_id, mode, character_images, mouse)
if text != '':
_add_text(img, text, text_settings)
img.save(out_file)
out_files.append(out_file)
return out_files
def main():
# 台本読み込み
with open('./script.toml', encoding='utf-8') as f:
script = toml.load(f)
root = script['root']
voice_settings = script['voice_settings']
character_images = {str(c['speaker']): c for c in script['character_images']}
text_settings = script['text_settings']
shot_default = script['shot_default']
shots = script['shots']
# 座標調整時のみ True
if False:
shot = shot_default.copy()
shot['text'] = 'ああああああああああいいいいいいいいいい'
shot['text'] += 'ううううううううううええええええええええ'
generate_image(root, shot, character_images,
text_settings, regenerate=True)
return
# moviepy に浮動小数バグがあるので 1/2^n 秒を時間の最小単位とする
# https://github.com/Zulko/moviepy/issues/646
SPK = 0.125 # Seconds Per Koma
# 各場面の台詞を wav に出力し全体を通した mp3 音声ファイルを出力しておく
komas = [] # 各場面の「有音コマ数、無音コマ数」を格納しておく
audio_concat = None
for shot_ in shots:
shot = shot_default.copy()
shot.update(shot_)
komasu = 0
silent_komasu = 0
audio = None
if shot['speaker'] > -1: # 話者がいればセリフ音声を合成する
out_file = root + str(shot['speaker']) + '_' + shot['serifu'][:3] + '_'
out_file += str_to_hash(shot['serifu']) + '.wav'
if not os.path.isfile(out_file):
synthesis(shot['serifu'], out_file, speaker=shot['speaker'],
options=voice_settings.get(str(shot['speaker'])))
audio = AudioSegment.from_wav(out_file)
komasu = math.ceil(audio.duration_seconds / SPK)
minisilence_duration = komasu * SPK - audio.duration_seconds
audio += AudioSegment.silent(duration=minisilence_duration * 1000)
if shot['silence'] > 0: # セリフ後無音秒数があれば無音を足す
silent_komasu = math.ceil(float(shot['silence']) / SPK)
if audio is None:
audio = AudioSegment.silent(duration=silent_komasu * SPK * 1000)
else:
audio += AudioSegment.silent(duration=silent_komasu * SPK * 1000)
komas.append((komasu, silent_komasu))
if audio_concat is None:
audio_concat = audio
else:
audio_concat += audio
audio_concat.export(f'{root}concat.mp3', format='mp3')
# 各場面のコマ数に応じて動画クリップを作成していく
clips = []
for (koma, shot_) in zip(komas, shots):
shot = shot_default.copy()
shot.update(shot_)
img_files = generate_image(root, shot, character_images, text_settings)
if koma[0] > 0:
for i_koma in range(koma[0]):
image_file = img_files[(i_koma+1) % 2]
clip = ImageClip(image_file).set_duration(format_duration(SPK))
clips.append(clip)
if koma[1] > 0:
duration = SPK * koma[1]
clip = ImageClip(img_files[0]).set_duration(format_duration(duration))
clips.append(clip)
# mp4 に出力する
video = concatenate_videoclips(clips)
video.write_videofile(f'{root}out.mp4', fps=24, audio=f'{root}concat.mp3')
if __name__ == '__main__':
main()
# png, wav, mp3, mp4 出力フォルダを指定します
root = "./20230630/" # 要スラッシュ
# VOICEVOX FastAPI の /synthesis に渡すオプションを指定します
# キーは VOICEVOX のスタイルIDです
# ここではずんだもんの話速を速めて抑揚を抑え、
# またずんだもんもつむぎも音高を落ち着かせています
[voice_settings]
3 = { speedScale = 1.2, intonationScale = 0.9, pitchScale = -0.025 }
8 = { pitchScale = -0.025 }
# 立ち絵の画像ファイルパスと縮尺と表示座標を指定します
# 縮尺と表示座標は試行錯誤するまでわからないので最初は適当にします
# 画像ファイルは各表情タイプに「口閉じ、口開き」の2ファイルを指定します
# なければ同じファイルでよいです (がセリフ中でも口パクしません)
[[character_images]]
speaker = 3
scale = 0.59
coordinate = [460, 286]
normal = ["../sozai/zundamon_10.png", "../sozai/zundamon_11.png"]
mattari = ["../sozai/zundamon_00.png", "../sozai/zundamon_01.png"]
[[character_images]]
speaker = 8
scale = 0.555
coordinate = [-25, 262]
normal = ["../sozai/tsumugi_10.png", "../sozai/tsumugi_11.png"]
mattari = ["../sozai/tsumugi_00.png", "../sozai/tsumugi_01.png"]
# 画面に描画するテキストの設定を指定します
[text_settings]
coordinate = [25, 25]
font_size = 24
font_path = '../../Downloads/M_PLUS_Rounded_1c/MPLUSRounded1c-Medium.ttf'
font_color = [0, 0, 0]
width = 24 # 背景画像が 640 ピクセルなので 24 文字くらい入りますが調整します
# 場面のデフォルト設定を指定します
# 各場面はこの設定への差分のみ指定します
# 立ち絵の座標調整時にもこの設定を使用します
[shot_default]
speaker = -1 # 話者のスタイルID (誰も話さないなら -1)
serifu = "" # セリフ
silence = 0 # セリフ後無音秒数
back = "./20230630/back.png" # 背景画像
characters = { 3 = "normal", 8 = "normal" } # 各人物の表情タイプ
text = ""
# 各場面のセリフや背景画像や表情などを指定していきます
[[shots]]
silence = 1
[[shots]]
speaker = 3
serifu = "定常時系列のあるステップとkステップ前の共分散をラグkの関数と考えて自己共分散関数というのだ。"
[[shots]]
speaker = 8
serifu = "定常時系列じゃないとだめなのー?"
[[shots]]
speaker = 3
serifu = "時系列が定常でない場合自己共分散はラグkのみでは決まらないのだ。時間ステップによって変わっていってしまうかもしれないのだ。"
[[shots]]
speaker = 8
serifu = "あーそっかー。"
[[shots]]
silence = 5
characters = { 3 = "mattari", 8 = "mattari" }
text = "[声の出演]<br/>VOICEVOX:ずんだもん<br/>VOICEVOX:春日部つむぎ"
import requests
import json
import time
# ほぼ以下のコードです
# https://blog.shikoan.com/voicevox-python/
def synthesis(text, filename, speaker=1, options=None, max_retry=20):
query_payload = {"text": text, "speaker": speaker}
for query_i in range(max_retry):
r = requests.post("http://localhost:50021/audio_query",
params=query_payload, timeout=(10.0, 300.0))
if r.status_code == 200:
query_data = r.json()
break
time.sleep(1)
else:
raise ConnectionError(
"リトライ回数が上限に到達しました。 audio_query : ",
filename, "/", text[:30], r.text)
if options is not None:
query_data.update(options)
# synthesis
synth_payload = {"speaker": speaker}
for synth_i in range(max_retry):
r = requests.post(
"http://localhost:50021/synthesis",
params=synth_payload,
data=json.dumps(query_data), timeout=(10.0, 300.0))
if r.status_code == 200:
with open(filename, "wb") as fp:
fp.write(r.content)
print(
f"{filename} は query={query_i+1}回, synthesis={synth_i+1}回"
"のリトライで正常に保存されました")
break
time.sleep(1)
else:
raise ConnectionError(
"リトライ回数が上限に到達しました。 synthesis : ",
filename, "/", text[:30], r, text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment