Created
June 30, 2023 14:49
-
-
Save CookieBox26/13abd3b6bb02054ef25c749235af51d7 to your computer and use it in GitHub Desktop.
動画を作成する (20230630)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import toml | |
from synthesis import synthesis | |
import hashlib | |
from pydub import AudioSegment | |
from PIL import Image, ImageFont, ImageDraw | |
from moviepy.editor import ImageClip, concatenate_videoclips | |
import math | |
import os | |
def str_to_hash(s): | |
return hashlib.md5(s.encode()).hexdigest() | |
def file_to_hash(file): | |
with open(file, 'rb') as f: | |
md5 = hashlib.md5(f.read()).hexdigest() | |
return md5 | |
def dict_to_str(d): | |
keys = sorted(d.keys()) | |
s = '' | |
for key in keys: | |
s += str(key) + str(d[key]) | |
return s | |
def format_duration(duration): | |
sec = math.floor(duration) | |
msec = int(1000 * (duration - sec)) | |
return f'00:00:{sec:02}.{msec:03}' | |
def _add_text(img, text, text_settings): | |
""" 背景画像に文字列を貼り付けます | |
""" | |
font_size = text_settings['font_size'] | |
font_path = text_settings['font_path'] | |
font = ImageFont.truetype(font_path, font_size) | |
draw = ImageDraw.Draw(img) | |
coord = text_settings['coordinate'] | |
width = text_settings['width'] | |
if '<br/>' in text: | |
text = text.split('<br/>') | |
for row, text_ in enumerate(text): | |
coord_ = (coord[0], coord[1] + row * int(font_size * 1.4)) # 行間 1.4 | |
draw.text( | |
coord_, text_, | |
tuple(text_settings['font_color']), font=font | |
) | |
return | |
length = len(text) | |
for row, i in enumerate(range(0, length, width)): | |
coord_ = (coord[0], coord[1] + row * int(font_size * 1.4)) # 行間 1.4 | |
draw.text( | |
coord_, text[i:(i + width)], | |
tuple(text_settings['font_color']), font=font | |
) | |
def _paste(img, chara_id, mode, character_images, mouse=0): | |
""" 背景画像に立ち絵画像を貼り付けます | |
""" | |
img_ = Image.open(character_images[chara_id][mode][mouse]).convert('RGBA') | |
scale = character_images[chara_id]['scale'] | |
size_new = (int(scale * img_.width), int(scale * img_.height)) | |
img_ = img_.resize(size_new) | |
w = character_images[chara_id]['coordinate'][0] | |
h = character_images[chara_id]['coordinate'][1] | |
img.paste(img_, character_images[chara_id]['coordinate'], img_) # (w, h) | |
def generate_image(root, shot, character_images, | |
text_settings, regenerate=False): | |
""" 必要な画像を合成します | |
""" | |
back = shot['back'] | |
characters = shot['characters'] | |
text = shot['text'] | |
if text == '': | |
text = shot['serifu'] # 暫定敵にテキストがなければセリフを画像に記入する | |
filebody = root + file_to_hash(back) + '_' + dict_to_str(characters) | |
filebody += '_' + str_to_hash(text) | |
speakers = [-1] | |
if shot['speaker'] > -1: # 話者がいれば口開き版も必要なため2枚合成する | |
speakers.append(shot['speaker']) | |
out_files = [] | |
for speaker in speakers: | |
postfix = '' if (speaker == -1) else ('_' + str(speaker)) | |
out_file = filebody + postfix + '.png' | |
if (not regenerate) and os.path.isfile(out_file): | |
out_files.append(out_file) | |
continue | |
img = Image.open(back).convert('RGBA') | |
for chara_id, mode in characters.items(): | |
mouse = 1 if (chara_id == str(speaker)) else 0 | |
_paste(img, chara_id, mode, character_images, mouse) | |
if text != '': | |
_add_text(img, text, text_settings) | |
img.save(out_file) | |
out_files.append(out_file) | |
return out_files | |
def main(): | |
# 台本読み込み | |
with open('./script.toml', encoding='utf-8') as f: | |
script = toml.load(f) | |
root = script['root'] | |
voice_settings = script['voice_settings'] | |
character_images = {str(c['speaker']): c for c in script['character_images']} | |
text_settings = script['text_settings'] | |
shot_default = script['shot_default'] | |
shots = script['shots'] | |
# 座標調整時のみ True | |
if False: | |
shot = shot_default.copy() | |
shot['text'] = 'ああああああああああいいいいいいいいいい' | |
shot['text'] += 'ううううううううううええええええええええ' | |
generate_image(root, shot, character_images, | |
text_settings, regenerate=True) | |
return | |
# moviepy に浮動小数バグがあるので 1/2^n 秒を時間の最小単位とする | |
# https://github.com/Zulko/moviepy/issues/646 | |
SPK = 0.125 # Seconds Per Koma | |
# 各場面の台詞を wav に出力し全体を通した mp3 音声ファイルを出力しておく | |
komas = [] # 各場面の「有音コマ数、無音コマ数」を格納しておく | |
audio_concat = None | |
for shot_ in shots: | |
shot = shot_default.copy() | |
shot.update(shot_) | |
komasu = 0 | |
silent_komasu = 0 | |
audio = None | |
if shot['speaker'] > -1: # 話者がいればセリフ音声を合成する | |
out_file = root + str(shot['speaker']) + '_' + shot['serifu'][:3] + '_' | |
out_file += str_to_hash(shot['serifu']) + '.wav' | |
if not os.path.isfile(out_file): | |
synthesis(shot['serifu'], out_file, speaker=shot['speaker'], | |
options=voice_settings.get(str(shot['speaker']))) | |
audio = AudioSegment.from_wav(out_file) | |
komasu = math.ceil(audio.duration_seconds / SPK) | |
minisilence_duration = komasu * SPK - audio.duration_seconds | |
audio += AudioSegment.silent(duration=minisilence_duration * 1000) | |
if shot['silence'] > 0: # セリフ後無音秒数があれば無音を足す | |
silent_komasu = math.ceil(float(shot['silence']) / SPK) | |
if audio is None: | |
audio = AudioSegment.silent(duration=silent_komasu * SPK * 1000) | |
else: | |
audio += AudioSegment.silent(duration=silent_komasu * SPK * 1000) | |
komas.append((komasu, silent_komasu)) | |
if audio_concat is None: | |
audio_concat = audio | |
else: | |
audio_concat += audio | |
audio_concat.export(f'{root}concat.mp3', format='mp3') | |
# 各場面のコマ数に応じて動画クリップを作成していく | |
clips = [] | |
for (koma, shot_) in zip(komas, shots): | |
shot = shot_default.copy() | |
shot.update(shot_) | |
img_files = generate_image(root, shot, character_images, text_settings) | |
if koma[0] > 0: | |
for i_koma in range(koma[0]): | |
image_file = img_files[(i_koma+1) % 2] | |
clip = ImageClip(image_file).set_duration(format_duration(SPK)) | |
clips.append(clip) | |
if koma[1] > 0: | |
duration = SPK * koma[1] | |
clip = ImageClip(img_files[0]).set_duration(format_duration(duration)) | |
clips.append(clip) | |
# mp4 に出力する | |
video = concatenate_videoclips(clips) | |
video.write_videofile(f'{root}out.mp4', fps=24, audio=f'{root}concat.mp3') | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# png, wav, mp3, mp4 出力フォルダを指定します | |
root = "./20230630/" # 要スラッシュ | |
# VOICEVOX FastAPI の /synthesis に渡すオプションを指定します | |
# キーは VOICEVOX のスタイルIDです | |
# ここではずんだもんの話速を速めて抑揚を抑え、 | |
# またずんだもんもつむぎも音高を落ち着かせています | |
[voice_settings] | |
3 = { speedScale = 1.2, intonationScale = 0.9, pitchScale = -0.025 } | |
8 = { pitchScale = -0.025 } | |
# 立ち絵の画像ファイルパスと縮尺と表示座標を指定します | |
# 縮尺と表示座標は試行錯誤するまでわからないので最初は適当にします | |
# 画像ファイルは各表情タイプに「口閉じ、口開き」の2ファイルを指定します | |
# なければ同じファイルでよいです (がセリフ中でも口パクしません) | |
[[character_images]] | |
speaker = 3 | |
scale = 0.59 | |
coordinate = [460, 286] | |
normal = ["../sozai/zundamon_10.png", "../sozai/zundamon_11.png"] | |
mattari = ["../sozai/zundamon_00.png", "../sozai/zundamon_01.png"] | |
[[character_images]] | |
speaker = 8 | |
scale = 0.555 | |
coordinate = [-25, 262] | |
normal = ["../sozai/tsumugi_10.png", "../sozai/tsumugi_11.png"] | |
mattari = ["../sozai/tsumugi_00.png", "../sozai/tsumugi_01.png"] | |
# 画面に描画するテキストの設定を指定します | |
[text_settings] | |
coordinate = [25, 25] | |
font_size = 24 | |
font_path = '../../Downloads/M_PLUS_Rounded_1c/MPLUSRounded1c-Medium.ttf' | |
font_color = [0, 0, 0] | |
width = 24 # 背景画像が 640 ピクセルなので 24 文字くらい入りますが調整します | |
# 場面のデフォルト設定を指定します | |
# 各場面はこの設定への差分のみ指定します | |
# 立ち絵の座標調整時にもこの設定を使用します | |
[shot_default] | |
speaker = -1 # 話者のスタイルID (誰も話さないなら -1) | |
serifu = "" # セリフ | |
silence = 0 # セリフ後無音秒数 | |
back = "./20230630/back.png" # 背景画像 | |
characters = { 3 = "normal", 8 = "normal" } # 各人物の表情タイプ | |
text = "" | |
# 各場面のセリフや背景画像や表情などを指定していきます | |
[[shots]] | |
silence = 1 | |
[[shots]] | |
speaker = 3 | |
serifu = "定常時系列のあるステップとkステップ前の共分散をラグkの関数と考えて自己共分散関数というのだ。" | |
[[shots]] | |
speaker = 8 | |
serifu = "定常時系列じゃないとだめなのー?" | |
[[shots]] | |
speaker = 3 | |
serifu = "時系列が定常でない場合自己共分散はラグkのみでは決まらないのだ。時間ステップによって変わっていってしまうかもしれないのだ。" | |
[[shots]] | |
speaker = 8 | |
serifu = "あーそっかー。" | |
[[shots]] | |
silence = 5 | |
characters = { 3 = "mattari", 8 = "mattari" } | |
text = "[声の出演]<br/>VOICEVOX:ずんだもん<br/>VOICEVOX:春日部つむぎ" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import json | |
import time | |
# ほぼ以下のコードです | |
# https://blog.shikoan.com/voicevox-python/ | |
def synthesis(text, filename, speaker=1, options=None, max_retry=20): | |
query_payload = {"text": text, "speaker": speaker} | |
for query_i in range(max_retry): | |
r = requests.post("http://localhost:50021/audio_query", | |
params=query_payload, timeout=(10.0, 300.0)) | |
if r.status_code == 200: | |
query_data = r.json() | |
break | |
time.sleep(1) | |
else: | |
raise ConnectionError( | |
"リトライ回数が上限に到達しました。 audio_query : ", | |
filename, "/", text[:30], r.text) | |
if options is not None: | |
query_data.update(options) | |
# synthesis | |
synth_payload = {"speaker": speaker} | |
for synth_i in range(max_retry): | |
r = requests.post( | |
"http://localhost:50021/synthesis", | |
params=synth_payload, | |
data=json.dumps(query_data), timeout=(10.0, 300.0)) | |
if r.status_code == 200: | |
with open(filename, "wb") as fp: | |
fp.write(r.content) | |
print( | |
f"{filename} は query={query_i+1}回, synthesis={synth_i+1}回" | |
"のリトライで正常に保存されました") | |
break | |
time.sleep(1) | |
else: | |
raise ConnectionError( | |
"リトライ回数が上限に到達しました。 synthesis : ", | |
filename, "/", text[:30], r, text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment