CookieBox26/run.py

## run.py
import toml
from synthesis import synthesis
import hashlib
from pydub import AudioSegment
from PIL import Image, ImageFont, ImageDraw
from moviepy.editor import ImageClip, concatenate_videoclips
import math
import os


def str_to_hash(s):
    return hashlib.md5(s.encode()).hexdigest()


def file_to_hash(file):
    with open(file, 'rb') as f:
        md5 = hashlib.md5(f.read()).hexdigest()
    return md5


def dict_to_str(d):
    keys = sorted(d.keys())
    s = ''
    for key in keys:
        s += str(key) + str(d[key])
    return s


def format_duration(duration):
    sec = math.floor(duration)
    msec = int(1000 * (duration - sec))
    return f'00:00:{sec:02}.{msec:03}'


def _add_text(img, text, text_settings):
    """ 背景画像に文字列を貼り付けます
    """
    font_size = text_settings['font_size']
    font_path = text_settings['font_path']
    font = ImageFont.truetype(font_path, font_size)
    draw = ImageDraw.Draw(img)
    coord = text_settings['coordinate']
    width = text_settings['width']
    if '<br/>' in text:
        text = text.split('<br/>')
        for row, text_ in enumerate(text):
            coord_ = (coord[0], coord[1] + row * int(font_size * 1.4))  # 行間 1.4
            draw.text(
                coord_, text_,
                tuple(text_settings['font_color']), font=font
            )
        return
    length = len(text)
    for row, i in enumerate(range(0, length, width)):
        coord_ = (coord[0], coord[1] + row * int(font_size * 1.4))  # 行間 1.4
        draw.text(
            coord_, text[i:(i + width)],
            tuple(text_settings['font_color']), font=font
        )


def _paste(img, chara_id, mode, character_images, mouse=0):
    """ 背景画像に立ち絵画像を貼り付けます
    """
    img_ = Image.open(character_images[chara_id][mode][mouse]).convert('RGBA')
    scale = character_images[chara_id]['scale']
    size_new = (int(scale * img_.width), int(scale * img_.height))
    img_ = img_.resize(size_new)
    w = character_images[chara_id]['coordinate'][0]
    h = character_images[chara_id]['coordinate'][1]
    img.paste(img_, character_images[chara_id]['coordinate'], img_)  #  (w, h)


def generate_image(root, shot, character_images,
                   text_settings, regenerate=False):
    """ 必要な画像を合成します
    """
    back = shot['back']
    characters = shot['characters']
    text = shot['text']
    if text == '':
        text = shot['serifu']  # 暫定敵にテキストがなければセリフを画像に記入する
    filebody = root + file_to_hash(back) + '_' + dict_to_str(characters)
    filebody += '_' + str_to_hash(text)
    speakers = [-1]
    if shot['speaker'] > -1:  # 話者がいれば口開き版も必要なため2枚合成する
        speakers.append(shot['speaker'])
    out_files = []
    for speaker in speakers:
        postfix = '' if (speaker == -1) else ('_' + str(speaker))
        out_file = filebody + postfix + '.png'
        if (not regenerate) and os.path.isfile(out_file):
            out_files.append(out_file)
            continue
        img = Image.open(back).convert('RGBA')
        for chara_id, mode in characters.items():
            mouse = 1 if (chara_id == str(speaker)) else 0
            _paste(img, chara_id, mode, character_images, mouse)
        if text != '':
            _add_text(img, text, text_settings)
        img.save(out_file)
        out_files.append(out_file)
    return out_files


def main():
    # 台本読み込み
    with open('./script.toml', encoding='utf-8') as f:
        script = toml.load(f)
    root = script['root']
    voice_settings = script['voice_settings']
    character_images = {str(c['speaker']): c for c in script['character_images']}
    text_settings = script['text_settings']
    shot_default = script['shot_default']
    shots = script['shots']

    # 座標調整時のみ True
    if False:
        shot = shot_default.copy()
        shot['text'] = 'ああああああああああいいいいいいいいいい'
        shot['text'] += 'ううううううううううええええええええええ'
        generate_image(root, shot, character_images,
                       text_settings, regenerate=True)
        return

    # moviepy に浮動小数バグがあるので 1/2^n 秒を時間の最小単位とする
    # https://github.com/Zulko/moviepy/issues/646
    SPK = 0.125  # Seconds Per Koma

    # 各場面の台詞を wav に出力し全体を通した mp3 音声ファイルを出力しておく
    komas = []  # 各場面の「有音コマ数、無音コマ数」を格納しておく
    audio_concat = None
    for shot_ in shots:
        shot = shot_default.copy()
        shot.update(shot_)
        komasu = 0
        silent_komasu = 0
        audio = None
        if shot['speaker'] > -1:  # 話者がいればセリフ音声を合成する
            out_file = root + str(shot['speaker']) + '_' + shot['serifu'][:3] + '_'
            out_file += str_to_hash(shot['serifu']) + '.wav'
            if not os.path.isfile(out_file):
                synthesis(shot['serifu'], out_file, speaker=shot['speaker'],
                          options=voice_settings.get(str(shot['speaker'])))
            audio = AudioSegment.from_wav(out_file)
            komasu = math.ceil(audio.duration_seconds / SPK)
            minisilence_duration = komasu * SPK - audio.duration_seconds
            audio += AudioSegment.silent(duration=minisilence_duration * 1000)
        if shot['silence'] > 0:  # セリフ後無音秒数があれば無音を足す
            silent_komasu = math.ceil(float(shot['silence']) / SPK)
            if audio is None:
                audio = AudioSegment.silent(duration=silent_komasu * SPK * 1000)
            else:
                audio += AudioSegment.silent(duration=silent_komasu * SPK * 1000)
        komas.append((komasu, silent_komasu))
        if audio_concat is None:
            audio_concat = audio
        else:
            audio_concat += audio
    audio_concat.export(f'{root}concat.mp3', format='mp3')

    # 各場面のコマ数に応じて動画クリップを作成していく
    clips = []
    for (koma, shot_) in zip(komas, shots):
        shot = shot_default.copy()
        shot.update(shot_)
        img_files = generate_image(root, shot, character_images, text_settings)
        if koma[0] > 0:
            for i_koma in range(koma[0]):
                image_file = img_files[(i_koma+1) % 2]
                clip = ImageClip(image_file).set_duration(format_duration(SPK))
                clips.append(clip)
        if koma[1] > 0:
            duration = SPK * koma[1]
            clip = ImageClip(img_files[0]).set_duration(format_duration(duration))
            clips.append(clip)
    # mp4 に出力する
    video = concatenate_videoclips(clips)
    video.write_videofile(f'{root}out.mp4', fps=24, audio=f'{root}concat.mp3')


if __name__ == '__main__':
    main()

## script.toml
# png, wav, mp3, mp4 出力フォルダを指定します
root = "./20230630/"  # 要スラッシュ

# VOICEVOX FastAPI の /synthesis に渡すオプションを指定します
# キーは VOICEVOX のスタイルIDです
# ここではずんだもんの話速を速めて抑揚を抑え、
# またずんだもんもつむぎも音高を落ち着かせています
[voice_settings]
3 = { speedScale = 1.2, intonationScale = 0.9, pitchScale = -0.025 }
8 = { pitchScale = -0.025 }

# 立ち絵の画像ファイルパスと縮尺と表示座標を指定します
# 縮尺と表示座標は試行錯誤するまでわからないので最初は適当にします
# 画像ファイルは各表情タイプに「口閉じ、口開き」の2ファイルを指定します
# なければ同じファイルでよいです (がセリフ中でも口パクしません)
[[character_images]]
speaker = 3
scale = 0.59
coordinate = [460, 286]
normal = ["../sozai/zundamon_10.png", "../sozai/zundamon_11.png"]
mattari = ["../sozai/zundamon_00.png", "../sozai/zundamon_01.png"]
[[character_images]]
speaker = 8
scale = 0.555
coordinate = [-25, 262]
normal = ["../sozai/tsumugi_10.png", "../sozai/tsumugi_11.png"]
mattari = ["../sozai/tsumugi_00.png", "../sozai/tsumugi_01.png"]

# 画面に描画するテキストの設定を指定します
[text_settings]
coordinate = [25, 25]
font_size = 24
font_path = '../../Downloads/M_PLUS_Rounded_1c/MPLUSRounded1c-Medium.ttf'
font_color = [0, 0, 0]
width = 24  # 背景画像が 640 ピクセルなので 24 文字くらい入りますが調整します

# 場面のデフォルト設定を指定します
# 各場面はこの設定への差分のみ指定します
# 立ち絵の座標調整時にもこの設定を使用します
[shot_default]
speaker = -1  # 話者のスタイルID (誰も話さないなら -1)
serifu = ""  # セリフ
silence = 0  # セリフ後無音秒数
back = "./20230630/back.png"  # 背景画像
characters = { 3 = "normal", 8 = "normal" }  # 各人物の表情タイプ
text = ""

# 各場面のセリフや背景画像や表情などを指定していきます
[[shots]]
silence = 1
[[shots]]
speaker = 3
serifu = "定常時系列のあるステップとkステップ前の共分散をラグkの関数と考えて自己共分散関数というのだ。"
[[shots]]
speaker = 8
serifu = "定常時系列じゃないとだめなのー？"
[[shots]]
speaker = 3
serifu = "時系列が定常でない場合自己共分散はラグkのみでは決まらないのだ。時間ステップによって変わっていってしまうかもしれないのだ。"
[[shots]]
speaker = 8
serifu = "あーそっかー。"
[[shots]]
silence = 5
characters = { 3 = "mattari", 8 = "mattari" }
text = "[声の出演]<br/>VOICEVOX:ずんだもん<br/>VOICEVOX:春日部つむぎ"

## synthesis.py
import requests
import json
import time


# ほぼ以下のコードです
# https://blog.shikoan.com/voicevox-python/
def synthesis(text, filename, speaker=1, options=None, max_retry=20):
    query_payload = {"text": text, "speaker": speaker}
    for query_i in range(max_retry):
        r = requests.post("http://localhost:50021/audio_query",
                        params=query_payload, timeout=(10.0, 300.0))
        if r.status_code == 200:
            query_data = r.json()
            break
        time.sleep(1)
    else:
        raise ConnectionError(
            "リトライ回数が上限に到達しました。 audio_query : ",
            filename, "/", text[:30], r.text)
    if options is not None:
        query_data.update(options)

    # synthesis
    synth_payload = {"speaker": speaker}
    for synth_i in range(max_retry):
        r = requests.post(
            "http://localhost:50021/synthesis",
            params=synth_payload,
            data=json.dumps(query_data), timeout=(10.0, 300.0))
        if r.status_code == 200:
            with open(filename, "wb") as fp:
                fp.write(r.content)
            print(
                f"{filename} は query={query_i+1}回, synthesis={synth_i+1}回"
                "のリトライで正常に保存されました")
            break
        time.sleep(1)
    else:
        raise ConnectionError(
            "リトライ回数が上限に到達しました。 synthesis : ",
            filename, "/", text[:30], r, text)
	import toml
	from synthesis import synthesis
	import hashlib
	from pydub import AudioSegment
	from PIL import Image, ImageFont, ImageDraw
	from moviepy.editor import ImageClip, concatenate_videoclips
	import math
	import os


	def str_to_hash(s):
	return hashlib.md5(s.encode()).hexdigest()


	def file_to_hash(file):
	with open(file, 'rb') as f:
	md5 = hashlib.md5(f.read()).hexdigest()
	return md5


	def dict_to_str(d):
	keys = sorted(d.keys())
	s = ''
	for key in keys:
	s += str(key) + str(d[key])
	return s


	def format_duration(duration):
	sec = math.floor(duration)
	msec = int(1000 * (duration - sec))
	return f'00:00:{sec:02}.{msec:03}'


	def _add_text(img, text, text_settings):
	""" 背景画像に文字列を貼り付けます
	"""
	font_size = text_settings['font_size']
	font_path = text_settings['font_path']
	font = ImageFont.truetype(font_path, font_size)
	draw = ImageDraw.Draw(img)
	coord = text_settings['coordinate']
	width = text_settings['width']
	if '<br/>' in text:
	text = text.split('<br/>')
	for row, text_ in enumerate(text):
	coord_ = (coord[0], coord[1] + row * int(font_size * 1.4)) # 行間 1.4
	draw.text(
	coord_, text_,
	tuple(text_settings['font_color']), font=font
	)
	return
	length = len(text)
	for row, i in enumerate(range(0, length, width)):
	coord_ = (coord[0], coord[1] + row * int(font_size * 1.4)) # 行間 1.4
	draw.text(
	coord_, text[i:(i + width)],
	tuple(text_settings['font_color']), font=font
	)


	def _paste(img, chara_id, mode, character_images, mouse=0):
	""" 背景画像に立ち絵画像を貼り付けます
	"""
	img_ = Image.open(character_images[chara_id][mode][mouse]).convert('RGBA')
	scale = character_images[chara_id]['scale']
	size_new = (int(scale * img_.width), int(scale * img_.height))
	img_ = img_.resize(size_new)
	w = character_images[chara_id]['coordinate'][0]
	h = character_images[chara_id]['coordinate'][1]
	img.paste(img_, character_images[chara_id]['coordinate'], img_) # (w, h)


	def generate_image(root, shot, character_images,
	text_settings, regenerate=False):
	""" 必要な画像を合成します
	"""
	back = shot['back']
	characters = shot['characters']
	text = shot['text']
	if text == '':
	text = shot['serifu'] # 暫定敵にテキストがなければセリフを画像に記入する
	filebody = root + file_to_hash(back) + '_' + dict_to_str(characters)
	filebody += '_' + str_to_hash(text)
	speakers = [-1]
	if shot['speaker'] > -1: # 話者がいれば口開き版も必要なため2枚合成する
	speakers.append(shot['speaker'])
	out_files = []
	for speaker in speakers:
	postfix = '' if (speaker == -1) else ('_' + str(speaker))
	out_file = filebody + postfix + '.png'
	if (not regenerate) and os.path.isfile(out_file):
	out_files.append(out_file)
	continue
	img = Image.open(back).convert('RGBA')
	for chara_id, mode in characters.items():
	mouse = 1 if (chara_id == str(speaker)) else 0
	_paste(img, chara_id, mode, character_images, mouse)
	if text != '':
	_add_text(img, text, text_settings)
	img.save(out_file)
	out_files.append(out_file)
	return out_files


	def main():
	# 台本読み込み
	with open('./script.toml', encoding='utf-8') as f:
	script = toml.load(f)
	root = script['root']
	voice_settings = script['voice_settings']
	character_images = {str(c['speaker']): c for c in script['character_images']}
	text_settings = script['text_settings']
	shot_default = script['shot_default']
	shots = script['shots']

	# 座標調整時のみ True
	if False:
	shot = shot_default.copy()
	shot['text'] = 'ああああああああああいいいいいいいいいい'
	shot['text'] += 'ううううううううううええええええええええ'
	generate_image(root, shot, character_images,
	text_settings, regenerate=True)
	return

	# moviepy に浮動小数バグがあるので 1/2^n 秒を時間の最小単位とする
	# https://github.com/Zulko/moviepy/issues/646
	SPK = 0.125 # Seconds Per Koma

	# 各場面の台詞を wav に出力し全体を通した mp3 音声ファイルを出力しておく
	komas = [] # 各場面の「有音コマ数、無音コマ数」を格納しておく
	audio_concat = None
	for shot_ in shots:
	shot = shot_default.copy()
	shot.update(shot_)
	komasu = 0
	silent_komasu = 0
	audio = None
	if shot['speaker'] > -1: # 話者がいればセリフ音声を合成する
	out_file = root + str(shot['speaker']) + '_' + shot['serifu'][:3] + '_'
	out_file += str_to_hash(shot['serifu']) + '.wav'
	if not os.path.isfile(out_file):
	synthesis(shot['serifu'], out_file, speaker=shot['speaker'],
	options=voice_settings.get(str(shot['speaker'])))
	audio = AudioSegment.from_wav(out_file)
	komasu = math.ceil(audio.duration_seconds / SPK)
	minisilence_duration = komasu * SPK - audio.duration_seconds
	audio += AudioSegment.silent(duration=minisilence_duration * 1000)
	if shot['silence'] > 0: # セリフ後無音秒数があれば無音を足す
	silent_komasu = math.ceil(float(shot['silence']) / SPK)
	if audio is None:
	audio = AudioSegment.silent(duration=silent_komasu * SPK * 1000)
	else:
	audio += AudioSegment.silent(duration=silent_komasu * SPK * 1000)
	komas.append((komasu, silent_komasu))
	if audio_concat is None:
	audio_concat = audio
	else:
	audio_concat += audio
	audio_concat.export(f'{root}concat.mp3', format='mp3')

	# 各場面のコマ数に応じて動画クリップを作成していく
	clips = []
	for (koma, shot_) in zip(komas, shots):
	shot = shot_default.copy()
	shot.update(shot_)
	img_files = generate_image(root, shot, character_images, text_settings)
	if koma[0] > 0:
	for i_koma in range(koma[0]):
	image_file = img_files[(i_koma+1) % 2]
	clip = ImageClip(image_file).set_duration(format_duration(SPK))
	clips.append(clip)
	if koma[1] > 0:
	duration = SPK * koma[1]
	clip = ImageClip(img_files[0]).set_duration(format_duration(duration))
	clips.append(clip)
	# mp4 に出力する
	video = concatenate_videoclips(clips)
	video.write_videofile(f'{root}out.mp4', fps=24, audio=f'{root}concat.mp3')


	if __name__ == '__main__':
	main()
	# png, wav, mp3, mp4 出力フォルダを指定します
	root = "./20230630/" # 要スラッシュ

	# VOICEVOX FastAPI の /synthesis に渡すオプションを指定します
	# キーは VOICEVOX のスタイルIDです
	# ここではずんだもんの話速を速めて抑揚を抑え、
	# またずんだもんもつむぎも音高を落ち着かせています
	[voice_settings]
	3 = { speedScale = 1.2, intonationScale = 0.9, pitchScale = -0.025 }
	8 = { pitchScale = -0.025 }

	# 立ち絵の画像ファイルパスと縮尺と表示座標を指定します
	# 縮尺と表示座標は試行錯誤するまでわからないので最初は適当にします
	# 画像ファイルは各表情タイプに「口閉じ、口開き」の2ファイルを指定します
	# なければ同じファイルでよいです (がセリフ中でも口パクしません)
	[[character_images]]
	speaker = 3
	scale = 0.59
	coordinate = [460, 286]
	normal = ["../sozai/zundamon_10.png", "../sozai/zundamon_11.png"]
	mattari = ["../sozai/zundamon_00.png", "../sozai/zundamon_01.png"]
	[[character_images]]
	speaker = 8
	scale = 0.555
	coordinate = [-25, 262]
	normal = ["../sozai/tsumugi_10.png", "../sozai/tsumugi_11.png"]
	mattari = ["../sozai/tsumugi_00.png", "../sozai/tsumugi_01.png"]

	# 画面に描画するテキストの設定を指定します
	[text_settings]
	coordinate = [25, 25]
	font_size = 24
	font_path = '../../Downloads/M_PLUS_Rounded_1c/MPLUSRounded1c-Medium.ttf'
	font_color = [0, 0, 0]
	width = 24 # 背景画像が 640 ピクセルなので 24 文字くらい入りますが調整します

	# 場面のデフォルト設定を指定します
	# 各場面はこの設定への差分のみ指定します
	# 立ち絵の座標調整時にもこの設定を使用します
	[shot_default]
	speaker = -1 # 話者のスタイルID (誰も話さないなら -1)
	serifu = "" # セリフ
	silence = 0 # セリフ後無音秒数
	back = "./20230630/back.png" # 背景画像
	characters = { 3 = "normal", 8 = "normal" } # 各人物の表情タイプ
	text = ""

	# 各場面のセリフや背景画像や表情などを指定していきます
	[[shots]]
	silence = 1
	[[shots]]
	speaker = 3
	serifu = "定常時系列のあるステップとkステップ前の共分散をラグkの関数と考えて自己共分散関数というのだ。"
	[[shots]]
	speaker = 8
	serifu = "定常時系列じゃないとだめなのー？"
	[[shots]]
	speaker = 3
	serifu = "時系列が定常でない場合自己共分散はラグkのみでは決まらないのだ。時間ステップによって変わっていってしまうかもしれないのだ。"
	[[shots]]
	speaker = 8
	serifu = "あーそっかー。"
	[[shots]]
	silence = 5
	characters = { 3 = "mattari", 8 = "mattari" }
	text = "[声の出演]<br/>VOICEVOX:ずんだもん<br/>VOICEVOX:春日部つむぎ"
	import requests
	import json
	import time


	# ほぼ以下のコードです
	# https://blog.shikoan.com/voicevox-python/
	def synthesis(text, filename, speaker=1, options=None, max_retry=20):
	query_payload = {"text": text, "speaker": speaker}
	for query_i in range(max_retry):
	r = requests.post("http://localhost:50021/audio_query",
	params=query_payload, timeout=(10.0, 300.0))
	if r.status_code == 200:
	query_data = r.json()
	break
	time.sleep(1)
	else:
	raise ConnectionError(
	"リトライ回数が上限に到達しました。 audio_query : ",
	filename, "/", text[:30], r.text)
	if options is not None:
	query_data.update(options)

	# synthesis
	synth_payload = {"speaker": speaker}
	for synth_i in range(max_retry):
	r = requests.post(
	"http://localhost:50021/synthesis",
	params=synth_payload,
	data=json.dumps(query_data), timeout=(10.0, 300.0))
	if r.status_code == 200:
	with open(filename, "wb") as fp:
	fp.write(r.content)
	print(
	f"{filename} は query={query_i+1}回, synthesis={synth_i+1}回"
	"のリトライで正常に保存されました")
	break
	time.sleep(1)
	else:
	raise ConnectionError(
	"リトライ回数が上限に到達しました。 synthesis : ",
	filename, "/", text[:30], r, text)