takana-v/vv_core_wrapper.py

## vv_core_wrapper.py
# vv_core_wrapper.py
# Copyright (c) 2022 takana-v
#
# This software is released under the MIT License.
# http://opensource.org/licenses/mit-license.php

import faulthandler
import json

from ctypes import *
from pprint import pprint

# エラー時のデバッグ用
faulthandler.enable()

core_path = r"/path/to/voicevox_core.dll"
dict_path = r"/path/to/open_jtalk_dic_utf_8-1.11"
core = cdll.LoadLibrary(core_path)


class VoicevoxInitializeOptions(Structure):
    _fields_ = [
        ("acceleration_mode", c_int32),
        ("cpu_num_threads", c_uint16),
        ("load_all_models", c_bool),
        ("open_jtalk_dict_dir", c_char_p),
    ]


class VoicevoxAudioQueryOptions(Structure):
    _fields_ = [
        ("kana", c_bool),
    ]


class VoicevoxSynthesisOptions(Structure):
    _fields_ = [
        ("enable_interrogative_upspeak", c_bool),
    ]


class VoicevoxTtsOptions(Structure):
    _fields_ = [
        ("kana", c_bool),
        ("enable_interrogative_upspeak", c_bool),
    ]


core.voicevox_audio_query.argtypes = (c_char_p, c_uint32, VoicevoxAudioQueryOptions, POINTER(c_char_p))
core.voicevox_audio_query.restype = c_int32

core.voicevox_audio_query_json_free.argtypes = (c_char_p,)
core.voicevox_audio_query_json_free.restype = None

core.voicevox_error_result_to_message.argtypes = (c_int32,)
core.voicevox_error_result_to_message.restype = c_char_p

core.voicevox_finalize.argtypes = ()
core.voicevox_finalize.restype = None

core.voicevox_get_metas_json.argtypes = ()
core.voicevox_get_metas_json.restype = c_char_p

core.voicevox_get_supported_devices_json.argtypes = ()
core.voicevox_get_supported_devices_json.restype = c_char_p

core.voicevox_get_version.argtypes = ()
core.voicevox_get_version.restype = c_char_p

core.voicevox_initialize.argtypes = (VoicevoxInitializeOptions,)
core.voicevox_initialize.restype = c_int32

core.voicevox_is_gpu_mode.argtypes = ()
core.voicevox_is_gpu_mode.restype = c_bool

core.voicevox_is_model_loaded.argtypes = (c_uint32,)
core.voicevox_is_model_loaded.restype = c_bool

core.voicevox_load_model.argtypes = (c_uint32,)
core.voicevox_load_model.restype = c_int32

core.voicevox_make_default_audio_query_options.argtypes = ()
core.voicevox_make_default_audio_query_options.restype = VoicevoxAudioQueryOptions

core.voicevox_make_default_initialize_options.argtypes = ()
core.voicevox_make_default_initialize_options.restype = VoicevoxInitializeOptions

core.voicevox_make_default_synthesis_options.argtypes = ()
core.voicevox_make_default_synthesis_options.restype = VoicevoxSynthesisOptions

core.voicevox_make_default_tts_options.argtypes = ()
core.voicevox_make_default_tts_options.restype = VoicevoxTtsOptions

core.voicevox_synthesis.argtypes = (c_char_p, c_uint32, VoicevoxSynthesisOptions, POINTER(c_uint * 1), POINTER(POINTER(c_uint8)))
core.voicevox_synthesis.restype = c_int32

core.voicevox_tts.argtypes = (c_char_p, c_uint32, VoicevoxTtsOptions, POINTER(c_uint * 1), POINTER(POINTER(c_uint8)))
core.voicevox_tts.restype = c_int32

core.voicevox_wav_free.argtypes = (POINTER(c_uint8),)
core.voicevox_wav_free.restype = None


if __name__ == "__main__":
    # コアの初期化
    initialize_option = core.voicevox_make_default_initialize_options()
    initialize_option.open_jtalk_dict_dir = c_char_p(dict_path.encode())
    res = core.voicevox_initialize(initialize_option)
    if res != 0:
        raise Exception(core.voicevox_error_result_to_message(res).decode("utf-8"))

    # 各種情報の表示
    print(f'version: {core.voicevox_get_version().decode("utf-8")}')
    print(f'is_gpu: {core.voicevox_is_gpu_mode()}')
    print(f'metas:')
    pprint(json.loads(core.voicevox_get_metas_json()))

    # モデルのロード
    speaker_id = 8
    print(f'is_model_loaded: {core.voicevox_is_model_loaded(speaker_id)}')
    res = core.voicevox_load_model(speaker_id)
    if res != 0:
        raise Exception(core.voicevox_error_result_to_message(res).decode("utf-8"))
    print(f'is_model_loaded: {core.voicevox_is_model_loaded(speaker_id)}')

    # TTSを試す
    tts_option = core.voicevox_make_default_tts_options()
    output_wav_length = pointer((c_uint * 1)())
    output_wav = pointer(pointer(c_uint8()))
    text = c_char_p("これはテストです。".encode("utf-8"))
    res = core.voicevox_tts(text, speaker_id, tts_option, output_wav_length, output_wav)
    if res != 0:
        raise Exception(core.voicevox_error_result_to_message(res).decode("utf-8"))
    print(f'output_wav_length: {output_wav_length.contents[0]}')
    output_wav_bin = b""
    for i in range(output_wav_length.contents[0]):
        output_wav_bin += output_wav.contents[i].to_bytes(1, "big")
    with open("output.wav", mode="wb") as f:
        f.write(output_wav_bin)
    core.voicevox_wav_free(output_wav.contents)

    # AudioQueryの作成を試す
    audio_query_option = core.voicevox_make_default_audio_query_options()
    output_audio_query_json = pointer(c_char_p())
    res = core.voicevox_audio_query(text, speaker_id, audio_query_option, output_audio_query_json)
    if res != 0:
        raise Exception(core.voicevox_error_result_to_message(res).decode("utf-8"))
    audio_query = json.loads(output_audio_query_json.contents.value)
    print(f'AudioQuery:')
    pprint(audio_query)
    core.voicevox_audio_query_json_free(output_audio_query_json.contents)

    # ちょっとAudioQueryをいじってみる
    audio_query["pitch_scale"] = 0.1

    # AudioQueryを元に音声合成してみる
    synthesis_option = core.voicevox_make_default_synthesis_options()
    audio_query_json = c_char_p(json.dumps(audio_query).encode("utf-8"))
    output_wav_length = pointer((c_uint * 1)())
    output_wav = pointer(pointer(c_uint8()))
    res = core.voicevox_synthesis(audio_query_json, speaker_id, synthesis_option, output_wav_length, output_wav)
    if res != 0:
        raise Exception(core.voicevox_error_result_to_message(res).decode("utf-8"))
    print(f'output_wav_length: {output_wav_length.contents[0]}')
    output_wav_bin = b""
    for i in range(output_wav_length.contents[0]):
        output_wav_bin += output_wav.contents[i].to_bytes(1, "big")
    with open("output_2.wav", mode="wb") as f:
        f.write(output_wav_bin)
    core.voicevox_wav_free(output_wav.contents)

    # 後片付け
    core.voicevox_finalize()
	# vv_core_wrapper.py
	# Copyright (c) 2022 takana-v
	#
	# This software is released under the MIT License.
	# http://opensource.org/licenses/mit-license.php

	import faulthandler
	import json

	from ctypes import *
	from pprint import pprint

	# エラー時のデバッグ用
	faulthandler.enable()

	core_path = r"/path/to/voicevox_core.dll"
	dict_path = r"/path/to/open_jtalk_dic_utf_8-1.11"
	core = cdll.LoadLibrary(core_path)


	class VoicevoxInitializeOptions(Structure):
	_fields_ = [
	("acceleration_mode", c_int32),
	("cpu_num_threads", c_uint16),
	("load_all_models", c_bool),
	("open_jtalk_dict_dir", c_char_p),
	]


	class VoicevoxAudioQueryOptions(Structure):
	_fields_ = [
	("kana", c_bool),
	]


	class VoicevoxSynthesisOptions(Structure):
	_fields_ = [
	("enable_interrogative_upspeak", c_bool),
	]


	class VoicevoxTtsOptions(Structure):
	_fields_ = [
	("kana", c_bool),
	("enable_interrogative_upspeak", c_bool),
	]


	core.voicevox_audio_query.argtypes = (c_char_p, c_uint32, VoicevoxAudioQueryOptions, POINTER(c_char_p))
	core.voicevox_audio_query.restype = c_int32

	core.voicevox_audio_query_json_free.argtypes = (c_char_p,)
	core.voicevox_audio_query_json_free.restype = None

	core.voicevox_error_result_to_message.argtypes = (c_int32,)
	core.voicevox_error_result_to_message.restype = c_char_p

	core.voicevox_finalize.argtypes = ()
	core.voicevox_finalize.restype = None

	core.voicevox_get_metas_json.argtypes = ()
	core.voicevox_get_metas_json.restype = c_char_p

	core.voicevox_get_supported_devices_json.argtypes = ()
	core.voicevox_get_supported_devices_json.restype = c_char_p

	core.voicevox_get_version.argtypes = ()
	core.voicevox_get_version.restype = c_char_p

	core.voicevox_initialize.argtypes = (VoicevoxInitializeOptions,)
	core.voicevox_initialize.restype = c_int32

	core.voicevox_is_gpu_mode.argtypes = ()
	core.voicevox_is_gpu_mode.restype = c_bool

	core.voicevox_is_model_loaded.argtypes = (c_uint32,)
	core.voicevox_is_model_loaded.restype = c_bool

	core.voicevox_load_model.argtypes = (c_uint32,)
	core.voicevox_load_model.restype = c_int32

	core.voicevox_make_default_audio_query_options.argtypes = ()
	core.voicevox_make_default_audio_query_options.restype = VoicevoxAudioQueryOptions

	core.voicevox_make_default_initialize_options.argtypes = ()
	core.voicevox_make_default_initialize_options.restype = VoicevoxInitializeOptions

	core.voicevox_make_default_synthesis_options.argtypes = ()
	core.voicevox_make_default_synthesis_options.restype = VoicevoxSynthesisOptions

	core.voicevox_make_default_tts_options.argtypes = ()
	core.voicevox_make_default_tts_options.restype = VoicevoxTtsOptions

	core.voicevox_synthesis.argtypes = (c_char_p, c_uint32, VoicevoxSynthesisOptions, POINTER(c_uint * 1), POINTER(POINTER(c_uint8)))
	core.voicevox_synthesis.restype = c_int32

	core.voicevox_tts.argtypes = (c_char_p, c_uint32, VoicevoxTtsOptions, POINTER(c_uint * 1), POINTER(POINTER(c_uint8)))
	core.voicevox_tts.restype = c_int32

	core.voicevox_wav_free.argtypes = (POINTER(c_uint8),)
	core.voicevox_wav_free.restype = None


	if __name__ == "__main__":
	# コアの初期化
	initialize_option = core.voicevox_make_default_initialize_options()
	initialize_option.open_jtalk_dict_dir = c_char_p(dict_path.encode())
	res = core.voicevox_initialize(initialize_option)
	if res != 0:
	raise Exception(core.voicevox_error_result_to_message(res).decode("utf-8"))

	# 各種情報の表示
	print(f'version: {core.voicevox_get_version().decode("utf-8")}')
	print(f'is_gpu: {core.voicevox_is_gpu_mode()}')
	print(f'metas:')
	pprint(json.loads(core.voicevox_get_metas_json()))

	# モデルのロード
	speaker_id = 8
	print(f'is_model_loaded: {core.voicevox_is_model_loaded(speaker_id)}')
	res = core.voicevox_load_model(speaker_id)
	if res != 0:
	raise Exception(core.voicevox_error_result_to_message(res).decode("utf-8"))
	print(f'is_model_loaded: {core.voicevox_is_model_loaded(speaker_id)}')

	# TTSを試す
	tts_option = core.voicevox_make_default_tts_options()
	output_wav_length = pointer((c_uint * 1)())
	output_wav = pointer(pointer(c_uint8()))
	text = c_char_p("これはテストです。".encode("utf-8"))
	res = core.voicevox_tts(text, speaker_id, tts_option, output_wav_length, output_wav)
	if res != 0:
	raise Exception(core.voicevox_error_result_to_message(res).decode("utf-8"))
	print(f'output_wav_length: {output_wav_length.contents[0]}')
	output_wav_bin = b""
	for i in range(output_wav_length.contents[0]):
	output_wav_bin += output_wav.contents[i].to_bytes(1, "big")
	with open("output.wav", mode="wb") as f:
	f.write(output_wav_bin)
	core.voicevox_wav_free(output_wav.contents)

	# AudioQueryの作成を試す
	audio_query_option = core.voicevox_make_default_audio_query_options()
	output_audio_query_json = pointer(c_char_p())
	res = core.voicevox_audio_query(text, speaker_id, audio_query_option, output_audio_query_json)
	if res != 0:
	raise Exception(core.voicevox_error_result_to_message(res).decode("utf-8"))
	audio_query = json.loads(output_audio_query_json.contents.value)
	print(f'AudioQuery:')
	pprint(audio_query)
	core.voicevox_audio_query_json_free(output_audio_query_json.contents)

	# ちょっとAudioQueryをいじってみる
	audio_query["pitch_scale"] = 0.1

	# AudioQueryを元に音声合成してみる
	synthesis_option = core.voicevox_make_default_synthesis_options()
	audio_query_json = c_char_p(json.dumps(audio_query).encode("utf-8"))
	output_wav_length = pointer((c_uint * 1)())
	output_wav = pointer(pointer(c_uint8()))
	res = core.voicevox_synthesis(audio_query_json, speaker_id, synthesis_option, output_wav_length, output_wav)
	if res != 0:
	raise Exception(core.voicevox_error_result_to_message(res).decode("utf-8"))
	print(f'output_wav_length: {output_wav_length.contents[0]}')
	output_wav_bin = b""
	for i in range(output_wav_length.contents[0]):
	output_wav_bin += output_wav.contents[i].to_bytes(1, "big")
	with open("output_2.wav", mode="wb") as f:
	f.write(output_wav_bin)
	core.voicevox_wav_free(output_wav.contents)

	# 後片付け
	core.voicevox_finalize()