tam17aki/real_time_vc_sptk.py

## real_time_vc_sptk.py
#!/usr/bin/env python3
# -*- coding:utf-8 -*-

# Copyright (C) 2019 by Akira Tamamori
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import numpy as np
import pyaudio
import pysptk
import librosa
from pysptk.synthesis import MLSADF, Synthesizer


sample_rate = 16000  # サンプリング周波数
input_buffer_size = 1024 * 10  # バッファサイズ（入力）
output_buffer_size = 1024 * 2  # バッファサイズ（入力）

pitch_rate = 0.5  # 声の高さの調整 : 2倍にすれば1オクターブ下に、0.5倍にすれば1オクターブ上に
sp_rate = 0.75  # 声色の調整 (> 0.0) : 女性の声にする場合は1.0より小さく、男性はその逆で大きく

# 音声の分析条件
frame_length = 512
frame_shift = 80

# メルケプストラムの抽出条件
order = 25
alpha = 0.41
mcep_floor = 0.0001


def analysis_resynthesis(signal):

    # フレーム化処理
    frames = librosa.util.frame(
        signal, frame_length=frame_length,
        hop_length=frame_shift).astype(np.float64).transpose()

    # 窓掛け
    frames *= pysptk.blackman(frame_length)

    # ピッチの抽出
    pitch = pysptk.swipe(signal, fs=sample_rate,
                         hopsize=frame_shift, min=60, max=240,
                         otype="pitch")

    # ピッチシフト
    pitch *= pitch_rate

    # 振幅スペクトルを計算
    fft_frames = np.abs(np.fft.fft(frames))

    # 振幅スペクトルの対称性より半分だけ取り出す
    fft_frames = fft_frames[:, 0:int(frame_length / 2) + 1]

    # フォルマントシフト
    m_frames = np.zeros_like(fft_frames)
    sp_range = int(m_frames.shape[1] * sp_rate)
    for i in range(m_frames.shape[1]):
        if (i < sp_range):
            if sp_rate >= 1.0:
                m_frames[:, i] = fft_frames[:, int(i / sp_rate)]
            else:
                m_frames[:, i] = fft_frames[:, int(i * sp_rate)]
        else:
            m_frames[:, i] = fft_frames[:, i]

    # フロア処理
    m_frames += mcep_floor

    # メルケプストラムの抽出
    mc = pysptk.mcep(m_frames, order=order, alpha=alpha, itype=3)

    # ディジタルフィルタ係数に変換
    b = pysptk.mc2b(mc, alpha)

    # 励振信号の作成
    source_excitation = pysptk.excite(pitch, frame_shift)

    # 音声の再合成
    synthesizer = Synthesizer(MLSADF(order=order, alpha=alpha), frame_shift)
    synthesized = synthesizer.synthesis(source_excitation, b)
    synthesized = synthesized[0:-int(2 * frame_length)] # ぶつ切れ感の回避

    return synthesized


if __name__ == "__main__":

    audio = pyaudio.PyAudio()

    stream_in = audio.open(format=pyaudio.paInt16,
                           channels=1,
                           rate=sample_rate,
                           frames_per_buffer=input_buffer_size,
                           input=True)

    stream_out = audio.open(format=pyaudio.paInt16,
                            channels=1,
                            rate=sample_rate,
                            frames_per_buffer=output_buffer_size,
                            output=True)

    try:
        print("分析合成を開始します。話しかけてください。")
        while stream_in.is_active():
            input = stream_in.read(input_buffer_size,
                                   exception_on_overflow=False)
            signal = np.frombuffer(input, dtype='int16').astype(np.float64)
            output = analysis_resynthesis(signal)
            stream_out.write(output.astype(np.int16).tobytes())

    except KeyboardInterrupt:
        print("\nInterrupt.")

    finally:
        stream_in.stop_stream()
        stream_in.close()
        stream_out.stop_stream()
        stream_out.close()
        audio.terminate()
        print("分析合成を終了します。")
	#!/usr/bin/env python3
	# -- coding:utf-8 --

	# Copyright (C) 2019 by Akira Tamamori
	#
	# This program is free software; you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation, either version 3 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with this program. If not, see <http://www.gnu.org/licenses/>.

	import numpy as np
	import pyaudio
	import pysptk
	import librosa
	from pysptk.synthesis import MLSADF, Synthesizer


	sample_rate = 16000 # サンプリング周波数
	input_buffer_size = 1024 * 10 # バッファサイズ（入力）
	output_buffer_size = 1024 * 2 # バッファサイズ（入力）

	pitch_rate = 0.5 # 声の高さの調整 : 2倍にすれば1オクターブ下に、0.5倍にすれば1オクターブ上に
	sp_rate = 0.75 # 声色の調整 (> 0.0) : 女性の声にする場合は1.0より小さく、男性はその逆で大きく

	# 音声の分析条件
	frame_length = 512
	frame_shift = 80

	# メルケプストラムの抽出条件
	order = 25
	alpha = 0.41
	mcep_floor = 0.0001


	def analysis_resynthesis(signal):

	# フレーム化処理
	frames = librosa.util.frame(
	signal, frame_length=frame_length,
	hop_length=frame_shift).astype(np.float64).transpose()

	# 窓掛け
	frames *= pysptk.blackman(frame_length)

	# ピッチの抽出
	pitch = pysptk.swipe(signal, fs=sample_rate,
	hopsize=frame_shift, min=60, max=240,
	otype="pitch")

	# ピッチシフト
	pitch *= pitch_rate

	# 振幅スペクトルを計算
	fft_frames = np.abs(np.fft.fft(frames))

	# 振幅スペクトルの対称性より半分だけ取り出す
	fft_frames = fft_frames[:, 0:int(frame_length / 2) + 1]

	# フォルマントシフト
	m_frames = np.zeros_like(fft_frames)
	sp_range = int(m_frames.shape[1] * sp_rate)
	for i in range(m_frames.shape[1]):
	if (i < sp_range):
	if sp_rate >= 1.0:
	m_frames[:, i] = fft_frames[:, int(i / sp_rate)]
	else:
	m_frames[:, i] = fft_frames[:, int(i * sp_rate)]
	else:
	m_frames[:, i] = fft_frames[:, i]

	# フロア処理
	m_frames += mcep_floor

	# メルケプストラムの抽出
	mc = pysptk.mcep(m_frames, order=order, alpha=alpha, itype=3)

	# ディジタルフィルタ係数に変換
	b = pysptk.mc2b(mc, alpha)

	# 励振信号の作成
	source_excitation = pysptk.excite(pitch, frame_shift)

	# 音声の再合成
	synthesizer = Synthesizer(MLSADF(order=order, alpha=alpha), frame_shift)
	synthesized = synthesizer.synthesis(source_excitation, b)
	synthesized = synthesized[0:-int(2 * frame_length)] # ぶつ切れ感の回避

	return synthesized


	if __name__ == "__main__":

	audio = pyaudio.PyAudio()

	stream_in = audio.open(format=pyaudio.paInt16,
	channels=1,
	rate=sample_rate,
	frames_per_buffer=input_buffer_size,
	input=True)

	stream_out = audio.open(format=pyaudio.paInt16,
	channels=1,
	rate=sample_rate,
	frames_per_buffer=output_buffer_size,
	output=True)

	try:
	print("分析合成を開始します。話しかけてください。")
	while stream_in.is_active():
	input = stream_in.read(input_buffer_size,
	exception_on_overflow=False)
	signal = np.frombuffer(input, dtype='int16').astype(np.float64)
	output = analysis_resynthesis(signal)
	stream_out.write(output.astype(np.int16).tobytes())

	except KeyboardInterrupt:
	print("\nInterrupt.")

	finally:
	stream_in.stop_stream()
	stream_in.close()
	stream_out.stop_stream()
	stream_out.close()
	audio.terminate()
	print("分析合成を終了します。")