Last active
February 7, 2020 06:35
Star
You must be signed in to star a gist
PySPTKによる逐次分析合成(+ 簡易声質変換)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding:utf-8 -*- | |
# Copyright (C) 2019 by Akira Tamamori | |
# | |
# This program is free software; you can redistribute it and/or modify | |
# it under the terms of the GNU General Public License as published by | |
# the Free Software Foundation, either version 3 of the License, or | |
# (at your option) any later version. | |
# | |
# This program is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
# GNU General Public License for more details. | |
# | |
# You should have received a copy of the GNU General Public License | |
# along with this program. If not, see <http://www.gnu.org/licenses/>. | |
import numpy as np | |
import pyaudio | |
import pysptk | |
import librosa | |
from pysptk.synthesis import MLSADF, Synthesizer | |
sample_rate = 16000 # サンプリング周波数 | |
input_buffer_size = 1024 * 10 # バッファサイズ(入力) | |
output_buffer_size = 1024 * 2 # バッファサイズ(入力) | |
pitch_rate = 0.5 # 声の高さの調整 : 2倍にすれば1オクターブ下に、0.5倍にすれば1オクターブ上に | |
sp_rate = 0.75 # 声色の調整 (> 0.0) : 女性の声にする場合は1.0より小さく、男性はその逆で大きく | |
# 音声の分析条件 | |
frame_length = 512 | |
frame_shift = 80 | |
# メルケプストラムの抽出条件 | |
order = 25 | |
alpha = 0.41 | |
mcep_floor = 0.0001 | |
def analysis_resynthesis(signal): | |
# フレーム化処理 | |
frames = librosa.util.frame( | |
signal, frame_length=frame_length, | |
hop_length=frame_shift).astype(np.float64).transpose() | |
# 窓掛け | |
frames *= pysptk.blackman(frame_length) | |
# ピッチの抽出 | |
pitch = pysptk.swipe(signal, fs=sample_rate, | |
hopsize=frame_shift, min=60, max=240, | |
otype="pitch") | |
# ピッチシフト | |
pitch *= pitch_rate | |
# 振幅スペクトルを計算 | |
fft_frames = np.abs(np.fft.fft(frames)) | |
# 振幅スペクトルの対称性より半分だけ取り出す | |
fft_frames = fft_frames[:, 0:int(frame_length / 2) + 1] | |
# フォルマントシフト | |
m_frames = np.zeros_like(fft_frames) | |
sp_range = int(m_frames.shape[1] * sp_rate) | |
for i in range(m_frames.shape[1]): | |
if (i < sp_range): | |
if sp_rate >= 1.0: | |
m_frames[:, i] = fft_frames[:, int(i / sp_rate)] | |
else: | |
m_frames[:, i] = fft_frames[:, int(i * sp_rate)] | |
else: | |
m_frames[:, i] = fft_frames[:, i] | |
# フロア処理 | |
m_frames += mcep_floor | |
# メルケプストラムの抽出 | |
mc = pysptk.mcep(m_frames, order=order, alpha=alpha, itype=3) | |
# ディジタルフィルタ係数に変換 | |
b = pysptk.mc2b(mc, alpha) | |
# 励振信号の作成 | |
source_excitation = pysptk.excite(pitch, frame_shift) | |
# 音声の再合成 | |
synthesizer = Synthesizer(MLSADF(order=order, alpha=alpha), frame_shift) | |
synthesized = synthesizer.synthesis(source_excitation, b) | |
synthesized = synthesized[0:-int(2 * frame_length)] # ぶつ切れ感の回避 | |
return synthesized | |
if __name__ == "__main__": | |
audio = pyaudio.PyAudio() | |
stream_in = audio.open(format=pyaudio.paInt16, | |
channels=1, | |
rate=sample_rate, | |
frames_per_buffer=input_buffer_size, | |
input=True) | |
stream_out = audio.open(format=pyaudio.paInt16, | |
channels=1, | |
rate=sample_rate, | |
frames_per_buffer=output_buffer_size, | |
output=True) | |
try: | |
print("分析合成を開始します。話しかけてください。") | |
while stream_in.is_active(): | |
input = stream_in.read(input_buffer_size, | |
exception_on_overflow=False) | |
signal = np.frombuffer(input, dtype='int16').astype(np.float64) | |
output = analysis_resynthesis(signal) | |
stream_out.write(output.astype(np.int16).tobytes()) | |
except KeyboardInterrupt: | |
print("\nInterrupt.") | |
finally: | |
stream_in.stop_stream() | |
stream_in.close() | |
stream_out.stop_stream() | |
stream_out.close() | |
audio.terminate() | |
print("分析合成を終了します。") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment