Skip to content

Instantly share code, notes, and snippets.

@tam17aki
Last active February 7, 2020 06:35
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save tam17aki/e41a9069d0df14c2b51500a16209dde1 to your computer and use it in GitHub Desktop.
PySPTKによる逐次分析合成(+ 簡易声質変換)
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (C) 2019 by Akira Tamamori
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import numpy as np
import pyaudio
import pysptk
import librosa
from pysptk.synthesis import MLSADF, Synthesizer
sample_rate = 16000 # サンプリング周波数
input_buffer_size = 1024 * 10 # バッファサイズ(入力)
output_buffer_size = 1024 * 2 # バッファサイズ(入力)
pitch_rate = 0.5 # 声の高さの調整 : 2倍にすれば1オクターブ下に、0.5倍にすれば1オクターブ上に
sp_rate = 0.75 # 声色の調整 (> 0.0) : 女性の声にする場合は1.0より小さく、男性はその逆で大きく
# 音声の分析条件
frame_length = 512
frame_shift = 80
# メルケプストラムの抽出条件
order = 25
alpha = 0.41
mcep_floor = 0.0001
def analysis_resynthesis(signal):
# フレーム化処理
frames = librosa.util.frame(
signal, frame_length=frame_length,
hop_length=frame_shift).astype(np.float64).transpose()
# 窓掛け
frames *= pysptk.blackman(frame_length)
# ピッチの抽出
pitch = pysptk.swipe(signal, fs=sample_rate,
hopsize=frame_shift, min=60, max=240,
otype="pitch")
# ピッチシフト
pitch *= pitch_rate
# 振幅スペクトルを計算
fft_frames = np.abs(np.fft.fft(frames))
# 振幅スペクトルの対称性より半分だけ取り出す
fft_frames = fft_frames[:, 0:int(frame_length / 2) + 1]
# フォルマントシフト
m_frames = np.zeros_like(fft_frames)
sp_range = int(m_frames.shape[1] * sp_rate)
for i in range(m_frames.shape[1]):
if (i < sp_range):
if sp_rate >= 1.0:
m_frames[:, i] = fft_frames[:, int(i / sp_rate)]
else:
m_frames[:, i] = fft_frames[:, int(i * sp_rate)]
else:
m_frames[:, i] = fft_frames[:, i]
# フロア処理
m_frames += mcep_floor
# メルケプストラムの抽出
mc = pysptk.mcep(m_frames, order=order, alpha=alpha, itype=3)
# ディジタルフィルタ係数に変換
b = pysptk.mc2b(mc, alpha)
# 励振信号の作成
source_excitation = pysptk.excite(pitch, frame_shift)
# 音声の再合成
synthesizer = Synthesizer(MLSADF(order=order, alpha=alpha), frame_shift)
synthesized = synthesizer.synthesis(source_excitation, b)
synthesized = synthesized[0:-int(2 * frame_length)] # ぶつ切れ感の回避
return synthesized
if __name__ == "__main__":
audio = pyaudio.PyAudio()
stream_in = audio.open(format=pyaudio.paInt16,
channels=1,
rate=sample_rate,
frames_per_buffer=input_buffer_size,
input=True)
stream_out = audio.open(format=pyaudio.paInt16,
channels=1,
rate=sample_rate,
frames_per_buffer=output_buffer_size,
output=True)
try:
print("分析合成を開始します。話しかけてください。")
while stream_in.is_active():
input = stream_in.read(input_buffer_size,
exception_on_overflow=False)
signal = np.frombuffer(input, dtype='int16').astype(np.float64)
output = analysis_resynthesis(signal)
stream_out.write(output.astype(np.int16).tobytes())
except KeyboardInterrupt:
print("\nInterrupt.")
finally:
stream_in.stop_stream()
stream_in.close()
stream_out.stop_stream()
stream_out.close()
audio.terminate()
print("分析合成を終了します。")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment