tam17aki/world_encode_decode.py

## world_encode_decode.py
import pyworld as pw
import pysptk
from scipy.io import wavfile
import numpy as np

fs, x = wavfile.read(pysptk.util.example_audio_file())
assert fs == 16000
wavfile.write('./orig.wav', fs, x)

# shortからfloatに変換します
x = x.astype(np.float64)

# 特徴量抽出（基本周波数、スペクトル包絡、非周期性指標）
f0, sp, ap = pw.wav2world(x, fs)
fft_size = pw.get_cheaptrick_fft_size(fs)

# 特徴量の次元削減 -> DCTベースの手法
# https://www.isca-speech.org/archive/Interspeech_2017/abstracts/0067.html
sp_dim = 50                     # 50次元まで削減しても音質は削減前と変わらない
code_sp = pw.code_spectral_envelope(sp, fs, sp_dim)
code_ap = pw.code_aperiodicity(ap, fs)

# The `dim` of code ap is defined based on the `fs` as follow:
# fs = `16000` : `1`
# fs = `22050` : `2`
# fs = `44100` : `5`
# fs = `48000` : `5`
decode_sp = pw.decode_spectral_envelope(code_sp, fs, fft_size)
decode_ap = pw.decode_aperiodicity(code_ap, fs, fft_size)

# 再合成
y = pw.synthesize(f0, sp, ap, fs)
y = y.astype(np.int16)
outfile = 'world_resynthesis.wav'
wavfile.write(outfile, fs, y)

# 再合成
y = pw.synthesize(f0, decode_sp, decode_ap, fs)
y = y.astype(np.int16)
outfile = 'world_resynthesis_coded.wav'
wavfile.write(outfile, fs, y)
	import pyworld as pw
	import pysptk
	from scipy.io import wavfile
	import numpy as np

	fs, x = wavfile.read(pysptk.util.example_audio_file())
	assert fs == 16000
	wavfile.write('./orig.wav', fs, x)

	# shortからfloatに変換します
	x = x.astype(np.float64)

	# 特徴量抽出（基本周波数、スペクトル包絡、非周期性指標）
	f0, sp, ap = pw.wav2world(x, fs)
	fft_size = pw.get_cheaptrick_fft_size(fs)

	# 特徴量の次元削減 -> DCTベースの手法
	# https://www.isca-speech.org/archive/Interspeech_2017/abstracts/0067.html
	sp_dim = 50 # 50次元まで削減しても音質は削減前と変わらない
	code_sp = pw.code_spectral_envelope(sp, fs, sp_dim)
	code_ap = pw.code_aperiodicity(ap, fs)

	# The `dim` of code ap is defined based on the `fs` as follow:
	# fs = `16000` : `1`
	# fs = `22050` : `2`
	# fs = `44100` : `5`
	# fs = `48000` : `5`
	decode_sp = pw.decode_spectral_envelope(code_sp, fs, fft_size)
	decode_ap = pw.decode_aperiodicity(code_ap, fs, fft_size)

	# 再合成
	y = pw.synthesize(f0, sp, ap, fs)
	y = y.astype(np.int16)
	outfile = 'world_resynthesis.wav'
	wavfile.write(outfile, fs, y)

	# 再合成
	y = pw.synthesize(f0, decode_sp, decode_ap, fs)
	y = y.astype(np.int16)
	outfile = 'world_resynthesis_coded.wav'
	wavfile.write(outfile, fs, y)