Created
September 11, 2021 16:24
-
-
Save iwasakishuto/2fadb544e95351d963b89b24f804bd0e to your computer and use it in GitHub Desktop.
Convert voice not using deeplearning.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
"""Voice conversion. | |
1. Install `sptk <http://sp-tk.sourceforge.net/>` | |
```sh | |
$ tar xvzf SPTK-3.11.tar.gz | |
$ cd SPTK-3.111 | |
$ ./configure | |
$ make | |
$ sudo make install | |
``` | |
2. Install `pyaudio <https://pypi.org/project/PyAudio/>`_ | |
```sh | |
$ pip install pyaudio | |
``` | |
If the error message "fatal error: 'portaudio.h' file not found" is appeared, please try the following code. | |
```sh | |
$ brew update | |
$ brew install portaudio | |
$ brew link --overwrite portaudio | |
$ pip install pyaudio | |
``` | |
""" | |
import argparse | |
import os | |
import struct | |
import subprocess | |
import wave | |
from typing import List | |
import pyaudio | |
def recordAudio( | |
wave_file: str, | |
ch: int = 1, | |
fmt: int = pyaudio.paInt16, | |
rate: int = 16000, | |
record_second: int = 10, | |
chunk: int = 1024, | |
device_idx: int = 0, | |
) -> None: | |
print(f"Now Recording ({record_second}[s])") | |
p = pyaudio.PyAudio() | |
with wave.open(wave_file, mode="wb") as wf: | |
wf.setnchannels(ch) | |
wf.setsampwidth(p.get_sample_size(fmt)) | |
wf.setframerate(rate) | |
stream = p.open( | |
format=fmt, | |
channels=ch, | |
rate=rate, | |
input=True, | |
input_device_index=device_idx, | |
frames_per_buffer=chunk, | |
) | |
frames: List[bytes] = [] | |
for i in range(0, int(rate / chunk * record_second)): | |
frames.append(stream.read(chunk)) | |
wf.writeframes(b"".join(frames)) | |
p.terminate() | |
print("Done.") | |
def play(wave_file: str, ch: int = 1, rate: int = 16000, chunk: int = 1024) -> None: | |
p = pyaudio.PyAudio() | |
stream = p.open( | |
format=p.get_format_from_width(2), channels=ch, rate=rate, output=True | |
) | |
with open(wave_file, mode="rb") as f: | |
data = f.read(chunk) | |
while data != b"": | |
stream.write(data) | |
data = f.read(chunk) | |
stream.stop_stream() | |
def modify_pitch( | |
pitch_file: str, mcep_file: str, out_file: str, pitch: float = 0.0 | |
) -> None: | |
subprocess.call( | |
f"sopr -m {pitch} {pitch_file} | excite -p 80 | mlsadf -m 25 -a 0.42 -p 80 {mcep_file} | clip -y -32000 32000 | x2x +fs > {out_file}", | |
shell=True, | |
) | |
def modify_speed( | |
pitch_file: str, mcep_file: str, out_file: str, frame_shift: int = 300 | |
): | |
subprocess.call( | |
f"excite -p {frame_shift} {pitch_file} | mlsadf -m 25 -a 0.42 -p {frame_shift} {mcep_file} | clip -y -32000 32000 | x2x +fs > {out_file}", | |
shell=True, | |
) | |
def robot_voice( | |
mcep_file: str, | |
out_file: str, | |
frame_period: int = 100, | |
record_second: int = 10, | |
rate: int = 16000, | |
) -> None: | |
subprocess.call( | |
f"train -p {frame_period} -l {record_second * rate * frame_period} | mlsadf -m 25 -a 0.42 -p 80 {mcep_file} | clip -y -32000 32000 | x2x +fs > {out_file}", | |
shell=True, | |
) | |
def child_voice(pitch_file: str, mcep_file: str, out_file: str): | |
subprocess.call( | |
f"sopr -m 0.4 {pitch_file} | excite -p 80 | mlsadf -m 25 -a 0.1 -p 80 {mcep_file} | clip -y -32000 32000 | x2x +fs > {out_file}", | |
shell=True, | |
) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(prog="", description="", add_help=True) | |
parser.add_argument("-CH", "--channel", type=int, default=1) | |
parser.add_argument("-RATE", "--rate", type=int, default=16000) | |
parser.add_argument("-CHUNK", "--chunk", type=int, default=1024) | |
parser.add_argument("-FMT", "--format", type=int, default=pyaudio.paInt16) | |
parser.add_argument("-DEVICE", "--device", type=int, default=0) | |
parser.add_argument("-SEC", "--record-second", type=int, default=3) | |
parser.add_argument("--wave-file", type=str, default="tmp.raw") | |
parser.add_argument("--pitch-file", type=str, default="tmp.pitch") | |
parser.add_argument("--mcep-file", type=str, default="tmp.mcep") | |
parser.add_argument("--out-file", type=str, default="out.raw") | |
parser.add_argument("--pitch", type=float, default=None) # 0.3 | |
parser.add_argument("--frame_shift", type=int, default=None) # 300 | |
parser.add_argument( | |
"-M", | |
"--method", | |
type=str, | |
choices=["hoarse", "robot", "child"], | |
default="child", | |
) | |
args = parser.parse_args() | |
wave_file = args.wave_file | |
pitch_file = args.pitch_file | |
mcep_file = args.mcep_file | |
out_file = args.out_file | |
method = args.method | |
# if os.path.exists(wave_file): | |
# print(f"Use the existing wave file ({wave_file})") | |
# else: | |
recordAudio( | |
wave_file=wave_file, | |
ch=args.channel, | |
fmt=args.format, | |
rate=args.rate, | |
record_second=args.record_second, | |
chunk=args.chunk, | |
device_idx=args.device, | |
) | |
print("Extracting information about pitch...") | |
subprocess.call( | |
f"x2x +sf {wave_file} | pitch -a 1 -s 16 -p 80 > {pitch_file}", shell=True | |
) | |
print("Extracting information about Mec Cepstrum...") | |
subprocess.call( | |
f"x2x +sf {wave_file} | frame -p 80 | window | mcep -m 25 -a 0.42 > {mcep_file}", | |
shell=True, | |
) | |
if args.pitch is not None: | |
print(f"Modify the pitch of input audio (pitch={args.pitch})") | |
modify_pitch( | |
pitch_file=pitch_file, | |
mcep_file=mcep_file, | |
out_file=wave_file, | |
pitch=args.pitch, | |
) | |
if args.frame_shift is not None: | |
print(f"Modify the speed of input audio (speed={args.pitch})") | |
modify_speed( | |
pitch_file=pitch_file, | |
mcep_file=mcep_file, | |
out_file=wave_file, | |
frame_shift=args.frame_shift, | |
) | |
print(f"Perform voice convertsion (method={method})") | |
if method == "hoarse": | |
modify_pitch( | |
pitch_file=pitch_file, mcep_file=mcep_file, out_file=out_file, pitch=0.0 | |
) | |
elif method == "robot": | |
robot_voice( | |
mcep_file=mcep_file, | |
out_file=out_file, | |
frame_period=100, | |
record_second=args.record_second, | |
rate=args.rate, | |
) | |
elif method == "child": | |
child_voice(pitch_file=pitch_file, mcep_file=mcep_file, out_file=out_file) | |
print(f"Play the output audio.") | |
play(out_file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment