Skip to content

Instantly share code, notes, and snippets.

@iwasakishuto
Created September 11, 2021 16:24
Show Gist options
  • Save iwasakishuto/2fadb544e95351d963b89b24f804bd0e to your computer and use it in GitHub Desktop.
Save iwasakishuto/2fadb544e95351d963b89b24f804bd0e to your computer and use it in GitHub Desktop.
Convert voice not using deeplearning.
# coding: utf-8
"""Voice conversion.
1. Install `sptk <http://sp-tk.sourceforge.net/>`
```sh
$ tar xvzf SPTK-3.11.tar.gz
$ cd SPTK-3.111
$ ./configure
$ make
$ sudo make install
```
2. Install `pyaudio <https://pypi.org/project/PyAudio/>`_
```sh
$ pip install pyaudio
```
If the error message "fatal error: 'portaudio.h' file not found" is appeared, please try the following code.
```sh
$ brew update
$ brew install portaudio
$ brew link --overwrite portaudio
$ pip install pyaudio
```
"""
import argparse
import os
import struct
import subprocess
import wave
from typing import List
import pyaudio
def recordAudio(
wave_file: str,
ch: int = 1,
fmt: int = pyaudio.paInt16,
rate: int = 16000,
record_second: int = 10,
chunk: int = 1024,
device_idx: int = 0,
) -> None:
print(f"Now Recording ({record_second}[s])")
p = pyaudio.PyAudio()
with wave.open(wave_file, mode="wb") as wf:
wf.setnchannels(ch)
wf.setsampwidth(p.get_sample_size(fmt))
wf.setframerate(rate)
stream = p.open(
format=fmt,
channels=ch,
rate=rate,
input=True,
input_device_index=device_idx,
frames_per_buffer=chunk,
)
frames: List[bytes] = []
for i in range(0, int(rate / chunk * record_second)):
frames.append(stream.read(chunk))
wf.writeframes(b"".join(frames))
p.terminate()
print("Done.")
def play(wave_file: str, ch: int = 1, rate: int = 16000, chunk: int = 1024) -> None:
p = pyaudio.PyAudio()
stream = p.open(
format=p.get_format_from_width(2), channels=ch, rate=rate, output=True
)
with open(wave_file, mode="rb") as f:
data = f.read(chunk)
while data != b"":
stream.write(data)
data = f.read(chunk)
stream.stop_stream()
def modify_pitch(
pitch_file: str, mcep_file: str, out_file: str, pitch: float = 0.0
) -> None:
subprocess.call(
f"sopr -m {pitch} {pitch_file} | excite -p 80 | mlsadf -m 25 -a 0.42 -p 80 {mcep_file} | clip -y -32000 32000 | x2x +fs > {out_file}",
shell=True,
)
def modify_speed(
pitch_file: str, mcep_file: str, out_file: str, frame_shift: int = 300
):
subprocess.call(
f"excite -p {frame_shift} {pitch_file} | mlsadf -m 25 -a 0.42 -p {frame_shift} {mcep_file} | clip -y -32000 32000 | x2x +fs > {out_file}",
shell=True,
)
def robot_voice(
mcep_file: str,
out_file: str,
frame_period: int = 100,
record_second: int = 10,
rate: int = 16000,
) -> None:
subprocess.call(
f"train -p {frame_period} -l {record_second * rate * frame_period} | mlsadf -m 25 -a 0.42 -p 80 {mcep_file} | clip -y -32000 32000 | x2x +fs > {out_file}",
shell=True,
)
def child_voice(pitch_file: str, mcep_file: str, out_file: str):
subprocess.call(
f"sopr -m 0.4 {pitch_file} | excite -p 80 | mlsadf -m 25 -a 0.1 -p 80 {mcep_file} | clip -y -32000 32000 | x2x +fs > {out_file}",
shell=True,
)
if __name__ == "__main__":
parser = argparse.ArgumentParser(prog="", description="", add_help=True)
parser.add_argument("-CH", "--channel", type=int, default=1)
parser.add_argument("-RATE", "--rate", type=int, default=16000)
parser.add_argument("-CHUNK", "--chunk", type=int, default=1024)
parser.add_argument("-FMT", "--format", type=int, default=pyaudio.paInt16)
parser.add_argument("-DEVICE", "--device", type=int, default=0)
parser.add_argument("-SEC", "--record-second", type=int, default=3)
parser.add_argument("--wave-file", type=str, default="tmp.raw")
parser.add_argument("--pitch-file", type=str, default="tmp.pitch")
parser.add_argument("--mcep-file", type=str, default="tmp.mcep")
parser.add_argument("--out-file", type=str, default="out.raw")
parser.add_argument("--pitch", type=float, default=None) # 0.3
parser.add_argument("--frame_shift", type=int, default=None) # 300
parser.add_argument(
"-M",
"--method",
type=str,
choices=["hoarse", "robot", "child"],
default="child",
)
args = parser.parse_args()
wave_file = args.wave_file
pitch_file = args.pitch_file
mcep_file = args.mcep_file
out_file = args.out_file
method = args.method
# if os.path.exists(wave_file):
# print(f"Use the existing wave file ({wave_file})")
# else:
recordAudio(
wave_file=wave_file,
ch=args.channel,
fmt=args.format,
rate=args.rate,
record_second=args.record_second,
chunk=args.chunk,
device_idx=args.device,
)
print("Extracting information about pitch...")
subprocess.call(
f"x2x +sf {wave_file} | pitch -a 1 -s 16 -p 80 > {pitch_file}", shell=True
)
print("Extracting information about Mec Cepstrum...")
subprocess.call(
f"x2x +sf {wave_file} | frame -p 80 | window | mcep -m 25 -a 0.42 > {mcep_file}",
shell=True,
)
if args.pitch is not None:
print(f"Modify the pitch of input audio (pitch={args.pitch})")
modify_pitch(
pitch_file=pitch_file,
mcep_file=mcep_file,
out_file=wave_file,
pitch=args.pitch,
)
if args.frame_shift is not None:
print(f"Modify the speed of input audio (speed={args.pitch})")
modify_speed(
pitch_file=pitch_file,
mcep_file=mcep_file,
out_file=wave_file,
frame_shift=args.frame_shift,
)
print(f"Perform voice convertsion (method={method})")
if method == "hoarse":
modify_pitch(
pitch_file=pitch_file, mcep_file=mcep_file, out_file=out_file, pitch=0.0
)
elif method == "robot":
robot_voice(
mcep_file=mcep_file,
out_file=out_file,
frame_period=100,
record_second=args.record_second,
rate=args.rate,
)
elif method == "child":
child_voice(pitch_file=pitch_file, mcep_file=mcep_file, out_file=out_file)
print(f"Play the output audio.")
play(out_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment