iwasakishuto/VoiceConversion.py

## VoiceConversion.py
# coding: utf-8
"""Voice conversion.

1. Install `sptk <http://sp-tk.sourceforge.net/>`
    ```sh
    $ tar xvzf SPTK-3.11.tar.gz
    $ cd SPTK-3.111
    $ ./configure
    $ make
    $ sudo make install
    ```
2. Install `pyaudio <https://pypi.org/project/PyAudio/>`_
    ```sh
    $ pip install pyaudio
    ```
    If the error message "fatal error: 'portaudio.h' file not found" is appeared, please try the following code.
    ```sh
    $ brew update
    $ brew install portaudio
    $ brew link --overwrite portaudio
    $ pip install pyaudio
    ```
"""
import argparse
import os
import struct
import subprocess
import wave
from typing import List

import pyaudio


def recordAudio(
    wave_file: str,
    ch: int = 1,
    fmt: int = pyaudio.paInt16,
    rate: int = 16000,
    record_second: int = 10,
    chunk: int = 1024,
    device_idx: int = 0,
) -> None:
    print(f"Now Recording ({record_second}[s])")
    p = pyaudio.PyAudio()
    with wave.open(wave_file, mode="wb") as wf:
        wf.setnchannels(ch)
        wf.setsampwidth(p.get_sample_size(fmt))
        wf.setframerate(rate)
        stream = p.open(
            format=fmt,
            channels=ch,
            rate=rate,
            input=True,
            input_device_index=device_idx,
            frames_per_buffer=chunk,
        )
        frames: List[bytes] = []
        for i in range(0, int(rate / chunk * record_second)):
            frames.append(stream.read(chunk))
        wf.writeframes(b"".join(frames))
        p.terminate()
    print("Done.")


def play(wave_file: str, ch: int = 1, rate: int = 16000, chunk: int = 1024) -> None:
    p = pyaudio.PyAudio()
    stream = p.open(
        format=p.get_format_from_width(2), channels=ch, rate=rate, output=True
    )
    with open(wave_file, mode="rb") as f:
        data = f.read(chunk)
        while data != b"":
            stream.write(data)
            data = f.read(chunk)
    stream.stop_stream()


def modify_pitch(
    pitch_file: str, mcep_file: str, out_file: str, pitch: float = 0.0
) -> None:
    subprocess.call(
        f"sopr -m {pitch} {pitch_file} | excite -p 80 | mlsadf -m 25 -a 0.42 -p 80 {mcep_file} | clip -y -32000 32000 | x2x +fs > {out_file}",
        shell=True,
    )


def modify_speed(
    pitch_file: str, mcep_file: str, out_file: str, frame_shift: int = 300
):
    subprocess.call(
        f"excite -p {frame_shift} {pitch_file} | mlsadf -m 25 -a 0.42 -p {frame_shift} {mcep_file} | clip -y -32000 32000 | x2x +fs > {out_file}",
        shell=True,
    )


def robot_voice(
    mcep_file: str,
    out_file: str,
    frame_period: int = 100,
    record_second: int = 10,
    rate: int = 16000,
) -> None:
    subprocess.call(
        f"train -p {frame_period} -l {record_second * rate * frame_period} | mlsadf -m 25 -a 0.42 -p 80 {mcep_file} | clip -y -32000 32000 | x2x +fs > {out_file}",
        shell=True,
    )


def child_voice(pitch_file: str, mcep_file: str, out_file: str):
    subprocess.call(
        f"sopr -m 0.4 {pitch_file} | excite -p 80 | mlsadf -m 25 -a 0.1 -p 80 {mcep_file} | clip -y -32000 32000 | x2x +fs > {out_file}",
        shell=True,
    )


if __name__ == "__main__":
    parser = argparse.ArgumentParser(prog="", description="", add_help=True)
    parser.add_argument("-CH", "--channel", type=int, default=1)
    parser.add_argument("-RATE", "--rate", type=int, default=16000)
    parser.add_argument("-CHUNK", "--chunk", type=int, default=1024)
    parser.add_argument("-FMT", "--format", type=int, default=pyaudio.paInt16)
    parser.add_argument("-DEVICE", "--device", type=int, default=0)
    parser.add_argument("-SEC", "--record-second", type=int, default=3)
    parser.add_argument("--wave-file", type=str, default="tmp.raw")
    parser.add_argument("--pitch-file", type=str, default="tmp.pitch")
    parser.add_argument("--mcep-file", type=str, default="tmp.mcep")
    parser.add_argument("--out-file", type=str, default="out.raw")
    parser.add_argument("--pitch", type=float, default=None)  # 0.3
    parser.add_argument("--frame_shift", type=int, default=None)  # 300
    parser.add_argument(
        "-M",
        "--method",
        type=str,
        choices=["hoarse", "robot", "child"],
        default="child",
    )
    args = parser.parse_args()

    wave_file = args.wave_file
    pitch_file = args.pitch_file
    mcep_file = args.mcep_file
    out_file = args.out_file
    method = args.method

    # if os.path.exists(wave_file):
    #     print(f"Use the existing wave file ({wave_file})")
    # else:
    recordAudio(
        wave_file=wave_file,
        ch=args.channel,
        fmt=args.format,
        rate=args.rate,
        record_second=args.record_second,
        chunk=args.chunk,
        device_idx=args.device,
    )

    print("Extracting information about pitch...")
    subprocess.call(
        f"x2x +sf {wave_file} | pitch -a 1 -s 16 -p 80 > {pitch_file}", shell=True
    )
    print("Extracting information about Mec Cepstrum...")
    subprocess.call(
        f"x2x +sf {wave_file} | frame -p 80 | window | mcep -m 25 -a 0.42 > {mcep_file}",
        shell=True,
    )

    if args.pitch is not None:
        print(f"Modify the pitch of input audio (pitch={args.pitch})")
        modify_pitch(
            pitch_file=pitch_file,
            mcep_file=mcep_file,
            out_file=wave_file,
            pitch=args.pitch,
        )
    if args.frame_shift is not None:
        print(f"Modify the speed of input audio (speed={args.pitch})")
        modify_speed(
            pitch_file=pitch_file,
            mcep_file=mcep_file,
            out_file=wave_file,
            frame_shift=args.frame_shift,
        )

    print(f"Perform voice convertsion (method={method})")
    if method == "hoarse":
        modify_pitch(
            pitch_file=pitch_file, mcep_file=mcep_file, out_file=out_file, pitch=0.0
        )
    elif method == "robot":
        robot_voice(
            mcep_file=mcep_file,
            out_file=out_file,
            frame_period=100,
            record_second=args.record_second,
            rate=args.rate,
        )
    elif method == "child":
        child_voice(pitch_file=pitch_file, mcep_file=mcep_file, out_file=out_file)

    print(f"Play the output audio.")
    play(out_file)
	# coding: utf-8
	"""Voice conversion.

	1. Install `sptk <http://sp-tk.sourceforge.net/>`
	```sh
	$ tar xvzf SPTK-3.11.tar.gz
	$ cd SPTK-3.111
	$ ./configure
	$ make
	$ sudo make install
	```
	2. Install `pyaudio <https://pypi.org/project/PyAudio/>`_
	```sh
	$ pip install pyaudio
	```
	If the error message "fatal error: 'portaudio.h' file not found" is appeared, please try the following code.
	```sh
	$ brew update
	$ brew install portaudio
	$ brew link --overwrite portaudio
	$ pip install pyaudio
	```
	"""
	import argparse
	import os
	import struct
	import subprocess
	import wave
	from typing import List

	import pyaudio


	def recordAudio(
	wave_file: str,
	ch: int = 1,
	fmt: int = pyaudio.paInt16,
	rate: int = 16000,
	record_second: int = 10,
	chunk: int = 1024,
	device_idx: int = 0,
	) -> None:
	print(f"Now Recording ({record_second}[s])")
	p = pyaudio.PyAudio()
	with wave.open(wave_file, mode="wb") as wf:
	wf.setnchannels(ch)
	wf.setsampwidth(p.get_sample_size(fmt))
	wf.setframerate(rate)
	stream = p.open(
	format=fmt,
	channels=ch,
	rate=rate,
	input=True,
	input_device_index=device_idx,
	frames_per_buffer=chunk,
	)
	frames: List[bytes] = []
	for i in range(0, int(rate / chunk * record_second)):
	frames.append(stream.read(chunk))
	wf.writeframes(b"".join(frames))
	p.terminate()
	print("Done.")


	def play(wave_file: str, ch: int = 1, rate: int = 16000, chunk: int = 1024) -> None:
	p = pyaudio.PyAudio()
	stream = p.open(
	format=p.get_format_from_width(2), channels=ch, rate=rate, output=True
	)
	with open(wave_file, mode="rb") as f:
	data = f.read(chunk)
	while data != b"":
	stream.write(data)
	data = f.read(chunk)
	stream.stop_stream()


	def modify_pitch(
	pitch_file: str, mcep_file: str, out_file: str, pitch: float = 0.0
	) -> None:
	subprocess.call(
	f"sopr -m {pitch} {pitch_file} \| excite -p 80 \| mlsadf -m 25 -a 0.42 -p 80 {mcep_file} \| clip -y -32000 32000 \| x2x +fs > {out_file}",
	shell=True,
	)


	def modify_speed(
	pitch_file: str, mcep_file: str, out_file: str, frame_shift: int = 300
	):
	subprocess.call(
	f"excite -p {frame_shift} {pitch_file} \| mlsadf -m 25 -a 0.42 -p {frame_shift} {mcep_file} \| clip -y -32000 32000 \| x2x +fs > {out_file}",
	shell=True,
	)


	def robot_voice(
	mcep_file: str,
	out_file: str,
	frame_period: int = 100,
	record_second: int = 10,
	rate: int = 16000,
	) -> None:
	subprocess.call(
	f"train -p {frame_period} -l {record_second * rate * frame_period} \| mlsadf -m 25 -a 0.42 -p 80 {mcep_file} \| clip -y -32000 32000 \| x2x +fs > {out_file}",
	shell=True,
	)


	def child_voice(pitch_file: str, mcep_file: str, out_file: str):
	subprocess.call(
	f"sopr -m 0.4 {pitch_file} \| excite -p 80 \| mlsadf -m 25 -a 0.1 -p 80 {mcep_file} \| clip -y -32000 32000 \| x2x +fs > {out_file}",
	shell=True,
	)


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(prog="", description="", add_help=True)
	parser.add_argument("-CH", "--channel", type=int, default=1)
	parser.add_argument("-RATE", "--rate", type=int, default=16000)
	parser.add_argument("-CHUNK", "--chunk", type=int, default=1024)
	parser.add_argument("-FMT", "--format", type=int, default=pyaudio.paInt16)
	parser.add_argument("-DEVICE", "--device", type=int, default=0)
	parser.add_argument("-SEC", "--record-second", type=int, default=3)
	parser.add_argument("--wave-file", type=str, default="tmp.raw")
	parser.add_argument("--pitch-file", type=str, default="tmp.pitch")
	parser.add_argument("--mcep-file", type=str, default="tmp.mcep")
	parser.add_argument("--out-file", type=str, default="out.raw")
	parser.add_argument("--pitch", type=float, default=None) # 0.3
	parser.add_argument("--frame_shift", type=int, default=None) # 300
	parser.add_argument(
	"-M",
	"--method",
	type=str,
	choices=["hoarse", "robot", "child"],
	default="child",
	)
	args = parser.parse_args()

	wave_file = args.wave_file
	pitch_file = args.pitch_file
	mcep_file = args.mcep_file
	out_file = args.out_file
	method = args.method

	# if os.path.exists(wave_file):
	# print(f"Use the existing wave file ({wave_file})")
	# else:
	recordAudio(
	wave_file=wave_file,
	ch=args.channel,
	fmt=args.format,
	rate=args.rate,
	record_second=args.record_second,
	chunk=args.chunk,
	device_idx=args.device,
	)

	print("Extracting information about pitch...")
	subprocess.call(
	f"x2x +sf {wave_file} \| pitch -a 1 -s 16 -p 80 > {pitch_file}", shell=True
	)
	print("Extracting information about Mec Cepstrum...")
	subprocess.call(
	f"x2x +sf {wave_file} \| frame -p 80 \| window \| mcep -m 25 -a 0.42 > {mcep_file}",
	shell=True,
	)

	if args.pitch is not None:
	print(f"Modify the pitch of input audio (pitch={args.pitch})")
	modify_pitch(
	pitch_file=pitch_file,
	mcep_file=mcep_file,
	out_file=wave_file,
	pitch=args.pitch,
	)
	if args.frame_shift is not None:
	print(f"Modify the speed of input audio (speed={args.pitch})")
	modify_speed(
	pitch_file=pitch_file,
	mcep_file=mcep_file,
	out_file=wave_file,
	frame_shift=args.frame_shift,
	)

	print(f"Perform voice convertsion (method={method})")
	if method == "hoarse":
	modify_pitch(
	pitch_file=pitch_file, mcep_file=mcep_file, out_file=out_file, pitch=0.0
	)
	elif method == "robot":
	robot_voice(
	mcep_file=mcep_file,
	out_file=out_file,
	frame_period=100,
	record_second=args.record_second,
	rate=args.rate,
	)
	elif method == "child":
	child_voice(pitch_file=pitch_file, mcep_file=mcep_file, out_file=out_file)

	print(f"Play the output audio.")
	play(out_file)