amiasato/pitchshift.py

## pitchshift.py
import torch
from torch.nn import functional as F

from torch_audiomentations.utils.io import Audio

import soundfile as sf

# Change this for whatever file you wish
input_path = "test_fixtures/perfect-alley1.ogg"
output_path = "shifted.wav"

sr = 16000
n_fft = int(sr // 64)   # Rule-of-thumb for FFT size wrt the samplerate for okayish quality
hop_length = n_fft // 2
warp_factor = 1.2       # Should actually be mapped to dBs in the future

win = torch.hann_window(n_fft)

audio = Audio(sample_rate=sr)
samples = audio(input_path)
num_samples = samples.shape[-1]

# Doing time interpolation instead of frequency interpolation results
# in better quality, so we first interpolate in the time domain
print(samples.shape)
samples = F.interpolate(
    samples.unsqueeze(0), scale_factor=1 / warp_factor, mode="linear"
)[0]
print(samples.shape)


# Now undo the time interpolation in the time-frequency domain (after fft),
# preserving the pitch shift.
spec = torch.stft(
    samples, n_fft, hop_length=hop_length, window=win, return_complex=False
).permute([0, 3, 1, 2])
spec = F.interpolate(
    spec, scale_factor=(1, warp_factor), mode="bilinear", align_corners=False,
)[:, :, : spec.shape[2]]
spec = spec.permute([0, 2, 3, 1])

# Revert back to the time domain
noisy_samples = torch.istft(
    spec,
    n_fft,
    window=win,
    hop_length=hop_length,
    length=num_samples,
    return_complex=False,
)
print(samples.shape, noisy_samples.shape)

# Write file
with sf.SoundFile(output_path, mode="w", samplerate=sr, channels=1) as f:
    f.write(noisy_samples.numpy()[0])
	import torch
	from torch.nn import functional as F

	from torch_audiomentations.utils.io import Audio

	import soundfile as sf

	# Change this for whatever file you wish
	input_path = "test_fixtures/perfect-alley1.ogg"
	output_path = "shifted.wav"

	sr = 16000
	n_fft = int(sr // 64) # Rule-of-thumb for FFT size wrt the samplerate for okayish quality
	hop_length = n_fft // 2
	warp_factor = 1.2 # Should actually be mapped to dBs in the future

	win = torch.hann_window(n_fft)

	audio = Audio(sample_rate=sr)
	samples = audio(input_path)
	num_samples = samples.shape[-1]

	# Doing time interpolation instead of frequency interpolation results
	# in better quality, so we first interpolate in the time domain
	print(samples.shape)
	samples = F.interpolate(
	samples.unsqueeze(0), scale_factor=1 / warp_factor, mode="linear"
	)[0]
	print(samples.shape)


	# Now undo the time interpolation in the time-frequency domain (after fft),
	# preserving the pitch shift.
	spec = torch.stft(
	samples, n_fft, hop_length=hop_length, window=win, return_complex=False
	).permute([0, 3, 1, 2])
	spec = F.interpolate(
	spec, scale_factor=(1, warp_factor), mode="bilinear", align_corners=False,
	)[:, :, : spec.shape[2]]
	spec = spec.permute([0, 2, 3, 1])

	# Revert back to the time domain
	noisy_samples = torch.istft(
	spec,
	n_fft,
	window=win,
	hop_length=hop_length,
	length=num_samples,
	return_complex=False,
	)
	print(samples.shape, noisy_samples.shape)

	# Write file
	with sf.SoundFile(output_path, mode="w", samplerate=sr, channels=1) as f:
	f.write(noisy_samples.numpy()[0])