Skip to content

Instantly share code, notes, and snippets.

@kastnerkyle
Last active March 2, 2019 03:53
Show Gist options
  • Save kastnerkyle/2eec2695a48849f33b94b272355ef368 to your computer and use it in GitHub Desktop.
Save kastnerkyle/2eec2695a48849f33b94b272355ef368 to your computer and use it in GitHub Desktop.
{
"transcript": "the Roman letter was used side by side with the Gothic.",
"words": [
{
"case": "success",
"end": 0.11,
"endOffset": 3,
"phones": [
{
"duration": 0.05,
"phone": "dh_B"
},
{
"duration": 0.06,
"phone": "ah_E"
}
],
"alignedWord": "the",
"start": 0.0,
"startOffset": 0,
"word": "the"
},
{
"case": "success",
"end": 0.52,
"endOffset": 9,
"phones": [
{
"duration": 0.1,
"phone": "r_B"
},
{
"duration": 0.08,
"phone": "ow_I"
},
{
"duration": 0.08,
"phone": "m_I"
},
{
"duration": 0.05,
"phone": "ah_I"
},
{
"duration": 0.1,
"phone": "n_E"
}
],
"alignedWord": "roman",
"start": 0.11,
"startOffset": 4,
"word": "Roman"
},
{
"case": "success",
"end": 0.8300000000000001,
"endOffset": 16,
"phones": [
{
"duration": 0.05,
"phone": "l_B"
},
{
"duration": 0.08,
"phone": "eh_I"
},
{
"duration": 0.08,
"phone": "t_I"
},
{
"duration": 0.1,
"phone": "er_E"
}
],
"alignedWord": "letter",
"start": 0.52,
"startOffset": 10,
"word": "letter"
},
{
"case": "success",
"end": 1.05,
"endOffset": 20,
"phones": [
{
"duration": 0.08,
"phone": "w_B"
},
{
"duration": 0.07,
"phone": "ah_I"
},
{
"duration": 0.07,
"phone": "z_E"
}
],
"alignedWord": "was",
"start": 0.83,
"startOffset": 17,
"word": "was"
},
{
"case": "success",
"end": 1.4100000000000001,
"endOffset": 25,
"phones": [
{
"duration": 0.12,
"phone": "y_B"
},
{
"duration": 0.16,
"phone": "uw_I"
},
{
"duration": 0.07,
"phone": "z_I"
},
{
"duration": 0.01,
"phone": "d_E"
}
],
"alignedWord": "used",
"start": 1.05,
"startOffset": 21,
"word": "used"
},
{
"case": "success",
"end": 1.8599999999999999,
"endOffset": 30,
"phones": [
{
"duration": 0.17,
"phone": "s_B"
},
{
"duration": 0.12,
"phone": "ay_I"
},
{
"duration": 0.08,
"phone": "d_E"
}
],
"alignedWord": "side",
"start": 1.49,
"startOffset": 26,
"word": "side"
},
{
"case": "success",
"end": 2.0500000000000003,
"endOffset": 33,
"phones": [
{
"duration": 0.08,
"phone": "b_B"
},
{
"duration": 0.1,
"phone": "ay_E"
}
],
"alignedWord": "by",
"start": 1.87,
"startOffset": 31,
"word": "by"
},
{
"case": "success",
"end": 2.51,
"endOffset": 38,
"phones": [
{
"duration": 0.15,
"phone": "s_B"
},
{
"duration": 0.23,
"phone": "ay_I"
},
{
"duration": 0.08,
"phone": "d_E"
}
],
"alignedWord": "side",
"start": 2.05,
"startOffset": 34,
"word": "side"
},
{
"case": "success",
"end": 2.73,
"endOffset": 43,
"phones": [
{
"duration": 0.09,
"phone": "w_B"
},
{
"duration": 0.06,
"phone": "ih_I"
},
{
"duration": 0.06,
"phone": "th_E"
}
],
"alignedWord": "with",
"start": 2.52,
"startOffset": 39,
"word": "with"
},
{
"case": "success",
"end": 2.85,
"endOffset": 47,
"phones": [
{
"duration": 0.06,
"phone": "dh_B"
},
{
"duration": 0.06,
"phone": "ah_E"
}
],
"alignedWord": "the",
"start": 2.73,
"startOffset": 44,
"word": "the"
},
{
"case": "success",
"end": 3.33,
"endOffset": 54,
"phones": [
{
"duration": 0.09,
"phone": "g_B"
},
{
"duration": 0.14,
"phone": "aa_I"
},
{
"duration": 0.11,
"phone": "th_I"
},
{
"duration": 0.07,
"phone": "ih_I"
},
{
"duration": 0.07,
"phone": "k_E"
}
],
"alignedWord": "gothic",
"start": 2.85,
"startOffset": 48,
"word": "Gothic"
}
]
}
the Roman letter was used side by side with the Gothic.
from scipy.io import wavfile
import numpy as np
import json
import scipy.signal as sg
from scipy import linalg, fftpack
from numpy.lib.stride_tricks import as_strided
def _raised_cosine_window(window_length, periodic, a, b):
even = 1 - window_length % 2
periodic = 1. if True else False
n = np.float64(window_length + periodic * even - 1)
count = np.arange(window_length).astype(np.float64)
cos_arg = 2 * np.pi * count / n
return a - b * np.cos(cos_arg)
def soundsc(X, gain_scale=.9, copy=True):
X = np.array(X, copy=copy)
X = (X - X.min()) / (X.max() - X.min())
X = 2 * X - 1
X = gain_scale * X
X = X * 2 ** 15
return X.astype('int16')
def halfoverlap(X, window_size):
if window_size % 2 != 0:
raise ValueError("Window size must be even!")
window_step = window_size // 2
# Make sure there are an even number of windows before stridetricks
append = np.zeros((window_size - len(X) % window_size))
X = np.hstack((X, append))
num_frames = len(X) // window_step - 1
row_stride = X.itemsize * window_step
col_stride = X.itemsize
X_strided = as_strided(X, shape=(num_frames, window_size),
strides=(row_stride, col_stride))
return X_strided
def overlap(X, window_size, window_step, window=None, copy=True):
if not hasattr(X, "shape") or len(X.shape) != 1:
raise ValueError("X must be passed as 1D np array")
if copy:
X = np.array(X)
X = X.copy()
if window_size % 2 != 0:
raise ValueError("Window size must be even!")
# Make sure there are an even number of windows before stridetricks
# need to window in here?
append = np.zeros((window_size - len(X) % window_size))
X = np.hstack((X, append))
overlap_sz = window_size - window_step
new_shape = X.shape[:-1] + ((X.shape[-1] - overlap_sz) // window_step, window_size)
new_strides = X.strides[:-1] + (window_step * X.strides[-1],) + X.strides[-1:]
X_strided = as_strided(X, shape=new_shape, strides=new_strides)
return X_strided
def stft(X, windowsize=None, fftsize=None, step="half", mean_normalize=True, real=False,
window_type="hann", periodic=True, compute_onesided=True):
if real:
raise ValueError("real=True needs debug")
local_fft = fftpack.rfft
cut = None
else:
local_fft = fftpack.fft
cut = None
if fftsize == None:
assert windowsize is not None
enclosing_fftsize = int(2 ** np.ceil(np.log(windowsize) / np.log(2.0)))
fftsize = enclosing_fftsize
else:
windowsize = fftsize
if compute_onesided or real:
cut = fftsize // 2 + 1
if mean_normalize:
X -= X.mean()
if step == "half":
X = halfoverlap(X, windowsize)
else:
X = overlap(X, windowsize, step)
size = fftsize
if window_type == "hann" and periodic:
win = _raised_cosine_window(size, True, 0.5, 0.5)
else:
raise ValueError("No other windows currently supported")
#win = 0.54 - .46 * np.cos(2 * np.pi * np.arange(size) / (size - 1))
#win = 0.54 - .46 * np.cos(2 * np.pi * np.arange(size) / (size - 1))
X = X * win[None]
X = local_fft(X.astype(np.float64))[:, :cut]
return X
def overlap_add(X_strided, window_step):
n_rows, window_size = X_strided.shape
# Start with largest size (no overlap) then truncate after we finish
# +2 for one window on each side
X = np.zeros(((n_rows + 2) * window_size,)).astype(X_strided.dtype)
start_index = 0
total_windowing_sum = np.zeros((X.shape[0]))
win = 0.54 - .46 * np.cos(2 * np.pi * np.arange(window_size) / (
window_size - 1))
for i in range(n_rows):
end_index = start_index + window_size
X[start_index:end_index] += X_strided[i]
total_windowing_sum[start_index:end_index] += win
start_index += window_step
# Not using this right now
#X = np.real(X) / (total_windowing_sum + 1)
X = X[:end_index]
return X
def istft(X, windowsize=None, fftsize=None, step="half", mean_normalize=True,
real=False, compute_onesided=True):
"""
Compute ISTFT for STFT transformed X
"""
if real:
local_ifft = fftpack.irfft
X_pad = np.zeros((X.shape[0], X.shape[1] + 1)) + 0j
X_pad[:, :-1] = X
X = X_pad
else:
local_ifft = fftpack.ifft
if fftsize == None:
assert windowsize == None
if compute_onesided:
X_pad = np.zeros((X.shape[0], 2 * X.shape[1])) + 0j
X_pad[:, :fftsize // 2 + 1] = X
X_pad[:, fftsize // 2 + 1:] = 0
X = X_pad
X = local_ifft(X).astype("float64")
if step == "half":
X = invert_halfoverlap(X)
else:
X = overlap_add(X, step)
if mean_normalize:
X -= np.mean(X)
return X
def phase_vocoder(wav_data, rate, fftsize=512):
""" bulk of the processing taken from librosa """
wav_data = wav_data.copy().astype("float32")
D = stft(wav_data, fftsize, step=32)
D = D.transpose(1, 0)
n_fft = 2 * (D.shape[0] - 1)
hop_length = None
if hop_length is None:
hop_length = int(n_fft // 4)
# Expected time advance in each bin
time_steps = np.arange(0, D.shape[1], rate, dtype=np.float)
# Expected phase advance in each bin
phi_advance = np.linspace(0, np.pi * hop_length, D.shape[0])
# Create an empty output array
d_stretch = np.zeros((D.shape[0], len(time_steps)), D.dtype, order='F')
# Phase accumulator; initialize to the first sample
phase_acc = np.angle(D[:, 0])
# Pad 0 columns to simplify boundary logic
D = np.pad(D, [(0, 0), (0, 2)], mode='constant')
for (t, step) in enumerate(time_steps):
columns = D[:, int(step):int(step + 2)]
# Weighting for linear magnitude interpolation
alpha = np.mod(step, 1.0)
mag = ((1.0 - alpha) * np.abs(columns[:, 0])
+ alpha * np.abs(columns[:, 1]))
# Store to output array
d_stretch[:, t] = mag * np.exp(1.j * phase_acc)
# Compute phase advance
dphase = (np.angle(columns[:, 1])
- np.angle(columns[:, 0])
- phi_advance)
# Wrap to -pi:pi range
dphase = dphase - 2.0 * np.pi * np.round(dphase / (2.0 * np.pi))
# Accumulate phase
phase_acc += phi_advance + dphase
d_stretch = d_stretch.transpose(1, 0)
reconstructed_wav = istft(d_stretch, fftsize=fftsize, step=32)
return reconstructed_wav
if __name__ == "__main__":
file_to_modify = "synth_LJ001-0061.wav"
file_orig = "LJ001-0061.wav"
fs, wav_data = wavfile.read(file_to_modify)
orig_fs, orig_wav_data = wavfile.read(file_orig)
file_to_modify_info = "synth_LJ001-0061.json"
with open(file_to_modify_info, "r") as f:
modify_info = json.load(f)
target_alignment_info = "LJ001-0061.json"
with open(target_alignment_info, "r") as f:
target_info = json.load(f)
s = [w["word"] for w in modify_info["words"]]
t = [w["word"] for w in target_info["words"]]
fftsize = 512
assert all([s[i] == t[i] for i in range(len(s))]) and len(t) == len(s)
warped_chunks = []
for n, (modify_word, target_word) in enumerate(zip(modify_info["words"], target_info["words"])):
# for now, do whole word boundaries directly
if n == 0:
s_ = target_info["words"][0]["start"] * fs
if s_ != 0:
warped_chunks.append(orig_wav_data[0:s_])
s_ = int(modify_word["start"] * fs)
e_ = int(modify_word["end"] * fs)
modify_chunk = wav_data[s_:e_]
# calculate the rate chunks based on the ratio
"""
m_phones = modify_word["phones"]
t_phones = target_word["phones"]
ratios = [m_phones[i]["duration"] / float(t_phones[i]["duration"]) for i in range(len(m_phones))]
"""
m_dur = modify_word["end"] - modify_word["start"]
t_dur = target_word["end"] - target_word["start"]
rate = m_dur / float(t_dur)
stretch_chunk = phase_vocoder(modify_chunk, rate)
warped_chunks.append(stretch_chunk)
if n != (len(modify_info["words"]) - 1):
s_ = int(modify_info["words"][n]["end"] * fs)
e_ = int(modify_info["words"][n + 1]["start"] * fs)
gap_chunk = wav_data[s_:e_]
m_dur = modify_info["words"][n + 1]["start"] - modify_info["words"][n]["end"]
t_dur = target_info["words"][n + 1]["start"] - target_info["words"][n]["end"]
if m_dur == 0 or t_dur == 0:
continue
if len(gap_chunk) < fftsize:
warped_chunks.append(gap_chunk)
continue
rate = m_dur / float(t_dur)
stretch_gap_chunk = phase_vocoder(gap_chunk, rate, fftsize=fftsize)
warped_chunks.append(stretch_gap_chunk)
warped_wav = np.concatenate(warped_chunks)
wavfile.write("output.wav", fs, soundsc(warped_wav))
{
"transcript": "the Roman letter was used side by side with the Gothic.",
"words": [
{
"case": "success",
"end": 0.28,
"endOffset": 3,
"phones": [
{
"duration": 0.06,
"phone": "dh_B"
},
{
"duration": 0.06,
"phone": "ah_E"
}
],
"alignedWord": "the",
"start": 0.16,
"startOffset": 0,
"word": "the"
},
{
"case": "success",
"end": 0.69,
"endOffset": 9,
"phones": [
{
"duration": 0.06,
"phone": "r_B"
},
{
"duration": 0.14,
"phone": "ow_I"
},
{
"duration": 0.04,
"phone": "m_I"
},
{
"duration": 0.07,
"phone": "ah_I"
},
{
"duration": 0.1,
"phone": "n_E"
}
],
"alignedWord": "roman",
"start": 0.28,
"startOffset": 4,
"word": "Roman"
},
{
"case": "success",
"end": 1.0699999999999998,
"endOffset": 16,
"phones": [
{
"duration": 0.07,
"phone": "l_B"
},
{
"duration": 0.11,
"phone": "eh_I"
},
{
"duration": 0.11,
"phone": "t_I"
},
{
"duration": 0.09,
"phone": "er_E"
}
],
"alignedWord": "letter",
"start": 0.69,
"startOffset": 10,
"word": "letter"
},
{
"case": "success",
"end": 1.28,
"endOffset": 20,
"phones": [
{
"duration": 0.06,
"phone": "w_B"
},
{
"duration": 0.07,
"phone": "ah_I"
},
{
"duration": 0.08,
"phone": "z_E"
}
],
"alignedWord": "was",
"start": 1.07,
"startOffset": 17,
"word": "was"
},
{
"case": "success",
"end": 1.57,
"endOffset": 25,
"phones": [
{
"duration": 0.08,
"phone": "y_B"
},
{
"duration": 0.1,
"phone": "uw_I"
},
{
"duration": 0.07,
"phone": "z_I"
},
{
"duration": 0.04,
"phone": "d_E"
}
],
"alignedWord": "used",
"start": 1.28,
"startOffset": 21,
"word": "used"
},
{
"case": "success",
"end": 1.9000000000000001,
"endOffset": 30,
"phones": [
{
"duration": 0.15,
"phone": "s_B"
},
{
"duration": 0.12,
"phone": "ay_I"
},
{
"duration": 0.06,
"phone": "d_E"
}
],
"alignedWord": "side",
"start": 1.57,
"startOffset": 26,
"word": "side"
},
{
"case": "success",
"end": 2.12,
"endOffset": 33,
"phones": [
{
"duration": 0.09,
"phone": "b_B"
},
{
"duration": 0.12,
"phone": "ay_E"
}
],
"alignedWord": "by",
"start": 1.91,
"startOffset": 31,
"word": "by"
},
{
"case": "success",
"end": 2.4699999999999998,
"endOffset": 38,
"phones": [
{
"duration": 0.13,
"phone": "s_B"
},
{
"duration": 0.11,
"phone": "ay_I"
},
{
"duration": 0.1,
"phone": "d_E"
}
],
"alignedWord": "side",
"start": 2.13,
"startOffset": 34,
"word": "side"
},
{
"case": "success",
"end": 2.6300000000000003,
"endOffset": 43,
"phones": [
{
"duration": 0.04,
"phone": "w_B"
},
{
"duration": 0.05,
"phone": "ih_I"
},
{
"duration": 0.07,
"phone": "th_E"
}
],
"alignedWord": "with",
"start": 2.47,
"startOffset": 39,
"word": "with"
},
{
"case": "success",
"end": 2.74,
"endOffset": 47,
"phones": [
{
"duration": 0.05,
"phone": "dh_B"
},
{
"duration": 0.05,
"phone": "ah_E"
}
],
"alignedWord": "the",
"start": 2.64,
"startOffset": 44,
"word": "the"
},
{
"case": "success",
"end": 3.2800000000000002,
"endOffset": 54,
"phones": [
{
"duration": 0.07,
"phone": "g_B"
},
{
"duration": 0.11,
"phone": "aa_I"
},
{
"duration": 0.13,
"phone": "th_I"
},
{
"duration": 0.13,
"phone": "ih_I"
},
{
"duration": 0.1,
"phone": "k_E"
}
],
"alignedWord": "gothic",
"start": 2.74,
"startOffset": 48,
"word": "Gothic"
}
]
}
the Roman letter was used side by side with the Gothic.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment