Last active
March 2, 2019 03:53
-
-
Save kastnerkyle/2eec2695a48849f33b94b272355ef368 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"transcript": "the Roman letter was used side by side with the Gothic.", | |
"words": [ | |
{ | |
"case": "success", | |
"end": 0.11, | |
"endOffset": 3, | |
"phones": [ | |
{ | |
"duration": 0.05, | |
"phone": "dh_B" | |
}, | |
{ | |
"duration": 0.06, | |
"phone": "ah_E" | |
} | |
], | |
"alignedWord": "the", | |
"start": 0.0, | |
"startOffset": 0, | |
"word": "the" | |
}, | |
{ | |
"case": "success", | |
"end": 0.52, | |
"endOffset": 9, | |
"phones": [ | |
{ | |
"duration": 0.1, | |
"phone": "r_B" | |
}, | |
{ | |
"duration": 0.08, | |
"phone": "ow_I" | |
}, | |
{ | |
"duration": 0.08, | |
"phone": "m_I" | |
}, | |
{ | |
"duration": 0.05, | |
"phone": "ah_I" | |
}, | |
{ | |
"duration": 0.1, | |
"phone": "n_E" | |
} | |
], | |
"alignedWord": "roman", | |
"start": 0.11, | |
"startOffset": 4, | |
"word": "Roman" | |
}, | |
{ | |
"case": "success", | |
"end": 0.8300000000000001, | |
"endOffset": 16, | |
"phones": [ | |
{ | |
"duration": 0.05, | |
"phone": "l_B" | |
}, | |
{ | |
"duration": 0.08, | |
"phone": "eh_I" | |
}, | |
{ | |
"duration": 0.08, | |
"phone": "t_I" | |
}, | |
{ | |
"duration": 0.1, | |
"phone": "er_E" | |
} | |
], | |
"alignedWord": "letter", | |
"start": 0.52, | |
"startOffset": 10, | |
"word": "letter" | |
}, | |
{ | |
"case": "success", | |
"end": 1.05, | |
"endOffset": 20, | |
"phones": [ | |
{ | |
"duration": 0.08, | |
"phone": "w_B" | |
}, | |
{ | |
"duration": 0.07, | |
"phone": "ah_I" | |
}, | |
{ | |
"duration": 0.07, | |
"phone": "z_E" | |
} | |
], | |
"alignedWord": "was", | |
"start": 0.83, | |
"startOffset": 17, | |
"word": "was" | |
}, | |
{ | |
"case": "success", | |
"end": 1.4100000000000001, | |
"endOffset": 25, | |
"phones": [ | |
{ | |
"duration": 0.12, | |
"phone": "y_B" | |
}, | |
{ | |
"duration": 0.16, | |
"phone": "uw_I" | |
}, | |
{ | |
"duration": 0.07, | |
"phone": "z_I" | |
}, | |
{ | |
"duration": 0.01, | |
"phone": "d_E" | |
} | |
], | |
"alignedWord": "used", | |
"start": 1.05, | |
"startOffset": 21, | |
"word": "used" | |
}, | |
{ | |
"case": "success", | |
"end": 1.8599999999999999, | |
"endOffset": 30, | |
"phones": [ | |
{ | |
"duration": 0.17, | |
"phone": "s_B" | |
}, | |
{ | |
"duration": 0.12, | |
"phone": "ay_I" | |
}, | |
{ | |
"duration": 0.08, | |
"phone": "d_E" | |
} | |
], | |
"alignedWord": "side", | |
"start": 1.49, | |
"startOffset": 26, | |
"word": "side" | |
}, | |
{ | |
"case": "success", | |
"end": 2.0500000000000003, | |
"endOffset": 33, | |
"phones": [ | |
{ | |
"duration": 0.08, | |
"phone": "b_B" | |
}, | |
{ | |
"duration": 0.1, | |
"phone": "ay_E" | |
} | |
], | |
"alignedWord": "by", | |
"start": 1.87, | |
"startOffset": 31, | |
"word": "by" | |
}, | |
{ | |
"case": "success", | |
"end": 2.51, | |
"endOffset": 38, | |
"phones": [ | |
{ | |
"duration": 0.15, | |
"phone": "s_B" | |
}, | |
{ | |
"duration": 0.23, | |
"phone": "ay_I" | |
}, | |
{ | |
"duration": 0.08, | |
"phone": "d_E" | |
} | |
], | |
"alignedWord": "side", | |
"start": 2.05, | |
"startOffset": 34, | |
"word": "side" | |
}, | |
{ | |
"case": "success", | |
"end": 2.73, | |
"endOffset": 43, | |
"phones": [ | |
{ | |
"duration": 0.09, | |
"phone": "w_B" | |
}, | |
{ | |
"duration": 0.06, | |
"phone": "ih_I" | |
}, | |
{ | |
"duration": 0.06, | |
"phone": "th_E" | |
} | |
], | |
"alignedWord": "with", | |
"start": 2.52, | |
"startOffset": 39, | |
"word": "with" | |
}, | |
{ | |
"case": "success", | |
"end": 2.85, | |
"endOffset": 47, | |
"phones": [ | |
{ | |
"duration": 0.06, | |
"phone": "dh_B" | |
}, | |
{ | |
"duration": 0.06, | |
"phone": "ah_E" | |
} | |
], | |
"alignedWord": "the", | |
"start": 2.73, | |
"startOffset": 44, | |
"word": "the" | |
}, | |
{ | |
"case": "success", | |
"end": 3.33, | |
"endOffset": 54, | |
"phones": [ | |
{ | |
"duration": 0.09, | |
"phone": "g_B" | |
}, | |
{ | |
"duration": 0.14, | |
"phone": "aa_I" | |
}, | |
{ | |
"duration": 0.11, | |
"phone": "th_I" | |
}, | |
{ | |
"duration": 0.07, | |
"phone": "ih_I" | |
}, | |
{ | |
"duration": 0.07, | |
"phone": "k_E" | |
} | |
], | |
"alignedWord": "gothic", | |
"start": 2.85, | |
"startOffset": 48, | |
"word": "Gothic" | |
} | |
] | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
the Roman letter was used side by side with the Gothic. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scipy.io import wavfile | |
import numpy as np | |
import json | |
import scipy.signal as sg | |
from scipy import linalg, fftpack | |
from numpy.lib.stride_tricks import as_strided | |
def _raised_cosine_window(window_length, periodic, a, b): | |
even = 1 - window_length % 2 | |
periodic = 1. if True else False | |
n = np.float64(window_length + periodic * even - 1) | |
count = np.arange(window_length).astype(np.float64) | |
cos_arg = 2 * np.pi * count / n | |
return a - b * np.cos(cos_arg) | |
def soundsc(X, gain_scale=.9, copy=True): | |
X = np.array(X, copy=copy) | |
X = (X - X.min()) / (X.max() - X.min()) | |
X = 2 * X - 1 | |
X = gain_scale * X | |
X = X * 2 ** 15 | |
return X.astype('int16') | |
def halfoverlap(X, window_size): | |
if window_size % 2 != 0: | |
raise ValueError("Window size must be even!") | |
window_step = window_size // 2 | |
# Make sure there are an even number of windows before stridetricks | |
append = np.zeros((window_size - len(X) % window_size)) | |
X = np.hstack((X, append)) | |
num_frames = len(X) // window_step - 1 | |
row_stride = X.itemsize * window_step | |
col_stride = X.itemsize | |
X_strided = as_strided(X, shape=(num_frames, window_size), | |
strides=(row_stride, col_stride)) | |
return X_strided | |
def overlap(X, window_size, window_step, window=None, copy=True): | |
if not hasattr(X, "shape") or len(X.shape) != 1: | |
raise ValueError("X must be passed as 1D np array") | |
if copy: | |
X = np.array(X) | |
X = X.copy() | |
if window_size % 2 != 0: | |
raise ValueError("Window size must be even!") | |
# Make sure there are an even number of windows before stridetricks | |
# need to window in here? | |
append = np.zeros((window_size - len(X) % window_size)) | |
X = np.hstack((X, append)) | |
overlap_sz = window_size - window_step | |
new_shape = X.shape[:-1] + ((X.shape[-1] - overlap_sz) // window_step, window_size) | |
new_strides = X.strides[:-1] + (window_step * X.strides[-1],) + X.strides[-1:] | |
X_strided = as_strided(X, shape=new_shape, strides=new_strides) | |
return X_strided | |
def stft(X, windowsize=None, fftsize=None, step="half", mean_normalize=True, real=False, | |
window_type="hann", periodic=True, compute_onesided=True): | |
if real: | |
raise ValueError("real=True needs debug") | |
local_fft = fftpack.rfft | |
cut = None | |
else: | |
local_fft = fftpack.fft | |
cut = None | |
if fftsize == None: | |
assert windowsize is not None | |
enclosing_fftsize = int(2 ** np.ceil(np.log(windowsize) / np.log(2.0))) | |
fftsize = enclosing_fftsize | |
else: | |
windowsize = fftsize | |
if compute_onesided or real: | |
cut = fftsize // 2 + 1 | |
if mean_normalize: | |
X -= X.mean() | |
if step == "half": | |
X = halfoverlap(X, windowsize) | |
else: | |
X = overlap(X, windowsize, step) | |
size = fftsize | |
if window_type == "hann" and periodic: | |
win = _raised_cosine_window(size, True, 0.5, 0.5) | |
else: | |
raise ValueError("No other windows currently supported") | |
#win = 0.54 - .46 * np.cos(2 * np.pi * np.arange(size) / (size - 1)) | |
#win = 0.54 - .46 * np.cos(2 * np.pi * np.arange(size) / (size - 1)) | |
X = X * win[None] | |
X = local_fft(X.astype(np.float64))[:, :cut] | |
return X | |
def overlap_add(X_strided, window_step): | |
n_rows, window_size = X_strided.shape | |
# Start with largest size (no overlap) then truncate after we finish | |
# +2 for one window on each side | |
X = np.zeros(((n_rows + 2) * window_size,)).astype(X_strided.dtype) | |
start_index = 0 | |
total_windowing_sum = np.zeros((X.shape[0])) | |
win = 0.54 - .46 * np.cos(2 * np.pi * np.arange(window_size) / ( | |
window_size - 1)) | |
for i in range(n_rows): | |
end_index = start_index + window_size | |
X[start_index:end_index] += X_strided[i] | |
total_windowing_sum[start_index:end_index] += win | |
start_index += window_step | |
# Not using this right now | |
#X = np.real(X) / (total_windowing_sum + 1) | |
X = X[:end_index] | |
return X | |
def istft(X, windowsize=None, fftsize=None, step="half", mean_normalize=True, | |
real=False, compute_onesided=True): | |
""" | |
Compute ISTFT for STFT transformed X | |
""" | |
if real: | |
local_ifft = fftpack.irfft | |
X_pad = np.zeros((X.shape[0], X.shape[1] + 1)) + 0j | |
X_pad[:, :-1] = X | |
X = X_pad | |
else: | |
local_ifft = fftpack.ifft | |
if fftsize == None: | |
assert windowsize == None | |
if compute_onesided: | |
X_pad = np.zeros((X.shape[0], 2 * X.shape[1])) + 0j | |
X_pad[:, :fftsize // 2 + 1] = X | |
X_pad[:, fftsize // 2 + 1:] = 0 | |
X = X_pad | |
X = local_ifft(X).astype("float64") | |
if step == "half": | |
X = invert_halfoverlap(X) | |
else: | |
X = overlap_add(X, step) | |
if mean_normalize: | |
X -= np.mean(X) | |
return X | |
def phase_vocoder(wav_data, rate, fftsize=512): | |
""" bulk of the processing taken from librosa """ | |
wav_data = wav_data.copy().astype("float32") | |
D = stft(wav_data, fftsize, step=32) | |
D = D.transpose(1, 0) | |
n_fft = 2 * (D.shape[0] - 1) | |
hop_length = None | |
if hop_length is None: | |
hop_length = int(n_fft // 4) | |
# Expected time advance in each bin | |
time_steps = np.arange(0, D.shape[1], rate, dtype=np.float) | |
# Expected phase advance in each bin | |
phi_advance = np.linspace(0, np.pi * hop_length, D.shape[0]) | |
# Create an empty output array | |
d_stretch = np.zeros((D.shape[0], len(time_steps)), D.dtype, order='F') | |
# Phase accumulator; initialize to the first sample | |
phase_acc = np.angle(D[:, 0]) | |
# Pad 0 columns to simplify boundary logic | |
D = np.pad(D, [(0, 0), (0, 2)], mode='constant') | |
for (t, step) in enumerate(time_steps): | |
columns = D[:, int(step):int(step + 2)] | |
# Weighting for linear magnitude interpolation | |
alpha = np.mod(step, 1.0) | |
mag = ((1.0 - alpha) * np.abs(columns[:, 0]) | |
+ alpha * np.abs(columns[:, 1])) | |
# Store to output array | |
d_stretch[:, t] = mag * np.exp(1.j * phase_acc) | |
# Compute phase advance | |
dphase = (np.angle(columns[:, 1]) | |
- np.angle(columns[:, 0]) | |
- phi_advance) | |
# Wrap to -pi:pi range | |
dphase = dphase - 2.0 * np.pi * np.round(dphase / (2.0 * np.pi)) | |
# Accumulate phase | |
phase_acc += phi_advance + dphase | |
d_stretch = d_stretch.transpose(1, 0) | |
reconstructed_wav = istft(d_stretch, fftsize=fftsize, step=32) | |
return reconstructed_wav | |
if __name__ == "__main__": | |
file_to_modify = "synth_LJ001-0061.wav" | |
file_orig = "LJ001-0061.wav" | |
fs, wav_data = wavfile.read(file_to_modify) | |
orig_fs, orig_wav_data = wavfile.read(file_orig) | |
file_to_modify_info = "synth_LJ001-0061.json" | |
with open(file_to_modify_info, "r") as f: | |
modify_info = json.load(f) | |
target_alignment_info = "LJ001-0061.json" | |
with open(target_alignment_info, "r") as f: | |
target_info = json.load(f) | |
s = [w["word"] for w in modify_info["words"]] | |
t = [w["word"] for w in target_info["words"]] | |
fftsize = 512 | |
assert all([s[i] == t[i] for i in range(len(s))]) and len(t) == len(s) | |
warped_chunks = [] | |
for n, (modify_word, target_word) in enumerate(zip(modify_info["words"], target_info["words"])): | |
# for now, do whole word boundaries directly | |
if n == 0: | |
s_ = target_info["words"][0]["start"] * fs | |
if s_ != 0: | |
warped_chunks.append(orig_wav_data[0:s_]) | |
s_ = int(modify_word["start"] * fs) | |
e_ = int(modify_word["end"] * fs) | |
modify_chunk = wav_data[s_:e_] | |
# calculate the rate chunks based on the ratio | |
""" | |
m_phones = modify_word["phones"] | |
t_phones = target_word["phones"] | |
ratios = [m_phones[i]["duration"] / float(t_phones[i]["duration"]) for i in range(len(m_phones))] | |
""" | |
m_dur = modify_word["end"] - modify_word["start"] | |
t_dur = target_word["end"] - target_word["start"] | |
rate = m_dur / float(t_dur) | |
stretch_chunk = phase_vocoder(modify_chunk, rate) | |
warped_chunks.append(stretch_chunk) | |
if n != (len(modify_info["words"]) - 1): | |
s_ = int(modify_info["words"][n]["end"] * fs) | |
e_ = int(modify_info["words"][n + 1]["start"] * fs) | |
gap_chunk = wav_data[s_:e_] | |
m_dur = modify_info["words"][n + 1]["start"] - modify_info["words"][n]["end"] | |
t_dur = target_info["words"][n + 1]["start"] - target_info["words"][n]["end"] | |
if m_dur == 0 or t_dur == 0: | |
continue | |
if len(gap_chunk) < fftsize: | |
warped_chunks.append(gap_chunk) | |
continue | |
rate = m_dur / float(t_dur) | |
stretch_gap_chunk = phase_vocoder(gap_chunk, rate, fftsize=fftsize) | |
warped_chunks.append(stretch_gap_chunk) | |
warped_wav = np.concatenate(warped_chunks) | |
wavfile.write("output.wav", fs, soundsc(warped_wav)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"transcript": "the Roman letter was used side by side with the Gothic.", | |
"words": [ | |
{ | |
"case": "success", | |
"end": 0.28, | |
"endOffset": 3, | |
"phones": [ | |
{ | |
"duration": 0.06, | |
"phone": "dh_B" | |
}, | |
{ | |
"duration": 0.06, | |
"phone": "ah_E" | |
} | |
], | |
"alignedWord": "the", | |
"start": 0.16, | |
"startOffset": 0, | |
"word": "the" | |
}, | |
{ | |
"case": "success", | |
"end": 0.69, | |
"endOffset": 9, | |
"phones": [ | |
{ | |
"duration": 0.06, | |
"phone": "r_B" | |
}, | |
{ | |
"duration": 0.14, | |
"phone": "ow_I" | |
}, | |
{ | |
"duration": 0.04, | |
"phone": "m_I" | |
}, | |
{ | |
"duration": 0.07, | |
"phone": "ah_I" | |
}, | |
{ | |
"duration": 0.1, | |
"phone": "n_E" | |
} | |
], | |
"alignedWord": "roman", | |
"start": 0.28, | |
"startOffset": 4, | |
"word": "Roman" | |
}, | |
{ | |
"case": "success", | |
"end": 1.0699999999999998, | |
"endOffset": 16, | |
"phones": [ | |
{ | |
"duration": 0.07, | |
"phone": "l_B" | |
}, | |
{ | |
"duration": 0.11, | |
"phone": "eh_I" | |
}, | |
{ | |
"duration": 0.11, | |
"phone": "t_I" | |
}, | |
{ | |
"duration": 0.09, | |
"phone": "er_E" | |
} | |
], | |
"alignedWord": "letter", | |
"start": 0.69, | |
"startOffset": 10, | |
"word": "letter" | |
}, | |
{ | |
"case": "success", | |
"end": 1.28, | |
"endOffset": 20, | |
"phones": [ | |
{ | |
"duration": 0.06, | |
"phone": "w_B" | |
}, | |
{ | |
"duration": 0.07, | |
"phone": "ah_I" | |
}, | |
{ | |
"duration": 0.08, | |
"phone": "z_E" | |
} | |
], | |
"alignedWord": "was", | |
"start": 1.07, | |
"startOffset": 17, | |
"word": "was" | |
}, | |
{ | |
"case": "success", | |
"end": 1.57, | |
"endOffset": 25, | |
"phones": [ | |
{ | |
"duration": 0.08, | |
"phone": "y_B" | |
}, | |
{ | |
"duration": 0.1, | |
"phone": "uw_I" | |
}, | |
{ | |
"duration": 0.07, | |
"phone": "z_I" | |
}, | |
{ | |
"duration": 0.04, | |
"phone": "d_E" | |
} | |
], | |
"alignedWord": "used", | |
"start": 1.28, | |
"startOffset": 21, | |
"word": "used" | |
}, | |
{ | |
"case": "success", | |
"end": 1.9000000000000001, | |
"endOffset": 30, | |
"phones": [ | |
{ | |
"duration": 0.15, | |
"phone": "s_B" | |
}, | |
{ | |
"duration": 0.12, | |
"phone": "ay_I" | |
}, | |
{ | |
"duration": 0.06, | |
"phone": "d_E" | |
} | |
], | |
"alignedWord": "side", | |
"start": 1.57, | |
"startOffset": 26, | |
"word": "side" | |
}, | |
{ | |
"case": "success", | |
"end": 2.12, | |
"endOffset": 33, | |
"phones": [ | |
{ | |
"duration": 0.09, | |
"phone": "b_B" | |
}, | |
{ | |
"duration": 0.12, | |
"phone": "ay_E" | |
} | |
], | |
"alignedWord": "by", | |
"start": 1.91, | |
"startOffset": 31, | |
"word": "by" | |
}, | |
{ | |
"case": "success", | |
"end": 2.4699999999999998, | |
"endOffset": 38, | |
"phones": [ | |
{ | |
"duration": 0.13, | |
"phone": "s_B" | |
}, | |
{ | |
"duration": 0.11, | |
"phone": "ay_I" | |
}, | |
{ | |
"duration": 0.1, | |
"phone": "d_E" | |
} | |
], | |
"alignedWord": "side", | |
"start": 2.13, | |
"startOffset": 34, | |
"word": "side" | |
}, | |
{ | |
"case": "success", | |
"end": 2.6300000000000003, | |
"endOffset": 43, | |
"phones": [ | |
{ | |
"duration": 0.04, | |
"phone": "w_B" | |
}, | |
{ | |
"duration": 0.05, | |
"phone": "ih_I" | |
}, | |
{ | |
"duration": 0.07, | |
"phone": "th_E" | |
} | |
], | |
"alignedWord": "with", | |
"start": 2.47, | |
"startOffset": 39, | |
"word": "with" | |
}, | |
{ | |
"case": "success", | |
"end": 2.74, | |
"endOffset": 47, | |
"phones": [ | |
{ | |
"duration": 0.05, | |
"phone": "dh_B" | |
}, | |
{ | |
"duration": 0.05, | |
"phone": "ah_E" | |
} | |
], | |
"alignedWord": "the", | |
"start": 2.64, | |
"startOffset": 44, | |
"word": "the" | |
}, | |
{ | |
"case": "success", | |
"end": 3.2800000000000002, | |
"endOffset": 54, | |
"phones": [ | |
{ | |
"duration": 0.07, | |
"phone": "g_B" | |
}, | |
{ | |
"duration": 0.11, | |
"phone": "aa_I" | |
}, | |
{ | |
"duration": 0.13, | |
"phone": "th_I" | |
}, | |
{ | |
"duration": 0.13, | |
"phone": "ih_I" | |
}, | |
{ | |
"duration": 0.1, | |
"phone": "k_E" | |
} | |
], | |
"alignedWord": "gothic", | |
"start": 2.74, | |
"startOffset": 48, | |
"word": "Gothic" | |
} | |
] | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
the Roman letter was used side by side with the Gothic. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment