kastnerkyle/LJ001-0061.json

## LJ001-0061.json
{
    "transcript": "the Roman letter was used side by side with the Gothic.",
    "words": [
        {
            "case": "success",
            "end": 0.11,
            "endOffset": 3,
            "phones": [
                {
                    "duration": 0.05,
                    "phone": "dh_B"
                },
                {
                    "duration": 0.06,
                    "phone": "ah_E"
                }
            ],
            "alignedWord": "the",
            "start": 0.0,
            "startOffset": 0,
            "word": "the"
        },
        {
            "case": "success",
            "end": 0.52,
            "endOffset": 9,
            "phones": [
                {
                    "duration": 0.1,
                    "phone": "r_B"
                },
                {
                    "duration": 0.08,
                    "phone": "ow_I"
                },
                {
                    "duration": 0.08,
                    "phone": "m_I"
                },
                {
                    "duration": 0.05,
                    "phone": "ah_I"
                },
                {
                    "duration": 0.1,
                    "phone": "n_E"
                }
            ],
            "alignedWord": "roman",
            "start": 0.11,
            "startOffset": 4,
            "word": "Roman"
        },
        {
            "case": "success",
            "end": 0.8300000000000001,
            "endOffset": 16,
            "phones": [
                {
                    "duration": 0.05,
                    "phone": "l_B"
                },
                {
                    "duration": 0.08,
                    "phone": "eh_I"
                },
                {
                    "duration": 0.08,
                    "phone": "t_I"
                },
                {
                    "duration": 0.1,
                    "phone": "er_E"
                }
            ],
            "alignedWord": "letter",
            "start": 0.52,
            "startOffset": 10,
            "word": "letter"
        },
        {
            "case": "success",
            "end": 1.05,
            "endOffset": 20,
            "phones": [
                {
                    "duration": 0.08,
                    "phone": "w_B"
                },
                {
                    "duration": 0.07,
                    "phone": "ah_I"
                },
                {
                    "duration": 0.07,
                    "phone": "z_E"
                }
            ],
            "alignedWord": "was",
            "start": 0.83,
            "startOffset": 17,
            "word": "was"
        },
        {
            "case": "success",
            "end": 1.4100000000000001,
            "endOffset": 25,
            "phones": [
                {
                    "duration": 0.12,
                    "phone": "y_B"
                },
                {
                    "duration": 0.16,
                    "phone": "uw_I"
                },
                {
                    "duration": 0.07,
                    "phone": "z_I"
                },
                {
                    "duration": 0.01,
                    "phone": "d_E"
                }
            ],
            "alignedWord": "used",
            "start": 1.05,
            "startOffset": 21,
            "word": "used"
        },
        {
            "case": "success",
            "end": 1.8599999999999999,
            "endOffset": 30,
            "phones": [
                {
                    "duration": 0.17,
                    "phone": "s_B"
                },
                {
                    "duration": 0.12,
                    "phone": "ay_I"
                },
                {
                    "duration": 0.08,
                    "phone": "d_E"
                }
            ],
            "alignedWord": "side",
            "start": 1.49,
            "startOffset": 26,
            "word": "side"
        },
        {
            "case": "success",
            "end": 2.0500000000000003,
            "endOffset": 33,
            "phones": [
                {
                    "duration": 0.08,
                    "phone": "b_B"
                },
                {
                    "duration": 0.1,
                    "phone": "ay_E"
                }
            ],
            "alignedWord": "by",
            "start": 1.87,
            "startOffset": 31,
            "word": "by"
        },
        {
            "case": "success",
            "end": 2.51,
            "endOffset": 38,
            "phones": [
                {
                    "duration": 0.15,
                    "phone": "s_B"
                },
                {
                    "duration": 0.23,
                    "phone": "ay_I"
                },
                {
                    "duration": 0.08,
                    "phone": "d_E"
                }
            ],
            "alignedWord": "side",
            "start": 2.05,
            "startOffset": 34,
            "word": "side"
        },
        {
            "case": "success",
            "end": 2.73,
            "endOffset": 43,
            "phones": [
                {
                    "duration": 0.09,
                    "phone": "w_B"
                },
                {
                    "duration": 0.06,
                    "phone": "ih_I"
                },
                {
                    "duration": 0.06,
                    "phone": "th_E"
                }
            ],
            "alignedWord": "with",
            "start": 2.52,
            "startOffset": 39,
            "word": "with"
        },
        {
            "case": "success",
            "end": 2.85,
            "endOffset": 47,
            "phones": [
                {
                    "duration": 0.06,
                    "phone": "dh_B"
                },
                {
                    "duration": 0.06,
                    "phone": "ah_E"
                }
            ],
            "alignedWord": "the",
            "start": 2.73,
            "startOffset": 44,
            "word": "the"
        },
        {
            "case": "success",
            "end": 3.33,
            "endOffset": 54,
            "phones": [
                {
                    "duration": 0.09,
                    "phone": "g_B"
                },
                {
                    "duration": 0.14,
                    "phone": "aa_I"
                },
                {
                    "duration": 0.11,
                    "phone": "th_I"
                },
                {
                    "duration": 0.07,
                    "phone": "ih_I"
                },
                {
                    "duration": 0.07,
                    "phone": "k_E"
                }
            ],
            "alignedWord": "gothic",
            "start": 2.85,
            "startOffset": 48,
            "word": "Gothic"
        }
    ]
}

## LJ001-0061.txt
the Roman letter was used side by side with the Gothic.

## LJ001-0061.wav

      
    Raw
  

              LJ001-0061.wav
            
          
            View raw
        
    
## phase_vocoder_matching.py
from scipy.io import wavfile
import numpy as np
import json
import scipy.signal as sg
from scipy import linalg, fftpack
from numpy.lib.stride_tricks import as_strided

def _raised_cosine_window(window_length, periodic, a, b):
    even = 1 - window_length % 2
    periodic = 1. if True else False
    n = np.float64(window_length + periodic * even - 1)
    count = np.arange(window_length).astype(np.float64)
    cos_arg = 2 * np.pi * count / n
    return a - b * np.cos(cos_arg)


def soundsc(X, gain_scale=.9, copy=True):
    X = np.array(X, copy=copy)
    X = (X - X.min()) / (X.max() - X.min())
    X = 2 * X - 1
    X = gain_scale * X
    X = X * 2 ** 15
    return X.astype('int16')


def halfoverlap(X, window_size):
    if window_size % 2 != 0:
        raise ValueError("Window size must be even!")
    window_step = window_size // 2
    # Make sure there are an even number of windows before stridetricks
    append = np.zeros((window_size - len(X) % window_size))
    X = np.hstack((X, append))
    num_frames = len(X) // window_step - 1
    row_stride = X.itemsize * window_step
    col_stride = X.itemsize
    X_strided = as_strided(X, shape=(num_frames, window_size),
                           strides=(row_stride, col_stride))
    return X_strided


def overlap(X, window_size, window_step, window=None, copy=True):
    if not hasattr(X, "shape") or len(X.shape) != 1:
        raise ValueError("X must be passed as 1D np array")
    if copy:
        X = np.array(X)
        X = X.copy()
    if window_size % 2 != 0:
        raise ValueError("Window size must be even!")
    # Make sure there are an even number of windows before stridetricks
    # need to window in here?
    append = np.zeros((window_size - len(X) % window_size))
    X = np.hstack((X, append))
    overlap_sz = window_size - window_step
    new_shape = X.shape[:-1] + ((X.shape[-1] - overlap_sz) // window_step, window_size)
    new_strides = X.strides[:-1] + (window_step * X.strides[-1],) + X.strides[-1:]
    X_strided = as_strided(X, shape=new_shape, strides=new_strides)
    return X_strided


def stft(X, windowsize=None, fftsize=None, step="half", mean_normalize=True, real=False,
         window_type="hann", periodic=True, compute_onesided=True):
    if real:
        raise ValueError("real=True needs debug")
        local_fft = fftpack.rfft
        cut = None
    else:
        local_fft = fftpack.fft
        cut = None

    if fftsize == None:
        assert windowsize is not None
        enclosing_fftsize = int(2 ** np.ceil(np.log(windowsize) / np.log(2.0)))
        fftsize = enclosing_fftsize
    else:
        windowsize = fftsize

    if compute_onesided or real:
        cut = fftsize // 2 + 1

    if mean_normalize:
        X -= X.mean()

    if step == "half":
        X = halfoverlap(X, windowsize)
    else:
        X = overlap(X, windowsize, step)

    size = fftsize
    if window_type == "hann" and periodic:
        win = _raised_cosine_window(size, True, 0.5, 0.5)
    else:
        raise ValueError("No other windows currently supported")
        #win = 0.54 - .46 * np.cos(2 * np.pi * np.arange(size) / (size - 1))
    #win = 0.54 - .46 * np.cos(2 * np.pi * np.arange(size) / (size - 1))
    X = X * win[None]
    X = local_fft(X.astype(np.float64))[:, :cut]
    return X


def overlap_add(X_strided, window_step):
    n_rows, window_size = X_strided.shape

    # Start with largest size (no overlap) then truncate after we finish
    # +2 for one window on each side
    X = np.zeros(((n_rows + 2) * window_size,)).astype(X_strided.dtype)
    start_index = 0

    total_windowing_sum = np.zeros((X.shape[0]))
    win = 0.54 - .46 * np.cos(2 * np.pi * np.arange(window_size) / (
        window_size - 1))
    for i in range(n_rows):
        end_index = start_index + window_size
        X[start_index:end_index] += X_strided[i]
        total_windowing_sum[start_index:end_index] += win
        start_index += window_step
    # Not using this right now
    #X = np.real(X) / (total_windowing_sum + 1)
    X = X[:end_index]
    return X


def istft(X, windowsize=None, fftsize=None, step="half", mean_normalize=True,
          real=False, compute_onesided=True):
    """
    Compute ISTFT for STFT transformed X
    """
    if real:
        local_ifft = fftpack.irfft
        X_pad = np.zeros((X.shape[0], X.shape[1] + 1)) + 0j
        X_pad[:, :-1] = X
        X = X_pad
    else:
        local_ifft = fftpack.ifft
    if fftsize == None:
        assert windowsize == None
    if compute_onesided:
        X_pad = np.zeros((X.shape[0], 2 * X.shape[1])) + 0j
        X_pad[:, :fftsize // 2 + 1] = X
        X_pad[:, fftsize // 2 + 1:] = 0
        X = X_pad
    X = local_ifft(X).astype("float64")
    if step == "half":
        X = invert_halfoverlap(X)
    else:
        X = overlap_add(X, step)
    if mean_normalize:
        X -= np.mean(X)
    return X

def phase_vocoder(wav_data, rate, fftsize=512):
    """ bulk of the processing taken from librosa """
    wav_data = wav_data.copy().astype("float32")
    D = stft(wav_data, fftsize, step=32)
    D = D.transpose(1, 0)
    n_fft = 2 * (D.shape[0] - 1)
    hop_length = None

    if hop_length is None:
        hop_length = int(n_fft // 4)

    # Expected time advance in each bin
    time_steps = np.arange(0, D.shape[1], rate, dtype=np.float)

    # Expected phase advance in each bin
    phi_advance = np.linspace(0, np.pi * hop_length, D.shape[0])


    # Create an empty output array
    d_stretch = np.zeros((D.shape[0], len(time_steps)), D.dtype, order='F')

    # Phase accumulator; initialize to the first sample
    phase_acc = np.angle(D[:, 0])

    # Pad 0 columns to simplify boundary logic
    D = np.pad(D, [(0, 0), (0, 2)], mode='constant')

    for (t, step) in enumerate(time_steps):

        columns = D[:, int(step):int(step + 2)]

        # Weighting for linear magnitude interpolation
        alpha = np.mod(step, 1.0)
        mag = ((1.0 - alpha) * np.abs(columns[:, 0])
               + alpha * np.abs(columns[:, 1]))

        # Store to output array
        d_stretch[:, t] = mag * np.exp(1.j * phase_acc)

        # Compute phase advance
        dphase = (np.angle(columns[:, 1])
                  - np.angle(columns[:, 0])
                  - phi_advance)

        # Wrap to -pi:pi range
        dphase = dphase - 2.0 * np.pi * np.round(dphase / (2.0 * np.pi))

        # Accumulate phase
        phase_acc += phi_advance + dphase
    d_stretch = d_stretch.transpose(1, 0)
    reconstructed_wav = istft(d_stretch, fftsize=fftsize, step=32)
    return reconstructed_wav

if __name__ == "__main__":
    file_to_modify = "synth_LJ001-0061.wav"
    file_orig = "LJ001-0061.wav"
    fs, wav_data = wavfile.read(file_to_modify)

    orig_fs, orig_wav_data = wavfile.read(file_orig)

    file_to_modify_info = "synth_LJ001-0061.json"
    with open(file_to_modify_info, "r") as f:
        modify_info = json.load(f)
    target_alignment_info = "LJ001-0061.json"
    with open(target_alignment_info, "r") as f:
        target_info = json.load(f)

    s = [w["word"] for w in modify_info["words"]]
    t = [w["word"] for w in target_info["words"]]
    fftsize = 512
    assert all([s[i] == t[i] for i in range(len(s))]) and len(t) == len(s)
    warped_chunks = []
    for n, (modify_word, target_word) in enumerate(zip(modify_info["words"], target_info["words"])):
        # for now, do whole word boundaries directly
        if n == 0:
            s_ = target_info["words"][0]["start"] * fs
            if s_ != 0:
                warped_chunks.append(orig_wav_data[0:s_])
        s_ = int(modify_word["start"] * fs)
        e_ = int(modify_word["end"] * fs)
        modify_chunk = wav_data[s_:e_]
        # calculate the rate chunks based on the ratio
        """
        m_phones = modify_word["phones"]
        t_phones = target_word["phones"]
        ratios = [m_phones[i]["duration"] / float(t_phones[i]["duration"]) for i in range(len(m_phones))]
        """
        m_dur = modify_word["end"] - modify_word["start"]
        t_dur = target_word["end"] - target_word["start"]

        rate = m_dur / float(t_dur)
        stretch_chunk = phase_vocoder(modify_chunk, rate)
        warped_chunks.append(stretch_chunk)
        if n != (len(modify_info["words"]) - 1):
            s_ = int(modify_info["words"][n]["end"] * fs)
            e_ = int(modify_info["words"][n + 1]["start"] * fs)
            gap_chunk = wav_data[s_:e_]
            m_dur = modify_info["words"][n + 1]["start"]  - modify_info["words"][n]["end"]
            t_dur = target_info["words"][n + 1]["start"]  - target_info["words"][n]["end"]
            if m_dur == 0 or t_dur == 0:
                continue
            if len(gap_chunk) < fftsize:
                warped_chunks.append(gap_chunk)
                continue
            rate = m_dur / float(t_dur)
            stretch_gap_chunk = phase_vocoder(gap_chunk, rate, fftsize=fftsize)
            warped_chunks.append(stretch_gap_chunk)
    warped_wav = np.concatenate(warped_chunks)
    wavfile.write("output.wav", fs, soundsc(warped_wav))

## synth_LJ001-0061.json
{
    "transcript": "the Roman letter was used side by side with the Gothic.",
    "words": [
        {
            "case": "success",
            "end": 0.28,
            "endOffset": 3,
            "phones": [
                {
                    "duration": 0.06,
                    "phone": "dh_B"
                },
                {
                    "duration": 0.06,
                    "phone": "ah_E"
                }
            ],
            "alignedWord": "the",
            "start": 0.16,
            "startOffset": 0,
            "word": "the"
        },
        {
            "case": "success",
            "end": 0.69,
            "endOffset": 9,
            "phones": [
                {
                    "duration": 0.06,
                    "phone": "r_B"
                },
                {
                    "duration": 0.14,
                    "phone": "ow_I"
                },
                {
                    "duration": 0.04,
                    "phone": "m_I"
                },
                {
                    "duration": 0.07,
                    "phone": "ah_I"
                },
                {
                    "duration": 0.1,
                    "phone": "n_E"
                }
            ],
            "alignedWord": "roman",
            "start": 0.28,
            "startOffset": 4,
            "word": "Roman"
        },
        {
            "case": "success",
            "end": 1.0699999999999998,
            "endOffset": 16,
            "phones": [
                {
                    "duration": 0.07,
                    "phone": "l_B"
                },
                {
                    "duration": 0.11,
                    "phone": "eh_I"
                },
                {
                    "duration": 0.11,
                    "phone": "t_I"
                },
                {
                    "duration": 0.09,
                    "phone": "er_E"
                }
            ],
            "alignedWord": "letter",
            "start": 0.69,
            "startOffset": 10,
            "word": "letter"
        },
        {
            "case": "success",
            "end": 1.28,
            "endOffset": 20,
            "phones": [
                {
                    "duration": 0.06,
                    "phone": "w_B"
                },
                {
                    "duration": 0.07,
                    "phone": "ah_I"
                },
                {
                    "duration": 0.08,
                    "phone": "z_E"
                }
            ],
            "alignedWord": "was",
            "start": 1.07,
            "startOffset": 17,
            "word": "was"
        },
        {
            "case": "success",
            "end": 1.57,
            "endOffset": 25,
            "phones": [
                {
                    "duration": 0.08,
                    "phone": "y_B"
                },
                {
                    "duration": 0.1,
                    "phone": "uw_I"
                },
                {
                    "duration": 0.07,
                    "phone": "z_I"
                },
                {
                    "duration": 0.04,
                    "phone": "d_E"
                }
            ],
            "alignedWord": "used",
            "start": 1.28,
            "startOffset": 21,
            "word": "used"
        },
        {
            "case": "success",
            "end": 1.9000000000000001,
            "endOffset": 30,
            "phones": [
                {
                    "duration": 0.15,
                    "phone": "s_B"
                },
                {
                    "duration": 0.12,
                    "phone": "ay_I"
                },
                {
                    "duration": 0.06,
                    "phone": "d_E"
                }
            ],
            "alignedWord": "side",
            "start": 1.57,
            "startOffset": 26,
            "word": "side"
        },
        {
            "case": "success",
            "end": 2.12,
            "endOffset": 33,
            "phones": [
                {
                    "duration": 0.09,
                    "phone": "b_B"
                },
                {
                    "duration": 0.12,
                    "phone": "ay_E"
                }
            ],
            "alignedWord": "by",
            "start": 1.91,
            "startOffset": 31,
            "word": "by"
        },
        {
            "case": "success",
            "end": 2.4699999999999998,
            "endOffset": 38,
            "phones": [
                {
                    "duration": 0.13,
                    "phone": "s_B"
                },
                {
                    "duration": 0.11,
                    "phone": "ay_I"
                },
                {
                    "duration": 0.1,
                    "phone": "d_E"
                }
            ],
            "alignedWord": "side",
            "start": 2.13,
            "startOffset": 34,
            "word": "side"
        },
        {
            "case": "success",
            "end": 2.6300000000000003,
            "endOffset": 43,
            "phones": [
                {
                    "duration": 0.04,
                    "phone": "w_B"
                },
                {
                    "duration": 0.05,
                    "phone": "ih_I"
                },
                {
                    "duration": 0.07,
                    "phone": "th_E"
                }
            ],
            "alignedWord": "with",
            "start": 2.47,
            "startOffset": 39,
            "word": "with"
        },
        {
            "case": "success",
            "end": 2.74,
            "endOffset": 47,
            "phones": [
                {
                    "duration": 0.05,
                    "phone": "dh_B"
                },
                {
                    "duration": 0.05,
                    "phone": "ah_E"
                }
            ],
            "alignedWord": "the",
            "start": 2.64,
            "startOffset": 44,
            "word": "the"
        },
        {
            "case": "success",
            "end": 3.2800000000000002,
            "endOffset": 54,
            "phones": [
                {
                    "duration": 0.07,
                    "phone": "g_B"
                },
                {
                    "duration": 0.11,
                    "phone": "aa_I"
                },
                {
                    "duration": 0.13,
                    "phone": "th_I"
                },
                {
                    "duration": 0.13,
                    "phone": "ih_I"
                },
                {
                    "duration": 0.1,
                    "phone": "k_E"
                }
            ],
            "alignedWord": "gothic",
            "start": 2.74,
            "startOffset": 48,
            "word": "Gothic"
        }
    ]
}

## synth_LJ001-0061.txt
the Roman letter was used side by side with the Gothic.

## synth_LJ001-0061.wav

      
    Raw
  

              synth_LJ001-0061.wav
            
          
            View raw
	{
	"transcript": "the Roman letter was used side by side with the Gothic.",
	"words": [
	{
	"case": "success",
	"end": 0.11,
	"endOffset": 3,
	"phones": [
	{
	"duration": 0.05,
	"phone": "dh_B"
	},
	{
	"duration": 0.06,
	"phone": "ah_E"
	}
	],
	"alignedWord": "the",
	"start": 0.0,
	"startOffset": 0,
	"word": "the"
	},
	{
	"case": "success",
	"end": 0.52,
	"endOffset": 9,
	"phones": [
	{
	"duration": 0.1,
	"phone": "r_B"
	},
	{
	"duration": 0.08,
	"phone": "ow_I"
	},
	{
	"duration": 0.08,
	"phone": "m_I"
	},
	{
	"duration": 0.05,
	"phone": "ah_I"
	},
	{
	"duration": 0.1,
	"phone": "n_E"
	}
	],
	"alignedWord": "roman",
	"start": 0.11,
	"startOffset": 4,
	"word": "Roman"
	},
	{
	"case": "success",
	"end": 0.8300000000000001,
	"endOffset": 16,
	"phones": [
	{
	"duration": 0.05,
	"phone": "l_B"
	},
	{
	"duration": 0.08,
	"phone": "eh_I"
	},
	{
	"duration": 0.08,
	"phone": "t_I"
	},
	{
	"duration": 0.1,
	"phone": "er_E"
	}
	],
	"alignedWord": "letter",
	"start": 0.52,
	"startOffset": 10,
	"word": "letter"
	},
	{
	"case": "success",
	"end": 1.05,
	"endOffset": 20,
	"phones": [
	{
	"duration": 0.08,
	"phone": "w_B"
	},
	{
	"duration": 0.07,
	"phone": "ah_I"
	},
	{
	"duration": 0.07,
	"phone": "z_E"
	}
	],
	"alignedWord": "was",
	"start": 0.83,
	"startOffset": 17,
	"word": "was"
	},
	{
	"case": "success",
	"end": 1.4100000000000001,
	"endOffset": 25,
	"phones": [
	{
	"duration": 0.12,
	"phone": "y_B"
	},
	{
	"duration": 0.16,
	"phone": "uw_I"
	},
	{
	"duration": 0.07,
	"phone": "z_I"
	},
	{
	"duration": 0.01,
	"phone": "d_E"
	}
	],
	"alignedWord": "used",
	"start": 1.05,
	"startOffset": 21,
	"word": "used"
	},
	{
	"case": "success",
	"end": 1.8599999999999999,
	"endOffset": 30,
	"phones": [
	{
	"duration": 0.17,
	"phone": "s_B"
	},
	{
	"duration": 0.12,
	"phone": "ay_I"
	},
	{
	"duration": 0.08,
	"phone": "d_E"
	}
	],
	"alignedWord": "side",
	"start": 1.49,
	"startOffset": 26,
	"word": "side"
	},
	{
	"case": "success",
	"end": 2.0500000000000003,
	"endOffset": 33,
	"phones": [
	{
	"duration": 0.08,
	"phone": "b_B"
	},
	{
	"duration": 0.1,
	"phone": "ay_E"
	}
	],
	"alignedWord": "by",
	"start": 1.87,
	"startOffset": 31,
	"word": "by"
	},
	{
	"case": "success",
	"end": 2.51,
	"endOffset": 38,
	"phones": [
	{
	"duration": 0.15,
	"phone": "s_B"
	},
	{
	"duration": 0.23,
	"phone": "ay_I"
	},
	{
	"duration": 0.08,
	"phone": "d_E"
	}
	],
	"alignedWord": "side",
	"start": 2.05,
	"startOffset": 34,
	"word": "side"
	},
	{
	"case": "success",
	"end": 2.73,
	"endOffset": 43,
	"phones": [
	{
	"duration": 0.09,
	"phone": "w_B"
	},
	{
	"duration": 0.06,
	"phone": "ih_I"
	},
	{
	"duration": 0.06,
	"phone": "th_E"
	}
	],
	"alignedWord": "with",
	"start": 2.52,
	"startOffset": 39,
	"word": "with"
	},
	{
	"case": "success",
	"end": 2.85,
	"endOffset": 47,
	"phones": [
	{
	"duration": 0.06,
	"phone": "dh_B"
	},
	{
	"duration": 0.06,
	"phone": "ah_E"
	}
	],
	"alignedWord": "the",
	"start": 2.73,
	"startOffset": 44,
	"word": "the"
	},
	{
	"case": "success",
	"end": 3.33,
	"endOffset": 54,
	"phones": [
	{
	"duration": 0.09,
	"phone": "g_B"
	},
	{
	"duration": 0.14,
	"phone": "aa_I"
	},
	{
	"duration": 0.11,
	"phone": "th_I"
	},
	{
	"duration": 0.07,
	"phone": "ih_I"
	},
	{
	"duration": 0.07,
	"phone": "k_E"
	}
	],
	"alignedWord": "gothic",
	"start": 2.85,
	"startOffset": 48,
	"word": "Gothic"
	}
	]
	}
	from scipy.io import wavfile
	import numpy as np
	import json
	import scipy.signal as sg
	from scipy import linalg, fftpack
	from numpy.lib.stride_tricks import as_strided

	def _raised_cosine_window(window_length, periodic, a, b):
	even = 1 - window_length % 2
	periodic = 1. if True else False
	n = np.float64(window_length + periodic * even - 1)
	count = np.arange(window_length).astype(np.float64)
	cos_arg = 2 * np.pi * count / n
	return a - b * np.cos(cos_arg)


	def soundsc(X, gain_scale=.9, copy=True):
	X = np.array(X, copy=copy)
	X = (X - X.min()) / (X.max() - X.min())
	X = 2 * X - 1
	X = gain_scale * X
	X = X * 2 ** 15
	return X.astype('int16')


	def halfoverlap(X, window_size):
	if window_size % 2 != 0:
	raise ValueError("Window size must be even!")
	window_step = window_size // 2
	# Make sure there are an even number of windows before stridetricks
	append = np.zeros((window_size - len(X) % window_size))
	X = np.hstack((X, append))
	num_frames = len(X) // window_step - 1
	row_stride = X.itemsize * window_step
	col_stride = X.itemsize
	X_strided = as_strided(X, shape=(num_frames, window_size),
	strides=(row_stride, col_stride))
	return X_strided


	def overlap(X, window_size, window_step, window=None, copy=True):
	if not hasattr(X, "shape") or len(X.shape) != 1:
	raise ValueError("X must be passed as 1D np array")
	if copy:
	X = np.array(X)
	X = X.copy()
	if window_size % 2 != 0:
	raise ValueError("Window size must be even!")
	# Make sure there are an even number of windows before stridetricks
	# need to window in here?
	append = np.zeros((window_size - len(X) % window_size))
	X = np.hstack((X, append))
	overlap_sz = window_size - window_step
	new_shape = X.shape[:-1] + ((X.shape[-1] - overlap_sz) // window_step, window_size)
	new_strides = X.strides[:-1] + (window_step * X.strides[-1],) + X.strides[-1:]
	X_strided = as_strided(X, shape=new_shape, strides=new_strides)
	return X_strided


	def stft(X, windowsize=None, fftsize=None, step="half", mean_normalize=True, real=False,
	window_type="hann", periodic=True, compute_onesided=True):
	if real:
	raise ValueError("real=True needs debug")
	local_fft = fftpack.rfft
	cut = None
	else:
	local_fft = fftpack.fft
	cut = None

	if fftsize == None:
	assert windowsize is not None
	enclosing_fftsize = int(2 ** np.ceil(np.log(windowsize) / np.log(2.0)))
	fftsize = enclosing_fftsize
	else:
	windowsize = fftsize

	if compute_onesided or real:
	cut = fftsize // 2 + 1

	if mean_normalize:
	X -= X.mean()

	if step == "half":
	X = halfoverlap(X, windowsize)
	else:
	X = overlap(X, windowsize, step)

	size = fftsize
	if window_type == "hann" and periodic:
	win = _raised_cosine_window(size, True, 0.5, 0.5)
	else:
	raise ValueError("No other windows currently supported")
	#win = 0.54 - .46 * np.cos(2 * np.pi * np.arange(size) / (size - 1))
	#win = 0.54 - .46 * np.cos(2 * np.pi * np.arange(size) / (size - 1))
	X = X * win[None]
	X = local_fft(X.astype(np.float64))[:, :cut]
	return X


	def overlap_add(X_strided, window_step):
	n_rows, window_size = X_strided.shape

	# Start with largest size (no overlap) then truncate after we finish
	# +2 for one window on each side
	X = np.zeros(((n_rows + 2) * window_size,)).astype(X_strided.dtype)
	start_index = 0

	total_windowing_sum = np.zeros((X.shape[0]))
	win = 0.54 - .46 * np.cos(2 * np.pi * np.arange(window_size) / (
	window_size - 1))
	for i in range(n_rows):
	end_index = start_index + window_size
	X[start_index:end_index] += X_strided[i]
	total_windowing_sum[start_index:end_index] += win
	start_index += window_step
	# Not using this right now
	#X = np.real(X) / (total_windowing_sum + 1)
	X = X[:end_index]
	return X


	def istft(X, windowsize=None, fftsize=None, step="half", mean_normalize=True,
	real=False, compute_onesided=True):
	"""
	Compute ISTFT for STFT transformed X
	"""
	if real:
	local_ifft = fftpack.irfft
	X_pad = np.zeros((X.shape[0], X.shape[1] + 1)) + 0j
	X_pad[:, :-1] = X
	X = X_pad
	else:
	local_ifft = fftpack.ifft
	if fftsize == None:
	assert windowsize == None
	if compute_onesided:
	X_pad = np.zeros((X.shape[0], 2 * X.shape[1])) + 0j
	X_pad[:, :fftsize // 2 + 1] = X
	X_pad[:, fftsize // 2 + 1:] = 0
	X = X_pad
	X = local_ifft(X).astype("float64")
	if step == "half":
	X = invert_halfoverlap(X)
	else:
	X = overlap_add(X, step)
	if mean_normalize:
	X -= np.mean(X)
	return X

	def phase_vocoder(wav_data, rate, fftsize=512):
	""" bulk of the processing taken from librosa """
	wav_data = wav_data.copy().astype("float32")
	D = stft(wav_data, fftsize, step=32)
	D = D.transpose(1, 0)
	n_fft = 2 * (D.shape[0] - 1)
	hop_length = None

	if hop_length is None:
	hop_length = int(n_fft // 4)

	# Expected time advance in each bin
	time_steps = np.arange(0, D.shape[1], rate, dtype=np.float)

	# Expected phase advance in each bin
	phi_advance = np.linspace(0, np.pi * hop_length, D.shape[0])


	# Create an empty output array
	d_stretch = np.zeros((D.shape[0], len(time_steps)), D.dtype, order='F')

	# Phase accumulator; initialize to the first sample
	phase_acc = np.angle(D[:, 0])

	# Pad 0 columns to simplify boundary logic
	D = np.pad(D, [(0, 0), (0, 2)], mode='constant')

	for (t, step) in enumerate(time_steps):

	columns = D[:, int(step):int(step + 2)]

	# Weighting for linear magnitude interpolation
	alpha = np.mod(step, 1.0)
	mag = ((1.0 - alpha) * np.abs(columns[:, 0])
	+ alpha * np.abs(columns[:, 1]))

	# Store to output array
	d_stretch[:, t] = mag * np.exp(1.j * phase_acc)

	# Compute phase advance
	dphase = (np.angle(columns[:, 1])
	- np.angle(columns[:, 0])
	- phi_advance)

	# Wrap to -pi:pi range
	dphase = dphase - 2.0 * np.pi * np.round(dphase / (2.0 * np.pi))

	# Accumulate phase
	phase_acc += phi_advance + dphase
	d_stretch = d_stretch.transpose(1, 0)
	reconstructed_wav = istft(d_stretch, fftsize=fftsize, step=32)
	return reconstructed_wav

	if __name__ == "__main__":
	file_to_modify = "synth_LJ001-0061.wav"
	file_orig = "LJ001-0061.wav"
	fs, wav_data = wavfile.read(file_to_modify)

	orig_fs, orig_wav_data = wavfile.read(file_orig)

	file_to_modify_info = "synth_LJ001-0061.json"
	with open(file_to_modify_info, "r") as f:
	modify_info = json.load(f)
	target_alignment_info = "LJ001-0061.json"
	with open(target_alignment_info, "r") as f:
	target_info = json.load(f)

	s = [w["word"] for w in modify_info["words"]]
	t = [w["word"] for w in target_info["words"]]
	fftsize = 512
	assert all([s[i] == t[i] for i in range(len(s))]) and len(t) == len(s)
	warped_chunks = []
	for n, (modify_word, target_word) in enumerate(zip(modify_info["words"], target_info["words"])):
	# for now, do whole word boundaries directly
	if n == 0:
	s_ = target_info["words"][0]["start"] * fs
	if s_ != 0:
	warped_chunks.append(orig_wav_data[0:s_])
	s_ = int(modify_word["start"] * fs)
	e_ = int(modify_word["end"] * fs)
	modify_chunk = wav_data[s_:e_]
	# calculate the rate chunks based on the ratio
	"""
	m_phones = modify_word["phones"]
	t_phones = target_word["phones"]
	ratios = [m_phones[i]["duration"] / float(t_phones[i]["duration"]) for i in range(len(m_phones))]
	"""
	m_dur = modify_word["end"] - modify_word["start"]
	t_dur = target_word["end"] - target_word["start"]

	rate = m_dur / float(t_dur)
	stretch_chunk = phase_vocoder(modify_chunk, rate)
	warped_chunks.append(stretch_chunk)
	if n != (len(modify_info["words"]) - 1):
	s_ = int(modify_info["words"][n]["end"] * fs)
	e_ = int(modify_info["words"][n + 1]["start"] * fs)
	gap_chunk = wav_data[s_:e_]
	m_dur = modify_info["words"][n + 1]["start"] - modify_info["words"][n]["end"]
	t_dur = target_info["words"][n + 1]["start"] - target_info["words"][n]["end"]
	if m_dur == 0 or t_dur == 0:
	continue
	if len(gap_chunk) < fftsize:
	warped_chunks.append(gap_chunk)
	continue
	rate = m_dur / float(t_dur)
	stretch_gap_chunk = phase_vocoder(gap_chunk, rate, fftsize=fftsize)
	warped_chunks.append(stretch_gap_chunk)
	warped_wav = np.concatenate(warped_chunks)
	wavfile.write("output.wav", fs, soundsc(warped_wav))