Last active
March 9, 2020 01:32
-
-
Save rbrigden/d611ef4f2eccddb5581e0ac617ce38eb to your computer and use it in GitHub Desktop.
Load the WSJ speech dataset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import os | |
class WSJ(): | |
""" Load the WSJ speech dataset | |
Ensure WSJ_PATH is path to directory containing | |
all data files (.npy) provided on Kaggle. | |
Example usage: | |
loader = WSJ() | |
trainX, trainY = loader.train | |
assert(trainX.shape[0] == 24590) | |
""" | |
def __init__(self): | |
self.dev_set = None | |
self.train_set = None | |
self.test_set = None | |
@property | |
def dev(self): | |
if self.dev_set is None: | |
self.dev_set = load_raw(os.environ['WSJ_PATH'], 'dev') | |
return self.dev_set | |
@property | |
def train(self): | |
if self.train_set is None: | |
self.train_set = load_raw(os.environ['WSJ_PATH'], 'train') | |
return self.train_set | |
@property | |
def test(self): | |
if self.test_set is None: | |
self.test_set = (np.load(os.path.join(os.environ['WSJ_PATH'], 'test.npy'), encoding='bytes'), None) | |
return self.test_set | |
def load_raw(path, name): | |
return ( | |
np.load(os.path.join(path, '{}.npy'.format(name)), encoding='bytes'), | |
np.load(os.path.join(path, '{}_labels.npy'.format(name)), encoding='bytes') | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment