Created
October 2, 2017 15:49
-
-
Save saurabhvyas/40bea8449fc92b6053294cdb8bff3394 to your computer and use it in GitHub Desktop.
main.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# In[ ]: | |
# In[1]: | |
import torch | |
import torch.nn as nn | |
from torch.autograd import Variable | |
import numpy as np | |
# Hyper Parameters | |
input_size = 13 | |
hidden_size = 60 | |
num_layers = 2 | |
num_classes = 2 | |
batch_size = 1 | |
num_epochs = 2 | |
learning_rate = 0.01 | |
# In[2]: | |
from python_speech_features import mfcc | |
from python_speech_features import delta | |
from python_speech_features import logfbank | |
import scipy.io.wavfile as wav | |
# In[3]: | |
def audio_to_mfcc(fileurl): | |
rate, sig = wav.read(fileurl) | |
mfcc_feat = mfcc(sig,rate) | |
#d_mfcc_feat = delta(mfcc_feat, 2) | |
#fbank_feat = logfbank(sig,rate) | |
return mfcc_feat | |
# In[21]: | |
class RNN(nn.Module): | |
def __init__(self, input_size, hidden_size, num_layers, num_classes): | |
super(RNN, self).__init__() | |
self.hidden_size = hidden_size | |
self.num_layers = num_layers | |
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True) | |
self.fc = nn.Linear(hidden_size, num_classes) | |
def forward(self, x): | |
# Set initial states | |
h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) | |
c0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) | |
# Forward propagate RNN | |
out, _ = self.lstm(x, (h0, c0)) | |
# print (out.size()) | |
# Decode hidden state of last time step | |
out = self.fc(out[:, -1, :]) | |
return out | |
# In[22]: | |
# create a many to one LSTM , just take last output timestep as output | |
rnn = RNN(input_size, hidden_size, num_layers, num_classes) | |
# In[ ]: | |
# In[23]: | |
# Loss and Optimizer | |
#criterion = nn.NLLLoss() | |
criterion = nn.CrossEntropyLoss() | |
optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate) | |
# In[32]: | |
mfcc1 = audio_to_mfcc('/home/saurabh/Documents/audio_classification/data/lizzie.wav') | |
#print(mfcc1.shape) | |
mfcc2 = audio_to_mfcc('/home/saurabh/Documents/audio_classification/data/boy.wav') | |
#print(mfcc2.shape) | |
temp = mfcc1[ : , np.newaxis , :] | |
temp2= mfcc2[ : , np.newaxis , :] | |
#print(temp2.shape) | |
input_var = Variable(torch.Tensor(temp)) | |
input2_var = Variable(torch.Tensor(temp2)) | |
for epoch in range(num_epochs): | |
outputs = rnn(input_var) | |
outputs2= rnn(input2_var) | |
# print(outputs[999]) | |
t1=Variable(torch.FloatTensor(outputs.data[999]),requires_grad=True) | |
t1=t1.unsqueeze(0) | |
# print(t1) | |
t2=Variable(torch.FloatTensor(outputs2.data[998]),requires_grad=True) | |
t2=t2.unsqueeze(0) | |
# print(t2) | |
#final_output = Variable( t1 , requires_grad =True) | |
#final_output2= Variable( t2 , requires_grad =True) | |
# final_output.unsqueeze(0) | |
# final_output2.unsqueeze(0) | |
# final_output_numpy=final_output.data.numpy()[np.newaxis,:] | |
# final_output = torch.from_numpy(final_output_numpy) | |
# final_output_numpy2=final_output2.data.numpy()[np.newaxis,:] | |
# final_output2 = torch.from_numpy(final_output_numpy2) | |
#print(final_output_numpy.shape) | |
# print(final_output.size()) | |
label = Variable(torch.LongTensor([0])) | |
label2 = Variable(torch.LongTensor([1])) | |
#print (label.size()) | |
loss = criterion( t1, label) | |
optimizer.zero_grad() | |
loss.backward() | |
optimizer.step() | |
print(loss.data[0]) | |
loss2=criterion( t2, label2) | |
optimizer.zero_grad() | |
loss2.backward() | |
optimizer.step() | |
print(loss2.data[0]) | |
# In[ ]: | |
# In[ ]: | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment