Skip to content

Instantly share code, notes, and snippets.

@wckdouglas
Last active May 7, 2019 08:58
Show Gist options
  • Save wckdouglas/a49a998b13b8bc4ffe2880cee6c738fb to your computer and use it in GitHub Desktop.
Save wckdouglas/a49a998b13b8bc4ffe2880cee6c738fb to your computer and use it in GitHub Desktop.
Deep-learning framework modified from DanQ model (a hybrid convolutional and recurrent deep neural network)
# load libraries
from keras import backend as K
from keras.models import Sequential, model_from_json
from keras.layers import Dense, Conv1D,\
Flatten, MaxPool1D, \
Dropout, LSTM, \
Bidirectional
from sequencing_tools.fastq_tools import reverse_complement, \
onehot_sequence_encoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
def f1_metric(y_true, y_pred):
'''
https://stackoverflow.com/questions/43547402/how-to-calculate-f1-macro-in-keras?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa
'''
def recall(y_true, y_pred):
"""Recall metric.
Only computes a batch-wise average of recall.
Computes the recall, a metric for multi-label classification of
how many relevant items are selected.
"""
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
recall = true_positives / (possible_positives + K.epsilon())
return recall
def precision(y_true, y_pred):
"""Precision metric.
Only computes a batch-wise average of precision.
Computes the precision, a metric for multi-label classification of
how many selected items are relevant.
"""
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
precision = true_positives / (predicted_positives + K.epsilon())
return precision
precision = precision(y_true, y_pred)
recall = recall(y_true, y_pred)
return 2*((precision*recall)/(precision+recall+K.epsilon()))
# set up onehot sequence encoder
acceptable_nuc = list('ACTGN')
dna_encoder = onehot_sequence_encoder(''.join(acceptable_nuc))
def generate_data(sequences, labels):
'''
Wrapper for generating one-hot-encoded sequences
'''
for i, (seq, label) in enumerate(zip(sequences, labels)):
if set(seq).issubset(acceptable_nuc):
label = 0 if label == "bound" else 1
yield dna_encoder.transform(seq), label
def deep_model():
'''
adopted from DanQ: https://www.ncbi.nlm.nih.gov/pubmed/27084946
Convolution layer -> Max Pooling -> Bidirectional LTSM -> Fully connected
'''
model = Sequential()
model.add(Conv1D(filters=160,
kernel_size = 26,
strides = 1,
padding = 'valid',
input_shape = (frag_size,5),
activation = 'relu'))
model.add(MaxPool1D(pool_size=50, strides=13))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(25, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='rmsprop',
metrics=[f1_metric,'binary_accuracy'])
return model
# generate data
data = [(seq, lab) for seq, lab in generate_data(sequences, labels)]
X, y = zip(*data)
X = np.array(X)
y = np.array(y)
# split data from evaluation
X_train, X_test, y_train, y_test = train_test_split(X, Y,
test_size=0.1,
random_state=0)
# train model with 25 epochs
model = deep_model()
model.fit(X_train,
y_test,
validation_split = 0.1,
epochs = 25)
# Test on testset
y_pred = model.predict_proba(X_test)
y_pred = y_pred.flatten()
print("Precision: %1.3f" % precision_score(y_test, y_pred.round()))
print("Recall: %1.3f" % recall_score(y_test, y_pred.round()))
print("F1: %1.3f" % f1_score(y_test, y_pred.round()))
print("AUROC: %1.3f" % roc_auc_score(y_test, y_pred))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment