Skip to content

Instantly share code, notes, and snippets.

@hussius
Created June 14, 2018 12:17
Show Gist options
  • Save hussius/e1ae850acfcbac3c70f1be373f7bb6c3 to your computer and use it in GitHub Desktop.
Save hussius/e1ae850acfcbac3c70f1be373f7bb6c3 to your computer and use it in GitHub Desktop.
Preprocess yeast DNA csv file from Genome Research paper
from pathlib import Path
import os
import sys
from fire import Fire
import numpy as np
import pandas as pd
from tqdm import tqdm
def one_hot_encoding(df, seq_column, target):
"""
This function returns a one-hot-encoded representation of DNA sequences
and a vector of target values from a data frame.
Args:
df (DataFrame): Data frame where rows correspond to yeast
DNA sequences.
seq_column (str): Name of the column containing the DNA sequence.
target (str): Name of the column containing the target
variable (fluorescence).
Returns:
X (numpy array): One-hot encoded version of DNA sequence
with shape (N, 4, 70, 1).
Y (numpy array: Target value (fluorescence).
total_width (int): Length of the sequences including padding.
"""
bases = ['A', 'C', 'G', 'T']
base_dict = dict(zip(bases, range(4)))
n = len(df)
pad = 10
total_width = df[seq_column].str.len().max() + 2 * pad
# initialize an empty numpy ndarray of the appropriate size
X = np.zeros((n, 4, total_width, 1))
# an array with the sequences that we will one-hot encode
seqs = df[seq_column].values
for i in tqdm(range(n)):
seq = seqs[i]
for b in range(len(seq)):
X[i,
base_dict[seq[b]],
int(b + round((total_width - len(seq))/2.)),
0] = 1.
X = X.astype("float32")
Y = np.asarray(df[target].values,
dtype="float32")[:, np.newaxis]
return X, Y, total_width
def convert(output_root):
"""
This function downloads a CSV file associated with the paper
"Deep Learning Of The Regulatory Grammar Of Yeast 5′ Untranslated
Regions From 500,000 Random Sequences"
(https://genome.cshlp.org/content/27/12/2015)
and converts the sequence information and target values to numpy arrays.
Args:
output_root (str): Name of directory to which output
(two numpy array files) will be written.
Returns:
-
"""
output_root = Path(output_root)
os.makedirs(output_root, exist_ok=True)
try:
df = pd.read_csv('https://github.com/animesh/2017---'
'Deep-learning-yeast-UTRs/blob/master'
'/Data/Random_UTRs.csv.gz?raw=true',
compression='gzip')
except Exception as e:
sys.exit('Unable to download yeast file ... please check URL')
X, Y, total_width = one_hot_encoding(df,
seq_column='UTR',
target='growth_rate')
np.save(str(output_root / 'yeast_seq.npy'), X)
np.save(str(output_root / 'yeast_labels.npy'), Y)
if __name__ == '__main__':
Fire(convert)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment