Created
June 14, 2018 12:17
-
-
Save hussius/e1ae850acfcbac3c70f1be373f7bb6c3 to your computer and use it in GitHub Desktop.
Preprocess yeast DNA csv file from Genome Research paper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pathlib import Path | |
import os | |
import sys | |
from fire import Fire | |
import numpy as np | |
import pandas as pd | |
from tqdm import tqdm | |
def one_hot_encoding(df, seq_column, target): | |
""" | |
This function returns a one-hot-encoded representation of DNA sequences | |
and a vector of target values from a data frame. | |
Args: | |
df (DataFrame): Data frame where rows correspond to yeast | |
DNA sequences. | |
seq_column (str): Name of the column containing the DNA sequence. | |
target (str): Name of the column containing the target | |
variable (fluorescence). | |
Returns: | |
X (numpy array): One-hot encoded version of DNA sequence | |
with shape (N, 4, 70, 1). | |
Y (numpy array: Target value (fluorescence). | |
total_width (int): Length of the sequences including padding. | |
""" | |
bases = ['A', 'C', 'G', 'T'] | |
base_dict = dict(zip(bases, range(4))) | |
n = len(df) | |
pad = 10 | |
total_width = df[seq_column].str.len().max() + 2 * pad | |
# initialize an empty numpy ndarray of the appropriate size | |
X = np.zeros((n, 4, total_width, 1)) | |
# an array with the sequences that we will one-hot encode | |
seqs = df[seq_column].values | |
for i in tqdm(range(n)): | |
seq = seqs[i] | |
for b in range(len(seq)): | |
X[i, | |
base_dict[seq[b]], | |
int(b + round((total_width - len(seq))/2.)), | |
0] = 1. | |
X = X.astype("float32") | |
Y = np.asarray(df[target].values, | |
dtype="float32")[:, np.newaxis] | |
return X, Y, total_width | |
def convert(output_root): | |
""" | |
This function downloads a CSV file associated with the paper | |
"Deep Learning Of The Regulatory Grammar Of Yeast 5′ Untranslated | |
Regions From 500,000 Random Sequences" | |
(https://genome.cshlp.org/content/27/12/2015) | |
and converts the sequence information and target values to numpy arrays. | |
Args: | |
output_root (str): Name of directory to which output | |
(two numpy array files) will be written. | |
Returns: | |
- | |
""" | |
output_root = Path(output_root) | |
os.makedirs(output_root, exist_ok=True) | |
try: | |
df = pd.read_csv('https://github.com/animesh/2017---' | |
'Deep-learning-yeast-UTRs/blob/master' | |
'/Data/Random_UTRs.csv.gz?raw=true', | |
compression='gzip') | |
except Exception as e: | |
sys.exit('Unable to download yeast file ... please check URL') | |
X, Y, total_width = one_hot_encoding(df, | |
seq_column='UTR', | |
target='growth_rate') | |
np.save(str(output_root / 'yeast_seq.npy'), X) | |
np.save(str(output_root / 'yeast_labels.npy'), Y) | |
if __name__ == '__main__': | |
Fire(convert) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment