Created
November 19, 2016 05:08
-
-
Save brianhill11/c372bcc1e201075761b08a6753808891 to your computer and use it in GitHub Desktop.
Script to convert all .sra files in a directory to .fastq.gz files in batches
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/bash | |
# used to set time zone for logging date/time | |
export TZ=":America/Los_Angeles" date | |
# sets max number of processes to launch in a batch | |
MAX_NUM_PROCS=24 | |
# directory holding the .sra files | |
SRA_FILES=/hdfs1/FlintData/*.sra | |
# directory to save the .fastq files to | |
OUTPUT_DIR=/home/FlintData/ | |
# name of the log file | |
LOG_FILE_NAME="fastq_conversion.log" | |
# keep log file in results dir (can change to anywhere you like) | |
LOG_FILE=${OUTPUT_DIR}${LOG_FILE_NAME} | |
# extension for the completed fastq files (compressed) | |
FASTQ_EXTENSION=".fastq.gz" | |
# get number of .sra files in the input directory | |
NUM_SRA_FILES=$(ls -l $SRA_FILES | wc -l) | |
NUM_SRA_FILES=$((NUM_SRA_FILES)) | |
# init counter variables | |
COMPLETED_FILE_COUNT=0 | |
SKIPPED_FILE_COUNT=0 | |
NUM_PROCS=0 | |
# for each file in input directory | |
for f in $SRA_FILES | |
do | |
# check if the associated output file already exists | |
BASE_FILENAME=${f##*/} | |
# if a fastq file already exists for that base filename, don't reconvert | |
if [ $(find "$OUTPUT_DIR" -name "${BASE_FILENAME%.*}${FASTQ_EXTENSION}") ]; then | |
echo "$(date) | WARNING: ${BASE_FILENAME%.*}${FASTQ_EXTENSION} already exists. Skipping..." >> $LOG_FILE | |
NUM_SRA_FILES=$((NUM_SRA_FILES - 1)) | |
SKIPPED_FILE_COUNT=$((SKIPPED_FILE_COUNT + 1)) | |
# else we convert from .sra to .fastq.gz | |
else | |
echo "$(date) | LOG: Processing ${BASE_FILENAME}" >> $LOG_FILE | |
# compress using gzip; save result to $OUTPUT_DIR; launch process in background | |
fastq-dump --gzip -O ${OUTPUT_DIR} ${f} & | |
# increment process counter | |
NUM_PROCS=$((NUM_PROCS + 1)) | |
# increment completed file counter | |
COMPLETED_FILE_COUNT=$((COMPLETED_FILE_COUNT + 1)) | |
fi | |
# if we've launched MAX_NUM_PROCS processes, wait for them to finish | |
if [ $((NUM_PROCS)) == $((MAX_NUM_PROCS)) ]; then | |
wait | |
# once MAX_NUM_PROCS processes have finished, reset counter to zero | |
NUM_PROCS=0 | |
echo "$(date) | $COMPLETED_FILE_COUNT / $NUM_SRA_FILES files completed" | |
fi | |
done | |
echo "$(date) | $COMPLETED_FILE_COUNT / $NUM_SRA_FILES DONE!" | |
echo "Skipped $SKIPPED_FILE_COUNT files" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment