Skip to content

Instantly share code, notes, and snippets.

@mfcovington
Last active August 13, 2016 18:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mfcovington/ece00080acd1af8b17f47fa5d82b2b41 to your computer and use it in GitHub Desktop.
Save mfcovington/ece00080acd1af8b17f47fa5d82b2b41 to your computer and use it in GitHub Desktop.
Merging and Summarizing FDA Samples
################################
# Auto-merge by sample and end #
################################
BASE_DIR=/Volumes/seagate/giriget
for SAMPLE in {1..144}; do
for END in R1 R2; do
# Gather files to process
FILE_LIST=`ls $BASE_DIR/raw/*/*fda_${SAMPLE}_*_${END}_*q.gz $BASE_DIR/raw/*/*fda_aj_${SAMPLE}_*_${END}_*q.gz`
# Skip non-existent samples
if [[ -z $FILE_LIST ]]; then
echo "$SAMPLE-$END" >> $BASE_DIR/skipped
continue
fi
# Log files to be merged
echo "$SAMPLE-$END" >> $BASE_DIR/log
for FILE in $FILE_LIST; do
echo $FILE >> $BASE_DIR/log
done
# Merge FASTQ files by sample and end
printf -v SAMPLE_PADDED "%03d" $SAMPLE
cat $FILE_LIST > $BASE_DIR/merged/fda_$SAMPLE_PADDED.$END.fq.gz
done
done
#####################################################
# Manually merge Sample 78 due to inconsistent name #
#####################################################
for END in R1 R2; do
FILE=$BASE_DIR/raw/mix-06/Mix_6_fda_aj_78dil_cirna_S9_${END}_001.fastq.gz
echo -e "78dil-$END\tManual Merge Due to Inconsistent Name" >> $BASE_DIR/log
echo $FILE >> $BASE_DIR/log
cp $FILE $BASE_DIR/merged/fda_078.$END.fq.gz
done
#########################################
# Manually merge samples named "GTCCGC" #
#########################################
for END in R1 R2; do
FILE1=$BASE_DIR/raw/mix-09/Mix9_GTCCGC_S14_L001_${END}_001.fastq.gz
FILE2=$BASE_DIR/raw/mix-09/Mix9_GTCCGC_S14_L002_${END}_001.fastq.gz
echo -e "78dil-$END\tManual Merge Due to Inconsistent Name" >> $BASE_DIR/log
echo $FILE1 >> $BASE_DIR/log
echo $FILE2 >> $BASE_DIR/log
cat $FILE1 $FILE2 >> $BASE_DIR/merged/GTCCGC.$END.fq.gz
done
#########################################################
# Manually merge Samples 2 & 6 due to ambiguous pooling #
#########################################################
mkdir $BASE_DIR/merged.bad
mv $BASE_DIR/merged/fda_002.* $BASE_DIR/merged.bad/
mv $BASE_DIR/merged/fda_006.* $BASE_DIR/merged.bad/
# Gather files to process
FILE_LIST_002=`ls $BASE_DIR/raw/*/*fda_aj_2_* | grep -v fda_aj_6_`
FILE_LIST_006=`ls $BASE_DIR/raw/*/*fda_aj_6_* | grep -v fda_aj_2_`
function log_and_merge {
# Log files to be merged
echo -e "$SAMPLE-$END\tManual Merge Due to Ambiguous Pooling" >> $BASE_DIR/log
for FILE in $FILE_LIST; do
echo $FILE >> $BASE_DIR/log
done
# Merge FASTQ files by sample and end
cat $FILE_LIST > $BASE_DIR/merged/fda_$SAMPLE_PADDED.$END.fq.gz
}
# Sample 2
for END in R1 R2; do
FILE_LIST=`printf -- '%s' "${FILE_LIST_002[@]}" | grep $END`
SAMPLE=2
printf -v SAMPLE_PADDED "%03d" $SAMPLE
log_and_merge
done
# Sample 6
for END in R1 R2; do
FILE_LIST=`printf -- '%s' "${FILE_LIST_006[@]}" | grep $END`
SAMPLE=6
printf -v SAMPLE_PADDED "%03d" $SAMPLE
log_and_merge
done
#####################################################
# Manually merge Sample 72 due to ambiguous pooling #
#####################################################
mv $BASE_DIR/merged/fda_072.* $BASE_DIR/merged.bad/
# Gather files to process
FILE_LIST_72_6=`ls $BASE_DIR/raw/*/*_72_* | grep -v index11`
FILE_LIST_72_11=`ls $BASE_DIR/raw/*/*_72_* | grep index11`
function log_and_merge_with_index {
# Log files to be merged
echo -e "$SAMPLE-INDEX_$INDEX-$END\tManual Merge Due to Ambiguous Pooling" >> $BASE_DIR/log
for FILE in $FILE_LIST; do
echo $FILE >> $BASE_DIR/log
done
# Merge FASTQ files by sample and end
cat $FILE_LIST > $BASE_DIR/merged/fda_$SAMPLE_PADDED.INDEX_$INDEX.$END.fq.gz
}
# Sample 72 Index 6
for END in R1 R2; do
FILE_LIST=`printf -- '%s' "${FILE_LIST_72_6[@]}" | grep $END`
SAMPLE=72
INDEX=6
printf -v SAMPLE_PADDED "%03d" $SAMPLE
log_and_merge_with_index
done
# Sample 72 Index 11
for END in R1 R2; do
FILE_LIST=`printf -- '%s' "${FILE_LIST_72_11[@]}" | grep $END`
SAMPLE=72
INDEX=11
printf -v SAMPLE_PADDED "%03d" $SAMPLE
log_and_merge_with_index
done
####################################################
# Log line counts and read counts for each FQ file #
####################################################
# Confirm correct number of files
ls $BASE_DIR/merged | wc -l
# 242
cd $BASE_DIR/merged
echo -e "FILE\tLINES\tREADS" > $BASE_DIR/counts
for FQ in *.fq.gz; do
LINES=`gunzip -c $FQ | wc -l`
READS=$((LINES / 4))
RECORD="$FQ\t$LINES\t$READS"
# Warn if file is not a multiple of 4 lines
if (( $LINES % 4 != 0 )); then
RECORD="$RECORD\tERROR: Truncated File Detected"
fi
echo -e $RECORD >> $BASE_DIR/counts
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment