These notes assume you are using bash.
First set an environment variable representing the run/lane. For example:
run_lane=11869_2
kinit
mkdir $run_lane
run=`echo $run_lane | sed -e 's/_.*//'`
lane=`echo $run_lane | sed -e 's/.*_//'`
/software/irods/icommands/bin/imeta qu -z seq -d id_run = $run and lane = $lane and target = 1 and type = cram \
| grep : | awk '{ print $2 }' | paste - - -d/ \
| xargs -ixxx /software/irods/icommands/bin/iget -K xxx $run_lane
/software/irods/icommands/bin/iget -K /seq/$run/$run_lane#0.cram $run_lane
chmod 664 $run_lane/*
find $run_lane | grep -E '#(168|888)\.' | xargs rm
for cram in $(find $run_lane | grep cram$ | sed -e 's/.*\///' | sed -e 's/\.cram$//')
do
/software/irods/icommands/bin/imeta ls -d /seq/$run/$cram.cram > $run_lane/$cram.imeta
done
(Should only be necessary for very old DETCT lanes, but is fine to run for all lanes.)
temp1=`find $run_lane | grep cram$ | grep -v '#'`
temp2=`find $run_lane | grep cram$ | grep '#0' | sed -e 's/#0//'`
if [[ -n $temp1 && -n $temp2 && $temp1 == $temp2 ]];
then
rm $temp1
fi
ls $run_lane/*.cram | sed -e 's/.cram$//' | xargs -ixxx \
bsub -o xxx.cram2bam.o -e xxx.cram2bam.e \
-R'select[mem>1000] rusage[mem=1000]' -M1000 \
"/software/team31/bin/samtools-1.2 view -b xxx.cram > xxx.bam"
Check jobs ran OK:
ls $run_lane/*.cram | wc -l && \
grep -l 'Successfully completed' $run_lane/*.cram2bam.o | wc -l && \
grep -l 'Exited' $run_lane/*.cram2bam.o | wc -l
rm $run_lane/*.cram
ls $run_lane/*.bam | sed -e 's/.bam$//' | xargs -ixxx \
bsub -o xxx.sort.o -e xxx.sort.e \
-R'select[mem>2000] rusage[mem=2000]' -M2000 \
/software/team31/bin/samtools-0.1.19 sort -n xxx.bam xxx.sorted
Check jobs ran OK:
ls $run_lane/*.bam | grep -v sorted | wc -l && \
grep -l 'Successfully completed' $run_lane/*.sort.o | wc -l && \
grep -l 'Exited' $run_lane/*.sort.o | wc -l
ls $run_lane/*.bam | grep -v sorted | xargs rm
(Uses modified SamToFastq from https://github.com/iansealy/picard-detct)
ls $run_lane/*.sorted.bam | sed -e 's/.sorted.bam$//' | xargs -ixxx mkdir xxx
ls $run_lane/*.sorted.bam | sed -e 's/.sorted.bam$//' | xargs -ixxx \
bsub -o xxx.bam2fastq.o -e xxx.bam2fastq.e \
-R'select[mem>2000] rusage[mem=2000]' -M2000 \
"/software/team31/bin/samtools-0.1.19 view -h -F 2816 xxx.sorted.bam -o - | \
/software/java/bin/java -jar -XX:+UseParallelOldGC -Xms1500m -Xmx2g \
/software/team31/packages/picard-tools-1.79-detct/SamToFastq.jar \
INPUT=/dev/stdin OUTPUT_PER_RG=true OUTPUT_DIR=xxx VALIDATION_STRINGENCY=SILENT"
Check jobs ran OK:
ls $run_lane/*.sorted.bam | wc -l && \
grep -l 'Successfully completed' $run_lane/*.bam2fastq.o | wc -l && \
grep -l 'Exited' $run_lane/*.bam2fastq.o | wc -l
rm $run_lane/*.bam
bsub -o $run_lane/1.merge_fastq.o -e $run_lane/1.merge_fastq.e \
"cat `ls $run_lane/*/*1.fastq | sort | tr '\n' ' '` > $run_lane/$run_lane.1.fastq"
bsub -o $run_lane/2.merge_fastq.o -e $run_lane/2.merge_fastq.e \
"cat `ls $run_lane/*/*2.fastq | sort | tr '\n' ' '` > $run_lane/$run_lane.2.fastq"
Check jobs ran OK:
echo 2 && \
grep -l 'Successfully completed' $run_lane/*.merge_fastq.o | wc -l && \
grep -l 'Exited' $run_lane/*.merge_fastq.o | wc -l
rm $run_lane/*/*.fastq
bsub -o $run_lane/stats1.o -e $run_lane/stats1.e \
-R'select[mem>1000] rusage[mem=1000]' -M1000 \
/software/team31/bin/fastx_quality_stats -Q33 -N -i $run_lane/$run_lane.1.fastq -o $run_lane/$run_lane.1.fastq.stats
bsub -o $run_lane/stats2.o -e $run_lane/stats2.e \
-R'select[mem>1000] rusage[mem=1000]' -M1000 \
/software/team31/bin/fastx_quality_stats -Q33 -N -i $run_lane/$run_lane.2.fastq -o $run_lane/$run_lane.2.fastq.stats
Check jobs ran OK:
echo 2 && \
grep -l 'Successfully completed' $run_lane/stats?.o | wc -l && \
grep -l 'Exited' $run_lane/stats?.o | wc -l
For example:
bsub -o $run_lane/detag.o -e $run_lane/detag.e \
-R'select[mem>1000] rusage[mem=1000]' -M1000 \
perl \
-I/software/team31/packages/DETCT/lib \
/software/team31/packages/DETCT/script/detag_fastq.pl \
--fastq_read1_input $run_lane/$run_lane.1.fastq \
--fastq_read2_input $run_lane/$run_lane.2.fastq \
--fastq_output_prefix $run_lane/$run_lane \
--treat_n_in_polyt_as_t \
--read_tags \
NNNNBGAGGC NNNNBAGAAG NNNNBCAGAG \
NNNNBGCACG NNNNBCGCAA NNNNBCAAGA \
NNNNBGCCGA NNNNBCGGCC NNNNBAACCG \
NNNNBACGGG NNNNBCCAAC NNNNBAGCGC
Check jobs ran OK:
echo 1 && \
grep -l 'Successfully completed' $run_lane/detag.o | wc -l && \
grep -l 'Exited' $run_lane/detag.o | wc -l
The exact read tags will vary between experiments. The specific suffixes can be extracted using:
run=`echo $run_lane | sed -e 's/_.*//'`
lane=`echo $run_lane | sed -e 's/.*_//'`
mysql -h seqw-db -P 3379 -u warehouse_ro sequencescape_warehouse -Bse \
"SELECT DISTINCT supplier_name, tag_index, tag_sequence FROM npg_plex_information, current_samples
WHERE npg_plex_information.sample_id = current_samples.internal_id
AND id_run = $run AND position = $lane AND tag_index <> 0 AND tag_index <> 168 AND tag_index <> 888
ORDER BY tag_index" \
| awk '{ print $3 }' | sort
The random base prefixes vary and the length is indicated in the sample description in Sequencescape. They are likely to be NNNNB (as in the example above) or NNNNBBBBNNNN or HBDVHBDVHB.
Also some lanes have spacer bases after the specific suffix (but before the polyT). This is indicated in the sample description in Sequencescape.
An example sample description from Sequencescape:
allele t3 clutch 1. A 8 base indexing sequence (ATCACGTT) is bases 13 to 20 of read 1 followed by CG and polyT.
The index starts at base 13, so the random prefix is 12 bases (i.e. NNNNBBBBNNNN). The spacer bases after the index are CG. So the full read tag to give to detag_fastq.pl will be NNNNBBBBNNNNATCACGTTCG.
For the above example:
run=`echo $run_lane | sed -e 's/_.*//'`
lane=`echo $run_lane | sed -e 's/.*_//'`
bsub -o $run_lane/detag.o -e $run_lane/detag.e \
-R'select[mem>1000] rusage[mem=1000]' -M1000 \
perl \
-I/software/team31/packages/DETCT/lib \
/software/team31/packages/DETCT/script/detag_fastq.pl \
--fastq_read1_input $run_lane/$run_lane.1.fastq \
--fastq_read2_input $run_lane/$run_lane.2.fastq \
--fastq_output_prefix $run_lane/$run_lane \
--treat_n_in_polyt_as_t \
--read_tags \
`mysql -h seqw-db -P 3379 -u warehouse_ro sequencescape_warehouse -Bse \
"SELECT DISTINCT supplier_name, tag_index, tag_sequence FROM npg_plex_information, current_samples
WHERE npg_plex_information.sample_id = current_samples.internal_id
AND id_run = $run AND position = $lane AND tag_index <> 0 AND tag_index <> 168 AND tag_index <> 888
ORDER BY tag_index" \
| awk '{ print $3 }' | sort \
| awk '{ print "NNNNBBBBNNNN" $1 "CG" }' | tr '\n' ' '`
If you're unsure what the read tags should be then ask John (jec) or Ian (is1).
For more recent experiments, reads 1 and 2 have been swapped in library construction (so that the read containing polyT is sequenced last). This is indicated in Sequencescape by a study specification containing something like:
An RNA oligo comprising part of the Illumina adapter 1 was ligated to the 5’ end of the captured RNA and the RNA was eluted from the beads. Reverse transcription was primed with an anchored polyT oligo with part of Illumina adapter 2 at the 5’ end followed by 10 bases HBDVHBDVHB, then one of 96 eight base indexing tags, then CG and 14 T bases.
For older experimentsm where reads 1 and 2 have not been swapped, the study specification will contain something like:
An RNA oligo comprising part of the Illumina adapter 2 was ligated to the 5' end of the captured RNA and the RNA was eluted from the beads. Reverse transcription was primed with an anchored polyToligo with part of Illumina adapter 1 at the 5' end followed by 4 random bases, then an A, C or G base, the one of twelve 5 base indexing tags and 14T bases.
To handle the former, simply swap the arguments to --fastq_read1_input and --fastq_read2_input. For example:
run=`echo $run_lane | sed -e 's/_.*//'`
lane=`echo $run_lane | sed -e 's/.*_//'`
bsub -o $run_lane/detag.o -e $run_lane/detag.e \
-R'select[mem>1000] rusage[mem=1000]' -M1000 \
perl \
-I/software/team31/packages/DETCT/lib \
/software/team31/packages/DETCT/script/detag_fastq.pl \
--fastq_read1_input $run_lane/$run_lane.2.fastq \
--fastq_read2_input $run_lane/$run_lane.1.fastq \
--fastq_output_prefix $run_lane/$run_lane \
--treat_n_in_polyt_as_t \
--read_tags \
`mysql -h seqw-db -P 3379 -u warehouse_ro sequencescape_warehouse -Bse \
"SELECT DISTINCT supplier_name, tag_index, tag_sequence FROM npg_plex_information, current_samples
WHERE npg_plex_information.sample_id = current_samples.internal_id
AND id_run = $run AND position = $lane AND tag_index <> 0 AND tag_index <> 168 AND tag_index <> 888
ORDER BY tag_index" \
| awk '{ print $3 }' | sort \
| awk '{ print "HBDVHBDVHB" $1 "CG" }' | tr '\n' ' '`
If you're unsure how the libraries were constructed then ask John (jec) or Ian (is1).
find $run_lane | grep fastq$ | grep -E 'XXXX|NNNN|HBDV' | sed -e 's/.fastq$//' | xargs mkdir
find $run_lane | grep fastq$ | grep -E 'XXXX|NNNN|HBDV' | sed -e 's/.fastq$//' | xargs -ixxx \
bsub -o xxx/split.o -e xxx/split.e \
split -l 30000000 xxx.fastq xxx/
Check jobs ran OK:
find $run_lane | grep fastq$ | grep -E 'XXXX|NNNN|HBDV' | wc -l && \
grep -l 'Successfully completed' $run_lane/*/split.o | wc -l && \
grep -l 'Exited' $run_lane/*/split.o | wc -l
find $run_lane | grep fastq$ | xargs rm
For zebrafish:
ls $run_lane/*/* | grep -v split | xargs -ixxx \
bsub -o xxx.bwaaln.o -e xxx.bwaaln.e \
-R'select[mem>3000] rusage[mem=3000]' -M3000 \
/software/team31/bin/bwa-0.5.10-mt_fixes aln \
-f xxx.sai \
/lustre/scratch110/sanger/rw4/genomes/Dr/Zv9/striped/zv9_toplevel_unmasked.fa \
xxx
For mouse:
ls $run_lane/*/* | grep -v split | xargs -ixxx \
bsub -o xxx.bwaaln.o -e xxx.bwaaln.e \
-R'select[mem>4000] rusage[mem=4000]' -M4000 \
/software/team31/bin/bwa-0.5.10-mt_fixes aln \
-f xxx.sai \
/lustre/scratch109/srpipe/references/Mus_musculus/GRCm38/all/bwa/Mus_musculus.GRCm38.68.dna.toplevel.fa \
xxx
For human:
ls $run_lane/*/* | grep -v split | xargs -ixxx \
bsub -o xxx.bwaaln.o -e xxx.bwaaln.e \
-R'select[mem>4000] rusage[mem=4000]' -M4000 \
/software/team31/bin/bwa-0.5.10-mt_fixes aln \
-f xxx.sai \
/lustre/scratch110/srpipe/references/Homo_sapiens/1000Genomes_hs37d5/all/bwa/hs37d5.fa \
xxx
Check jobs ran OK:
ls $run_lane/*/* | grep -v split | grep -v bwaaln | grep -v sai$ | wc -l && \
grep -l 'Successfully completed' $run_lane/*/*.bwaaln.o | wc -l && \
grep -l 'Exited' $run_lane/*/*.bwaaln.o | wc -l
For experiments including ERCC spikes, the reference will need to be a combination of the genome and spikes. For example:
/lustre/scratch109/sanger/is1/genomes/zv9spike/striped/zv9spike.fa
or:
/lustre/scratch109/sanger/is1/genomes/hs37d5spike/striped/hs37d5spike.fa
For zebrafish:
ls $run_lane/*/* | grep -v split | grep -v bwaaln | grep -v sai$ | xargs -ixxx \
bsub -o xxx.bwaaln.tr.o -e xxx.bwaaln.tr.e \
-R'select[mem>3000] rusage[mem=3000]' -M3000 \
/software/team31/bin/bwa-0.5.10-mt_fixes aln \
-f xxx.tr.sai \
/lustre/scratch109/sanger/is1/transcriptomes/zv9e79/zv9e79.fa \
xxx
Check jobs ran OK:
ls $run_lane/*/* | grep -v split | grep -v bwaaln | grep -v sai$ | wc -l && \
grep -l 'Successfully completed' $run_lane/*/*.bwaaln.tr.o | wc -l && \
grep -l 'Exited' $run_lane/*/*.bwaaln.tr.o | wc -l
For experiments including ERCC spikes, the reference will need to be a combination of the transcriptome and spikes. For example:
/lustre/scratch109/sanger/is1/transcriptomes/zv9e79spike/zv9e79spike.fa
For zebrafish:
for file in `ls -d $run_lane/*_1/ | sed -e 's/_1\/$//'`
do
mkdir $file
for split in `ls ${file}_1/ | grep -v '\.'`
do
bsub \
-o $file/$split.sampe.o -e $file/$split.sampe.e \
-R'select[mem>3000] rusage[mem=3000]' -M3000 \
"/software/team31/bin/bwa-0.5.10-mt_fixes sampe -T \
/lustre/scratch110/sanger/rw4/genomes/Dr/Zv9/striped/zv9_toplevel_unmasked.fa \
${file}_1/$split.sai \
${file}_2/$split.sai \
${file}_1/$split \
${file}_2/$split \
| /software/team31/bin/samtools-0.1.19 view -S -b -T \
/lustre/scratch110/sanger/rw4/genomes/Dr/Zv9/striped/zv9_toplevel_unmasked.fa \
-o $file/$split.bam - "
done
done
For mouse:
for file in `ls -d $run_lane/*_1/ | sed -e 's/_1\/$//'`
do
mkdir $file
for split in `ls ${file}_1/ | grep -v '\.'`
do
bsub \
-o $file/$split.sampe.o -e $file/$split.sampe.e \
-R'select[mem>4000] rusage[mem=4000]' -M4000 \
"/software/team31/bin/bwa-0.5.10-mt_fixes sampe -T \
/lustre/scratch109/srpipe/references/Mus_musculus/GRCm38/all/bwa/Mus_musculus.GRCm38.68.dna.toplevel.fa \
${file}_1/$split.sai \
${file}_2/$split.sai \
${file}_1/$split \
${file}_2/$split \
| /software/team31/bin/samtools-0.1.19 view -S -b -T \
/lustre/scratch109/srpipe/references/Mus_musculus/GRCm38/all/fasta/Mus_musculus.GRCm38.68.dna.toplevel.fa \
-o $file/$split.bam - "
done
done
For human:
for file in `ls -d $run_lane/*_1/ | sed -e 's/_1\/$//'`
do
mkdir $file
for split in `ls ${file}_1/ | grep -v '\.'`
do
bsub \
-o $file/$split.sampe.o -e $file/$split.sampe.e \
-R'select[mem>4000] rusage[mem=4000]' -M4000 \
"/software/team31/bin/bwa-0.5.10-mt_fixes sampe -T \
/lustre/scratch110/srpipe/references/Homo_sapiens/1000Genomes_hs37d5/all/bwa/hs37d5.fa \
${file}_1/$split.sai \
${file}_2/$split.sai \
${file}_1/$split \
${file}_2/$split \
| /software/team31/bin/samtools-0.1.19 view -S -b -T \
/lustre/scratch110/srpipe/references/Homo_sapiens/1000Genomes_hs37d5/all/fasta/hs37d5.fa \
-o $file/$split.bam - "
done
done
Check jobs ran OK:
find $run_lane/*_1 -type f | grep -v split | grep -v bwaaln | grep -v sai$ | wc -l && \
grep -l 'Successfully completed' $run_lane/*/*.sampe.o | wc -l && \
grep -l 'Exited' $run_lane/*/*.sampe.o | wc -l
For zebrafish:
for file in `ls -d $run_lane/*_1/ | sed -e 's/_1\/$//'`
do
mkdir -p $file
for split in `ls ${file}_1/ | grep -v '\.'`
do
bsub \
-o $file/$split.sampe.tr.o -e $file/$split.sampe.tr.e \
-R'select[mem>3000] rusage[mem=3000]' -M3000 \
"/software/team31/bin/bwa-0.5.10-mt_fixes sampe -T \
/lustre/scratch109/sanger/is1/transcriptomes/zv9e79/zv9e79.fa \
${file}_1/$split.tr.sai \
${file}_2/$split.tr.sai \
${file}_1/$split \
${file}_2/$split \
| /software/team31/bin/samtools-0.1.19 view -S -b -T \
/lustre/scratch109/sanger/is1/transcriptomes/zv9e79/zv9e79.fa \
-o $file/$split.tr.bam - "
done
done
Check jobs ran OK:
find $run_lane/*_1 -type f | grep -v split | grep -v bwaaln | grep -v sai$ | wc -l && \
grep -l 'Successfully completed' $run_lane/*/*.sampe.tr.o | wc -l && \
grep -l 'Exited' $run_lane/*/*.sampe.tr.o | wc -l
ls $run_lane/*/*.bam | sed -e 's/.bam$//' | xargs -ixxx \
bsub -o xxx.fixmate.o -e xxx.fixmate.e \
-R'select[mem>2000] rusage[mem=2000]' -M2000 \
java -XX:ParallelGCThreads=1 -Xmx2g -jar /software/team31/packages/picard-tools/FixMateInformation.jar \
INPUT=xxx.bam \
OUTPUT=xxx.fixmate.bam \
VALIDATION_STRINGENCY=SILENT VERBOSITY=WARNING QUIET=true \
TMP_DIR=xxx
Check jobs ran OK:
ls $run_lane/*/*.bam | grep -v fixmate | wc -l && \
grep -l 'Successfully completed' $run_lane/*/*.fixmate.o | wc -l && \
grep -l 'Exited' $run_lane/*/*.fixmate.o | wc -l
ls $run_lane/*/*.bam | grep -v fixmate | xargs rm
for file in `ls $run_lane/*/*.bam | grep fixmate | sed -e 's/.fixmate.bam$//'`
do
tag=`echo $file | sed -e 's/.*_//' | sed -e 's/\/.*//' | sed -e 's/[^AGCTX]*//g' | sed -e 's/XX*/X/g'`
bsub -o $file.readgroup.o -e $file.readgroup.e \
-R'select[mem>1000] rusage[mem=1000]' -M1000 \
java -XX:ParallelGCThreads=1 -Xmx2g -jar /software/team31/packages/picard-tools/AddOrReplaceReadGroups.jar \
INPUT=$file.fixmate.bam \
OUTPUT=$file.readgroup.bam \
RGLB=$tag RGPL=ILLUMINA RGPU=$tag RGSM=$tag \
VALIDATION_STRINGENCY=SILENT VERBOSITY=WARNING QUIET=true \
TMP_DIR=$file
done
Check jobs ran OK:
ls $run_lane/*/*.bam | grep fixmate | wc -l && \
grep -l 'Successfully completed' $run_lane/*/*.readgroup.o | wc -l && \
grep -l 'Exited' $run_lane/*/*.readgroup.o | wc -l
ls $run_lane/*/*.bam | grep -v readgroup | xargs rm
for file in `ls -d $run_lane/*_1/ | grep -E 'XXXX|NNNN|HBDV' | sed -e 's/_1\/$//'`
do
bsub -o $file/merge.o -e $file/merge.e \
-R'select[mem>4000] rusage[mem=4000]' -M4000 \
java -XX:ParallelGCThreads=1 -Xmx4g -jar /software/team31/packages/picard-tools/MergeSamFiles.jar \
INPUT=`find $file | grep bam$ | grep -v tr.readgroup | sort | tr '\n' ' ' | sed -e 's/ $//' | sed -e 's/ / INPUT=/g'` \
OUTPUT=$file.bam \
MSD=true ASSUME_SORTED=false \
VALIDATION_STRINGENCY=SILENT VERBOSITY=WARNING QUIET=true \
TMP_DIR=$file
bsub -o $file/tr.merge.o -e $file/tr.merge.e \
-R'select[mem>4000] rusage[mem=4000]' -M4000 \
java -XX:ParallelGCThreads=1 -Xmx4g -jar /software/team31/packages/picard-tools/MergeSamFiles.jar \
INPUT=`find $file | grep bam$ | grep tr.readgroup | sort | tr '\n' ' ' | sed -e 's/ $//' | sed -e 's/ / INPUT=/g'` \
OUTPUT=$file.tr.bam \
MSD=true ASSUME_SORTED=false \
VALIDATION_STRINGENCY=SILENT VERBOSITY=WARNING QUIET=true \
TMP_DIR=$file
done
Check jobs ran OK:
echo `ls -d $run_lane/*_1/ | grep -E 'XXXX|NNNN|HBDV' | wc -l` \* 2 | bc && \
grep -l 'Successfully completed' $run_lane/*/*merge.o | wc -l && \
grep -l 'Exited' $run_lane/*/*merge.o | wc -l
ls $run_lane/*/*.sai | sed -e 's/.sai//' | grep -v tr$ | xargs rm
rm $run_lane/*/*.bam $run_lane/*/*.sai
find $run_lane | grep libsnappyjava.so | xargs rm
This step is optional.
If the DETCT experiment was run on one sequencing lane then ignore this step. But this step should be run for HiSeq 2500 data, where two experiments are mixed over two lanes or one experiment is run over two lanes.
First set environment variables representing the pair of lanes. For example:
run_lane1=12207_1
run_lane2=12207_2
Extract all the samples and tags:
run1=`echo $run_lane1 | sed -e 's/_.*//'`
lane1=`echo $run_lane1 | sed -e 's/.*_//'`
run2=`echo $run_lane2 | sed -e 's/_.*//'`
lane2=`echo $run_lane2 | sed -e 's/.*_//'`
mysql -h seqw-db -P 3379 -u warehouse_ro sequencescape_warehouse -Bse \
"SELECT DISTINCT supplier_name, tag_index, tag_sequence FROM npg_plex_information, current_samples
WHERE npg_plex_information.sample_id = current_samples.internal_id
AND ((id_run = $run1 AND position = $lane1) OR (id_run = $run2 AND position = $lane2))
AND tag_index <> 0 AND tag_index <> 168 AND tag_index <> 888
ORDER BY tag_index" \
| awk '{ print $3, $1 }' | sort -u \
| sed -e 's/_[a-zA-Z]*[0-9]*$//' \
| awk '{ print $2 ":" $1 }' | sort -u > $run_lane1/samples.txt
cp $run_lane1/samples.txt $run_lane2/samples.txt
View $run_lane1/samples.txt and check the first column only contains one or two names (depending on the number of experiments run on the two lanes) and they look right.
Merge:
dest1=`sed -e 's/:.*//' $run_lane1/samples.txt | sort -u | head -1`
dest2=`sed -e 's/:.*//' $run_lane1/samples.txt | sort -u | tail -1`
sed -e 's/:.*//' $run_lane1/samples.txt | sort -u | xargs mkdir
for pair in `cat $run_lane1/samples.txt`
do
dest=`echo $pair | sed -e 's/:.*//'`
tag=`echo $pair | sed -e 's/.*://'`
filename=`ls $run_lane1/*NNNN${tag}*.bam $run_lane2/*NNNN${tag}*.bam $run_lane1/*HBDV*${tag}*.bam $run_lane2/*HBDV*${tag}*.bam | grep -v tr.bam$ | head -1 | sed -e 's/.*\///'`
bsub -o $dest/merge.$tag.o -e $dest/merge.$tag.e \
-R'select[mem>4000] rusage[mem=4000]' -M4000 \
java -XX:ParallelGCThreads=1 -Xmx4g -jar /software/team31/packages/picard-tools/MergeSamFiles.jar \
INPUT=`ls $run_lane1/*NNNN${tag}*.bam $run_lane2/*NNNN${tag}*.bam $run_lane1/*HBDV*${tag}*.bam $run_lane2/*HBDV*${tag}*.bam | grep -v tr.bam$ | sort | tr '\n' ' ' | sed -e 's/ $//' | sed -e 's/ / INPUT=/g'` \
OUTPUT=$dest/$filename \
MSD=true ASSUME_SORTED=false \
VALIDATION_STRINGENCY=SILENT VERBOSITY=WARNING QUIET=true \
TMP_DIR=$dest
filename=`ls $run_lane1/*NNNN${tag}*.bam $run_lane2/*NNNN${tag}*.bam $run_lane1/*HBDV*${tag}*.bam $run_lane2/*HBDV*${tag}*.bam | grep tr.bam$ | head -1 | sed -e 's/.*\///'`
bsub -o $dest/tr.merge.$tag.o -e $dest/tr.merge.$tag.e \
-R'select[mem>4000] rusage[mem=4000]' -M4000 \
java -XX:ParallelGCThreads=1 -Xmx4g -jar /software/team31/packages/picard-tools/MergeSamFiles.jar \
INPUT=`ls $run_lane1/*NNNN${tag}*.bam $run_lane2/*NNNN${tag}*.bam $run_lane1/*HBDV*${tag}*.bam $run_lane2/*HBDV*${tag}*.bam | grep tr.bam$ | sort | tr '\n' ' ' | sed -e 's/ $//' | sed -e 's/ / INPUT=/g'` \
OUTPUT=$dest/$filename \
MSD=true ASSUME_SORTED=false \
VALIDATION_STRINGENCY=SILENT VERBOSITY=WARNING QUIET=true \
TMP_DIR=$dest
done
Check jobs ran OK:
echo `wc -l $run_lane1/samples.txt | awk '{ print $1 }'` \* 2 | bc && \
grep -l 'Successfully completed' $dest1/*merge.*.o $dest2/*merge.*.o | sort -u | wc -l && \
grep -l 'Exited' $dest1/*merge.*.o $dest2/*merge.*.o | sort -u | wc -l
Set new environment variables (separately if two experiments) and continue:
run_lane=`sed -e 's/:.*//' $run_lane1/samples.txt | sort -u | head -1`
run_lane=`sed -e 's/:.*//' $run_lane1/samples.txt | sort -u | tail -1`
Copy useful files from original directories:
cp $run_lane1/*.imeta $run_lane2/*.imeta $run_lane1/*.stats $run_lane2/*.stats $run_lane1/samples.txt $dest1
cp $run_lane1/*.imeta $run_lane2/*.imeta $run_lane1/*.stats $run_lane2/*.stats $run_lane1/samples.txt $dest2
Delete original directories:
rm -rf $run_lane1 $run_lane2
ls $run_lane/*.bam | sed -e 's/.bam$//' | xargs -ixxx \
bsub -o xxx.index.o -e xxx.index.e \
-R'select[mem>1000] rusage[mem=1000]' -M1000 \
/software/team31/bin/samtools index xxx.bam
Check jobs ran OK:
ls $run_lane/*.bam | wc -l && \
grep -l 'Successfully completed' $run_lane/*.index.o | wc -l && \
grep -l 'Exited' $run_lane/*.index.o | wc -l
(Uses modified tag-aware MarkDuplicates from https://github.com/iansealy/picard-detct)
find $run_lane -type f | grep -v markdup | grep bam$ | sed -e 's/.bam$//' | xargs -ixxx \
bsub -o xxx.tag.markdup.o -e xxx.tag.markdup.e \
-n3 \
-R'span[hosts=1] select[mem>20000] rusage[mem=20000]' -M20000 \
/software/java/bin/java -jar -XX:+UseParallelOldGC -Xms21g -Xmx27g \
/software/team31/packages/picard-tools-detct/MarkDuplicates.jar \
I=xxx.bam \
O=xxx.tag.markdup.bam \
METRICS_FILE=xxx.tag.markdup.bam.stats \
REMOVE_DUPLICATES=false \
CREATE_INDEX=false \
MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=1000 \
VALIDATION_STRINGENCY=SILENT \
CONSIDER_TAGS=true
Check jobs ran OK:
ls $run_lane/*.bam | grep -v markdup | wc -l && \
grep -l 'Successfully completed' $run_lane/*.tag.markdup.o | wc -l && \
grep -l 'Exited' $run_lane/*.tag.markdup.o | wc -l
ls $run_lane/*.tag.markdup.bam | sed -e 's/.bam$//' | xargs -ixxx \
bsub -o xxx.index.o -e xxx.index.e \
-R'select[mem>1000] rusage[mem=1000]' -M1000 \
/software/team31/bin/samtools index xxx.bam
Check jobs ran OK:
ls $run_lane/*.tag.markdup.bam | wc -l && \
grep -l 'Successfully completed' $run_lane/*.tag.markdup.index.o | wc -l && \
grep -l 'Exited' $run_lane/*.tag.markdup.index.o | wc -l
find $run_lane -type f | grep -v markdup | grep bam$ | sed -e 's/.bam$//' | xargs -ixxx \
bsub -o xxx.normal.markdup.o -e xxx.normal.markdup.e \
-n3 \
-R'span[hosts=1] select[mem>20000] rusage[mem=20000]' -M20000 \
/software/java/bin/java -jar -XX:+UseParallelOldGC -Xms21g -Xmx27g \
/software/team31/packages/picard-tools-detct/MarkDuplicates.jar \
I=xxx.bam \
O=xxx.normal.markdup.bam \
METRICS_FILE=xxx.normal.markdup.bam.stats \
REMOVE_DUPLICATES=false \
CREATE_INDEX=false \
MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=1000 \
VALIDATION_STRINGENCY=SILENT
Check jobs ran OK:
ls $run_lane/*.bam | grep -v markdup | wc -l && \
grep -l 'Successfully completed' $run_lane/*.normal.markdup.o | wc -l && \
grep -l 'Exited' $run_lane/*.normal.markdup.o | wc -l
grep LIBRARY `find $run_lane | grep tag.markdup.bam.stats | head -1` \
> $run_lane/markdup-stats.txt
find $run_lane | grep bam.stats$ | grep -v '\.tr\.' | sort | xargs grep -A1 ^LIBRARY \
| sed -e 's/.*\///' | sed -e 's/.stats-Unknown Library//' | sed -e 's/.stats-[AGCTX]*//' \
| grep markdup | grep -v LIBRARY >> $run_lane/markdup-stats.txt
grep LIBRARY `find $run_lane | grep tag.markdup.bam.stats | head -1` \
> $run_lane/markdup-transcriptome-stats.txt
find $run_lane | grep bam.stats$ | grep '\.tr\.' | sort | xargs grep -A1 ^LIBRARY \
| sed -e 's/.*\///' | sed -e 's/.stats-Unknown Library//' | sed -e 's/.stats-[AGCTX]*//' \
| grep markdup | grep -v LIBRARY >> $run_lane/markdup-transcriptome-stats.txt
rm `ls $run_lane/*.bam | grep -v tag.markdup.bam$`
rm `ls $run_lane/*.bam.bai | grep -v tag.markdup.bam.bai$`
ls $run_lane/*.tag.markdup.bam | sed -e 's/.bam$//' | xargs -ixxx \
bsub -o xxx.flagstat.o -e xxx.flagstat.e \
-R'select[mem>1000] rusage[mem=1000]' -M1000 \
"/software/team31/bin/samtools flagstat xxx.bam > xxx.flagstat.txt"
Check jobs ran OK:
ls $run_lane/*.tag.markdup.bam | wc -l && \
grep -l 'Successfully completed' $run_lane/*.tag.markdup.flagstat.o | wc -l && \
grep -l 'Exited' $run_lane/*.tag.markdup.flagstat.o | wc -l
ls $run_lane/*.tag.markdup.bam | sed -e 's/.bam$//' | xargs -ixxx \
bsub -o xxx.insertsize.o -e xxx.insertsize.e \
"/software/team31/bin/samtools view xxx.bam | getinsertsize.py - \
--span-distribution-file xxx.span-dist.txt \
--read-distribution-file xxx.read-dist.txt \
> xxx.insertsize.txt"
Check jobs ran OK:
ls $run_lane/*.tag.markdup.bam | wc -l && \
grep -l 'Successfully completed' $run_lane/*.insertsize.o | wc -l && \
grep -l 'Exited' $run_lane/*.insertsize.o | wc -l
grep span $run_lane/*.insertsize.txt | grep -v '\.tr\.' | sed -e 's/^.*\///' | sed -e 's/insertsize.txt:/bam\t/' \
> $run_lane/insertsize-stats.txt
grep span $run_lane/*.insertsize.txt | grep '\.tr\.' | sed -e 's/^.*\///' | sed -e 's/insertsize.txt:/bam\t/' \
> $run_lane/insertsize-transcriptome-stats.txt
ls $run_lane/*.tr.tag.markdup.bam | sed -e 's/.bam$//' | xargs -ixxx \
bsub -o xxx.mapped.o -e xxx.mapped.e \
"/software/team31/bin/samtools-1.2 view -F 4 xxx.bam -b -o xxx.mapped.bam"
Check jobs ran OK:
ls $run_lane/*.tr.tag.markdup.bam | wc -l && \
grep -l 'Successfully completed' $run_lane/*.mapped.o | wc -l && \
grep -l 'Exited' $run_lane/*.mapped.o | wc -l
Delete BAM files containing unmapped reads:
rm $run_lane/*.tr.tag.markdup.bam
This step is optional.
If there are a large number of samples (>10) then the rest of the pipeline will run faster if the samples are merged into a single BAM file.
bsub -o $run_lane/merge.o -e $run_lane/merge.e \
-R'select[mem>4000] rusage[mem=4000]' -M4000 \
java -XX:ParallelGCThreads=1 -Xmx4g -jar /software/team31/packages/picard-tools/MergeSamFiles.jar \
INPUT=`ls $run_lane/*.tag.markdup.bam | grep -v tr.tag | sort | tr '\n' ' ' | sed -e 's/ $//' | sed -e 's/ / INPUT=/g'` \
OUTPUT=$run_lane/$run_lane.bam \
MSD=true ASSUME_SORTED=false \
VALIDATION_STRINGENCY=SILENT VERBOSITY=WARNING QUIET=true \
TMP_DIR=$run_lane
Check jobs ran OK:
echo 1 && \
grep -l 'Successfully completed' $run_lane/merge.o | wc -l && \
grep -l 'Exited' $run_lane/merge.o | wc -l
This step is optional. Only run if the previous optional step was run.
bsub -o $run_lane/index.o -e $run_lane/index.e \
-R'select[mem>1000] rusage[mem=1000]' -M1000 \
/software/team31/bin/samtools index $run_lane/$run_lane.bam
Check jobs ran OK:
echo 1 && \
grep -l 'Successfully completed' $run_lane/index.o | wc -l && \
grep -l 'Exited' $run_lane/index.o | wc -l