Last active
September 23, 2015 19:25
-
-
Save robsyme/3c083f84554a500cb5cb to your computer and use it in GitHub Desktop.
Example of hanging workflow
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env nextflow | |
params.species = 'species.txt' | |
species_names = Channel.fromPath(params.species) | |
genomes = Channel.fromPath(params.genomes).map{ file -> [file.name.lastIndexOf('.').with {it != -1 ? file.name[0..<it] : file.name}, file] }.distinct { id, file -> id } | |
process get_dothideomycete_descriptions { | |
output: | |
file('assemblies.json') into ascomycete_assemblies_json | |
"bionode-ncbi search assembly dothideomycetes > assemblies.json" | |
} | |
process filter_ascomycete_descriptions { | |
cache 'deep' | |
input: | |
file 'in.json' from ascomycete_assemblies_json | |
output: | |
stdout representative_assemblies_text | |
""" | |
jq --slurp --raw-output '.[] | select(.propertylist | .[] | in({\"representative\":true})) | [.assemblyaccession, .speciesname] | @tsv' < in.json | |
""" | |
} | |
process download_representative_assemblies { | |
cache 'deep' | |
tag { line.trim().split("\t")[1] } | |
input: | |
val line from representative_assemblies_text.splitText() | |
output: | |
set stdout, file('cds.fasta') into assemblies | |
script: | |
def (assemblyaccession, speciesname) = line.trim().split("\t") | |
speciesname = speciesname.replaceAll(" ","_") | |
""" | |
bionode-ncbi download assembly $assemblyaccession > /dev/null | |
bionode-ncbi download gff $assemblyaccession > /dev/null | |
gt gff3 -sort -tidy `find . -name '*.gff.gz'` > genome.gff3 | |
gt extractfeat -matchdesc -type CDS -join -width 80 -seqfiles `find . -name '*.fna.gz'` -- genome.gff3 > cds.fasta | |
printf '$speciesname' | |
""" | |
} | |
process busco { | |
container 'robsyme/busco_fungi:1.0' | |
tag { speciesName } | |
maxForks 3 | |
cpus 4 | |
input: | |
set speciesName, 'cds.fasta' from assemblies.filter{ species, cds -> cds.toRealPath().toFile().length() > 0 } | |
output: | |
set speciesName, 'cds.fasta', "run_${speciesName}/full_table_${speciesName}" into busco_table | |
val speciesName into speciesNames_cds | |
""" | |
mkdir -p out | |
if [ -s \$(readlink -f cds.fasta) ] | |
then | |
ln -s /opt/busco/lineages/fungi . | |
busco \ | |
--cpu ${task.cpus} \ | |
-in cds.fasta \ | |
--lineage fungi \ | |
-f \ | |
-m trans \ | |
-o '$speciesName' | |
else | |
mkdir -p run_$speciesName/ | |
touch run_$speciesName/full_table_$speciesName | |
fi | |
""" | |
} | |
process inhouse_busco { | |
container 'robsyme/busco_fungi:latest' | |
tag { speciesName } | |
cache 'deep' | |
maxForks 3 | |
cpus 4 | |
input: | |
set speciesName, 'genome.fasta' from genomes | |
output: | |
set speciesName, "inhouse.tar" into inhouse_busco_cds_raw | |
val speciesName into speciesNames_genome | |
""" | |
ln -s /opt/busco/lineages/fungi . | |
busco \ | |
--cpu ${task.cpus} \ | |
-in genome.fasta \ | |
--lineage fungi \ | |
-f \ | |
-m genome \ | |
-o '$speciesName' | |
mv run_$speciesName/gffs . | |
tar -chvf inhouse.tar gffs genome.fasta | |
""" | |
} | |
process fix_inhouse_busco { | |
input: | |
set speciesName, file('busco_outputs.tar') from inhouse_busco_cds_raw | |
output: | |
file('out/*') into inhouse_busco_cds | |
""" | |
mkdir -p out | |
tar -xvf busco_outputs.tar | |
for file in gffs/*.out; do | |
echo \$file | |
base=`basename \$file .out` | |
convert.rb \$file | gt gff3 -tidy -sort > \$base.gff3 | |
seqnames=`awk '!/^#/ {print \$1}' \$base.gff3 | sort | uniq` | |
samtools faidx genome.fasta \$seqnames > tmp.fasta | |
gt extractfeat -type CDS -join -matchdesc -width 80 -seqfiles tmp.fasta -- \$base.gff3 > out/\$base.fasta | |
rm tmp.fasta* | |
done | |
sed -i'' 's/^>.*/>$speciesName/' out/* | |
""" | |
} | |
process classify_busco_cds { | |
tag { speciesName } | |
input: | |
set speciesName, 'cds.fasta', 'busco_table.txt' from busco_table | |
output: | |
file('out/*') into busco_cds | |
""" | |
mkdir out | |
if [ -s \$(readlink -f cds.fasta) ] | |
then | |
samtools faidx cds.fasta | |
awk '\$2 == \"Complete\" {system(\"samtools faidx cds.fasta \" \$3 \" >> out/\"\$1\".fasta\")}' busco_table.txt | |
sed -i'' 's/^>.*/>$speciesName/' out/* | |
fi | |
""" | |
} | |
busco_cds | |
.mix(inhouse_busco_cds) | |
.flatten() | |
.map{ it -> [it.getName(it.getNameCount() - 1), it]} | |
.groupTuple() | |
.set{ busco_homologs } | |
process align_busco_homologs { | |
input: | |
set busco_id, file('input*.fasta') from busco_homologs | |
output: | |
file("${busco_id}*") into alignments | |
"cat input*.fasta > unaligned.fasta && clustalw -INFILE=unaligned.fasta -TYPE=DNA -OUTPUT=fasta -OUTFILE=${busco_id}" | |
} | |
c = speciesNames_cds.mix(speciesNames_genome).count().val | |
Channel.from(c).spread(alignments).filter{ speciesCount, alignment_filename -> | |
int count = 0 | |
def matcher = ( file(alignment_filename).text =~ /(?m)^>/ ) | |
while (matcher.find()) count++; | |
return count == speciesCount; | |
} | |
.map{ speciesCount, alignment_filename -> alignment_filename } | |
.tap{ final_loci } | |
.toList() | |
.set{ fileNameList } | |
final_loci.count().subscribe{ println("Found $it loci present in all ${c} strains") } | |
process combine_alignments { | |
cache false | |
input: | |
file('alignment*.fasta') from fileNameList | |
output: | |
file('combined.fasta') into combined_untrimmed | |
"combine.rb *.fasta > combined.fasta" | |
} | |
process trim_bridges { | |
input: | |
file('combined.fasta') from combined_untrimmed | |
output: | |
file('trimmed.fasta') into combined | |
"bio-alignment --type nucleotide --edit bridges --out fasta combined.fasta > trimmed.fasta" | |
} | |
combined | |
.subscribe onNext: { println("Done - copied to 'alignments.fasta'"); it.copyTo("$baseDir/alignments.fasta") }, onComplete: { println("Workflow Done") } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment