Skip to content

Instantly share code, notes, and snippets.

@robsyme
Last active September 23, 2015 19:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save robsyme/3c083f84554a500cb5cb to your computer and use it in GitHub Desktop.
Save robsyme/3c083f84554a500cb5cb to your computer and use it in GitHub Desktop.
Example of hanging workflow
#!/usr/bin/env nextflow
params.species = 'species.txt'
species_names = Channel.fromPath(params.species)
genomes = Channel.fromPath(params.genomes).map{ file -> [file.name.lastIndexOf('.').with {it != -1 ? file.name[0..<it] : file.name}, file] }.distinct { id, file -> id }
process get_dothideomycete_descriptions {
output:
file('assemblies.json') into ascomycete_assemblies_json
"bionode-ncbi search assembly dothideomycetes > assemblies.json"
}
process filter_ascomycete_descriptions {
cache 'deep'
input:
file 'in.json' from ascomycete_assemblies_json
output:
stdout representative_assemblies_text
"""
jq --slurp --raw-output '.[] | select(.propertylist | .[] | in({\"representative\":true})) | [.assemblyaccession, .speciesname] | @tsv' < in.json
"""
}
process download_representative_assemblies {
cache 'deep'
tag { line.trim().split("\t")[1] }
input:
val line from representative_assemblies_text.splitText()
output:
set stdout, file('cds.fasta') into assemblies
script:
def (assemblyaccession, speciesname) = line.trim().split("\t")
speciesname = speciesname.replaceAll(" ","_")
"""
bionode-ncbi download assembly $assemblyaccession > /dev/null
bionode-ncbi download gff $assemblyaccession > /dev/null
gt gff3 -sort -tidy `find . -name '*.gff.gz'` > genome.gff3
gt extractfeat -matchdesc -type CDS -join -width 80 -seqfiles `find . -name '*.fna.gz'` -- genome.gff3 > cds.fasta
printf '$speciesname'
"""
}
process busco {
container 'robsyme/busco_fungi:1.0'
tag { speciesName }
maxForks 3
cpus 4
input:
set speciesName, 'cds.fasta' from assemblies.filter{ species, cds -> cds.toRealPath().toFile().length() > 0 }
output:
set speciesName, 'cds.fasta', "run_${speciesName}/full_table_${speciesName}" into busco_table
val speciesName into speciesNames_cds
"""
mkdir -p out
if [ -s \$(readlink -f cds.fasta) ]
then
ln -s /opt/busco/lineages/fungi .
busco \
--cpu ${task.cpus} \
-in cds.fasta \
--lineage fungi \
-f \
-m trans \
-o '$speciesName'
else
mkdir -p run_$speciesName/
touch run_$speciesName/full_table_$speciesName
fi
"""
}
process inhouse_busco {
container 'robsyme/busco_fungi:latest'
tag { speciesName }
cache 'deep'
maxForks 3
cpus 4
input:
set speciesName, 'genome.fasta' from genomes
output:
set speciesName, "inhouse.tar" into inhouse_busco_cds_raw
val speciesName into speciesNames_genome
"""
ln -s /opt/busco/lineages/fungi .
busco \
--cpu ${task.cpus} \
-in genome.fasta \
--lineage fungi \
-f \
-m genome \
-o '$speciesName'
mv run_$speciesName/gffs .
tar -chvf inhouse.tar gffs genome.fasta
"""
}
process fix_inhouse_busco {
input:
set speciesName, file('busco_outputs.tar') from inhouse_busco_cds_raw
output:
file('out/*') into inhouse_busco_cds
"""
mkdir -p out
tar -xvf busco_outputs.tar
for file in gffs/*.out; do
echo \$file
base=`basename \$file .out`
convert.rb \$file | gt gff3 -tidy -sort > \$base.gff3
seqnames=`awk '!/^#/ {print \$1}' \$base.gff3 | sort | uniq`
samtools faidx genome.fasta \$seqnames > tmp.fasta
gt extractfeat -type CDS -join -matchdesc -width 80 -seqfiles tmp.fasta -- \$base.gff3 > out/\$base.fasta
rm tmp.fasta*
done
sed -i'' 's/^>.*/>$speciesName/' out/*
"""
}
process classify_busco_cds {
tag { speciesName }
input:
set speciesName, 'cds.fasta', 'busco_table.txt' from busco_table
output:
file('out/*') into busco_cds
"""
mkdir out
if [ -s \$(readlink -f cds.fasta) ]
then
samtools faidx cds.fasta
awk '\$2 == \"Complete\" {system(\"samtools faidx cds.fasta \" \$3 \" >> out/\"\$1\".fasta\")}' busco_table.txt
sed -i'' 's/^>.*/>$speciesName/' out/*
fi
"""
}
busco_cds
.mix(inhouse_busco_cds)
.flatten()
.map{ it -> [it.getName(it.getNameCount() - 1), it]}
.groupTuple()
.set{ busco_homologs }
process align_busco_homologs {
input:
set busco_id, file('input*.fasta') from busco_homologs
output:
file("${busco_id}*") into alignments
"cat input*.fasta > unaligned.fasta && clustalw -INFILE=unaligned.fasta -TYPE=DNA -OUTPUT=fasta -OUTFILE=${busco_id}"
}
c = speciesNames_cds.mix(speciesNames_genome).count().val
Channel.from(c).spread(alignments).filter{ speciesCount, alignment_filename ->
int count = 0
def matcher = ( file(alignment_filename).text =~ /(?m)^>/ )
while (matcher.find()) count++;
return count == speciesCount;
}
.map{ speciesCount, alignment_filename -> alignment_filename }
.tap{ final_loci }
.toList()
.set{ fileNameList }
final_loci.count().subscribe{ println("Found $it loci present in all ${c} strains") }
process combine_alignments {
cache false
input:
file('alignment*.fasta') from fileNameList
output:
file('combined.fasta') into combined_untrimmed
"combine.rb *.fasta > combined.fasta"
}
process trim_bridges {
input:
file('combined.fasta') from combined_untrimmed
output:
file('trimmed.fasta') into combined
"bio-alignment --type nucleotide --edit bridges --out fasta combined.fasta > trimmed.fasta"
}
combined
.subscribe onNext: { println("Done - copied to 'alignments.fasta'"); it.copyTo("$baseDir/alignments.fasta") }, onComplete: { println("Workflow Done") }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment