Skip to content

Instantly share code, notes, and snippets.

@mwalzer
Created November 27, 2019 10:01
Show Gist options
  • Save mwalzer/6bd5c25d607762d86485f7e1a1a9d5d7 to your computer and use it in GitHub Desktop.
Save mwalzer/6bd5c25d607762d86485f7e1a1a9d5d7 to your computer and use it in GitHub Desktop.
proteomics LFQ with nextflow
!/usr/bin/env nextflow
/*
========================================================================================
Epiphany OpenMS workflow for protein inference
========================================================================================
@#### Authors
Mathias Walzer <walzer@ebi.ac.uk>
----------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------
Pipeline overview:
- 1: Download a PRIDE experiment from an FTP URL
- 2: Converting the RAW data into mzML files
- 3: comet
- 4: epifany
----------------------------------------------------------------------------------------
*/
params.fasta = "/hps/nobackup2/proteomics/OPRA/Human-ReferenceProteome-Canonical-Isoform-73911-plusdecoy-trypsin.fasta"
//params.pxd = "PXD003133"
//params.ftp = "ftp://ftp.pride.ebi.ac.uk/pride/data/archive/2015/11/PXD003133"
//params.result_folder = "/hps/nobackup2/proteomics/OPRA/PXD003133"
def helpMessage() {
log.info"""
=========================================
Usage:
The typical command for running the pipeline is as follows:
nextflow run main.nf -c config.nf -profile local
Mandatory arguments:
--ftp The project ftp folder in PRIDE (e.g. ftp://ftp.pride.ebi.ac.uk/pride/data/archive/2015/11/PXD003133)
""".stripIndent()
}
/*
* SET UP CONFIGURATION VARIABLES
*/
// Show help message
params.help = false
if (params.help){
helpMessage()
exit 0
}
/*
* Download raw files from FTP
*/
process downloadFiles {
container '/hps/nobackup2/proteomics/ubuntu-wget.simg'
memory { 4.GB * task.attempt }
errorStrategy 'retry'
output:
file '*.raw' into rawFiles
script:
"""
wget -v -r -nd -A "*.raw" --no-host-directories --cut-dirs=1 ${params.ftp}
"""
}
/*
* Sanitise filename for each raw file
*/
process sanitiseFilename {
errorStrategy 'retry'
input:
file rawFile from rawFiles.flatten()
output:
file "*.raw" into rawFilesClean
script:
"""
#!/usr/bin/env python
import os
import re
import shutil
#https://github.com/django/django/blob/master/django/utils/text.py
def get_valid_filename(s):
s = str(s).strip().replace(\" \", \"_\")
return re.sub(r\"(?u)[^-\\w.]\", \"\", s)
ofp = \"${rawFile}\"
fn = os.path.basename(ofp)
dn = os.path.dirname(ofp)
new_name = get_valid_filename(fn)
nfp = os.path.join(dn,new_name)
if nfp==ofp:
#out_dest = open(\"${rawFile.baseName}_.raw\", 'wb')
#in_src = open(ofp, 'rb')
#shutil.copyfileobj(in_src, out_dest)
os.symlink(ofp,\"${rawFile.baseName}_.raw\")
else:
os.rename(ofp, nfp)
"""
}
/*
* Generate the mzML for each raw file
*/
process generateMZML {
container '/hps/nobackup2/proteomics/biocontainers-thermorawfileparser-1.1.8.simg'
memory { 4.GB * task.attempt }
errorStrategy 'retry'
/*publishDir "${params.result_folder}", mode: 'copy', overwrite: true*/
input:
file rawFile from rawFilesClean.flatten()
output:
file '*.mzML' into spectraFiles
script:
"""
ThermoRawFileParser.sh -i=$rawFile -f=2 -o=./
"""
}
process PeakPicker{
container '/hps/nobackup2/proteomics/openms_V2.4.0_proteomic_lfq_2.simg'
memory { 4.GB * task.attempt }
//publishDir "${params.result_folder}", mode: 'copy', overwrite: true
// errorStrategy { assert task.errorMessage ==~ 'Centroided data provided but profile spectra expected' ? 'ignore' : 'retry' } //errorReport instead of errorMessage???
//errorStrategy 'ignore'
errorStrategy 'retry'
input:
file mzML_file from spectraFiles.flatten()
output:
file "*.mzML" into spectraPicked
script:
"""
PeakPickerHiRes -in $mzML_file -out ${mzML_file.baseName}_pp.mzML
"""
}
process CometAdapter{
container '/hps/nobackup2/proteomics/openms_V2.4.0_proteomic_lfq_2.simg'
memory { 4.GB * task.attempt }
//publishDir "${params.result_folder}", mode: 'copy', overwrite: true
errorStrategy 'retry'
input:
file mzML_file from spectraPicked.flatten()
output:
file "*.idXML" into cometIDs
script:
"""
CometAdapter -in $mzML_file -database ${params.fasta} -allowed_missed_cleavages 1 -fixed_modifications "Carbamidomethyl (C)" -variable_modifications "Oxidation (M)" -enzyme Trypsin -precursor_mass_tolerance 20.0 -precursor_error_units ppm -out ${mzML_file.baseName}_comet.idXML
"""
}
process IDMerger{
container '/hps/nobackup2/proteomics/openms_V2.4.0_proteomic_lfq_2.simg'
memory { 8.GB * task.attempt }
//publishDir "${params.result_folder}", mode: 'copy', overwrite: true
errorStrategy 'retry'
input:
file cometIDslist from cometIDs.collect()
output:
file "*.idXML" into mergedIDs
script:
"""
IDMerger -out ${params.pxd}.idXML -annotate_file_origin -merge_proteins_add_PSMs -in $cometIDslist
"""
}
process PeptideIndexer{
container '/hps/nobackup2/proteomics/openms_V2.4.0_proteomic_lfq_2.simg'
memory { 8.GB * task.attempt }
//publishDir "${params.result_folder}", mode: 'copy', overwrite: true
errorStrategy 'retry'
input:
file mergedID from mergedIDs
output:
file "*.idXML" into piIDs
script:
"""
PeptideIndexer -out ${mergedID.baseName}_pi.idXML -in $mergedID -enzyme:name Trypsin -enzyme:specificity full -fasta ${params.fasta}
"""
}
process PSMFeatureExtractor{
container '/hps/nobackup2/proteomics/openms_V2.4.0_proteomic_lfq_2.simg'
memory { 8.GB * task.attempt }
//publishDir "${params.result_folder}", mode: 'copy', overwrite: true
errorStrategy 'retry'
input:
file piID from piIDs
output:
file "*.idXML" into psmFs
script:
"""
PSMFeatureExtractor -out ${piID.baseName}_fe.idXML -in $piID
"""
}
process PercolatorAdapter{
container '/hps/nobackup2/proteomics/openms_V2.4.0_proteomic_lfq_2.simg'
memory { 8.GB * task.attempt }
//publishDir "${params.result_folder}", mode: 'copy', overwrite: true
errorStrategy 'retry'
input:
file psmF from psmFs
output:
file "*.idXML" into percoIDs
script:
"""
PercolatorAdapter -out ${psmF.baseName}_fe.idXML -in $psmF -enzyme trypsin -score_type pep
"""
}
process Epifany{
container '/hps/nobackup2/proteomics/openms_V2.4.0_proteomic_lfq_2.simg'
memory { 8.GB * task.attempt }
publishDir "${params.result_folder}", mode: 'copy', overwrite: true
errorStrategy 'retry'
input:
file percoID from percoIDs
output:
file "*.idXML" into inferProt
script:
"""
Epifany -out ${percoID.baseName}_epi.idXML -in $percoID -protein_fdr true
"""
}
profiles{
local {
docker.enabled = false
singularity.enabled = true
singularity.autoMounts = true
singularity.runOptions = '-B /hps/nobackup/proteomics:/hps/nobackup/proteomics:rw -B /hps/nobackup2/proteomics:/hps/nobackup2/proteomics:rw'
process {
withLabel: big_mem {
maxForks = 1
}
}
}
trace {
enabled = true
}
cluster {
docker.enabled = false
singularity.enabled = true
singularity.autoMounts = true
singularity.runOptions = '-B /hps/nobackup/proteomics:/hps/nobackup/proteomics:rw -B /hps/nobackup2/proteomics:/hps/nobackup2/proteomics:rw'
process {
executor = 'lsf'
maxRetries = 5
}
}
}
singularity.enabled = true
singularity.cacheDir = "$baseDir/image"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment