Skip to content

Instantly share code, notes, and snippets.

@bsmith89
Created February 5, 2016 19:21
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bsmith89/4c3866bcdb77614b315e to your computer and use it in GitHub Desktop.
Save bsmith89/4c3866bcdb77614b315e to your computer and use it in GitHub Desktop.
Minorly redacted Makefile for a project using mothur to analyze 16S data.
# ====================
# Project Makefile
# ====================
# User Configuration {{{1
# ====================
# Major targets {{{2
.PHONY: figs res
figs:
# [Partially redacted]
res: res/rrs.procd.otus.shared
res: res/rrs.procd.otus.tax
res: seq/rrs.procd.otus.reps.afn
# [Partially redacted]
# What files are generated on `make all`?
all: docs figs res
# Compute environment {{{2
# Name, and directory, of the python virtual environment:
VENV = .venv
# Directory for recipe-internal files and terminal documents:
BUILD_DIR := build
# Documentation settings {{{2
.PHONY: docs
docs: ${BUILD_DIR}/2016-02-02_pg-meeting.pdf
BIB_FILE=doc/main.bib
MAKEFILE_GRAPH_FLAGS=-d 'raw/seq/.*' -d 'seq/split/.*' -d 'meta/split/.*' -d 'res/split.*' -d 'raw/hplc/*'
# Preqs for major documentation targets {{{3
${BUILD_DIR}/README.%:
# Cleanup settings {{{2
# Use the following line to add files and directories to be deleted on `make clean`:
CLEANUP += *.docx *.html *.log *.logfile *.pbs.o* *.pbs.e* ${BUILD_DIR}/*
TESTCLEANUP = res/test* res/split/test* ${BUILD_DIR}/test* meta/test* seq/test* seq/split/test*
.PHONY: testclean
testclean:
rm -rf ${TESTCLEANUP}
# Initialization settings {{{2
# What directories to generate on `make data-dirs`.
# By default, already includes build/ fig/
DATA_DIRS += raw/ raw/ref raw/seq \
raw/hplc raw/hplc/proto \
meta/ ref/ \
res/ res/split \
seq/ seq/split
# MOTHUR settings {{{2
# Link pre-requisites to the $BUILD dir
define MOTHUR_SETUP
@ln -fsrt ${BUILD_DIR}/ $^
# START: > $@
endef
define MOTHUR_TEARDOWN
# FINISH: > $@
endef
# Ordered list of pre-requisite link locations (in the $BUILD dir)
LN_PREQS = $(addprefix ${BUILD_DIR}/,$(notdir $^))
MOTHUR ?= mothur -q
# If you wanna swap in mothur-mpi use:
# MOTHUR ?= mpirun -np ${MAX_PROCS} mothur-mpi
# Requires that you have built `mothur-mpi` as a MPI capable version
# MOTHUR_STDOUT := >/dev/null
MOTHUR_STDOUT :=
MAX_PROCS ?= $(shell nproc)
# Random seed for reproduciblity.
SEED ?= 1
MIN_READS_PER_SAMPLE = 6000 # When to throw out samples from community matrix
NUM_PRECLUST_DIFFS = 2
OTU_CUTOFF ?= 0.03
DIST_CUTOFF ?= 0.15
CLUST_METHOD ?= average
SPLIT_TAXLEVEL ?= 4
ifeq (${CLUST_METHOD}, furthest)
CLUST_METHOD_ABBRV := fn
else
ifeq (${CLUST_METHOD}, average)
CLUST_METHOD_ABBRV := an
else
ifeq (${CLUST_METHOD}, nearest)
CLUST_METHOD_ABBRV := nn # TODO: Is this correct?
else
CLUST_METHOD_ABBRV := an
endif
endif
endif
# FINALLY: Include base makefile {{{2
include base.mk
# ==============
# Data {{{1
# ==============
# User defined recipes for cleaning up and initially parsing data.
# e.g. Slicing out columns, combining data sources, alignment, generating
# phylogenies, etc.
# 16S Library {{{2
# Reference sets {{{3
extract_from_tarball = tar -Oxzf ${1} ${2} > $@
raw/ref/Silva.nr_v119.tgz:
wget --directory-prefix=${@D} http://www.mothur.org/w/images/2/27/Silva.nr_v119.tgz
ref/silva.nr.afn: raw/ref/Silva.nr_v119.tgz
$(call extract_from_tarball,$^,silva.nr_v119.align)
ref/silva.nr.tax: raw/ref/Silva.nr_v119.tgz
$(call extract_from_tarball,$^,silva.nr_v119.tax)
# TODO: Is this the region I really want?
ref/silva.nr.pcr-v4.afn: ref/silva.nr.afn
${MOTHUR_SETUP}
${MOTHUR} "#pcr.seqs(fasta=${LN_PREQS}, start=11894, end=25319, \
keepdots=F, processors=${MAX_PROCS})" ${MOTHUR_STDOUT}
@mv ${BUILD_DIR}/silva.nr.pcr.afn $@
${MOTHUR_TEARDOWN}
raw/ref/Silva.gold.bacteria.zip:
wget --directory-prefix=${@D} http://www.mothur.org/w/images/f/f1/Silva.gold.bacteria.zip
ref/silva.gold.afn: raw/ref/Silva.gold.bacteria.zip
unzip $^
mv silva.gold.align $@
@touch $@
raw/ref/Silva.seed_v119.tgz:
wget --directory-prefix=${@D} http://www.mothur.org/w/images/1/15/Silva.seed_v123.tgz
ref/silva.seed.afn: raw/ref/Silva.seed_v119.tgz
$(call extract_from_tarball,$^,silva.seed_v119.align)
ref/silva.seed.tax: raw/ref/Silva.seed_v119.tgz
$(call extract_from_tarball,$^,silva.seed_v119.tax)
raw/ref/Trainset14_032015.pds.tgz:
wget --directory-prefix=${@D} http://www.mothur.org/w/images/8/88/Trainset14_032015.pds.tgz
ref/rdp.nr.afn: raw/ref/Trainset14_032015.pds.tgz
$(call extract_from_tarball,$^,trainset14_032015.pds/trainset14_032015.pds.fasta)
ref/rdp.nr.tax: raw/ref/Trainset14_032015.pds.tgz
$(call extract_from_tarball,$^,trainset14_032015.pds/trainset14_032015.pds.tax)
# Metadata {{{3
# http://www.mothur.org/wiki/MiSeq_SOP#Getting_started
meta/rrs.files meta/%.rrs.files: scripts/generate_files_file.py \
etc/rrs/library.tsv etc/rrs/analysis_group.tsv
$^ $* > $@
# Dynamic `include`ing and generation of *.contigs.mk {{{3
# For every unique library.series, generate a makefile in build/ which
ALL_LIBRARY_ANALYSIS_GROUPS := \
$(shell sed '1,1d' etc/rrs/analysis_group.tsv | cut -f2 | sort | uniq)
ALL_CONTIGS_MAKEFILES := \
$(patsubst %,${BUILD_DIR}/%.rrs.contigs.mk,${ALL_LIBRARY_ANALYSIS_GROUPS}) \
${BUILD_DIR}/rrs.contigs.mk
ALL_LIBRARY_SERIES += all
ifeq (${INITIALIZED},True)
$(foreach makefile,${ALL_CONTIGS_MAKEFILES},$(eval include ${makefile}))
endif
${BUILD_DIR}/%.contigs.mk: scripts/generate_contigs_makefile.py meta/%.files
$^ $* > $@
raw/seq/%.fastq.gz: raw/seq/fake_root.tgz
echo "This does not make $@. If you actually run this you are doing something wrong."
# Make contigs from paired-end reads
# Preqs defined in each build/%.contigs.mk
seq/split/%.rrs.contigs.fn seq/split/%.rrs.contigs.qual res/split/%.rrs.contigs.report:
gzip -dc $(word 1,$^) > ${BUILD_DIR}/$*.r1.fastq
gzip -dc $(word 2,$^) > ${BUILD_DIR}/$*.r2.fastq
${MOTHUR} "#make.contigs(ffastq=${BUILD_DIR}/$*.r1.fastq, \
rfastq=${BUILD_DIR}/$*.r2.fastq, \
trimoverlap=T)" ${MOTHUR_STDOUT}
@mv ${BUILD_DIR}/$*.r1.trim.contigs.fasta seq/split/$*.rrs.contigs.fn
@mv ${BUILD_DIR}/$*.r1.trim.contigs.qual seq/split/$*.rrs.contigs.qual
@mv ${BUILD_DIR}/$*.r1.contigs.report res/split/$*.rrs.contigs.report
${MOTHUR_TEARDOWN}
# Concatenate individual contig files
# Preqs defined in each build/%.contigs.mk
seq/%.contigs.fn:
cat $^ > $@
# Define groups based on names of individual contig files
# Preqs defined in each build/%.contigs.mk
meta/%.contigs.groups:
$^ > $@
# Align, classify, screen and cluster seqs {{{3
# Screen 1: remove sequences which are the wrong length or have too many
# ambiguous nucleotides
# TODO: Screen params?
# TODO: bug report the problem with dashes in file names for some commands.
seq/%.screen1.fn meta/%.screen1.groups: seq/%.fn meta/%.groups
${MOTHUR_SETUP}
${MOTHUR} "#screen.seqs(fasta=$(word 1,${LN_PREQS}), group=$(word 2,${LN_PREQS}), \
maxambig=0, maxlength=295, processors=${MAX_PROCS})" ${MOTHUR_STDOUT}
@mv ${BUILD_DIR}/$*.good.fn seq/$*.screen1.fn
@mv ${BUILD_DIR}/$*.good.groups meta/$*.screen1.groups
${MOTHUR_TEARDOWN}
# Uniq-ify seqs {{{3
seq/%.uniq.fn meta/%.uniq.names: seq/%.fn
${MOTHUR_SETUP}
${MOTHUR} "#unique.seqs(fasta=${LN_PREQS})" ${MOTHUR_STDOUT}
@mv ${BUILD_DIR}/$*.unique.fn seq/$*.uniq.fn
@mv ${BUILD_DIR}/$*.names meta/$*.uniq.names
${MOTHUR_TEARDOWN}
seq/%.uniq.afn meta/%.uniq.names: seq/%.afn
${MOTHUR_SETUP}
${MOTHUR} "#unique.seqs(fasta=${LN_PREQS})" ${MOTHUR_STDOUT}
@mv ${BUILD_DIR}/$*.unique.afn seq/$*.uniq.afn
@mv ${BUILD_DIR}/$*.names meta/$*.uniq.names
${MOTHUR_TEARDOWN}
meta/%.contigs.screen1.uniq.count_table: meta/%.contigs.screen1.uniq.names meta/%.contigs.screen1.groups
${MOTHUR_SETUP}
${MOTHUR} "#count.seqs(name=$(word 1,${LN_PREQS}), \
group=$(word 2,${LN_PREQS}), processors=${MAX_PROCS})" ${MOTHUR_STDOUT}
@mv ${BUILD_DIR}/$*.contigs.screen1.uniq.count_table meta/$*.contigs.screen1.uniq.count_table
${MOTHUR_TEARDOWN}
# Alignment {{{3
seq/%.align.afn res/%.align.report: seq/%.fn ref/silva.nr.pcr-v4.afn
${MOTHUR_SETUP}
${MOTHUR} "#align.seqs(fasta=$(word 1,${LN_PREQS}), \
reference=$(word 2,${LN_PREQS}), processors=${MAX_PROCS})" ${MOTHUR_STDOUT}
@mv ${BUILD_DIR}/$*.align seq/$*.align.afn
@mv ${BUILD_DIR}/$*.align.report res/$*.align.report
${MOTHUR_TEARDOWN}
# Sequence cleanup and screening {{{3
seq/%.align.screen2.afn meta/%.align.screen2.count_table: seq/%.align.afn meta/%.count_table
${MOTHUR_SETUP}
${MOTHUR} "#screen.seqs(fasta=$(word 1,${LN_PREQS}), start=3100, \
count=$(word 2,${LN_PREQS}), end=10600, maxhomop=8, \
processors=${MAX_PROCS})" ${MOTHUR_STDOUT}
@mv ${BUILD_DIR}/$*.align.good.afn seq/$*.align.screen2.afn
@mv ${BUILD_DIR}/$*.good.count_table meta/$*.align.screen2.count_table
${MOTHUR_TEARDOWN}
seq/%.trim.afn: seq/%.afn
${MOTHUR_SETUP}
${MOTHUR} "#filter.seqs(fasta=$(word 1,${LN_PREQS}), vertical=T, trump=., \
processors=${MAX_PROCS})" ${MOTHUR_STDOUT}
@mv ${BUILD_DIR}/$*.filter.fasta seq/$*.trim.afn
${MOTHUR_TEARDOWN}
seq/%.trim.uniq.afn meta/%.trim.uniq.count_table: seq/%.trim.afn meta/%.count_table
${MOTHUR_SETUP}
${MOTHUR} "#unique.seqs(fasta=$(word 1,${LN_PREQS}), \
count=$(word 2,${LN_PREQS}))" ${MOTHUR_STDOUT}
@mv ${BUILD_DIR}/$*.trim.unique.afn seq/$*.trim.uniq.afn
@mv ${BUILD_DIR}/$*.trim.count_table meta/$*.trim.uniq.count_table
${MOTHUR_TEARDOWN}
# TODO: Why doesn't the .name and .groups file combination work?
# I get told that there are names in the .names file which are not in the
# fasta, but those names are sub-names of uniq seqs, so that's what I expect...?
seq/%.preclust.afn meta/%.preclust.count_table: \
seq/%.afn \
meta/%.count_table
${MOTHUR_SETUP}
${MOTHUR} "#pre.cluster(fasta=$(word 1,${LN_PREQS}), \
count=$(word 2,${LN_PREQS}), diffs=${NUM_PRECLUST_DIFFS}, \
processors=${MAX_PROCS})" ${MOTHUR_STDOUT}
@mv ${BUILD_DIR}/$*.precluster.afn seq/$*.preclust.afn
@mv ${BUILD_DIR}/$*.precluster.count_table meta/$*.preclust.count_table
${MOTHUR_TEARDOWN}
# Chimera screening {{{3
seq/%.screen3.afn res/%.screen3.chimeras meta/%.screen3.count_table: seq/%.afn meta/%.count_table
${MOTHUR_SETUP}
${MOTHUR} "#chimera.uchime(fasta=$(word 1,${LN_PREQS}), \
count=$(word 2,${LN_PREQS}), \
dereplicate=T, processors=${MAX_PROCS})" ${MOTHUR_STDOUT}
@mv ${BUILD_DIR}/$*.denovo.uchime.chimeras res/$*.screen3.chimeras
${MOTHUR} "#remove.seqs(fasta=${BUILD_DIR}/$*.afn, \
accnos=${BUILD_DIR}/$*.denovo.uchime.accnos)" ${MOTHUR_STDOUT}
@mv ${BUILD_DIR}/$*.pick.afn seq/$*.screen3.afn
@mv ${BUILD_DIR}/$*.denovo.uchime.pick.count_table meta/$*.screen3.count_table
${MOTHUR_TEARDOWN}
# Taxonomic classification {{{3
res/%.tax res/%.tax-summary: seq/%.afn ref/rdp.nr.afn ref/rdp.nr.tax meta/%.count_table
${MOTHUR_SETUP}
${MOTHUR} "#classify.seqs(fasta=$(word 1,${LN_PREQS}), \
count=$(word 4, ${LN_PREQS}), \
reference=$(word 2,${LN_PREQS}), \
taxonomy=$(word 3,${LN_PREQS}), \
cutoff=80, processors=${MAX_PROCS})" ${MOTHUR_STDOUT}
@mv ${BUILD_DIR}/$*.nr.wang.taxonomy res/$*.tax
@mv ${BUILD_DIR}/$*.nr.wang.tax.summary res/$*.tax-summary
${MOTHUR_TEARDOWN}
# Taxonomic screening {{{3
# Screen out Chloroplasts, Mitochondria, Archaea, Eukaryota
seq/%.screen4.afn res/%.screen4.tax meta/%.screen4.count_table: seq/%.afn res/%.tax meta/%.count_table
${MOTHUR_SETUP}
${MOTHUR} "#remove.lineage(fasta=$(word 1,${LN_PREQS}), \
count=$(word 3,${LN_PREQS}), \
taxonomy=$(word 2,${LN_PREQS}), \
taxon=Chloroplast-Mitochondria-Archaea-Eukaryota-Unknown)" ${MOTHUR_STDOUT}
@mv ${BUILD_DIR}/$*.pick.afn seq/$*.screen4.afn
@mv ${BUILD_DIR}/$*.pick.tax res/$*.screen4.tax
@mv ${BUILD_DIR}/$*.pick.count_table meta/$*.screen4.count_table
${MOTHUR_TEARDOWN}
# OTU tallying {{{3
# TODO: .names dependency instead of .count_table?
# This'll increase the size of the .otus file, but it will also mean it includes
# all of the names and not just the names associated with the counts.
# Maybe useful for representative sequence pulling?
res/%.otus: seq/%.afn meta/%.count_table res/%.tax
${MOTHUR_SETUP}
${MOTHUR} "#cluster.split(fasta=$(word 1,${LN_PREQS}), \
count=$(word 2,${LN_PREQS}), taxonomy=$(word 3,${LN_PREQS}), \
splitmethod=classify, taxlevel=${SPLIT_TAXLEVEL}, cutoff=${DIST_CUTOFF}, \
processors=${MAX_PROCS}, method=${CLUST_METHOD})" ${MOTHUR_STDOUT}
@mv ${BUILD_DIR}/$*.${CLUST_METHOD_ABBRV}.unique_list.list res/$*.otus
${MOTHUR_TEARDOWN}
res/%.otus.shared: res/%.otus meta/%.count_table
${MOTHUR_SETUP}
${MOTHUR} "#make.shared(list=$(word 1,${LN_PREQS}), count=$(word 2,${LN_PREQS}), label=${OTU_CUTOFF})"
@mv ${BUILD_DIR}/$*.shared $@
${MOTHUR_TEARDOWN}
res/%.otus.tax: res/%.otus meta/%.count_table res/%.tax
${MOTHUR_SETUP}
${MOTHUR} "#classify.otu(list=$(word 1,${LN_PREQS}), count=$(word 2,${LN_PREQS}), \
taxonomy=$(word 3,${LN_PREQS}), label=${OTU_CUTOFF})"
@mv ${BUILD_DIR}/$*.${OTU_CUTOFF}.cons.taxonomy res/$*.otus.tax
${MOTHUR_TEARDOWN}
seq/%.screen3.screen4.otus.reps.afn: scripts/get_otu_reps.py res/%.screen3.screen4.otus \
meta/%.screen3.screen4.count_table seq/%.afn
$(word 1,$^) $(word 2,$^) ${OTU_CUTOFF} $(word 3,$^) $(word 4,$^) > $@
# Rename fully processed files. {{{3
res/%.procd.otus.shared: res/%.contigs.screen1.uniq.align.screen2.trim.uniq.preclust.screen3.screen4.otus.shared
cp $< $@
res/%.procd.otus.tax: res/%.contigs.screen1.uniq.align.screen2.trim.uniq.preclust.screen3.screen4.otus.tax
cp $< $@
seq/%.procd.otus.reps.afn: seq/%.contigs.screen1.uniq.align.screen2.trim.uniq.preclust.screen3.screen4.otus.reps.afn
cp $< $@
res/%.community.tsv: scripts/unstack_shared.py res/%.shared
${P1} < ${P2} > $@
# [Partially redacted]
# =======================
# Analysis {{{1
# =======================
# User defined recipes for analyzing the data.
# e.g. Calculating means, distributions, correlations, fitting models, etc.
# Basically anything that *could* go into the paper as a table.
# [Partially redacted]
# QC {{{3
# TODO
# LEFSE {{{2
# TODO: Fix this up for the new metadata format
meta/rrs.design: scripts/generate_lefse_design.py etc/rrs/library.tsv etc/extraction.tsv etc/sample.tsv etc/mouse.tsv
$^ > $@
res/%.procd.otus.lefse: res/%.procd.otus.shared meta/%.design
${MOTHUR_SETUP}
${MOTHUR} "#lefse(shared=$(word 1,${LN_PREQS}), \
design=$(word 2,${LN_PREQS}), \
class=treatment, subclass=site)"
@mv ${BUILD_DIR}/$*.procd.otus.${OTU_CUTOFF}.lefse_summary $@
${MOTHUR_TEARDOWN}
# Jupyter Notebooks {{{2
# Add pre-requisites for particular jupyter notebooks so that
# `make build/<notebook>.ipynb` will first make those pre-reqs.
# e.g.
# build/notebook.ipynb: res/results.tsv
# ==================
# Graphing {{{1
# ==================
# User defined recipes for plotting figures. These should use
# the targets of analysis recipes above as their prerequisites.
# [Partially redacted]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment