claczny/compute_coverage.mk

## compute_coverage.mk
SHELL=/bin/bash

SAMPLE?=<YOUR_SAMPLE>
DOUBLED_SAMPLE = $(SAMPLE)/$(SAMPLE)

RDIR?=results
DDIR?=data

#####
# BEAUTY TARGETS
#####

.PHONY: all extract_fastq bam2sam sort_bam index_bam genomecov

all: extract_fastq genomecov

extract_fastq: $(RDIR)/$(DOUBLED_SAMPLE).fq
bam2sam: $(RDIR)/$(DOUBLED_SAMPLE).sam
sort_bam: $(RDIR)/$(DOUBLED_SAMPLE).srtd.bam
index_bam: $(RDIR)/$(DOUBLED_SAMPLE).srtd.bai
genomecov: $(RDIR)/$(DOUBLED_SAMPLE).srtd.cov_avg.txt

# SOME BASIC STATISTICS
get_unique_seq_count: $(RDIR)/$(DOUBLED_SAMPLE).sam
        awk '{print $$1}' $^ | sort |uniq -c | wc -l

get_mapq_distribution: $(RDIR)/ $(DOUBLED_SAMPLE).sam
        awk -F"\t" '{print $$5}' $^ | sort |uniq -c

get_cigar_distribution: $(RDIR)/$(DOUBLED_SAMPLE).sam
        awk -F"\t" '{print $$6}' $^ | sort |uniq -c

#####
# ACTUAL TARGETS
#####
.SECONDARY:

.SECONDEXPANSION:
$(RDIR)/%.fq: $(DDIR)/$$(notdir $$*).bam
        mkdir -p $(dir $@)
        @date
        time bedtools bamtofastq -i $^ -fq $@
        @date

$(RDIR)/%.sam: $(DDIR)/$$(notdir $$*).bam
        @date
        time samtools view $^ > $@
        @date

$(RDIR)/%.srtd.bam: $(DDIR)/$$(notdir $$*).bam
        @date
        time samtools sort $^ $(@:.bam=)
        @date

%.bai: %.bam
        @date
        time samtools index $^ $@
        @date

%.srtd.cov_hist.txt: %.srtd.bam %.srtd.bai
        @date
        bedtools genomecov -ibam $(word 1,$^) > $@
        @date

%.cov_avg.txt: %.cov_hist.txt
        @date
        awk -F"\t" 'BEGIN {pc=""} \
        {\
                c=$$1;\
                if (c == pc) {\
                        cov=cov+$$2*$$5;\
                } else {\
                        print pc,cov;\
                        cov=$$2*$$5;\
                pc=c}\
        } END {print pc,cov}' $^ | tail -n +2 > $@
        @date

#####
# CLEAN-UP
#####
clean:
        echo "TODO: clean"
	SHELL=/bin/bash

	SAMPLE?=<YOUR_SAMPLE>
	DOUBLED_SAMPLE = $(SAMPLE)/$(SAMPLE)

	RDIR?=results
	DDIR?=data

	#####
	# BEAUTY TARGETS
	#####

	.PHONY: all extract_fastq bam2sam sort_bam index_bam genomecov

	all: extract_fastq genomecov

	extract_fastq: $(RDIR)/$(DOUBLED_SAMPLE).fq
	bam2sam: $(RDIR)/$(DOUBLED_SAMPLE).sam
	sort_bam: $(RDIR)/$(DOUBLED_SAMPLE).srtd.bam
	index_bam: $(RDIR)/$(DOUBLED_SAMPLE).srtd.bai
	genomecov: $(RDIR)/$(DOUBLED_SAMPLE).srtd.cov_avg.txt

	# SOME BASIC STATISTICS
	get_unique_seq_count: $(RDIR)/$(DOUBLED_SAMPLE).sam
	awk '{print $$1}' $^ \| sort \|uniq -c \| wc -l

	get_mapq_distribution: $(RDIR)/ $(DOUBLED_SAMPLE).sam
	awk -F"\t" '{print $$5}' $^ \| sort \|uniq -c

	get_cigar_distribution: $(RDIR)/$(DOUBLED_SAMPLE).sam
	awk -F"\t" '{print $$6}' $^ \| sort \|uniq -c

	#####
	# ACTUAL TARGETS
	#####
	.SECONDARY:

	.SECONDEXPANSION:
	$(RDIR)/%.fq: $(DDIR)/$$(notdir $$*).bam
	mkdir -p $(dir $@)
	@date
	time bedtools bamtofastq -i $^ -fq $@
	@date

	$(RDIR)/%.sam: $(DDIR)/$$(notdir $$*).bam
	@date
	time samtools view $^ > $@
	@date

	$(RDIR)/%.srtd.bam: $(DDIR)/$$(notdir $$*).bam
	@date
	time samtools sort $^ $(@:.bam=)
	@date

	%.bai: %.bam
	@date
	time samtools index $^ $@
	@date

	%.srtd.cov_hist.txt: %.srtd.bam %.srtd.bai
	@date
	bedtools genomecov -ibam $(word 1,$^) > $@
	@date

	%.cov_avg.txt: %.cov_hist.txt
	@date
	awk -F"\t" 'BEGIN {pc=""} \
	{\
	c=$$1;\
	if (c == pc) {\
	cov=cov+$$2*$$5;\
	} else {\
	print pc,cov;\
	cov=$$2*$$5;\
	pc=c}\
	} END {print pc,cov}' $^ \| tail -n +2 > $@
	@date

	#####
	# CLEAN-UP
	#####
	clean:
	echo "TODO: clean"