kgori/Makefile

## README.md

      
    Raw
  

              README.md
            
          
    Simulates a reference and some reads. The reads have a 15bp insertion relative to the reference, and 1 SNV.
Uses standard bioinformatic tools to align the reads, call the variants, and produce a consensus sequence.
The consensus should differ from the reference at exactly the single SNV, and the 15bp insertion.
To run, just call make. For more control, targets can be made in order (each step will also make its prerequisites):

make simulation: just make the simulated reference and reads
make alignment: run bwa mem to align the reads to the reference
make pileup: run bcftools mpileup on the simulated reads
make calls: run bcftools call on the pileup result
make filtered: run bcftools filter on the calls to consolidate overlapping indels
make consensus (equivalent to make all): run bcftools consensus on the calls
make clean: clean up all generated files

Outputs (not including various indexing files):

ref.fa: simulated reference sequence
reads.fq: simulated reference reads
aln.bam: reads aligned to reference
pileup.vcf: pileup result
calls.vcf: calls obtained from pileup
calls.bcf: filtered calls obtained from pileup
consensus.fa: consensus sequence

Comparing consensus.fa to ref.fa shows the 15bp insertion.

  
## Makefile
# Replace these with the full path to the executables, if necessary
BWA=bwa
SAMTOOLS=samtools
BCFTOOLS=bcftools

# Makefile targets
all: ref.fa reads.fq aln.bam aln.bam.bai pileup.vcf filtered_calls.bcf consensus.fa
.PHONY: all

simulation: ref.fa reads.fq
.PHONY: simulation

alignment: aln.bam.bai
.PHONY: alignment

pileup: pileup.vcf
.PHONY: pileup

calls: calls.vcf
.PHONY: calls

filtered_calls: filtered_calls.bcf.csi
.PHONY: filtered_calls

consensus: all
.PHONY: consensus

.PHONY: clean
clean:
	rm -f ref.fa* reads.fq aln.* calls.* filtered_calls.* pileup.* consensus.fa

# Simulate reference sequence and reads
ref.fa reads.fq: simulate_ref_and_reads.py
	python simulate_ref_and_reads.py

# Index the reference
ref.fa.fai: ref.fa
	$(BWA) index ref.fa 2>/dev/null

# Align reads to reference
aln.sam: ref.fa ref.fa.fai reads.fq
	$(BWA) mem ref.fa reads.fq > aln.sam 2>/dev/null

# Sort the SAM file and write to BAM
aln.bam: aln.sam
	$(SAMTOOLS) sort aln.sam > aln.bam 2>/dev/null

# Index the BAM
aln.bam.bai: aln.bam
	$(SAMTOOLS) index aln.bam 2>/dev/null

# Pileup the reads using bcftools
pileup.vcf: aln.bam ref.fa
	$(BCFTOOLS) mpileup -f ref.fa aln.bam > pileup.vcf 2>/dev/null

# Call variants from the pileup
calls.vcf: pileup.vcf
	$(BCFTOOLS) call -Ov -mv pileup.vcf > calls.vcf 2>/dev/null

# Filter to merge overlapping indels and sort VCF and write as BCF
filtered_calls.bcf: calls.vcf
	$(BCFTOOLS) filter -G 3 calls.vcf | $(BCFTOOLS) sort -O b -o filtered_calls.bcf 2>/dev/null

# Index the calls
filtered_calls.bcf.csi: filtered_calls.bcf
	$(BCFTOOLS) index filtered_calls.bcf 2>/dev/null

# Call the consensus sequence
consensus.fa: filtered_calls.bcf filtered_calls.bcf.csi ref.fa
	$(BCFTOOLS) consensus -f ref.fa filtered_calls.bcf > consensus.fa 2>/dev/null


## simulate_ref_and_reads.py
import itertools
import random

"""
Simulates a reference sequence and short reads to align to it.

Short reads have a small number (0-10) of random uniform mutations.

The reference sequence has a SNV and a 15bp deletion introduced,
to make it look like the sequenced sample has a SNV and a 15bp
insertion
"""

RANDOM_SEED = 123 # change this to get different output
REFLEN = 1015 # length of the reference sequence (plus 15bp which will be deleted)
READLEN = 125 # length of a read
NREADS = 1200 # number of readss to simulate

def grouper(n, iterable):
    """
    >>> list(grouper(3, 'ABCDEFG'))
    [['A', 'B', 'C'], ['D', 'E', 'F'], ['G']]
    """
    iterable = iter(iterable)
    return iter(lambda: list(itertools.islice(iterable, n)), [])

def break_string(string, size=60):
    """
    Insert a newline into `string` every `size` characters
    """
    return '\n'.join(''.join(grp) for grp in grouper(size, string))


random.seed(RANDOM_SEED)
refseq = ''.join(random.choice('ACGT') for _ in range(REFLEN))

# Generate read sequences
reads = []

for _ in range(NREADS):
    pos = random.randint(0, REFLEN - READLEN - 1)
    seq = list(refseq[pos:(pos + READLEN)]) # list is mutable
    nmuts = random.randint(0, 10)
    for _ in range(nmuts):
        mutpos = random.randint(0, READLEN - 1)
        seq[mutpos] = random.choice(list(set('ACGT') - set(seq[mutpos])))
    reads.append(''.join(seq))

# Write FASTQ
with open('reads.fq', 'w') as fq:
    for i, read in enumerate(reads):
        fq.write('@{name}\n{seq}\n+\n{qual}\n\n'
                 .format(name='read{}'.format(i),
                     seq=read,
                     qual='F'*READLEN))


# Write reference sequnce with 15 bp removed (so all reads have insertion)
# and change base at position 100
refseq = list(refseq)
refseq[99] = random.choice(list(set('ACGT') - set(refseq[99])))
refseq = ''.join(refseq)
refseq = refseq[:200] + refseq[215:]
with open('ref.fa', 'w') as ref:
    ref.write('>Chr1\n{}\n'.format(break_string(refseq, 60)))
	# Replace these with the full path to the executables, if necessary
	BWA=bwa
	SAMTOOLS=samtools
	BCFTOOLS=bcftools

	# Makefile targets
	all: ref.fa reads.fq aln.bam aln.bam.bai pileup.vcf filtered_calls.bcf consensus.fa
	.PHONY: all

	simulation: ref.fa reads.fq
	.PHONY: simulation

	alignment: aln.bam.bai
	.PHONY: alignment

	pileup: pileup.vcf
	.PHONY: pileup

	calls: calls.vcf
	.PHONY: calls

	filtered_calls: filtered_calls.bcf.csi
	.PHONY: filtered_calls

	consensus: all
	.PHONY: consensus

	.PHONY: clean
	clean:
	rm -f ref.fa* reads.fq aln.* calls.* filtered_calls.* pileup.* consensus.fa

	# Simulate reference sequence and reads
	ref.fa reads.fq: simulate_ref_and_reads.py
	python simulate_ref_and_reads.py

	# Index the reference
	ref.fa.fai: ref.fa
	$(BWA) index ref.fa 2>/dev/null

	# Align reads to reference
	aln.sam: ref.fa ref.fa.fai reads.fq
	$(BWA) mem ref.fa reads.fq > aln.sam 2>/dev/null

	# Sort the SAM file and write to BAM
	aln.bam: aln.sam
	$(SAMTOOLS) sort aln.sam > aln.bam 2>/dev/null

	# Index the BAM
	aln.bam.bai: aln.bam
	$(SAMTOOLS) index aln.bam 2>/dev/null

	# Pileup the reads using bcftools
	pileup.vcf: aln.bam ref.fa
	$(BCFTOOLS) mpileup -f ref.fa aln.bam > pileup.vcf 2>/dev/null

	# Call variants from the pileup
	calls.vcf: pileup.vcf
	$(BCFTOOLS) call -Ov -mv pileup.vcf > calls.vcf 2>/dev/null

	# Filter to merge overlapping indels and sort VCF and write as BCF
	filtered_calls.bcf: calls.vcf
	$(BCFTOOLS) filter -G 3 calls.vcf \| $(BCFTOOLS) sort -O b -o filtered_calls.bcf 2>/dev/null

	# Index the calls
	filtered_calls.bcf.csi: filtered_calls.bcf
	$(BCFTOOLS) index filtered_calls.bcf 2>/dev/null

	# Call the consensus sequence
	consensus.fa: filtered_calls.bcf filtered_calls.bcf.csi ref.fa
	$(BCFTOOLS) consensus -f ref.fa filtered_calls.bcf > consensus.fa 2>/dev/null
	import itertools
	import random

	"""
	Simulates a reference sequence and short reads to align to it.

	Short reads have a small number (0-10) of random uniform mutations.

	The reference sequence has a SNV and a 15bp deletion introduced,
	to make it look like the sequenced sample has a SNV and a 15bp
	insertion
	"""

	RANDOM_SEED = 123 # change this to get different output
	REFLEN = 1015 # length of the reference sequence (plus 15bp which will be deleted)
	READLEN = 125 # length of a read
	NREADS = 1200 # number of readss to simulate

	def grouper(n, iterable):
	"""
	>>> list(grouper(3, 'ABCDEFG'))
	[['A', 'B', 'C'], ['D', 'E', 'F'], ['G']]
	"""
	iterable = iter(iterable)
	return iter(lambda: list(itertools.islice(iterable, n)), [])

	def break_string(string, size=60):
	"""
	Insert a newline into `string` every `size` characters
	"""
	return '\n'.join(''.join(grp) for grp in grouper(size, string))


	random.seed(RANDOM_SEED)
	refseq = ''.join(random.choice('ACGT') for _ in range(REFLEN))

	# Generate read sequences
	reads = []

	for _ in range(NREADS):
	pos = random.randint(0, REFLEN - READLEN - 1)
	seq = list(refseq[pos:(pos + READLEN)]) # list is mutable
	nmuts = random.randint(0, 10)
	for _ in range(nmuts):
	mutpos = random.randint(0, READLEN - 1)
	seq[mutpos] = random.choice(list(set('ACGT') - set(seq[mutpos])))
	reads.append(''.join(seq))

	# Write FASTQ
	with open('reads.fq', 'w') as fq:
	for i, read in enumerate(reads):
	fq.write('@{name}\n{seq}\n+\n{qual}\n\n'
	.format(name='read{}'.format(i),
	seq=read,
	qual='F'*READLEN))


	# Write reference sequnce with 15 bp removed (so all reads have insertion)
	# and change base at position 100
	refseq = list(refseq)
	refseq[99] = random.choice(list(set('ACGT') - set(refseq[99])))
	refseq = ''.join(refseq)
	refseq = refseq[:200] + refseq[215:]
	with open('ref.fa', 'w') as ref:
	ref.write('>Chr1\n{}\n'.format(break_string(refseq, 60)))