tgirke/PeakSeq.sh

## PeakSeq.sh
##################################################
## Create mappability file for reference genome ##
##################################################
## Thomas Girke
## Date: May 16, 2018
## PeakSeq docs:
##   https://github.com/gersteinlab/PeakSeq
##   http://gensoft.pasteur.fr/docs/PeakSeq/1.1/PeakSeq.readme

## Note: mappability step should be skipped in newer version of PeakSeq
## Download Mappability software
mkdir Mappability_Map; cd Mappability_Map
wget http://archive.gersteinlab.org/proj/PeakSeq/Mappability_Map/Code/Mappability_Map.tar.gz
tar -zxvf Mappability_Map.tar.gz
make
chmod 777 compile.py
chmod 777 *.c
export PATH=/<full_path_to_current_dir>/:$PATH

## Download reference genome (fasta file)
wget ftp://ftp.arabidopsis.org/home/tair/Genes/TAIR10_genome_release/TAIR10_chromosome_files/TAIR10_chr_all.fas -O TAIR10_chr_all.fa

## Create Mappability files (this will take some time)
python compile.py 36 # Change to proper K value for your data set

#################
## Run PeakSeq ##
#################

## (1) Read preprocessing

module load peakseq/1.31
mkdir input
samtools view -h input.bam | PeakSeq -preprocess SAM stdin input
mkdir sample
samtools view -h sample.bam | PeakSeq -preprocess SAM stdin sample

## (2) Peak calling with PeakSeq

PeakSeq -peak_select config.dat # results are written to files my_chip_exp.txt and my_chip_exp_narrowPeak.txt


## Note: the file contig.dat should be in your current working directory and contain the following content.
## The parameters in this file need to be changed as needed

## FILE START ##
# Experiment id is used as a prefix to the output file name.
Experiment_id my_chip_exp

# Chromosome ID list file, is used to generate data file paths.
chromosome_list_file chr_ids.txt

# Enrichment fragment length For tag extension, this is the value of average fragment length.
Enrichment_mapped_fragment_length 200

# Target FDR in the simulations.
target_FDR 0.05

# Number of simulations performed while estimating the putative peaks.
N_Simulations 10

# Minimum distance between consecutive peaks
Minimum_interpeak_distance 200

# Mappability file that includes the uniquely mappable number of nucleotides per window for each chromosome.
Mappability_map_file dummy

# The directory that contains the preprocessed ChIP-Seq reads, can specify multiple directories to pool reads from multiple source (e.g. replicates)
ChIP_Seq_reads_data_dirs sample

# The directory that contains the preprocessed Input (control) experiment reads. (Multiple directories allowed)
Input_reads_data_dirs input

# Seed for pseudo-random number generator. This is necessary for simulated background option (specified below).
#Simulation_seed 1234567

# Q-value threshold applied on the final set of peaks.
max_Qvalue 0.05

# There are currently two models for simulating the background for threshold selection
# Simulated background is the simulation based method that is explained in the PeakSeq paper.
# Poisson background uses a simple Poisson background with mean estimated from the read statistics. This option is still experimental but it is much faster than the simulated background option.
# Background_model Poisson
Background_model Simulated
## FILE END ##
	##################################################
	## Create mappability file for reference genome ##
	##################################################
	## Thomas Girke
	## Date: May 16, 2018
	## PeakSeq docs:
	## https://github.com/gersteinlab/PeakSeq
	## http://gensoft.pasteur.fr/docs/PeakSeq/1.1/PeakSeq.readme

	## Note: mappability step should be skipped in newer version of PeakSeq
	## Download Mappability software
	mkdir Mappability_Map; cd Mappability_Map
	wget http://archive.gersteinlab.org/proj/PeakSeq/Mappability_Map/Code/Mappability_Map.tar.gz
	tar -zxvf Mappability_Map.tar.gz
	make
	chmod 777 compile.py
	chmod 777 *.c
	export PATH=/<full_path_to_current_dir>/:$PATH

	## Download reference genome (fasta file)
	wget ftp://ftp.arabidopsis.org/home/tair/Genes/TAIR10_genome_release/TAIR10_chromosome_files/TAIR10_chr_all.fas -O TAIR10_chr_all.fa

	## Create Mappability files (this will take some time)
	python compile.py 36 # Change to proper K value for your data set

	#################
	## Run PeakSeq ##
	#################

	## (1) Read preprocessing

	module load peakseq/1.31
	mkdir input
	samtools view -h input.bam \| PeakSeq -preprocess SAM stdin input
	mkdir sample
	samtools view -h sample.bam \| PeakSeq -preprocess SAM stdin sample

	## (2) Peak calling with PeakSeq

	PeakSeq -peak_select config.dat # results are written to files my_chip_exp.txt and my_chip_exp_narrowPeak.txt


	## Note: the file contig.dat should be in your current working directory and contain the following content.
	## The parameters in this file need to be changed as needed

	## FILE START ##
	# Experiment id is used as a prefix to the output file name.
	Experiment_id my_chip_exp

	# Chromosome ID list file, is used to generate data file paths.
	chromosome_list_file chr_ids.txt

	# Enrichment fragment length For tag extension, this is the value of average fragment length.
	Enrichment_mapped_fragment_length 200

	# Target FDR in the simulations.
	target_FDR 0.05

	# Number of simulations performed while estimating the putative peaks.
	N_Simulations 10

	# Minimum distance between consecutive peaks
	Minimum_interpeak_distance 200

	# Mappability file that includes the uniquely mappable number of nucleotides per window for each chromosome.
	Mappability_map_file dummy

	# The directory that contains the preprocessed ChIP-Seq reads, can specify multiple directories to pool reads from multiple source (e.g. replicates)
	ChIP_Seq_reads_data_dirs sample

	# The directory that contains the preprocessed Input (control) experiment reads. (Multiple directories allowed)
	Input_reads_data_dirs input

	# Seed for pseudo-random number generator. This is necessary for simulated background option (specified below).
	#Simulation_seed 1234567

	# Q-value threshold applied on the final set of peaks.
	max_Qvalue 0.05

	# There are currently two models for simulating the background for threshold selection
	# Simulated background is the simulation based method that is explained in the PeakSeq paper.
	# Poisson background uses a simple Poisson background with mean estimated from the read statistics. This option is still experimental but it is much faster than the simulated background option.
	# Background_model Poisson
	Background_model Simulated
	## FILE END ##