Phlya/uneuploidy_project.yml

## uneuploidy_project.yml
#########################################
# THIS IS A TYPICAL project.yml TEMPLATE
# most of the settings present here
# are GO for mapping production data
# but nonetheless user must consider
# carefully every presented option
#########################################

#########################################
# When commmenting parameters out, make sure
# that each section still has at least one
# uncommented parameter, otherwise it
# will not get properly parsed.
#########################################


#######################################
# provide paths to your raw input data (fastq-s):
#######################################
# Fastqs can be provided as:
# -- a pairs of relative/absolute paths
# -- sra:<SRA_NUMBER>, optionally followed by the indices of the first and
# the last entry in the SRA in the form of "?start=<first>&end=<last>
input:
    raw_reads_paths:
        # substitute the location of your fastq files:
        # an example of a 1-lane library:

        3475-G:
            lane1:
              - ../SU_TRISOMY/3475-G_R1_001.fastq.gz
              - ../SU_TRISOMY/3475-G_R2_001.fastq.gz
        3475-Y:
            lane1:
              - ../SU_TRISOMY/3475-Y_R1_001.fastq.gz
              - ../SU_TRISOMY/3475-Y_R2_001.fastq.gz
        3492-G:
            lane1:
              - ../SU_TRISOMY/3492-G_R1_001.fastq.gz
              - ../SU_TRISOMY/3492-G_R2_001.fastq.gz
        3492-Y:
            lane1:
              - ../SU_TRISOMY/3492-Y_R1_001.fastq.gz
              - ../SU_TRISOMY/3492-Y_R2_001.fastq.gz
        3494-G:
            lane1:
              - ../SU_TRISOMY/3494-G_R1_001.fastq.gz
              - ../SU_TRISOMY/3494-G_R2_001.fastq.gz
        3494-Y:
            lane1:
              - ../SU_TRISOMY/3494-Y_R1_001.fastq.gz
              - ../SU_TRISOMY/3494-Y_R2_001.fastq.gz
        3518I:
            lane1:
              - ../SU_TRISOMY/3518I_R1_001.fastq.gz
              - ../SU_TRISOMY/3518I_R2_001.fastq.gz
        3518II:
            lane1:
              - ../SU_TRISOMY/3518II_R1_001.fastq.gz
              - ../SU_TRISOMY/3518II_R2_001.fastq.gz
        3524I:
            lane1:
              - ../SU_TRISOMY/3524I_R1_001.fastq.gz
              - ../SU_TRISOMY/3524I_R2_001.fastq.gz
        3525a_S9:
            lane1:
              - ../GOOGLE_DATA/3525a_S9_R1_001.fastq.gz
              - ../GOOGLE_DATA/3525a_S9_R2_001.fastq.gz
        3525b_S10:
            lane1:
              - ../GOOGLE_DATA/3525b_S10_R1_001.fastq.gz
              - ../GOOGLE_DATA/3525b_S10_R2_001.fastq.gz
        HF-18-G:
            lane1:
              - ../SU_TRISOMY/HF-18-G_R1_001.fastq.gz
              - ../SU_TRISOMY/HF-18-G_R2_001.fastq.gz
        HF-18-Y:
            lane1:
              - ../SU_TRISOMY/HF-18-Y_R1_001.fastq.gz
              - ../SU_TRISOMY/HF-18-Y_R2_001.fastq.gz
        PFCH6-G:
            lane1:
              - ../SU_TRISOMY/PFCH6-G_R1_001.fastq.gz
              - ../SU_TRISOMY/PFCH6-G_R2_001.fastq.gz
        PFCH6-Y:
            lane1:
              - ../SU_TRISOMY/PFCH6-Y_R1_001.fastq.gz
              - ../SU_TRISOMY/PFCH6-Y_R2_001.fastq.gz
        Iso-E_rep1:
            lane1:
              - sra:SRR16242076
        Iso-E_rep2:
            lane1:
              - sra:SRR16242077
        Iso-E_rep3:
            lane1:
              - sra:SRR16242078
        Iso-T_rep1:
            lane1:
              - sra:SRR16242079
        Iso-T_rep2:
            lane1:
              - sra:SRR16242080
        Iso-T_rep3:
            lane1:
              - sra:SRR16242081
        NPC_Iso-E_rep1:
            lane1:
              - sra:SRR16242082
        NPC_Iso-E_rep2:
            lane1:
              - sra:SRR16242083
        NPC_Iso-E_rep3:
            lane1:
              - sra:SRR16242084
        NPC_Iso-T_rep1:
            lane1:
              - sra:SRR16242085
        NPC_Iso-T_rep2:
            lane1:
              - sra:SRR16242086
        NPC_Iso-T_rep3:
            lane1:
              - sra:SRR16242087
        NPC_Ma-E_rep1:
            lane1:
              - sra:SRR16242088
        NPC_Ma-E_rep2:
            lane1:
              - sra:SRR16242089
        NPC_Ma-T_rep1:
            lane1:
              - sra:SRR16242090
        NPC_Ma-T_rep2:
            lane1:
              - sra:SRR16242091

    # independent libraries can be combined together
    # on the level of binned-data (.cool files)
    # describe your groupings of choice here:
    library_groups:

        s3471:
            - 3471-G
            - 3471-Y
        s3475:
            - 3475-G
            - 3475-Y

        s3494:
            - 3494-G
            - 3494-Y
        s3496:
            - 3496I
        s3518:
            - 3518I
            - 3518II
        s3524:
            - 3524I
        s3525:
            - 3525a_S9
            - 3525b_S10
        sHF18:
            - HF-18-G
            - HF-18-Y
        sPFCH6:
            - PFCH6-G
            - PFCH6-Y
        IsoE:
            - Iso-E_rep1
            - Iso-E_rep2
            - Iso-E_rep3
        IsoT:
            - Iso-T_rep1
            - Iso-T_rep2
            - Iso-T_rep3
        NPC_IsoE:
            - NPC_Iso-E_rep1
            - NPC_Iso-E_rep2
            - NPC_Iso-E_rep3
        NPC_IsoT:
            - NPC_Iso-T_rep1
            - NPC_Iso-T_rep2
            - NPC_Iso-T_rep3
        NPC_MaE:
            - NPC_Ma-E_rep1
            - NPC_Ma-E_rep2
        NPC_MaT:
            - NPC_Ma-T_rep1
            - NPC_Ma-T_rep2

    # Truncate input fastqs to a small number of reads (e.g. 10000) for
    # semi-dry test runs.
    # NOTE: when the inputs are specified as an SRA number, only this number of
    # reads is downloaded!
    truncate_fastq_reads: 0

    # Specify a reference genome to align sequenced reads.
    # Provide the genome assembly name, a wildcard path to the BWA index files
    # of the reference genome, and a tab-separated table with contig sizes
    # (known as "chrom.sizes"). The latter is used to specify the subset and the
    # order of contigs in a resulting contact map.
    genome:
        assembly_name: 'hg38'
        bwa_index_wildcard_path: '/store/razinlab/common_data/genomes/hg38/index/bwa/hg38.fa.*'
        chrom_sizes_path:   '/store/razinlab/common_data/genomes/hg38/hg38.fa.sizes'

# Choose if you want to do FastQC of the input files:
do_fastqc: False

# Control how reads are mapped to the reference genomes.
map:
    # If 'chunksize' is non-zero, each input file gets split into multiple chunks,
    # each mapped separately. Useful for mapping on clusters with many
    # relatively weak nodes.
    # The optimal chunk size is defined by the balance between mapping and merging.
    # Smaller chunks (~30M) are better for clusters with many weak nodes,
    # however, having >~10 chunks per run slow down merging.
    chunksize: 100_000_000

    # Specify extra BWA mapping options.
    mapping_options: ''

    # Specify fastp trim options.
    #i.e. parameters
    #--detect_adapter_for_pe -q 15
    trim_options: ''

    # A more technical option, use a custom script to split fastq files from SRA
    # into two files, one per read side. By default it is true, which is
    # faster (because we can use multi-threaded compression), but less
    # stable. Set to false if you download files from SRA and bwa complains
    # about unpaired reads.
    use_custom_split: true

# Control how read alignments are converted ('parsed') into Hi-C pairs.
parse:
    # If 'make_pairsam' is True, parsed Hi-C pairs will store complete
    # alignment records in the SAM format (the resulting hybrid between the
    # .pairs and .sam formats is called '.pairsam'). Such files can be useful for
    # thorough investigation of Hi-C data. Downstream of parsing, pairsams
    # are split into .pairs and .bam, and .bam alignments are tagged with
    # Hi-C related information. 'make_pairsam' roughly doubles the storage
    # and I/O requirements and should be used only when absolutely needed.
    # NOTE: when 'make_pairsam' is False, the initial output of parsing is still
    # called '.pairsam' despite missing SAM alignments, for technical reasons.
    make_pairsam: False

    # When 'make_pairsam' is True, enabling 'drop_seq' erases sequences and
    # Phred scores from the SAM alignments in .pairsam and .bam output files.
    # Enable to make lightweight .pairsam/.bam output.
    # NOTE: when 'make_pairsam' is False, 'drop_seq' is ignored.
    drop_seq: True

    # Enable 'drop_readid' to drop readID from .pairs files to create
    # lightweight .pairs files
    # NOTE: does not affect alignment records in the .pairsam files and
    # subsequently .bam files after .apirsam splitting.
    drop_readid: False

    # When 'keep_unparsed_bams' is True, distiller preserves the _immediate_
    # output of bwa in a .bam format. Could be used as a faster alternative
    # to 'make_pairsam' when alignments are needed, but tagging them with Hi-C
    # related information is not necessary.
    keep_unparsed_bams: False

    # Pass extra options to pairtools parse, on top of the ones specified by
    # flags 'make_pairsam', 'drop_readid', 'drop_seq'. The default value
    # enables storing MAPQ scores in the .pairsam/.pairs output, which are
    # used later for filtering/binning. The default walks-policy is 'mask'
    # which masks complex walks in long reads.
    parsing_options: '--add-columns mapq --walks-policy mask'

# Control how PCR/optical duplicates are detected in the data.
dedup:
    # PCR/optical duplicates are detected as Hi-C pairs with matching locations
    # on both sides. 'max_mismatch_bp' controls the maximal allowed mismatch in
    # mapped locations on either side for two pairs to be still considered as
    # duplicates.
    max_mismatch_bp: 1

# Control how Hi-C pairs are binned into contact maps, stored in .cool files.
bin:
    # Specify which resolutions should be included in the multi-resolution .cool file.
    # The lowest (base) resolution _must_ be the common denominator of all other
    # resolutions.
    resolutions:
        - 10000000
        - 5000000
        - 2500000
        - 1000000
        - 500000
        - 250000
        - 100000
        - 50000
        - 25000
        - 10000
        - 5000
        - 2000
        - 1000
    # Specify if the multi-resolution .cool output files should be balanced.
    balance: true

    # Pass additional parameters to cooler balance:
    balance_options: '--trans-only --name weight_trans'

    # Specify additional filters applied to pairs during binning.
    # Multiple filters are allowed; for each filter, all pairs satisfying the
    # given filter expression will be binned into a separate cooler.
    # Filters are specified using the following syntax:
    # {filter_name}: '{a valid Python expression}'
    filters:
        no_filter: ''
        mapq_30: '(mapq1>=30) and (mapq2>=30)'

########################################
# folder structure for storing results
########################################
output:
    dirs:
        processed_fastqs: 'project/processed_fastqs/'
        mapped_parsed_sorted_chunks: 'project/mapped_parsed_sorted_chunks'
        fastqc: 'project/fastqc/'
        pairs_library: 'project/pairs_library'
        coolers_library: 'project/coolers_library/'
        coolers_library_group: 'project/coolers_library_group/'
        stats_library_group: 'project/stats_library_group/'
	#########################################
	# THIS IS A TYPICAL project.yml TEMPLATE
	# most of the settings present here
	# are GO for mapping production data
	# but nonetheless user must consider
	# carefully every presented option
	#########################################

	#########################################
	# When commmenting parameters out, make sure
	# that each section still has at least one
	# uncommented parameter, otherwise it
	# will not get properly parsed.
	#########################################



	#######################################
	# provide paths to your raw input data (fastq-s):
	#######################################
	# Fastqs can be provided as:
	# -- a pairs of relative/absolute paths
	# -- sra:<SRA_NUMBER>, optionally followed by the indices of the first and
	# the last entry in the SRA in the form of "?start=<first>&end=<last>
	input:
	raw_reads_paths:
	# substitute the location of your fastq files:
	# an example of a 1-lane library:

	3475-G:
	lane1:
	- ../SU_TRISOMY/3475-G_R1_001.fastq.gz
	- ../SU_TRISOMY/3475-G_R2_001.fastq.gz
	3475-Y:
	lane1:
	- ../SU_TRISOMY/3475-Y_R1_001.fastq.gz
	- ../SU_TRISOMY/3475-Y_R2_001.fastq.gz
	3492-G:
	lane1:
	- ../SU_TRISOMY/3492-G_R1_001.fastq.gz
	- ../SU_TRISOMY/3492-G_R2_001.fastq.gz
	3492-Y:
	lane1:
	- ../SU_TRISOMY/3492-Y_R1_001.fastq.gz
	- ../SU_TRISOMY/3492-Y_R2_001.fastq.gz
	3494-G:
	lane1:
	- ../SU_TRISOMY/3494-G_R1_001.fastq.gz
	- ../SU_TRISOMY/3494-G_R2_001.fastq.gz
	3494-Y:
	lane1:
	- ../SU_TRISOMY/3494-Y_R1_001.fastq.gz
	- ../SU_TRISOMY/3494-Y_R2_001.fastq.gz
	3518I:
	lane1:
	- ../SU_TRISOMY/3518I_R1_001.fastq.gz
	- ../SU_TRISOMY/3518I_R2_001.fastq.gz
	3518II:
	lane1:
	- ../SU_TRISOMY/3518II_R1_001.fastq.gz
	- ../SU_TRISOMY/3518II_R2_001.fastq.gz
	3524I:
	lane1:
	- ../SU_TRISOMY/3524I_R1_001.fastq.gz
	- ../SU_TRISOMY/3524I_R2_001.fastq.gz
	3525a_S9:
	lane1:
	- ../GOOGLE_DATA/3525a_S9_R1_001.fastq.gz
	- ../GOOGLE_DATA/3525a_S9_R2_001.fastq.gz
	3525b_S10:
	lane1:
	- ../GOOGLE_DATA/3525b_S10_R1_001.fastq.gz
	- ../GOOGLE_DATA/3525b_S10_R2_001.fastq.gz
	HF-18-G:
	lane1:
	- ../SU_TRISOMY/HF-18-G_R1_001.fastq.gz
	- ../SU_TRISOMY/HF-18-G_R2_001.fastq.gz
	HF-18-Y:
	lane1:
	- ../SU_TRISOMY/HF-18-Y_R1_001.fastq.gz
	- ../SU_TRISOMY/HF-18-Y_R2_001.fastq.gz
	PFCH6-G:
	lane1:
	- ../SU_TRISOMY/PFCH6-G_R1_001.fastq.gz
	- ../SU_TRISOMY/PFCH6-G_R2_001.fastq.gz
	PFCH6-Y:
	lane1:
	- ../SU_TRISOMY/PFCH6-Y_R1_001.fastq.gz
	- ../SU_TRISOMY/PFCH6-Y_R2_001.fastq.gz
	Iso-E_rep1:
	lane1:
	- sra:SRR16242076
	Iso-E_rep2:
	lane1:
	- sra:SRR16242077
	Iso-E_rep3:
	lane1:
	- sra:SRR16242078
	Iso-T_rep1:
	lane1:
	- sra:SRR16242079
	Iso-T_rep2:
	lane1:
	- sra:SRR16242080
	Iso-T_rep3:
	lane1:
	- sra:SRR16242081
	NPC_Iso-E_rep1:
	lane1:
	- sra:SRR16242082
	NPC_Iso-E_rep2:
	lane1:
	- sra:SRR16242083
	NPC_Iso-E_rep3:
	lane1:
	- sra:SRR16242084
	NPC_Iso-T_rep1:
	lane1:
	- sra:SRR16242085
	NPC_Iso-T_rep2:
	lane1:
	- sra:SRR16242086
	NPC_Iso-T_rep3:
	lane1:
	- sra:SRR16242087
	NPC_Ma-E_rep1:
	lane1:
	- sra:SRR16242088
	NPC_Ma-E_rep2:
	lane1:
	- sra:SRR16242089
	NPC_Ma-T_rep1:
	lane1:
	- sra:SRR16242090
	NPC_Ma-T_rep2:
	lane1:
	- sra:SRR16242091

	# independent libraries can be combined together
	# on the level of binned-data (.cool files)
	# describe your groupings of choice here:
	library_groups:

	s3471:
	- 3471-G
	- 3471-Y
	s3475:
	- 3475-G
	- 3475-Y

	s3494:
	- 3494-G
	- 3494-Y
	s3496:
	- 3496I
	s3518:
	- 3518I
	- 3518II
	s3524:
	- 3524I
	s3525:
	- 3525a_S9
	- 3525b_S10
	sHF18:
	- HF-18-G
	- HF-18-Y
	sPFCH6:
	- PFCH6-G
	- PFCH6-Y
	IsoE:
	- Iso-E_rep1
	- Iso-E_rep2
	- Iso-E_rep3
	IsoT:
	- Iso-T_rep1
	- Iso-T_rep2
	- Iso-T_rep3
	NPC_IsoE:
	- NPC_Iso-E_rep1
	- NPC_Iso-E_rep2
	- NPC_Iso-E_rep3
	NPC_IsoT:
	- NPC_Iso-T_rep1
	- NPC_Iso-T_rep2
	- NPC_Iso-T_rep3
	NPC_MaE:
	- NPC_Ma-E_rep1
	- NPC_Ma-E_rep2
	NPC_MaT:
	- NPC_Ma-T_rep1
	- NPC_Ma-T_rep2

	# Truncate input fastqs to a small number of reads (e.g. 10000) for
	# semi-dry test runs.
	# NOTE: when the inputs are specified as an SRA number, only this number of
	# reads is downloaded!
	truncate_fastq_reads: 0

	# Specify a reference genome to align sequenced reads.
	# Provide the genome assembly name, a wildcard path to the BWA index files
	# of the reference genome, and a tab-separated table with contig sizes
	# (known as "chrom.sizes"). The latter is used to specify the subset and the
	# order of contigs in a resulting contact map.
	genome:
	assembly_name: 'hg38'
	bwa_index_wildcard_path: '/store/razinlab/common_data/genomes/hg38/index/bwa/hg38.fa.*'
	chrom_sizes_path: '/store/razinlab/common_data/genomes/hg38/hg38.fa.sizes'

	# Choose if you want to do FastQC of the input files:
	do_fastqc: False

	# Control how reads are mapped to the reference genomes.
	map:
	# If 'chunksize' is non-zero, each input file gets split into multiple chunks,
	# each mapped separately. Useful for mapping on clusters with many
	# relatively weak nodes.
	# The optimal chunk size is defined by the balance between mapping and merging.
	# Smaller chunks (~30M) are better for clusters with many weak nodes,
	# however, having >~10 chunks per run slow down merging.
	chunksize: 100_000_000

	# Specify extra BWA mapping options.
	mapping_options: ''

	# Specify fastp trim options.
	#i.e. parameters
	#--detect_adapter_for_pe -q 15
	trim_options: ''

	# A more technical option, use a custom script to split fastq files from SRA
	# into two files, one per read side. By default it is true, which is
	# faster (because we can use multi-threaded compression), but less
	# stable. Set to false if you download files from SRA and bwa complains
	# about unpaired reads.
	use_custom_split: true

	# Control how read alignments are converted ('parsed') into Hi-C pairs.
	parse:
	# If 'make_pairsam' is True, parsed Hi-C pairs will store complete
	# alignment records in the SAM format (the resulting hybrid between the
	# .pairs and .sam formats is called '.pairsam'). Such files can be useful for
	# thorough investigation of Hi-C data. Downstream of parsing, pairsams
	# are split into .pairs and .bam, and .bam alignments are tagged with
	# Hi-C related information. 'make_pairsam' roughly doubles the storage
	# and I/O requirements and should be used only when absolutely needed.
	# NOTE: when 'make_pairsam' is False, the initial output of parsing is still
	# called '.pairsam' despite missing SAM alignments, for technical reasons.
	make_pairsam: False

	# When 'make_pairsam' is True, enabling 'drop_seq' erases sequences and
	# Phred scores from the SAM alignments in .pairsam and .bam output files.
	# Enable to make lightweight .pairsam/.bam output.
	# NOTE: when 'make_pairsam' is False, 'drop_seq' is ignored.
	drop_seq: True

	# Enable 'drop_readid' to drop readID from .pairs files to create
	# lightweight .pairs files
	# NOTE: does not affect alignment records in the .pairsam files and
	# subsequently .bam files after .apirsam splitting.
	drop_readid: False

	# When 'keep_unparsed_bams' is True, distiller preserves the _immediate_
	# output of bwa in a .bam format. Could be used as a faster alternative
	# to 'make_pairsam' when alignments are needed, but tagging them with Hi-C
	# related information is not necessary.
	keep_unparsed_bams: False

	# Pass extra options to pairtools parse, on top of the ones specified by
	# flags 'make_pairsam', 'drop_readid', 'drop_seq'. The default value
	# enables storing MAPQ scores in the .pairsam/.pairs output, which are
	# used later for filtering/binning. The default walks-policy is 'mask'
	# which masks complex walks in long reads.
	parsing_options: '--add-columns mapq --walks-policy mask'

	# Control how PCR/optical duplicates are detected in the data.
	dedup:
	# PCR/optical duplicates are detected as Hi-C pairs with matching locations
	# on both sides. 'max_mismatch_bp' controls the maximal allowed mismatch in
	# mapped locations on either side for two pairs to be still considered as
	# duplicates.
	max_mismatch_bp: 1

	# Control how Hi-C pairs are binned into contact maps, stored in .cool files.
	bin:
	# Specify which resolutions should be included in the multi-resolution .cool file.
	# The lowest (base) resolution _must_ be the common denominator of all other
	# resolutions.
	resolutions:
	- 10000000
	- 5000000
	- 2500000
	- 1000000
	- 500000
	- 250000
	- 100000
	- 50000
	- 25000
	- 10000
	- 5000
	- 2000
	- 1000
	# Specify if the multi-resolution .cool output files should be balanced.
	balance: true

	# Pass additional parameters to cooler balance:
	balance_options: '--trans-only --name weight_trans'

	# Specify additional filters applied to pairs during binning.
	# Multiple filters are allowed; for each filter, all pairs satisfying the
	# given filter expression will be binned into a separate cooler.
	# Filters are specified using the following syntax:
	# {filter_name}: '{a valid Python expression}'
	filters:
	no_filter: ''
	mapq_30: '(mapq1>=30) and (mapq2>=30)'

	########################################
	# folder structure for storing results
	########################################
	output:
	dirs:
	processed_fastqs: 'project/processed_fastqs/'
	mapped_parsed_sorted_chunks: 'project/mapped_parsed_sorted_chunks'
	fastqc: 'project/fastqc/'
	pairs_library: 'project/pairs_library'
	coolers_library: 'project/coolers_library/'
	coolers_library_group: 'project/coolers_library_group/'
	stats_library_group: 'project/stats_library_group/'