Skip to content

Instantly share code, notes, and snippets.

@Phlya
Last active July 11, 2023 14:23
Show Gist options
  • Save Phlya/b1cdceb8124d787731e654a7edaedb82 to your computer and use it in GitHub Desktop.
Save Phlya/b1cdceb8124d787731e654a7edaedb82 to your computer and use it in GitHub Desktop.
uneuploidy_project.yml
#########################################
# THIS IS A TYPICAL project.yml TEMPLATE
# most of the settings present here
# are GO for mapping production data
# but nonetheless user must consider
# carefully every presented option
#########################################
#########################################
# When commmenting parameters out, make sure
# that each section still has at least one
# uncommented parameter, otherwise it
# will not get properly parsed.
#########################################
#######################################
# provide paths to your raw input data (fastq-s):
#######################################
# Fastqs can be provided as:
# -- a pairs of relative/absolute paths
# -- sra:<SRA_NUMBER>, optionally followed by the indices of the first and
# the last entry in the SRA in the form of "?start=<first>&end=<last>
input:
raw_reads_paths:
# substitute the location of your fastq files:
# an example of a 1-lane library:
3475-G:
lane1:
- ../SU_TRISOMY/3475-G_R1_001.fastq.gz
- ../SU_TRISOMY/3475-G_R2_001.fastq.gz
3475-Y:
lane1:
- ../SU_TRISOMY/3475-Y_R1_001.fastq.gz
- ../SU_TRISOMY/3475-Y_R2_001.fastq.gz
3492-G:
lane1:
- ../SU_TRISOMY/3492-G_R1_001.fastq.gz
- ../SU_TRISOMY/3492-G_R2_001.fastq.gz
3492-Y:
lane1:
- ../SU_TRISOMY/3492-Y_R1_001.fastq.gz
- ../SU_TRISOMY/3492-Y_R2_001.fastq.gz
3494-G:
lane1:
- ../SU_TRISOMY/3494-G_R1_001.fastq.gz
- ../SU_TRISOMY/3494-G_R2_001.fastq.gz
3494-Y:
lane1:
- ../SU_TRISOMY/3494-Y_R1_001.fastq.gz
- ../SU_TRISOMY/3494-Y_R2_001.fastq.gz
3518I:
lane1:
- ../SU_TRISOMY/3518I_R1_001.fastq.gz
- ../SU_TRISOMY/3518I_R2_001.fastq.gz
3518II:
lane1:
- ../SU_TRISOMY/3518II_R1_001.fastq.gz
- ../SU_TRISOMY/3518II_R2_001.fastq.gz
3524I:
lane1:
- ../SU_TRISOMY/3524I_R1_001.fastq.gz
- ../SU_TRISOMY/3524I_R2_001.fastq.gz
3525a_S9:
lane1:
- ../GOOGLE_DATA/3525a_S9_R1_001.fastq.gz
- ../GOOGLE_DATA/3525a_S9_R2_001.fastq.gz
3525b_S10:
lane1:
- ../GOOGLE_DATA/3525b_S10_R1_001.fastq.gz
- ../GOOGLE_DATA/3525b_S10_R2_001.fastq.gz
HF-18-G:
lane1:
- ../SU_TRISOMY/HF-18-G_R1_001.fastq.gz
- ../SU_TRISOMY/HF-18-G_R2_001.fastq.gz
HF-18-Y:
lane1:
- ../SU_TRISOMY/HF-18-Y_R1_001.fastq.gz
- ../SU_TRISOMY/HF-18-Y_R2_001.fastq.gz
PFCH6-G:
lane1:
- ../SU_TRISOMY/PFCH6-G_R1_001.fastq.gz
- ../SU_TRISOMY/PFCH6-G_R2_001.fastq.gz
PFCH6-Y:
lane1:
- ../SU_TRISOMY/PFCH6-Y_R1_001.fastq.gz
- ../SU_TRISOMY/PFCH6-Y_R2_001.fastq.gz
Iso-E_rep1:
lane1:
- sra:SRR16242076
Iso-E_rep2:
lane1:
- sra:SRR16242077
Iso-E_rep3:
lane1:
- sra:SRR16242078
Iso-T_rep1:
lane1:
- sra:SRR16242079
Iso-T_rep2:
lane1:
- sra:SRR16242080
Iso-T_rep3:
lane1:
- sra:SRR16242081
NPC_Iso-E_rep1:
lane1:
- sra:SRR16242082
NPC_Iso-E_rep2:
lane1:
- sra:SRR16242083
NPC_Iso-E_rep3:
lane1:
- sra:SRR16242084
NPC_Iso-T_rep1:
lane1:
- sra:SRR16242085
NPC_Iso-T_rep2:
lane1:
- sra:SRR16242086
NPC_Iso-T_rep3:
lane1:
- sra:SRR16242087
NPC_Ma-E_rep1:
lane1:
- sra:SRR16242088
NPC_Ma-E_rep2:
lane1:
- sra:SRR16242089
NPC_Ma-T_rep1:
lane1:
- sra:SRR16242090
NPC_Ma-T_rep2:
lane1:
- sra:SRR16242091
# independent libraries can be combined together
# on the level of binned-data (.cool files)
# describe your groupings of choice here:
library_groups:
s3471:
- 3471-G
- 3471-Y
s3475:
- 3475-G
- 3475-Y
s3494:
- 3494-G
- 3494-Y
s3496:
- 3496I
s3518:
- 3518I
- 3518II
s3524:
- 3524I
s3525:
- 3525a_S9
- 3525b_S10
sHF18:
- HF-18-G
- HF-18-Y
sPFCH6:
- PFCH6-G
- PFCH6-Y
IsoE:
- Iso-E_rep1
- Iso-E_rep2
- Iso-E_rep3
IsoT:
- Iso-T_rep1
- Iso-T_rep2
- Iso-T_rep3
NPC_IsoE:
- NPC_Iso-E_rep1
- NPC_Iso-E_rep2
- NPC_Iso-E_rep3
NPC_IsoT:
- NPC_Iso-T_rep1
- NPC_Iso-T_rep2
- NPC_Iso-T_rep3
NPC_MaE:
- NPC_Ma-E_rep1
- NPC_Ma-E_rep2
NPC_MaT:
- NPC_Ma-T_rep1
- NPC_Ma-T_rep2
# Truncate input fastqs to a small number of reads (e.g. 10000) for
# semi-dry test runs.
# NOTE: when the inputs are specified as an SRA number, only this number of
# reads is downloaded!
truncate_fastq_reads: 0
# Specify a reference genome to align sequenced reads.
# Provide the genome assembly name, a wildcard path to the BWA index files
# of the reference genome, and a tab-separated table with contig sizes
# (known as "chrom.sizes"). The latter is used to specify the subset and the
# order of contigs in a resulting contact map.
genome:
assembly_name: 'hg38'
bwa_index_wildcard_path: '/store/razinlab/common_data/genomes/hg38/index/bwa/hg38.fa.*'
chrom_sizes_path: '/store/razinlab/common_data/genomes/hg38/hg38.fa.sizes'
# Choose if you want to do FastQC of the input files:
do_fastqc: False
# Control how reads are mapped to the reference genomes.
map:
# If 'chunksize' is non-zero, each input file gets split into multiple chunks,
# each mapped separately. Useful for mapping on clusters with many
# relatively weak nodes.
# The optimal chunk size is defined by the balance between mapping and merging.
# Smaller chunks (~30M) are better for clusters with many weak nodes,
# however, having >~10 chunks per run slow down merging.
chunksize: 100_000_000
# Specify extra BWA mapping options.
mapping_options: ''
# Specify fastp trim options.
#i.e. parameters
#--detect_adapter_for_pe -q 15
trim_options: ''
# A more technical option, use a custom script to split fastq files from SRA
# into two files, one per read side. By default it is true, which is
# faster (because we can use multi-threaded compression), but less
# stable. Set to false if you download files from SRA and bwa complains
# about unpaired reads.
use_custom_split: true
# Control how read alignments are converted ('parsed') into Hi-C pairs.
parse:
# If 'make_pairsam' is True, parsed Hi-C pairs will store complete
# alignment records in the SAM format (the resulting hybrid between the
# .pairs and .sam formats is called '.pairsam'). Such files can be useful for
# thorough investigation of Hi-C data. Downstream of parsing, pairsams
# are split into .pairs and .bam, and .bam alignments are tagged with
# Hi-C related information. 'make_pairsam' roughly doubles the storage
# and I/O requirements and should be used only when absolutely needed.
# NOTE: when 'make_pairsam' is False, the initial output of parsing is still
# called '.pairsam' despite missing SAM alignments, for technical reasons.
make_pairsam: False
# When 'make_pairsam' is True, enabling 'drop_seq' erases sequences and
# Phred scores from the SAM alignments in .pairsam and .bam output files.
# Enable to make lightweight .pairsam/.bam output.
# NOTE: when 'make_pairsam' is False, 'drop_seq' is ignored.
drop_seq: True
# Enable 'drop_readid' to drop readID from .pairs files to create
# lightweight .pairs files
# NOTE: does not affect alignment records in the .pairsam files and
# subsequently .bam files after .apirsam splitting.
drop_readid: False
# When 'keep_unparsed_bams' is True, distiller preserves the _immediate_
# output of bwa in a .bam format. Could be used as a faster alternative
# to 'make_pairsam' when alignments are needed, but tagging them with Hi-C
# related information is not necessary.
keep_unparsed_bams: False
# Pass extra options to pairtools parse, on top of the ones specified by
# flags 'make_pairsam', 'drop_readid', 'drop_seq'. The default value
# enables storing MAPQ scores in the .pairsam/.pairs output, which are
# used later for filtering/binning. The default walks-policy is 'mask'
# which masks complex walks in long reads.
parsing_options: '--add-columns mapq --walks-policy mask'
# Control how PCR/optical duplicates are detected in the data.
dedup:
# PCR/optical duplicates are detected as Hi-C pairs with matching locations
# on both sides. 'max_mismatch_bp' controls the maximal allowed mismatch in
# mapped locations on either side for two pairs to be still considered as
# duplicates.
max_mismatch_bp: 1
# Control how Hi-C pairs are binned into contact maps, stored in .cool files.
bin:
# Specify which resolutions should be included in the multi-resolution .cool file.
# The lowest (base) resolution _must_ be the common denominator of all other
# resolutions.
resolutions:
- 10000000
- 5000000
- 2500000
- 1000000
- 500000
- 250000
- 100000
- 50000
- 25000
- 10000
- 5000
- 2000
- 1000
# Specify if the multi-resolution .cool output files should be balanced.
balance: true
# Pass additional parameters to cooler balance:
balance_options: '--trans-only --name weight_trans'
# Specify additional filters applied to pairs during binning.
# Multiple filters are allowed; for each filter, all pairs satisfying the
# given filter expression will be binned into a separate cooler.
# Filters are specified using the following syntax:
# {filter_name}: '{a valid Python expression}'
filters:
no_filter: ''
mapq_30: '(mapq1>=30) and (mapq2>=30)'
########################################
# folder structure for storing results
########################################
output:
dirs:
processed_fastqs: 'project/processed_fastqs/'
mapped_parsed_sorted_chunks: 'project/mapped_parsed_sorted_chunks'
fastqc: 'project/fastqc/'
pairs_library: 'project/pairs_library'
coolers_library: 'project/coolers_library/'
coolers_library_group: 'project/coolers_library_group/'
stats_library_group: 'project/stats_library_group/'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment