Last active
July 11, 2023 14:23
-
-
Save Phlya/b1cdceb8124d787731e654a7edaedb82 to your computer and use it in GitHub Desktop.
uneuploidy_project.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
######################################### | |
# THIS IS A TYPICAL project.yml TEMPLATE | |
# most of the settings present here | |
# are GO for mapping production data | |
# but nonetheless user must consider | |
# carefully every presented option | |
######################################### | |
######################################### | |
# When commmenting parameters out, make sure | |
# that each section still has at least one | |
# uncommented parameter, otherwise it | |
# will not get properly parsed. | |
######################################### | |
####################################### | |
# provide paths to your raw input data (fastq-s): | |
####################################### | |
# Fastqs can be provided as: | |
# -- a pairs of relative/absolute paths | |
# -- sra:<SRA_NUMBER>, optionally followed by the indices of the first and | |
# the last entry in the SRA in the form of "?start=<first>&end=<last> | |
input: | |
raw_reads_paths: | |
# substitute the location of your fastq files: | |
# an example of a 1-lane library: | |
3475-G: | |
lane1: | |
- ../SU_TRISOMY/3475-G_R1_001.fastq.gz | |
- ../SU_TRISOMY/3475-G_R2_001.fastq.gz | |
3475-Y: | |
lane1: | |
- ../SU_TRISOMY/3475-Y_R1_001.fastq.gz | |
- ../SU_TRISOMY/3475-Y_R2_001.fastq.gz | |
3492-G: | |
lane1: | |
- ../SU_TRISOMY/3492-G_R1_001.fastq.gz | |
- ../SU_TRISOMY/3492-G_R2_001.fastq.gz | |
3492-Y: | |
lane1: | |
- ../SU_TRISOMY/3492-Y_R1_001.fastq.gz | |
- ../SU_TRISOMY/3492-Y_R2_001.fastq.gz | |
3494-G: | |
lane1: | |
- ../SU_TRISOMY/3494-G_R1_001.fastq.gz | |
- ../SU_TRISOMY/3494-G_R2_001.fastq.gz | |
3494-Y: | |
lane1: | |
- ../SU_TRISOMY/3494-Y_R1_001.fastq.gz | |
- ../SU_TRISOMY/3494-Y_R2_001.fastq.gz | |
3518I: | |
lane1: | |
- ../SU_TRISOMY/3518I_R1_001.fastq.gz | |
- ../SU_TRISOMY/3518I_R2_001.fastq.gz | |
3518II: | |
lane1: | |
- ../SU_TRISOMY/3518II_R1_001.fastq.gz | |
- ../SU_TRISOMY/3518II_R2_001.fastq.gz | |
3524I: | |
lane1: | |
- ../SU_TRISOMY/3524I_R1_001.fastq.gz | |
- ../SU_TRISOMY/3524I_R2_001.fastq.gz | |
3525a_S9: | |
lane1: | |
- ../GOOGLE_DATA/3525a_S9_R1_001.fastq.gz | |
- ../GOOGLE_DATA/3525a_S9_R2_001.fastq.gz | |
3525b_S10: | |
lane1: | |
- ../GOOGLE_DATA/3525b_S10_R1_001.fastq.gz | |
- ../GOOGLE_DATA/3525b_S10_R2_001.fastq.gz | |
HF-18-G: | |
lane1: | |
- ../SU_TRISOMY/HF-18-G_R1_001.fastq.gz | |
- ../SU_TRISOMY/HF-18-G_R2_001.fastq.gz | |
HF-18-Y: | |
lane1: | |
- ../SU_TRISOMY/HF-18-Y_R1_001.fastq.gz | |
- ../SU_TRISOMY/HF-18-Y_R2_001.fastq.gz | |
PFCH6-G: | |
lane1: | |
- ../SU_TRISOMY/PFCH6-G_R1_001.fastq.gz | |
- ../SU_TRISOMY/PFCH6-G_R2_001.fastq.gz | |
PFCH6-Y: | |
lane1: | |
- ../SU_TRISOMY/PFCH6-Y_R1_001.fastq.gz | |
- ../SU_TRISOMY/PFCH6-Y_R2_001.fastq.gz | |
Iso-E_rep1: | |
lane1: | |
- sra:SRR16242076 | |
Iso-E_rep2: | |
lane1: | |
- sra:SRR16242077 | |
Iso-E_rep3: | |
lane1: | |
- sra:SRR16242078 | |
Iso-T_rep1: | |
lane1: | |
- sra:SRR16242079 | |
Iso-T_rep2: | |
lane1: | |
- sra:SRR16242080 | |
Iso-T_rep3: | |
lane1: | |
- sra:SRR16242081 | |
NPC_Iso-E_rep1: | |
lane1: | |
- sra:SRR16242082 | |
NPC_Iso-E_rep2: | |
lane1: | |
- sra:SRR16242083 | |
NPC_Iso-E_rep3: | |
lane1: | |
- sra:SRR16242084 | |
NPC_Iso-T_rep1: | |
lane1: | |
- sra:SRR16242085 | |
NPC_Iso-T_rep2: | |
lane1: | |
- sra:SRR16242086 | |
NPC_Iso-T_rep3: | |
lane1: | |
- sra:SRR16242087 | |
NPC_Ma-E_rep1: | |
lane1: | |
- sra:SRR16242088 | |
NPC_Ma-E_rep2: | |
lane1: | |
- sra:SRR16242089 | |
NPC_Ma-T_rep1: | |
lane1: | |
- sra:SRR16242090 | |
NPC_Ma-T_rep2: | |
lane1: | |
- sra:SRR16242091 | |
# independent libraries can be combined together | |
# on the level of binned-data (.cool files) | |
# describe your groupings of choice here: | |
library_groups: | |
s3471: | |
- 3471-G | |
- 3471-Y | |
s3475: | |
- 3475-G | |
- 3475-Y | |
s3494: | |
- 3494-G | |
- 3494-Y | |
s3496: | |
- 3496I | |
s3518: | |
- 3518I | |
- 3518II | |
s3524: | |
- 3524I | |
s3525: | |
- 3525a_S9 | |
- 3525b_S10 | |
sHF18: | |
- HF-18-G | |
- HF-18-Y | |
sPFCH6: | |
- PFCH6-G | |
- PFCH6-Y | |
IsoE: | |
- Iso-E_rep1 | |
- Iso-E_rep2 | |
- Iso-E_rep3 | |
IsoT: | |
- Iso-T_rep1 | |
- Iso-T_rep2 | |
- Iso-T_rep3 | |
NPC_IsoE: | |
- NPC_Iso-E_rep1 | |
- NPC_Iso-E_rep2 | |
- NPC_Iso-E_rep3 | |
NPC_IsoT: | |
- NPC_Iso-T_rep1 | |
- NPC_Iso-T_rep2 | |
- NPC_Iso-T_rep3 | |
NPC_MaE: | |
- NPC_Ma-E_rep1 | |
- NPC_Ma-E_rep2 | |
NPC_MaT: | |
- NPC_Ma-T_rep1 | |
- NPC_Ma-T_rep2 | |
# Truncate input fastqs to a small number of reads (e.g. 10000) for | |
# semi-dry test runs. | |
# NOTE: when the inputs are specified as an SRA number, only this number of | |
# reads is downloaded! | |
truncate_fastq_reads: 0 | |
# Specify a reference genome to align sequenced reads. | |
# Provide the genome assembly name, a wildcard path to the BWA index files | |
# of the reference genome, and a tab-separated table with contig sizes | |
# (known as "chrom.sizes"). The latter is used to specify the subset and the | |
# order of contigs in a resulting contact map. | |
genome: | |
assembly_name: 'hg38' | |
bwa_index_wildcard_path: '/store/razinlab/common_data/genomes/hg38/index/bwa/hg38.fa.*' | |
chrom_sizes_path: '/store/razinlab/common_data/genomes/hg38/hg38.fa.sizes' | |
# Choose if you want to do FastQC of the input files: | |
do_fastqc: False | |
# Control how reads are mapped to the reference genomes. | |
map: | |
# If 'chunksize' is non-zero, each input file gets split into multiple chunks, | |
# each mapped separately. Useful for mapping on clusters with many | |
# relatively weak nodes. | |
# The optimal chunk size is defined by the balance between mapping and merging. | |
# Smaller chunks (~30M) are better for clusters with many weak nodes, | |
# however, having >~10 chunks per run slow down merging. | |
chunksize: 100_000_000 | |
# Specify extra BWA mapping options. | |
mapping_options: '' | |
# Specify fastp trim options. | |
#i.e. parameters | |
#--detect_adapter_for_pe -q 15 | |
trim_options: '' | |
# A more technical option, use a custom script to split fastq files from SRA | |
# into two files, one per read side. By default it is true, which is | |
# faster (because we can use multi-threaded compression), but less | |
# stable. Set to false if you download files from SRA and bwa complains | |
# about unpaired reads. | |
use_custom_split: true | |
# Control how read alignments are converted ('parsed') into Hi-C pairs. | |
parse: | |
# If 'make_pairsam' is True, parsed Hi-C pairs will store complete | |
# alignment records in the SAM format (the resulting hybrid between the | |
# .pairs and .sam formats is called '.pairsam'). Such files can be useful for | |
# thorough investigation of Hi-C data. Downstream of parsing, pairsams | |
# are split into .pairs and .bam, and .bam alignments are tagged with | |
# Hi-C related information. 'make_pairsam' roughly doubles the storage | |
# and I/O requirements and should be used only when absolutely needed. | |
# NOTE: when 'make_pairsam' is False, the initial output of parsing is still | |
# called '.pairsam' despite missing SAM alignments, for technical reasons. | |
make_pairsam: False | |
# When 'make_pairsam' is True, enabling 'drop_seq' erases sequences and | |
# Phred scores from the SAM alignments in .pairsam and .bam output files. | |
# Enable to make lightweight .pairsam/.bam output. | |
# NOTE: when 'make_pairsam' is False, 'drop_seq' is ignored. | |
drop_seq: True | |
# Enable 'drop_readid' to drop readID from .pairs files to create | |
# lightweight .pairs files | |
# NOTE: does not affect alignment records in the .pairsam files and | |
# subsequently .bam files after .apirsam splitting. | |
drop_readid: False | |
# When 'keep_unparsed_bams' is True, distiller preserves the _immediate_ | |
# output of bwa in a .bam format. Could be used as a faster alternative | |
# to 'make_pairsam' when alignments are needed, but tagging them with Hi-C | |
# related information is not necessary. | |
keep_unparsed_bams: False | |
# Pass extra options to pairtools parse, on top of the ones specified by | |
# flags 'make_pairsam', 'drop_readid', 'drop_seq'. The default value | |
# enables storing MAPQ scores in the .pairsam/.pairs output, which are | |
# used later for filtering/binning. The default walks-policy is 'mask' | |
# which masks complex walks in long reads. | |
parsing_options: '--add-columns mapq --walks-policy mask' | |
# Control how PCR/optical duplicates are detected in the data. | |
dedup: | |
# PCR/optical duplicates are detected as Hi-C pairs with matching locations | |
# on both sides. 'max_mismatch_bp' controls the maximal allowed mismatch in | |
# mapped locations on either side for two pairs to be still considered as | |
# duplicates. | |
max_mismatch_bp: 1 | |
# Control how Hi-C pairs are binned into contact maps, stored in .cool files. | |
bin: | |
# Specify which resolutions should be included in the multi-resolution .cool file. | |
# The lowest (base) resolution _must_ be the common denominator of all other | |
# resolutions. | |
resolutions: | |
- 10000000 | |
- 5000000 | |
- 2500000 | |
- 1000000 | |
- 500000 | |
- 250000 | |
- 100000 | |
- 50000 | |
- 25000 | |
- 10000 | |
- 5000 | |
- 2000 | |
- 1000 | |
# Specify if the multi-resolution .cool output files should be balanced. | |
balance: true | |
# Pass additional parameters to cooler balance: | |
balance_options: '--trans-only --name weight_trans' | |
# Specify additional filters applied to pairs during binning. | |
# Multiple filters are allowed; for each filter, all pairs satisfying the | |
# given filter expression will be binned into a separate cooler. | |
# Filters are specified using the following syntax: | |
# {filter_name}: '{a valid Python expression}' | |
filters: | |
no_filter: '' | |
mapq_30: '(mapq1>=30) and (mapq2>=30)' | |
######################################## | |
# folder structure for storing results | |
######################################## | |
output: | |
dirs: | |
processed_fastqs: 'project/processed_fastqs/' | |
mapped_parsed_sorted_chunks: 'project/mapped_parsed_sorted_chunks' | |
fastqc: 'project/fastqc/' | |
pairs_library: 'project/pairs_library' | |
coolers_library: 'project/coolers_library/' | |
coolers_library_group: 'project/coolers_library_group/' | |
stats_library_group: 'project/stats_library_group/' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment