Skip to content

Instantly share code, notes, and snippets.

@ShaiberAlon
Created February 2, 2018 18:06
Show Gist options
  • Save ShaiberAlon/0afdeb27be3f595101f1423ef1fea4c1 to your computer and use it in GitHub Desktop.
Save ShaiberAlon/0afdeb27be3f595101f1423ef1fea4c1 to your computer and use it in GitHub Desktop.
GATA4-snakefile
import pandas as pd
import os
project_name = "GATA4"
SAMPLES = pd.read_csv('samples.txt', sep='\t', header=0, index_col=False)
SAMPLES = SAMPLES['samples'].values
os.makedirs("00_LOGS", exist_ok=True)
print(SAMPLES)
localrules: all,QC_report, merge, get_sample_info, prepare_things_for_GAST
rule all:
input: "03_MED/create_taxonomy_matrices.done"
rule QC_report:
input: expand("01_QC" + "/{sample}_STATS", sample=SAMPLES)
output: "01_QC" + "/QC-report.txt"
run:
shell('echo -e "sample\tnumber of pairs analyzed\tmerged total" > ' + output[0])
for filename in input:
shell('n=`grep "Number of pairs analyzed" {f} | cut -f 2`; \
m=`grep "Merged total" {f} | cut -f 2`; \
echo -e "{f}\t$n\t$m" >> {o}'.format(f=filename, o=output[0]))
rule subsample:
log: "00_LOGS/{sample}-subsample.log"
input:
fa="01_QC" + "/{sample}.fa",
report=rules.QC_report.output
output: "02_FASTA" + "/{sample}-SUBSAMPLE.fa"
run:
subsample_N = 75000
shell("o-subsample-fasta-file {input.fa} %s {output}" % subsample_N)
rule merge:
input: expand("02_FASTA" + "/{sample}-SUBSAMPLE.fa", sample=SAMPLES)
output: temp("02_FASTA" + "/merged-RAW.fa")
shell: "cat {input} > {output}"
rule get_sample_info:
input: rules.merge.output
output: project_name + ".info"
shell: "o-get-sample-info-from-fasta {input} > {output}"
rule pad_with_gaps:
log: "00_LOGS" + "/pad.log"
input:
fa = rules.merge.output,
info = rules.get_sample_info.output
output: "02_FASTA" + "/merged.fa"
shell: "o-pad-with-gaps {input.fa} -o {output}"
rule MED:
log: "00_LOGS" + "/MED.log"
input: rules.pad_with_gaps.output
output: representatives = "03_MED" + "/NODE-REPRESENTATIVES.fasta"
params: output_dir="03_MED"
threads: 10
shell: "decompose {input} -o {params.output_dir} --number-of-threads {threads}"
rule prepare_things_for_GAST:
input: rules.MED.output.representatives
output: temp("03_MED" + "/temp-file-for-GAST.fa")
shell:
"""
cp {input} {output}
sed -i 's/|.*$//g' {output}
sed -i 's/-//g' {output}
"""
rule GAST:
log: "00_LOGS" + "/GAST.log"
input: rules.prepare_things_for_GAST.output
output: "03_MED" + "/NODE-REPRESENTATIVES.gast"
params:
ref="/xraid2-2/g454/blastdbs/gast_distributions/refssu.fa",
rtax="/xraid2-2/g454/blastdbs/gast_distributions/refssu.tax"
shell: "gast -in {input} -ref {params.ref} -rtax {params.rtax} -out {output} -termg"
rule create_taxonomy_matrices:
log: "00_LOGS" + "/create_taxonomy_matrices.log"
input: rules.GAST.output
output: touch("03_MED/create_taxonomy_matrices.done")
params: dir_name="03_MED"
shell: "python /groups/merenlab/00_RESOURCES/create_taxonomy_matrices.py {params.dir_name}"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment