Skip to content

Instantly share code, notes, and snippets.

@sxv
Last active October 11, 2016 20:53
# define input cases via master_key.txt
# for each run, align fastq -> bam
# if GATK specified, run GATK, variant calling, and annovar
# run matricizer2 on tab output
# todo: add R clustering scripts
## usage
# python rnarunner.py --input-dir /path/to/fastqs
## NOTES: input-dir must contain master_key.txt, input fastq.gz files, matricizer2.py, UCSC_GENE_NAME3.txt
import time, os, argparse
from glob import glob
now = int(time.time())
parser = argparse.ArgumentParser(description='RNA sample pipeline runner')
parser.add_argument('--input-dir', default='./', required=False)
parser.add_argument('--master-key', default='master_key.txt', required=False)
parser.add_argument('--batch-name', required=False)
parser.add_argument('--threads', default='8', required=False)
parser.add_argument('--star', default='/media/lab/data256gb/STAR-STAR_2.4.2a/bin/Linux_x86_64', required=False)
parser.add_argument('--star-ref', default='/media/lab/data3tb_1/STAR_ref/', required=False)
parser.add_argument('--genome', default='hg19/GRCh37.primary_assembly.reordered.fa', required=False)
parser.add_argument('--picard', default='/media/lab/data512gb/bcbio/share/java/picard-1.96', required=False)
parser.add_argument('--gatk', default='/media/lab/data256gb/bcbio/GenomeAnalysisTK.jar', required=False)
parser.add_argument('--varscan', default='/media/lab/data256gb/bcbio/share/java/varscan-2.3.7/', required=False)
parser.add_argument('--annovar', default='/media/lab/data3tb_2/annovar/', required=False)
parser.add_argument('--star-genome', default='/media/lab/data256gb/STAR-STAR_2.4.2a/star_genome', required=False)
#parser.add_argument('--matrix-config', default='False', required=False)
args = vars(parser.parse_args())
Mills = 'Mills_and_1000G_gold_standard.indels.b37.sites.gencode.vcf'
upload_dir = '/mmedia/stroma-common/rna.seq/complete/'
def main():
cases = queue_cases()
for case_name in cases:
if case_name in ['skipme']:
print 'Skipping already completed case: %s' % case_name
continue
# make case dir,
case_dir = "%s/%s" % (args['input_dir'], case_name)
os.system("bash -c 'mkdir %s'" % case_dir)
bam_path = "%s/%s.bam" % (case_dir, case_name)
batch_name = "%s.%s" % (args['batch_name'], now)
samples = CASE_DICT[case_name]
if not glob(bam_path):
run_star(case_dir, samples, bam_path)
if not glob('%s/*ReadsPerGene.txt' % case_dir):
run_htseq_count(bam_path, case_name)
#vcf_path = "%s/%s.vcf" % (case_dir, case_name)
#if not glob(vcf_path):
# run_gatk(case_dir, case_name, bam_path, vcf_path)
#if glob(vcf_path):
# os.system("bash -c 'rm %s/out*ba?'" % case_dir)
# os.system("bash -c 'mv %s %s'" % (case_dir, upload_dir))
#annotate(batch_name) #vcfgenie
#matricizer(batch_name) ON READSPERGENE Files in each case_dir
#matricizer2.py needs to be changed to matricize raw read counts (d/t new format of htseq vs star)
def find_read_pair(path1):
if os.path.isfile(path1):
if 'pf.fastq' in path1:
path2 = path1.replace('1_pf.fastq', '2_pf.fastq')
else:
path2 = path1.replace('1.fastq', '2.fastq')
if os.path.isfile(path2):
print "FOUND pair for %s: %s" % (path1, path2)
return path2
else: print "No pair found for %s" % path1
else: print "%s not a valid file" % path1
return False
CASE_DICT = {}
MASTER_KEY = "%s/%s" % (args['input_dir'], args['master_key'])
def queue_cases():
cases = []
for line in open(MASTER_KEY):
if not line.startswith('#'):
if line.split():
fastq, case_name = line.split()
print 'fastq: %s ; case_name: %s' % (fastq, case_name)
cases.append(case_name)
fastq2 = False
if '1.fastq' in fastq or '1_pf.fastq' in fastq:
fastq2 = find_read_pair(fastq)
else: print '1.fastq not found'
if fastq2: CASE_DICT[case_name] = "%s %s" % (fastq, fastq2)
else: CASE_DICT[case_name] = fastq
print "%s cases queued: \n%s" % (len(cases), CASE_DICT)
return cases
def run_star(case_dir, samples, bam_path):
star_command = "cd %s && %s/STAR --genomeDir %s --readFilesIn %s --runThreadN %s --quantMode TranscriptomeSAM GeneCounts --outReadsUnmapped None --outFilterMultimapNmax 1 --outFilterScoreMin 20 --outSAMtype BAM SortedByCoordinate --sjdbGTFfile %s/hg19/gencode.v24lift37.annotation.gtf --readFilesCommand zcat" % (case_dir, args['star'], args['star_genome'], samples, args['threads'], args['star_ref'])
print star_command
os.system("bash -c '%s'" % star_command)
os.system("bash -c 'mv %s/Aligned.sortedByCoord.out.bam %s'" % (case_dir, bam_path))
def run_htseq_count(bam, sample):
#os.system("bash -c 'htseq-count -r pos -f bam %s /media/lab/data3tb_1/STAR_ref/gencode.v24lift37.annotation.gff3 > %s/%s.ReadsPerGene.txt'" % (bam, sample, sample))
os.system("bash -c 'htseq-count -f bam %s /media/lab/data3tb_1/STAR_ref/gencode.v24lift37.annotation.gff3 > %s/%s.ReadsPerGene.txt'" % (bam, sample, sample))
def matricizer(batch_name):
os.system("bash -c 'python ./matricizer2.py > %s/%s.matrix.txt'" % (args['input_dir'], batch_name))
def annotate(batch_name):
os.system("""bash -c 'ssh joannaprzybyl@171.65.84.45 "mkdir -p /media/joannaprzybyl/SSD-Data/vcfgenie/input/%s"'""" % batch_name)
os.system("bash -c 'scp %s/*/*.vcf joannaprzybyl@171.65.84.45:/media/joannaprzybyl/SSD-Data/vcfgenie/input/%s/'" % (args['input_dir'], batch_name))
def run_gatk(case_dir, case_name, bam, vcf_path):
if not glob('%s/out.bam' % case_dir):
os.system("bash -c 'java -jar %s/AddOrReplaceReadGroups.jar I=%s O=%s/out.bam SO=coordinate RGID=test RGLB=test RGPL=Illumina RGPU=machine RGSM=test/'" % (args['picard'], bam, case_dir))
if not glob('%s/out2.bam' % case_dir):
os.system("bash -c 'java -jar %s/MarkDuplicates.jar I=%s/out.bam O=%s/out2.bam CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT M=output.metrics AS=TRUE REMOVE_DUPLICATES=TRUE'" % (args['picard'], case_dir, case_dir))
if not glob('%s/out2a*bai' % case_dir):
os.system("bash -c 'java -jar /media/lab/data512gb/bcbio/share/java/picard-1.96/ReorderSam.jar I=%s/out2.bam O=%s/out2a.bam REFERENCE=%s/%s'" % (case_dir, case_dir, args['star_ref'], args['genome'])) # ~15minutes
os.system("bash -c 'samtools index %s/out2a.bam'" % case_dir)
#*** change -R path
if not glob('%s/out3.bam' % case_dir):
os.system("bash -c 'java -jar %s -T SplitNCigarReads -R %s/%s -I %s/out2a.bam -o %s/out3.bam -rf ReassignOneMappingQuality -RMQF 255 -RMQT 60 -U ALLOW_N_CIGAR_READS'" % (args['gatk'], args['star_ref'], args['genome'], case_dir, case_dir))
#*** -R path and -known path #GATK RealignerTargetCreator #add Mills Gold Standard vcf
if not glob('%s/forIndelRealigner.intervals' % case_dir):
os.system("bash -c 'java -Xmx2g -jar %s -T RealignerTargetCreator -R %s/%s -nt %s -I %s/out3.bam -o %s/forIndelRealigner.intervals -known %s/%s'" % (args['gatk'], args['star_ref'], args['genome'], args['threads'], case_dir, case_dir, args['star_ref'], Mills))
#*** -R path and -known path #GATK IndelRealigner #add Mills Gold Standard vcf
if not glob('%s/out4.bam' % case_dir):
os.system("bash -c 'java -Xmx4g -jar %s -T IndelRealigner -R %s/%s -I %s/out3.bam -targetIntervals %s/forIndelRealigner.intervals -o %s/out4.bam -known %s/%s'" % (args['gatk'], args['star_ref'], args['genome'], case_dir, case_dir, case_dir, args['star_ref'], Mills))
#*** -R path and -known path #GATK base recalibration #recommend latest version dbsnp...146.dbsnp.vcf
if not glob('%s/recalibration_report.grp' % case_dir):
os.system("bash -c 'java -Xmx4g -jar %s -T BaseRecalibrator -nct %s -I %s/out4.bam -R %s/%s -knownSites %s/%s -o %s/recalibration_report.grp'" % (args['gatk'], args['threads'], case_dir, args['star_ref'], args['genome'], args['star_ref'], Mills, case_dir))
#*** -R path
if not glob('%s/*final.bam' % case_dir):
os.system("bash -c 'java -jar %s -T PrintReads -R %s/%s -nct %s -I %s/out4.bam -BQSR %s/recalibration_report.grp -o %s/%s.final.bam'" % (args['gatk'], args['star_ref'], args['genome'], args['threads'], case_dir, case_dir, case_dir, case_name))
#1 core
#nohup bash $SCRIPTS/mpileup_varscan.sh $1 > ./nohup_mpileup_varscan.txt &
#Haplotype caller (RNA-seq mode) #*** -R path
if not glob(vcf_path):
os.system("bash -c 'java -jar %s -T HaplotypeCaller -R %s/%s -I %s/%s.final.bam -dontUseSoftClippedBases -nct %s -stand_call_conf 20.0 -stand_emit_conf 20.0 -o %s'" % (args['gatk'], args['star_ref'], args['genome'], case_dir, case_name, args['threads'], vcf_path))
if __name__ == "__main__": main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment