Last active
April 15, 2019 14:20
-
-
Save PoisonAlien/d8cd79a76aab32d80d0f to your computer and use it in GitHub Desktop.
This is a wrapper for STAR aligner with most commonly used arguments. Assumes STAR is under path and accessible.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'Anand M' | |
import argparse, datetime, os | |
############# parse arguments ######## | |
parser = argparse.ArgumentParser( | |
description="wrapper script for STAR aligner. Assumes STAR is installed under path and accessible.", | |
formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
# Positional arguments | |
parser.add_argument('genomeDir', help='path to genome directory') | |
parser.add_argument('readFilesIn', help='path/to/read1 [path/to/read2]', nargs='+') | |
# file reading and multithreading options | |
parser.add_argument('--runThreadsN', help='Number of threads to use for mapping', default=4, type=int, dest='threads') | |
parser.add_argument('--outFileNamePrefix', | |
help='output files name prefix (including full or relative path). Can only be defined on the command line.', | |
default='myStar', type=str, dest='prefix') | |
parser.add_argument('--genomeLoad', help='mode of shared memory usage for the genome files', default='NoSharedMemory', | |
dest='mem', choices=['LoadAndKeep', 'LoadAndRemove', 'LoadAndExit', 'Remove', 'NoSharedMemory']) | |
parser.add_argument('--twopassMode', help='2-pass mapping mode.', choices=['None', 'Basic'], default='None', type=str, | |
dest='twoPass') | |
parser.add_argument('--rg', help='read group for sam of format "@RG\\tID:id\\tSM:sample\\tLB:lib" ', | |
default="@RG\\tID:id\\tSM:sample\\tLB:lib", type=str, dest="rg") | |
## Read-Parameters | |
inputReads = parser.add_argument_group('Read Parameters') | |
inputReads.add_argument('--readFilesCommand', help='uncomression method to use', default='zcat', dest='zip', | |
choices=['-', 'zcat', 'bzcat']) | |
inputReads.add_argument('--clip3pNbases', | |
help='number(s) of bases to clip from 3p of each mate. If one value is given, it will be assumed the same for both mates.', | |
dest='clip3', type=int, default=0) | |
inputReads.add_argument('--clip5pNbases', | |
help='number(s) of bases to clip from 5p of each mate. If one value is given, it will be assumed the same for both mates.', | |
dest='clip5', type=int, default=0) | |
inputReads.add_argument('--clip3pAdapterSeq', | |
help='adapter sequences to clip from 3p of each mate. If one value is given,it will be assumed the same for both mates.', | |
dest='clipAdapt', type=str, default='-') | |
# Splice junctions database | |
junction = parser.add_argument_group('splice junction options') | |
junction.add_argument('--sjdbGTFfile', help="string: path to the GTF file with annotations", default='-', | |
dest='sjdbGTFfile', type=str) | |
junction.add_argument('--sjdbGTFchrPrefix', | |
help="string: prefix for chromosome names in a GTF file (e.g. chr for using ENSMEBL annotations with UCSC geneomes)", | |
default='-', dest='sjdbPrefix', type=str) | |
## Quantification of Annotations | |
quant = parser.add_argument_group('Quantification of Annotation') | |
quant.add_argument('--quantMode', default='-', dest='quant', choices=['-', 'TranscriptomeSAM', 'GeneCounts'], | |
help='quantification of SJDB file') | |
## Output SAM and BAM options | |
output = parser.add_argument_group('Output: SAM and BAM') | |
output.add_argument('--outSAMtype', dest='outType', choices=['SAM', 'BAM', 'None'], default='BAM', type=str, | |
help='output type') | |
output.add_argument('--sortType', dest='sortType', choices=['Unsorted', 'SortedByCoordinate'], default='Unsorted', | |
type=str, help='sort order') | |
output.add_argument('--outSAMstrandField', dest='strandField', choices=['None', 'intronMotif'], default='intronMotif', | |
type=str, help='sam strand field type') | |
output.add_argument('--outFilterMultimapNmax', dest='maxAlign', default=10, type=int, | |
help='read alignments will be output only if the read maps fewer than this value,otherwise no alignments will be output') | |
args = parser.parse_args() | |
# input files | |
files = '' | |
if len(args.readFilesIn) <= 2: | |
for f in args.readFilesIn: | |
files = files + ' ' + f | |
else: | |
print("Please enter paired end files or interleaved single file. More than two input files entered !") | |
quit() | |
# sort type | |
samsort = args.outType + ' ' + args.sortType | |
# sam rg | |
args.rg = str.split(args.rg, '\\t') | |
dt = str(datetime.date.today().isoformat()) | |
rgattr = ' '.join(args.rg[1:]) + ' ' + 'DT:' + dt | |
#star command | |
starCommand = 'STAR --outFileNamePrefix %s --outSAMtype %s ' \ | |
"--genomeLoad %s --twopassMode %s " \ | |
"--clip3pNbases %d --clip5pNbases %d --clip3pAdapterSeq %s " \ | |
"--sjdbGTFfile %s --sjdbGTFchrPrefix %s --quantMode %s --outFilterMultimapNmax %d " \ | |
"--outSAMstrandField %s --outSAMattrRGline %s --runThreadN %d --readFilesCommand %s --genomeDir %s --readFilesIn %s" % ( | |
args.prefix, samsort, args.mem, args.twoPass, args.clip3, args.clip5, args.clipAdapt, | |
args.sjdbGTFfile, args.sjdbPrefix, args.quant, args.maxAlign, | |
args.strandField, rgattr, args.threads, args.zip, args.genomeDir, files) | |
print("Effective STAR command:\n") | |
print(starCommand) | |
print '\n[%s]\tAlignment started. \n' % datetime.datetime.now().strftime("%d-%m-%Y %H:%M") | |
os.system(starCommand) | |
print '[%s]\tAlignment finished. \n' % datetime.datetime.now().strftime("%d-%m-%Y %H:%M") | |
logFile = 'cat ' + args.prefix + 'Log.final.out' | |
os.system(logFile) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment