Skip to content

Instantly share code, notes, and snippets.

@radaniba
Created November 29, 2012 17:17
Show Gist options
  • Save radaniba/4170501 to your computer and use it in GitHub Desktop.
Save radaniba/4170501 to your computer and use it in GitHub Desktop.
The python script rpkmforgenes.py is written for calculating gene expression for RNA-Seq data. It was used for a study published in PLoS Computational Biology (Ramsköld D, Wang ET, Burge CB, Sandberg R. An abundance of ubiquitously expressed genes reveale
Sequence data input:
A common input file format is bed file. In the simplest form, it has the format
chromosome -tab- startposition -tab- endposition
where each line corresponds to one sequence read
However this format can also include strand information:
chromosome -tab- startposition -tab- endposition -tab- name -tab- score -tab- +/-
for example:
chr13 54005048 54005081 . 1 -
The score field is ignored.
Files are allowed by have an optional first line starting with "track", this line is then ignored
Other input formats can be specified by -bedspace, -gff, -bowtie, -bedcompacted, -sam, -samse or -bam. -bowtie uses the default output format of Bowtie, -sam for SAM files, -bam is for BAM files. The reading of BAM files requires an index (.bai) and installation of the pysam module.
For SAM files, multimapping reads are not counted. Also, this is the only input format where the program has built-in support for paired-end reads. The option -samse forces the use of a more light-weight SAM parser, which is faster but requires uniquely mapped reads which are not paired-end.
Gzipped (.gz) and bzipped (.bz2) files can be used as input read files
READ MORE HERE
http://sandberg.cmb.ki.se/media/data/rnaseq/instructions-rpkmforgenes.html
#!/usr/bin/python
"""
Calculates gene expression from a read mapping file
"""
lastmodified = "12 Apr 2011"
author = "Daniel Ramskold"
#4 june 2010: now assumes sam and gff files use 1-based coordinates, not 0-based, -exonnorm is default
#6 june 2010: partially rewrote code for overlapping exons from different isoforms, swapped ID and symbol fields for some annotation types
#10 june 2010: corrected gff end coordinates
#17 june 2010: removed exons of other isoforms from -intronsinstead, normalise by only mRNA by default
#28 june 2010: bed12 format as annotation, -flat option for collapsing before RPKM calculation, put strand in unused gene name column if -strand
#3 july 2010: changed output header to give number of reads normalised by regardless of normalisation option
#8 july 2010: -bedann for 3 columns gives 1-based chrX:start:end but no change for -bed3ann, -bothends map end-1 position instead of end, added -n option, more stringent check (strand, length) to filter with -diffreads
#28 july 2010: added -unique option (removal of multimapping reads)
#6 aug 2010: -u option can call bigWigSummary on bigWig files
#11 aug 2010: rewrote parsing of sam files to make -bothends and -diffreads work better, removed the need for -unique option
#19 aug 2010: changed gene/ID fields with -bedann
#23 aug 2010: changed ouput of 0 rpkm to 0.0 rpkm
#10 sep 2010: indexing exons instead of reads
#17 nov 2010: reenabled old sam format input function (now with -samse), added bam file support
#9 dec 2010: allowed -u to read from files with FASTA header and fixed line wrap
#5 jan 2011: looks for .bigWig file suffix, fixed reporting for -readcount -nocollapse
#20 jan 2011: changed GTF format ID to transcript ID
#24 jan 2011: added -randomreads
#9 feb 2011: speed optimizations for -randomreads, fixed error reporting at 0 total reads
#7 mars 2011: debugging of isoform deconvolution
#12 apr 2011: added -namecollapse option
from numpy import matrix, linalg
import numpy, sys, time, os, subprocess
WINDOWSIZE = 5000
uniqueposdir = "none"
class Cexon:
def __init__(self, start, end, normalise_with):
self.start = start
self.end = end
self.reads = 0
self.transcripts = []
self.readspersample = []
self.forbidden = False # only for normalisation, hide from expression calculation and output
self.normalise_with = normalise_with # include in exon normalisation
def calclength(self, chromosome):
if ONLYUNIQUEPOS:
if USESTRANDINFO:
chromsome = chromosome[:-1]
if bigWigSummary_path:
self.uniquelengthfromBigWig(chromosome)
else:
self.setuniquelength(os.path.join(uniqueposdir, chromosome + ".fa"))
else:
self.length = self.end - self.start
def exonstring(self):
outstr = "(" + str(self.start) + "," + str(self.end)
for tx in self.transcripts:
outstr += "," + tx.ID
outstr += ") " + str(self.reads) + " " + str(self.length)
return outstr
def setuniquelength(self, chromosomefile):
try:
cfileh = openfile(chromosomefile)
except:
global warnedchromosomes
if not chromosomefile in warnedchromosomes:
warnedchromosomes.append(chromosomefile)
print "Warning: Did not find", chromosomefile
self.length = self.end - self.start
return
global chromosomefile_infodict
try: chromosomefile_infodict
except: chromosomefile_infodict = {}
try: offset, linelength, seqlength = chromosomefile_infodict[chromosomefile]
except:
line1 = cfileh.readline(1000)
if len(line1) < 1000 and line1[0] == '>': offset = len(line1)
else:
cfileh.seek(0)
offset = 0
line2 = cfileh.readline(1000)
if len(line2) < 1000:
linelength = len(line2)
seqlength = len(line2.rstrip())
else:
linelength = 0
seqlength = 0
chromosomefile_infodict[chromosomefile] = offset, linelength, seqlength
if linelength == 0:
startfilepos = self.start + offset
endfilepos = self.end + offset
else:
startfilepos = offset + (self.start // seqlength)*linelength + (self.start % seqlength)
endfilepos = offset + (self.end // seqlength)*linelength + (self.end % seqlength)
cfileh.seek(startfilepos, 0)
sequence = cfileh.read(endfilepos-startfilepos)
self.length = sequence.count('A')+sequence.count('C')+sequence.count('G')+sequence.count('T') # upper-case means unique
def uniquelengthfromBigWig(self, chromosome):
try:
proc = subprocess.Popen([bigWigSummary_path, uniqueposdir, chromosome, str(self.start), str(self.end), '1'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
except KeyboardInterrupt:
raise
except:
global bigwig_norun
try: bigwig_norun
except:
print 'Error: Failed running bigWigSummary, required by the -u option for bigWig files. If it was\'t aborted by the user, perhaps its permission to execute has not been set (\'chmod +x '+ bigWigSummary_path + '\' on Unix-like systems)'
sys.exit(1)
bigwig_norun = 1
self.length = self.end - self.start
return
ret, err = proc.communicate()
if proc.returncode:
if err.startswith('no data'):
self.length = 0
else:
print "Warning: bigWigSummary returned an error:"
print err
self.length = self.end - self.start
else:
self.length = float(ret)*(self.end - self.start)
class Ctranscript:
def __init__(self, ID):
self.ID = ID
self.expression = []
self.reads = []
self.overlaptx = [self]
self.genename = "?"
class Cgene:
def __init__(self, name):
self.name = name
self.transcripts = []
self.exons = []
self.chromosome = ""
self.overlapsets = []
def exonstring(self):
outstr = self.name
for exon in self.exons:
outstr += "\t" + exon.exonstring()
return outstr
class Cunit:
# after collapse of genes
def __init__(self):
self.name1 = "."
self.name2 = "."
self.rpkms = []
self.reads = []
def contractarrays(Ll, Rl):
row = 0
while row < len(Rl):
zeropattern = Ll[row]
(Ll, Rl) = contractarrays_inner(zeropattern, Ll, Rl, row)
row += 1
return (Ll, Rl)
def contractarrays_inner(zeropattern, Ll, Rl, firstmatchingrow):
currentrow = firstmatchingrow+1
while currentrow < len(Ll):
matches = 1
for place in range(len(zeropattern)):
if (zeropattern[place] and not Ll[currentrow][place]) or \
(Ll[currentrow][place] and not zeropattern[place]):
matches = 0
if matches:
for place in range(len(zeropattern)):
Ll[firstmatchingrow][place] += Ll[currentrow][place]
Rl[firstmatchingrow][0] += Rl[currentrow][0]
del Ll[currentrow]
del Rl[currentrow]
else:
currentrow += 1
return (Ll, Rl)
def rowincludes(inclusiverow, includedrow):
# checks if all elements of includedrows are found within inclusiverow
for element in range(len(inclusiverow)):
if includedrow[element] and not inclusiverow[element]:
return 0
return 1
def elementsinrow(row):
count = 0
for element in range(len(row)):
if row[element]:
count += 1
return count
def removecommonrow(Ll, Rl):
currentrow = 0
while currentrow < len(Ll):
matches = 1
for place in range(len(Ll[currentrow])):
if not Ll[currentrow][place]:
matches = 0
if matches:
del Ll[currentrow]
del Rl[currentrow]
else:
currentrow += 1
return (Ll, Rl)
def removecolumn(Ll, column):
currentrow = 0
while currentrow < len(Ll):
del Ll[currentrow][column]
currentrow += 1
return Ll
def removezeros(Ll, Rl):
currentrow = 0
while currentrow < len(Ll):
matches = 1
for place in range(len(Ll[currentrow])):
if Ll[currentrow][place]:
matches = 0
if matches:
del Ll[currentrow]
del Rl[currentrow]
else:
currentrow += 1
return (Ll, Rl)
def fromannotationline(line):
if annotationtype == 0:
# from refGene.txt
p = line.rstrip('\r\n').split("\t")
exonstarts = [int(f) for f in p[9].split(",")[:-1]] # start positions for exons for the gene
exonends = [int(f) for f in p[10].split(",")[:-1]]
ID = p[1]
chromosome = p[2]
genename = p[12]
strand = p[3]
cdsstart = min(int(p[7]), int(p[6]))
cdsend = max(int(p[7]), int(p[6]))
elif annotationtype == 1:
# from altered knownGene.txt
p = line.rstrip('\r\n').split("\t")
exonstarts = [int(f) for f in p[8].split(",")[:-1]]
exonends = [int(f) for f in p[9].split(",")[:-1]]
ID = p[11]
chromosome = p[1]
genename = p[0]
strand = p[2]
cdsstart = min(int(p[5]), int(p[6]))
cdsend = max(int(p[5]), int(p[6]))
elif annotationtype == 2:
# from ensGene.txt or sibGene.txt
p = line.rstrip('\r\n').split("\t")
ID = p[1]
strand = p[3]
chromosome = p[2]
genename = p[12]
cdsstart = min(int(p[7]), int(p[6]))
cdsend = max(int(p[7]), int(p[6]))
exonstarts = [int(f) for f in p[9].split(",")[:-1]]
exonends = [int(f) for f in p[10].split(",")[:-1]]
elif annotationtype == 3:
# 3 column bed file
p = line.rstrip('\r\n').split("\t")
exonstarts = [int(p[1])]
exonends = [int(p[2])]
cdsstart = 0
cdsend = 0
chromosome = p[0]
genename = None
ID = p[0]+":"+p[1]+"-"+p[2]
strand = "+"
elif annotationtype == 4:
# 6 column bed file
p = line.rstrip('\r\n').split("\t")
exonstarts = [int(p[1])]
exonends = [int(p[2])]
cdsstart = 0
cdsend = 0
chromosome = p[0]
genename = None
ID = p[3]
if len(p) > 5: strand = p[5]
else: strand = "+"
elif annotationtype == 5:
# gtf file format
p = line.rstrip('\r\n').split("\t")
exonstarts = [int(p[3])-1]
exonends = [int(p[4])]
cdsstart = 0
cdsend = 0
chromosome = p[0]
transInfo = p[8].split(';')
transIDs = transInfo[1].lstrip('transcript_id ').strip('"').replace(' ', '').split(',')
ID = '+'.join(transIDs)
genename = p[0]+":"+p[3]+"-"+p[4]
strand = p[6]
if strand == '.': strand = '+'
elif annotationtype == 6:
# up to 12 column bed file
p = line.rstrip('\r\n').split("\t")
chromosome = p[0]
absstart = int(p[1])
try: absend = int(p[2])
except: absend = absstart
try: strand = p[5]
except: strand = "+"
try: cdsstart = int(p[6]); cdsend = int(p[7])
except: cdsstart = 0; cdsend = 0
try: blocksizes = map(int, p[10].split(',')) ; blockstarts = map(int, p[11].split(','))
except: blocksizes = [absend - absstart]; blockstarts = [0]
exonstarts = [absstart + rs for rs in blockstarts]
exonends = [es + size for es,size in zip(exonstarts, blocksizes)]
try: genename = p[3]
except:
genename = None
ID = str(chromosome)+":"+str(min(exonstarts)+1)+"-"+str(max(exonends)+1)
if len(blockstarts) != len(blocksizes) or max(exonends) != absend:
print "Warning: Block structure for " + ID + " is malformed"
if USESTRANDINFO:
if genename is None: genename = strand
if swapstrands:
if strand == "+": chromosome += "-"
else: chromosome += "+"
else:
chromosome += strand # so e.g. "chr1+" is different from "chr1-"
else:
if genename is None: genename = '.'
return (chromosome, strand, cdsstart, cdsend, exonstarts, exonends, genename, ID)
class Cline:
def __init__(self, line):
global allchromosomes, allchromosomes_dict
self.line = line
(chromosome, direction, cdsstart, cdsend, exonstarts, exonends, genename, ID) = fromannotationline(line)
try: self.chromosome = allchromosomes_dict[chromosome]
except:
allchromosomes.append(chromosome)
self.chromosome = allchromosomes.index(chromosome)
allchromosomes_dict[chromosome] = self.chromosome
self.start = min(exonstarts)
self.end = max(exonends)
self.strand = direction
def __cmp__(self, other):
if self.chromosome != other.chromosome:
return cmp(self.chromosome, other.chromosome)
if self.start > other.end:
return 1
if other.start > self.end:
return -1
return cmp(self.start, other.start)
def openfile(filename, mode='r'):
if filename.endswith(".gz"):
import gzip
fileh = gzip.open(filename, mode)
elif filename.endswith(".bz2"):
import bz2
fileh = bz2.BZ2File(filename, mode)
else: fileh = open(filename,mode)
return fileh
def addread(chromosome, start, end, strand):
try: chrID = allchromosomes_dict[chromosome]
except:
return -1, None
if strand == '-':
return chrID, (start, 1-end)
else:
return chrID, (start,end-1)
def addread_tuple(chromosome, postuple):
try: chrID = allchromosomes_dict[chromosome]
except:
return -1, None
return chrID, postuple
def readsfrombedtabfile(filename, justreadnum):
try:
if removemultimappers:
tmpfile = sorttotmpfile(filename, 3, ignoredprefix='track', sep='\t')
fileh = open(tmpfile,'r')
else:
fileh = openfile(filename)
except IOError:
print "Warning: No such file", filename
return
try:
if not fileh.readline().startswith("track"): fileh.seek(0)
while 1:
line = fileh.readline()
if not line: break
if justreadnum: yield 1; continue
p = line.rstrip('\r\n').split("\t")
chromosome = p[0]
try: strand = p[5]
except: strand = '.'
if USESTRANDINFO: chromosome += strand
yield addread(chromosome, int(p[1]), int(p[2]), strand)
finally:
fileh.close()
if removemultimappers:
os.remove(tmpfile)
def readsfrombedtabfile_rs(filename, justreadnum):
if removemultimappers: print "Warning: The -unique option was ignored, unsupported for this format"
try: fileh = openfile(filename)
except:
print "Warning: No such file", filename
return
try:
if not fileh.readline().startswith("track"): fileh.seek(0)
while 1:
line = fileh.readline()
if not line: break
p = line.rstrip('\r\n').split("\t")
chromosome = p[0]
try: strand = p[5]
except: strand = '.'
if USESTRANDINFO: chromosome += strand
for nb_reads in xrange(int(p[4])):
yield addread(chromosome, int(p[1]), int(p[2]), strand)
finally:
fileh.close()
if removemultimappers:
os.remove(tmpfile)
def readsfrombedspacefile(filename, justreadnum):
try:
if removemultimappers:
tmpfile = sorttotmpfile(filename, 3, ignoredprefix='track', sep=' ')
fileh = open(tmpfile,'r')
else:
fileh = openfile(filename)
except IOError:
print "Warning: No such file", filename
return
try:
if not fileh.readline().startswith("track"): fileh.seek(0)
while 1:
line = fileh.readline()
if not line: break
if justreadnum: yield 1; continue
p = line.rstrip('\r\n').split()
chromosome = p[0]
try: strand = p[5]
except: strand = '.'
if USESTRANDINFO: chromosome += strand
yield addread(chromosome, int(p[1]), int(p[2]), strand)
finally:
fileh.close()
def readsfromgtffile(filename, justreadnum):
if removemultimappers: print "Warning: The -unique option was ignored, unsupported for this format"
try: fileh = openfile(filename)
except:
print "Warning: No such file", filename
return
try:
if not fileh.readline().startswith("track"): fileh.seek(0)
while 1:
line = fileh.readline()
if not line: break
if justreadnum: yield 1; continue
p = line.rstrip('\r\n').split("\t")
chromosome = p[0]
try: strand = p[6]
except: strand = '.'
if USESTRANDINFO: chromosome += strand
yield addread(chromosome, int(p[3])-1, int(p[4]), strand)
finally:
fileh.close()
def splitcigar(string):
numarr = ['']
symbolarr = []
wasnum = 1
for letter in string:
if letter.isdigit():
if wasnum:
numarr[-1] += letter
else:
numarr.append(letter)
wasnum = 1
else:
if not wasnum:
symbolarr[-1] += letter
else:
symbolarr.append(letter)
wasnum = 0
return [int(v) for v in numarr], symbolarr
def rowparse(row):
chromosome = row[2]
start = row[1]-1
endincl = start + row[4]-1
if row[3] & 16:
if USESTRANDINFO:
chromosome += "-"
endincl = -endincl
else:
if USESTRANDINFO:
chromosome += "+"
return chromosome, start, endincl
def process_rowinfo(rowinfo, justreadnum):
lastname = ''
while 1:
try: row = rowinfo.pop()
except IndexError: break
try: nrow = rowinfo[-1]
except IndexError: pass
else:
if row[0] == nrow[0]: # same name
try:
if rowinfo[-2][0] == row[0]:
# multimapper
try:
while rowinfo[-1][0] == row[0]:
rowinfo.pop()
except IndexError: pass
continue
except IndexError: pass
if row[3]&0x40 or row[3]&0x80:
# paired end reads
if row[4]&0x80:
chromosome_1, start_1, endincl_1 = rowparse(nrow)
chromosome_2, start_2, endincl_2 = rowparse(row)
else:
chromosome_1, start_1, endincl_1 = rowparse(row)
chromosome_2, start_2, endincl_2 = rowparse(nrow)
yield addread_tuple(chromosome_1, (start_1, endincl_1, start_2, endincl_1))
continue
# multimapper
try:
while rowinfo[-1][0] == row[0]:
rowinfo.pop()
except IndexError: pass
continue
chromosome_1, start_1, endincl_1 = rowparse(row)
yield addread_tuple(chromosome_1, (start_1, endincl_1))
def readsfromsam(filename, justreadnum):
global saved_rowinfo
try:
if justreadnum: raise NameError
rowinfo, fn = saved_rowinfo
del saved_rowinfo
assert fn == filename
except NameError:
try:
fileh = openfile(filename)
except IOError:
print "Warning: No such file", filename
return
rowinfo = []
for line in fileh:
if line.startswith("@"): continue # header
p = line.rstrip('\r\n').split("\t")
bits = int(p[1])
if bits & 4: continue # unmapped
chromosome = p[2]
name = p[0]
length = sum([l for l,s in zip(*splitcigar(p[5])) if s in ["M","D","X","=","N"]])
rowinfo.append((name, int(p[3]), chromosome, bits, length))
rowinfo.sort()
import copy
if justreadnum: saved_rowinfo = copy.copy(rowinfo), filename
fileh.close()
return process_rowinfo(rowinfo, justreadnum)
def readsfrombam(filename, justreadnum):
global saved_rowinfo
try:
if justreadnum: raise NameError
rowinfo, fn = saved_rowinfo
del saved_rowinfo
assert fn == filename
except NameError:
try:
import pysam
except:
print "Error: tried to read BAM file, but missing the pysam module"
exit(1)
try:
fileh = pysam.Samfile(filename, 'rb')
except IOError:
print "Warning: No such file", filename
return
rowinfo = []
for read in fileh:
if read.flag & 4: continue # unmapped
chromosome = fileh.getrname(read.rname)
name = read.qname
length = sum(l for o,l in read.cigar if o in [0,2,3]) #0=M,1=I,2=D,3=N
rowinfo.append((name, read.pos+1, chromosome, read.flag, length))
rowinfo.sort()
import copy
if justreadnum: saved_rowinfo = copy.copy(rowinfo), filename
fileh.close()
return process_rowinfo(rowinfo, justreadnum)
def readsfromsam_se(filename, justreadnum):
try:
if removemultimappers:
tmpfile = sorttotmpfile(filename, 0, ignoredprefix='@', sep='\t')
fileh = open(tmpfile,'r')
else:
fileh = openfile(filename)
except IOError:
print "Warning: No such file", filename
return
try:
while 1:
line = fileh.readline()
if not line: break
if line.startswith("@"): continue # header
p = line.rstrip('\r\n').split("\t")
if int(p[1]) & 4: continue # unmapped
if justreadnum: yield 1; continue
chromosome = p[2]
if int(p[1]) & 16: strand = "-"
else: strand = "+"
if USESTRANDINFO:
chromosome += strand
cigarints, cigarsymbols = splitcigar(p[5])
length = sum([l for l,s in zip(cigarints, cigarsymbols) if s in ["M","D","X","=","N"]])
start = int(p[3])-1
yield addread(chromosome, start, start+length, strand)
finally:
fileh.close()
if removemultimappers:
os.remove(tmpfile)
return
def readsfrombowtieoutput(filename, justreadnum):
try:
if removemultimappers:
tmpfile = sorttotmpfile(filename, 0, sep='\t')
fileh = open(tmpfile,'r')
else:
fileh = openfile(filename)
except IOError:
print "Warning: No such file", filename
return
try:
while 1:
line = fileh.readline()
if not line: break
p = line.rstrip('\r\n').split("\t")
chromosome = p[2]
try: strand = p[1]
except: strand = '.'
if USESTRANDINFO: chromosome += strand
yield addread(chromosome, int(p[3]), int(p[3])+len(p[4]), strand)
finally:
fileh.close()
if removemultimappers:
os.remove(tmpfile)
return
def sorttotmpfile(filename, idindex, ignoredprefix='', sep='\t'):
# sort file, remove duplicates, export to temporary file
# returned file needs later removal with os.remove()
import tempfile
outf = tempfile.mkstemp(suffix='_rpkmforgenes')[1]
try:
print "Sorting to ", outf
infh = openfile(filename, 'r')
lines = []
for line in infh:
if ignoredprefix and line.startswith(ignoredprefix): continue
p = line.rstrip('\r\n').split('\t')
if ignoredprefix=='@' and int(p[2])&0x80: continue # remove 2nd in paired read
if len(p) <= idindex:
lines.append(('', line))
else:
lines.append((p[idindex],line))
infh.close()
lines.sort()
outfh = open(outf, 'w')
for i in range(len(lines)):
this = lines[i][0]
if i > 0:
neighbour = lines[i-1][0]
if neighbour and neighbour == this: continue
try:
neighbour = lines[i+1][0]
except IndexError: pass
else:
if neighbour and neighbour == this: continue
outfh.write(lines[i][1])
outfh.close()
except:
try: os.remove(outf)
except: pass
raise
return outf
def mappos(chrindex, readpos):
try: window = allwindows[chrindex][readpos//WINDOWSIZE]
except IndexError:
return []
return [exon for exon in window if exon.start <= readpos < exon.end]
def getarguments(flag):
# list of arguments that follow the flag
flagindex = sys.argv.index(flag)
ret = []
index = flagindex + 1
while index < len(sys.argv) and sys.argv[index][0] != "-":
ret.append(sys.argv[index])
index += 1
return ret
def getargument(flag):
# the argument after the flag
flagindex = sys.argv.index(flag)
return sys.argv[flagindex+1]
def testargumentflag(flag):
# if the flag is among the arguments
if flag in sys.argv: return 1
else: return 0
def main():
global MAXGENES
global MAXREADS
global ONLYUNIQUEPOS
global ONLYTXUNIQUEEXONS
global INTRONSINSTEAD
global NODECONVOLUTION
global COLLAPSEGENES
global FULLTRANSCRIPTS
global USESTRANDINFO
global allchromosomes
global allchromosomes_dict
global annotationtype
global uniqueposdir
global swapstrands
global mapends
global removemultimappers
global bigWigSummary_path
global allwindows
# interpret program arguments
if len(sys.argv) < 2 or testargumentflag("--help") or testargumentflag("-h"):
print "Non-optional arguments:"
print " -o followed by output file"
print " -i followed by list of input files (by default, guesses format from file extension)"
print " -a followed by annotation file"
print "Gene model-related options:"
print " -u followed by a bigWig file, alternatively a directory for files for non-unique positions (lower case for nonunique k-mers (where k is the read length), upper case for unique; filenames are e.g. chr1.fa"
print " -fulltranscript to not remove 3'UTRs"
print " -maxlength followed by a distance to cut each transcript from the 3' end (5' if negative)"
print " -maxgenes limit how many genes expression is calculated for"
print " -limitcollapse to not consider indirect transcript overlap"
print " -namecollapse to only consider overlap between isoform with the same gene identifier"
print " -nocollapse to get isoform expressions (shaky)"
print " -nooverlap to ignore that transcripts can overlap (will count some reads several times)"
print " -flat to flatten all isoforms to one gene model"
print " -txunique to ignore exons shared by multiple gene isoforms"
print " -onlycoding to ignore noncoding transcripts"
print " -swapstrands to make reads on + strand map to genes on - and vice versa (and sets -strand)"
print " -introns gives gene expression from introns rather than exons (also removes exons of other isoforms)"
print " -keephap to not remove haplotype chromosome (_hap) annotation"
print "Annotation file formats:"
print " -genePred if annotation file uses format of refGene.txt etc (default if cannot guess from file name suffix)"
print " -bedann tab-separated 0-based bed file, chromosome start end and 9 optional fields"
print " -gffann GFF format, but no groups, only single exon genes"
print "Input formats:"
print " -bed tab separated bed file (default if cannot guess from file name suffix)"
print " -bedcompacted bed file with number of reads in the score column"
print " -bedspace space separated bed file"
print " -bowtie the default output format of bowtie"
print " -sam SAM format"
print " -bam BAM format"
print " -samse SAM format, single-end, uniquely mapped reads (faster than -sam)"
print " -gff GFF file, no groups"
print "Normalisation options:"
print " -mRNAnorm to normalize by the number of reads matching mRNA exons (default)"
print " -exonnorm to normalize by the number of reads matching exons, including ncRNA"
print " -allmapnorm to normalize by the total number of mapped reads (default if annotation contains no mRNA)"
print " -forcetotal followed by a number of reads for each sample to set a constant to normalise by"
print "Other optional arguments:"
print " -strand to use strand information of reads"
print " -bothends to also map the end positions to genes, each end counted as 0.5 (or 0.25 for paired-end reads)"
print " -diffreads to count only one read if several have the same position, strand and length"
print " -n followed by list of sample names (input file names are otherwise used)"
print " -maxreads followed by maximum number of reads to be used"
print " -randomreads to make -maxreads pick reads at random"
print " -readcount to add the number of reads supporting expression to the output"
print " -h to print this message and quit"
print "Special output values:"
print " 0 gene has no reads, -1 gene has no exons"
print " otherwise the output is in reads per kilobase and million mappable reads (or rather FPKM for paired-end reads)"
print "Output:"
print " gene symbol -tab- ID -tab- RPKM values [-tab- read count]"
return 0
try: outfile = getargument("-o")
except:
print "No outfile (-o)"
return 1
try: infiles = getarguments("-i")
except:
print "No infile (-i)"
return 1
try: annotationfile = getargument("-a")
except:
print "No annotationfile (-a)"
return 1
try:
names = getarguments("-n")
if len(names) != len(infiles):
print "Not the same number of sample names as files with reads"
return 1
except: names = [filename.rsplit("/")[-1] for filename in infiles]
compressionsuffixes = ['gz','bz2']
if testargumentflag("-genePred") or testargumentflag("-refseq"): annotationtype = 0
elif testargumentflag("-ucsc"): annotationtype = 1
elif testargumentflag("-ensembl"): annotationtype = 2
elif testargumentflag("-bedann"): annotationtype = 6
elif testargumentflag("-bed3ann"): annotationtype = 3
elif testargumentflag("-bed6ann"): annotationtype = 4
elif testargumentflag("-gtfann") or testargumentflag("-gffann"): annotationtype = 5
else:
# if no annotation type given, guess from file suffix
anndict = {'bed':6, 'gff':5, 'gtf':5, 'txt':0}
try:
annpre, ending = annotationfile.rsplit('.',1)
if ending in compressionsuffixes: ending = annpre.rsplit('.',1)[1]
annotationtype = anndict[ending]
except:
annotationtype = 0
if testargumentflag("-bed"): infileformat = "bed"
elif testargumentflag("-bedcompacted"): infileformat = "bedcompacted"
elif testargumentflag("-bedspace"): infileformat = "bedspace"
elif testargumentflag("-gff") or testargumentflag("-gtf"): infileformat = "gtf"
elif testargumentflag("-sam"): infileformat = "sam"
elif testargumentflag("-bowtie"): infileformat = "bowtie"
elif testargumentflag("-samse"): infileformat = "samse"
elif testargumentflag("-bam"): infileformat = "bam"
else: infileformat = "guess"
if testargumentflag("-nocollapse"): COLLAPSEGENES = 0
else: COLLAPSEGENES = 1
if testargumentflag("-limitcollapse"): limitcollapse = 1
else: limitcollapse = 0
genecollapse = testargumentflag("-namecollapse")
if testargumentflag("-fulltranscript"): FULLTRANSCRIPTS = 1
else: FULLTRANSCRIPTS = 0
bigWigSummary_path = 0
if testargumentflag("-u"):
uniqueposdir = getargument("-u")
if uniqueposdir.endswith('.bw') or uniqueposdir.endswith('.bw.gz') or uniqueposdir.endswith('.bw.bz2') or uniqueposdir.endswith('.bigWig') or uniqueposdir.endswith('.bigWig.gz') or uniqueposdir.endswith('.bigWig.bz2'):
bigWigSummary_path = os.path.split(sys.argv[0])[0]
if bigWigSummary_path == '':
bigWigSummary_path = './'
bigWigSummary_path = os.path.join(bigWigSummary_path, 'bigWigSummary')
if not os.path.isfile(bigWigSummary_path):
print 'Error: ' + bigWigSummary_path + ' does not exist, needed to read bigWig file spec
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment