Created
November 29, 2012 17:17
-
-
Save radaniba/4170501 to your computer and use it in GitHub Desktop.
The python script rpkmforgenes.py is written for calculating gene expression for RNA-Seq data. It was used for a study published in PLoS Computational Biology (Ramsköld D, Wang ET, Burge CB, Sandberg R. An abundance of ubiquitously expressed genes reveale
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sequence data input: | |
A common input file format is bed file. In the simplest form, it has the format | |
chromosome -tab- startposition -tab- endposition | |
where each line corresponds to one sequence read | |
However this format can also include strand information: | |
chromosome -tab- startposition -tab- endposition -tab- name -tab- score -tab- +/- | |
for example: | |
chr13 54005048 54005081 . 1 - | |
The score field is ignored. | |
Files are allowed by have an optional first line starting with "track", this line is then ignored | |
Other input formats can be specified by -bedspace, -gff, -bowtie, -bedcompacted, -sam, -samse or -bam. -bowtie uses the default output format of Bowtie, -sam for SAM files, -bam is for BAM files. The reading of BAM files requires an index (.bai) and installation of the pysam module. | |
For SAM files, multimapping reads are not counted. Also, this is the only input format where the program has built-in support for paired-end reads. The option -samse forces the use of a more light-weight SAM parser, which is faster but requires uniquely mapped reads which are not paired-end. | |
Gzipped (.gz) and bzipped (.bz2) files can be used as input read files | |
READ MORE HERE | |
http://sandberg.cmb.ki.se/media/data/rnaseq/instructions-rpkmforgenes.html | |
#!/usr/bin/python | |
""" | |
Calculates gene expression from a read mapping file | |
""" | |
lastmodified = "12 Apr 2011" | |
author = "Daniel Ramskold" | |
#4 june 2010: now assumes sam and gff files use 1-based coordinates, not 0-based, -exonnorm is default | |
#6 june 2010: partially rewrote code for overlapping exons from different isoforms, swapped ID and symbol fields for some annotation types | |
#10 june 2010: corrected gff end coordinates | |
#17 june 2010: removed exons of other isoforms from -intronsinstead, normalise by only mRNA by default | |
#28 june 2010: bed12 format as annotation, -flat option for collapsing before RPKM calculation, put strand in unused gene name column if -strand | |
#3 july 2010: changed output header to give number of reads normalised by regardless of normalisation option | |
#8 july 2010: -bedann for 3 columns gives 1-based chrX:start:end but no change for -bed3ann, -bothends map end-1 position instead of end, added -n option, more stringent check (strand, length) to filter with -diffreads | |
#28 july 2010: added -unique option (removal of multimapping reads) | |
#6 aug 2010: -u option can call bigWigSummary on bigWig files | |
#11 aug 2010: rewrote parsing of sam files to make -bothends and -diffreads work better, removed the need for -unique option | |
#19 aug 2010: changed gene/ID fields with -bedann | |
#23 aug 2010: changed ouput of 0 rpkm to 0.0 rpkm | |
#10 sep 2010: indexing exons instead of reads | |
#17 nov 2010: reenabled old sam format input function (now with -samse), added bam file support | |
#9 dec 2010: allowed -u to read from files with FASTA header and fixed line wrap | |
#5 jan 2011: looks for .bigWig file suffix, fixed reporting for -readcount -nocollapse | |
#20 jan 2011: changed GTF format ID to transcript ID | |
#24 jan 2011: added -randomreads | |
#9 feb 2011: speed optimizations for -randomreads, fixed error reporting at 0 total reads | |
#7 mars 2011: debugging of isoform deconvolution | |
#12 apr 2011: added -namecollapse option | |
from numpy import matrix, linalg | |
import numpy, sys, time, os, subprocess | |
WINDOWSIZE = 5000 | |
uniqueposdir = "none" | |
class Cexon: | |
def __init__(self, start, end, normalise_with): | |
self.start = start | |
self.end = end | |
self.reads = 0 | |
self.transcripts = [] | |
self.readspersample = [] | |
self.forbidden = False # only for normalisation, hide from expression calculation and output | |
self.normalise_with = normalise_with # include in exon normalisation | |
def calclength(self, chromosome): | |
if ONLYUNIQUEPOS: | |
if USESTRANDINFO: | |
chromsome = chromosome[:-1] | |
if bigWigSummary_path: | |
self.uniquelengthfromBigWig(chromosome) | |
else: | |
self.setuniquelength(os.path.join(uniqueposdir, chromosome + ".fa")) | |
else: | |
self.length = self.end - self.start | |
def exonstring(self): | |
outstr = "(" + str(self.start) + "," + str(self.end) | |
for tx in self.transcripts: | |
outstr += "," + tx.ID | |
outstr += ") " + str(self.reads) + " " + str(self.length) | |
return outstr | |
def setuniquelength(self, chromosomefile): | |
try: | |
cfileh = openfile(chromosomefile) | |
except: | |
global warnedchromosomes | |
if not chromosomefile in warnedchromosomes: | |
warnedchromosomes.append(chromosomefile) | |
print "Warning: Did not find", chromosomefile | |
self.length = self.end - self.start | |
return | |
global chromosomefile_infodict | |
try: chromosomefile_infodict | |
except: chromosomefile_infodict = {} | |
try: offset, linelength, seqlength = chromosomefile_infodict[chromosomefile] | |
except: | |
line1 = cfileh.readline(1000) | |
if len(line1) < 1000 and line1[0] == '>': offset = len(line1) | |
else: | |
cfileh.seek(0) | |
offset = 0 | |
line2 = cfileh.readline(1000) | |
if len(line2) < 1000: | |
linelength = len(line2) | |
seqlength = len(line2.rstrip()) | |
else: | |
linelength = 0 | |
seqlength = 0 | |
chromosomefile_infodict[chromosomefile] = offset, linelength, seqlength | |
if linelength == 0: | |
startfilepos = self.start + offset | |
endfilepos = self.end + offset | |
else: | |
startfilepos = offset + (self.start // seqlength)*linelength + (self.start % seqlength) | |
endfilepos = offset + (self.end // seqlength)*linelength + (self.end % seqlength) | |
cfileh.seek(startfilepos, 0) | |
sequence = cfileh.read(endfilepos-startfilepos) | |
self.length = sequence.count('A')+sequence.count('C')+sequence.count('G')+sequence.count('T') # upper-case means unique | |
def uniquelengthfromBigWig(self, chromosome): | |
try: | |
proc = subprocess.Popen([bigWigSummary_path, uniqueposdir, chromosome, str(self.start), str(self.end), '1'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
except KeyboardInterrupt: | |
raise | |
except: | |
global bigwig_norun | |
try: bigwig_norun | |
except: | |
print 'Error: Failed running bigWigSummary, required by the -u option for bigWig files. If it was\'t aborted by the user, perhaps its permission to execute has not been set (\'chmod +x '+ bigWigSummary_path + '\' on Unix-like systems)' | |
sys.exit(1) | |
bigwig_norun = 1 | |
self.length = self.end - self.start | |
return | |
ret, err = proc.communicate() | |
if proc.returncode: | |
if err.startswith('no data'): | |
self.length = 0 | |
else: | |
print "Warning: bigWigSummary returned an error:" | |
print err | |
self.length = self.end - self.start | |
else: | |
self.length = float(ret)*(self.end - self.start) | |
class Ctranscript: | |
def __init__(self, ID): | |
self.ID = ID | |
self.expression = [] | |
self.reads = [] | |
self.overlaptx = [self] | |
self.genename = "?" | |
class Cgene: | |
def __init__(self, name): | |
self.name = name | |
self.transcripts = [] | |
self.exons = [] | |
self.chromosome = "" | |
self.overlapsets = [] | |
def exonstring(self): | |
outstr = self.name | |
for exon in self.exons: | |
outstr += "\t" + exon.exonstring() | |
return outstr | |
class Cunit: | |
# after collapse of genes | |
def __init__(self): | |
self.name1 = "." | |
self.name2 = "." | |
self.rpkms = [] | |
self.reads = [] | |
def contractarrays(Ll, Rl): | |
row = 0 | |
while row < len(Rl): | |
zeropattern = Ll[row] | |
(Ll, Rl) = contractarrays_inner(zeropattern, Ll, Rl, row) | |
row += 1 | |
return (Ll, Rl) | |
def contractarrays_inner(zeropattern, Ll, Rl, firstmatchingrow): | |
currentrow = firstmatchingrow+1 | |
while currentrow < len(Ll): | |
matches = 1 | |
for place in range(len(zeropattern)): | |
if (zeropattern[place] and not Ll[currentrow][place]) or \ | |
(Ll[currentrow][place] and not zeropattern[place]): | |
matches = 0 | |
if matches: | |
for place in range(len(zeropattern)): | |
Ll[firstmatchingrow][place] += Ll[currentrow][place] | |
Rl[firstmatchingrow][0] += Rl[currentrow][0] | |
del Ll[currentrow] | |
del Rl[currentrow] | |
else: | |
currentrow += 1 | |
return (Ll, Rl) | |
def rowincludes(inclusiverow, includedrow): | |
# checks if all elements of includedrows are found within inclusiverow | |
for element in range(len(inclusiverow)): | |
if includedrow[element] and not inclusiverow[element]: | |
return 0 | |
return 1 | |
def elementsinrow(row): | |
count = 0 | |
for element in range(len(row)): | |
if row[element]: | |
count += 1 | |
return count | |
def removecommonrow(Ll, Rl): | |
currentrow = 0 | |
while currentrow < len(Ll): | |
matches = 1 | |
for place in range(len(Ll[currentrow])): | |
if not Ll[currentrow][place]: | |
matches = 0 | |
if matches: | |
del Ll[currentrow] | |
del Rl[currentrow] | |
else: | |
currentrow += 1 | |
return (Ll, Rl) | |
def removecolumn(Ll, column): | |
currentrow = 0 | |
while currentrow < len(Ll): | |
del Ll[currentrow][column] | |
currentrow += 1 | |
return Ll | |
def removezeros(Ll, Rl): | |
currentrow = 0 | |
while currentrow < len(Ll): | |
matches = 1 | |
for place in range(len(Ll[currentrow])): | |
if Ll[currentrow][place]: | |
matches = 0 | |
if matches: | |
del Ll[currentrow] | |
del Rl[currentrow] | |
else: | |
currentrow += 1 | |
return (Ll, Rl) | |
def fromannotationline(line): | |
if annotationtype == 0: | |
# from refGene.txt | |
p = line.rstrip('\r\n').split("\t") | |
exonstarts = [int(f) for f in p[9].split(",")[:-1]] # start positions for exons for the gene | |
exonends = [int(f) for f in p[10].split(",")[:-1]] | |
ID = p[1] | |
chromosome = p[2] | |
genename = p[12] | |
strand = p[3] | |
cdsstart = min(int(p[7]), int(p[6])) | |
cdsend = max(int(p[7]), int(p[6])) | |
elif annotationtype == 1: | |
# from altered knownGene.txt | |
p = line.rstrip('\r\n').split("\t") | |
exonstarts = [int(f) for f in p[8].split(",")[:-1]] | |
exonends = [int(f) for f in p[9].split(",")[:-1]] | |
ID = p[11] | |
chromosome = p[1] | |
genename = p[0] | |
strand = p[2] | |
cdsstart = min(int(p[5]), int(p[6])) | |
cdsend = max(int(p[5]), int(p[6])) | |
elif annotationtype == 2: | |
# from ensGene.txt or sibGene.txt | |
p = line.rstrip('\r\n').split("\t") | |
ID = p[1] | |
strand = p[3] | |
chromosome = p[2] | |
genename = p[12] | |
cdsstart = min(int(p[7]), int(p[6])) | |
cdsend = max(int(p[7]), int(p[6])) | |
exonstarts = [int(f) for f in p[9].split(",")[:-1]] | |
exonends = [int(f) for f in p[10].split(",")[:-1]] | |
elif annotationtype == 3: | |
# 3 column bed file | |
p = line.rstrip('\r\n').split("\t") | |
exonstarts = [int(p[1])] | |
exonends = [int(p[2])] | |
cdsstart = 0 | |
cdsend = 0 | |
chromosome = p[0] | |
genename = None | |
ID = p[0]+":"+p[1]+"-"+p[2] | |
strand = "+" | |
elif annotationtype == 4: | |
# 6 column bed file | |
p = line.rstrip('\r\n').split("\t") | |
exonstarts = [int(p[1])] | |
exonends = [int(p[2])] | |
cdsstart = 0 | |
cdsend = 0 | |
chromosome = p[0] | |
genename = None | |
ID = p[3] | |
if len(p) > 5: strand = p[5] | |
else: strand = "+" | |
elif annotationtype == 5: | |
# gtf file format | |
p = line.rstrip('\r\n').split("\t") | |
exonstarts = [int(p[3])-1] | |
exonends = [int(p[4])] | |
cdsstart = 0 | |
cdsend = 0 | |
chromosome = p[0] | |
transInfo = p[8].split(';') | |
transIDs = transInfo[1].lstrip('transcript_id ').strip('"').replace(' ', '').split(',') | |
ID = '+'.join(transIDs) | |
genename = p[0]+":"+p[3]+"-"+p[4] | |
strand = p[6] | |
if strand == '.': strand = '+' | |
elif annotationtype == 6: | |
# up to 12 column bed file | |
p = line.rstrip('\r\n').split("\t") | |
chromosome = p[0] | |
absstart = int(p[1]) | |
try: absend = int(p[2]) | |
except: absend = absstart | |
try: strand = p[5] | |
except: strand = "+" | |
try: cdsstart = int(p[6]); cdsend = int(p[7]) | |
except: cdsstart = 0; cdsend = 0 | |
try: blocksizes = map(int, p[10].split(',')) ; blockstarts = map(int, p[11].split(',')) | |
except: blocksizes = [absend - absstart]; blockstarts = [0] | |
exonstarts = [absstart + rs for rs in blockstarts] | |
exonends = [es + size for es,size in zip(exonstarts, blocksizes)] | |
try: genename = p[3] | |
except: | |
genename = None | |
ID = str(chromosome)+":"+str(min(exonstarts)+1)+"-"+str(max(exonends)+1) | |
if len(blockstarts) != len(blocksizes) or max(exonends) != absend: | |
print "Warning: Block structure for " + ID + " is malformed" | |
if USESTRANDINFO: | |
if genename is None: genename = strand | |
if swapstrands: | |
if strand == "+": chromosome += "-" | |
else: chromosome += "+" | |
else: | |
chromosome += strand # so e.g. "chr1+" is different from "chr1-" | |
else: | |
if genename is None: genename = '.' | |
return (chromosome, strand, cdsstart, cdsend, exonstarts, exonends, genename, ID) | |
class Cline: | |
def __init__(self, line): | |
global allchromosomes, allchromosomes_dict | |
self.line = line | |
(chromosome, direction, cdsstart, cdsend, exonstarts, exonends, genename, ID) = fromannotationline(line) | |
try: self.chromosome = allchromosomes_dict[chromosome] | |
except: | |
allchromosomes.append(chromosome) | |
self.chromosome = allchromosomes.index(chromosome) | |
allchromosomes_dict[chromosome] = self.chromosome | |
self.start = min(exonstarts) | |
self.end = max(exonends) | |
self.strand = direction | |
def __cmp__(self, other): | |
if self.chromosome != other.chromosome: | |
return cmp(self.chromosome, other.chromosome) | |
if self.start > other.end: | |
return 1 | |
if other.start > self.end: | |
return -1 | |
return cmp(self.start, other.start) | |
def openfile(filename, mode='r'): | |
if filename.endswith(".gz"): | |
import gzip | |
fileh = gzip.open(filename, mode) | |
elif filename.endswith(".bz2"): | |
import bz2 | |
fileh = bz2.BZ2File(filename, mode) | |
else: fileh = open(filename,mode) | |
return fileh | |
def addread(chromosome, start, end, strand): | |
try: chrID = allchromosomes_dict[chromosome] | |
except: | |
return -1, None | |
if strand == '-': | |
return chrID, (start, 1-end) | |
else: | |
return chrID, (start,end-1) | |
def addread_tuple(chromosome, postuple): | |
try: chrID = allchromosomes_dict[chromosome] | |
except: | |
return -1, None | |
return chrID, postuple | |
def readsfrombedtabfile(filename, justreadnum): | |
try: | |
if removemultimappers: | |
tmpfile = sorttotmpfile(filename, 3, ignoredprefix='track', sep='\t') | |
fileh = open(tmpfile,'r') | |
else: | |
fileh = openfile(filename) | |
except IOError: | |
print "Warning: No such file", filename | |
return | |
try: | |
if not fileh.readline().startswith("track"): fileh.seek(0) | |
while 1: | |
line = fileh.readline() | |
if not line: break | |
if justreadnum: yield 1; continue | |
p = line.rstrip('\r\n').split("\t") | |
chromosome = p[0] | |
try: strand = p[5] | |
except: strand = '.' | |
if USESTRANDINFO: chromosome += strand | |
yield addread(chromosome, int(p[1]), int(p[2]), strand) | |
finally: | |
fileh.close() | |
if removemultimappers: | |
os.remove(tmpfile) | |
def readsfrombedtabfile_rs(filename, justreadnum): | |
if removemultimappers: print "Warning: The -unique option was ignored, unsupported for this format" | |
try: fileh = openfile(filename) | |
except: | |
print "Warning: No such file", filename | |
return | |
try: | |
if not fileh.readline().startswith("track"): fileh.seek(0) | |
while 1: | |
line = fileh.readline() | |
if not line: break | |
p = line.rstrip('\r\n').split("\t") | |
chromosome = p[0] | |
try: strand = p[5] | |
except: strand = '.' | |
if USESTRANDINFO: chromosome += strand | |
for nb_reads in xrange(int(p[4])): | |
yield addread(chromosome, int(p[1]), int(p[2]), strand) | |
finally: | |
fileh.close() | |
if removemultimappers: | |
os.remove(tmpfile) | |
def readsfrombedspacefile(filename, justreadnum): | |
try: | |
if removemultimappers: | |
tmpfile = sorttotmpfile(filename, 3, ignoredprefix='track', sep=' ') | |
fileh = open(tmpfile,'r') | |
else: | |
fileh = openfile(filename) | |
except IOError: | |
print "Warning: No such file", filename | |
return | |
try: | |
if not fileh.readline().startswith("track"): fileh.seek(0) | |
while 1: | |
line = fileh.readline() | |
if not line: break | |
if justreadnum: yield 1; continue | |
p = line.rstrip('\r\n').split() | |
chromosome = p[0] | |
try: strand = p[5] | |
except: strand = '.' | |
if USESTRANDINFO: chromosome += strand | |
yield addread(chromosome, int(p[1]), int(p[2]), strand) | |
finally: | |
fileh.close() | |
def readsfromgtffile(filename, justreadnum): | |
if removemultimappers: print "Warning: The -unique option was ignored, unsupported for this format" | |
try: fileh = openfile(filename) | |
except: | |
print "Warning: No such file", filename | |
return | |
try: | |
if not fileh.readline().startswith("track"): fileh.seek(0) | |
while 1: | |
line = fileh.readline() | |
if not line: break | |
if justreadnum: yield 1; continue | |
p = line.rstrip('\r\n').split("\t") | |
chromosome = p[0] | |
try: strand = p[6] | |
except: strand = '.' | |
if USESTRANDINFO: chromosome += strand | |
yield addread(chromosome, int(p[3])-1, int(p[4]), strand) | |
finally: | |
fileh.close() | |
def splitcigar(string): | |
numarr = [''] | |
symbolarr = [] | |
wasnum = 1 | |
for letter in string: | |
if letter.isdigit(): | |
if wasnum: | |
numarr[-1] += letter | |
else: | |
numarr.append(letter) | |
wasnum = 1 | |
else: | |
if not wasnum: | |
symbolarr[-1] += letter | |
else: | |
symbolarr.append(letter) | |
wasnum = 0 | |
return [int(v) for v in numarr], symbolarr | |
def rowparse(row): | |
chromosome = row[2] | |
start = row[1]-1 | |
endincl = start + row[4]-1 | |
if row[3] & 16: | |
if USESTRANDINFO: | |
chromosome += "-" | |
endincl = -endincl | |
else: | |
if USESTRANDINFO: | |
chromosome += "+" | |
return chromosome, start, endincl | |
def process_rowinfo(rowinfo, justreadnum): | |
lastname = '' | |
while 1: | |
try: row = rowinfo.pop() | |
except IndexError: break | |
try: nrow = rowinfo[-1] | |
except IndexError: pass | |
else: | |
if row[0] == nrow[0]: # same name | |
try: | |
if rowinfo[-2][0] == row[0]: | |
# multimapper | |
try: | |
while rowinfo[-1][0] == row[0]: | |
rowinfo.pop() | |
except IndexError: pass | |
continue | |
except IndexError: pass | |
if row[3]&0x40 or row[3]&0x80: | |
# paired end reads | |
if row[4]&0x80: | |
chromosome_1, start_1, endincl_1 = rowparse(nrow) | |
chromosome_2, start_2, endincl_2 = rowparse(row) | |
else: | |
chromosome_1, start_1, endincl_1 = rowparse(row) | |
chromosome_2, start_2, endincl_2 = rowparse(nrow) | |
yield addread_tuple(chromosome_1, (start_1, endincl_1, start_2, endincl_1)) | |
continue | |
# multimapper | |
try: | |
while rowinfo[-1][0] == row[0]: | |
rowinfo.pop() | |
except IndexError: pass | |
continue | |
chromosome_1, start_1, endincl_1 = rowparse(row) | |
yield addread_tuple(chromosome_1, (start_1, endincl_1)) | |
def readsfromsam(filename, justreadnum): | |
global saved_rowinfo | |
try: | |
if justreadnum: raise NameError | |
rowinfo, fn = saved_rowinfo | |
del saved_rowinfo | |
assert fn == filename | |
except NameError: | |
try: | |
fileh = openfile(filename) | |
except IOError: | |
print "Warning: No such file", filename | |
return | |
rowinfo = [] | |
for line in fileh: | |
if line.startswith("@"): continue # header | |
p = line.rstrip('\r\n').split("\t") | |
bits = int(p[1]) | |
if bits & 4: continue # unmapped | |
chromosome = p[2] | |
name = p[0] | |
length = sum([l for l,s in zip(*splitcigar(p[5])) if s in ["M","D","X","=","N"]]) | |
rowinfo.append((name, int(p[3]), chromosome, bits, length)) | |
rowinfo.sort() | |
import copy | |
if justreadnum: saved_rowinfo = copy.copy(rowinfo), filename | |
fileh.close() | |
return process_rowinfo(rowinfo, justreadnum) | |
def readsfrombam(filename, justreadnum): | |
global saved_rowinfo | |
try: | |
if justreadnum: raise NameError | |
rowinfo, fn = saved_rowinfo | |
del saved_rowinfo | |
assert fn == filename | |
except NameError: | |
try: | |
import pysam | |
except: | |
print "Error: tried to read BAM file, but missing the pysam module" | |
exit(1) | |
try: | |
fileh = pysam.Samfile(filename, 'rb') | |
except IOError: | |
print "Warning: No such file", filename | |
return | |
rowinfo = [] | |
for read in fileh: | |
if read.flag & 4: continue # unmapped | |
chromosome = fileh.getrname(read.rname) | |
name = read.qname | |
length = sum(l for o,l in read.cigar if o in [0,2,3]) #0=M,1=I,2=D,3=N | |
rowinfo.append((name, read.pos+1, chromosome, read.flag, length)) | |
rowinfo.sort() | |
import copy | |
if justreadnum: saved_rowinfo = copy.copy(rowinfo), filename | |
fileh.close() | |
return process_rowinfo(rowinfo, justreadnum) | |
def readsfromsam_se(filename, justreadnum): | |
try: | |
if removemultimappers: | |
tmpfile = sorttotmpfile(filename, 0, ignoredprefix='@', sep='\t') | |
fileh = open(tmpfile,'r') | |
else: | |
fileh = openfile(filename) | |
except IOError: | |
print "Warning: No such file", filename | |
return | |
try: | |
while 1: | |
line = fileh.readline() | |
if not line: break | |
if line.startswith("@"): continue # header | |
p = line.rstrip('\r\n').split("\t") | |
if int(p[1]) & 4: continue # unmapped | |
if justreadnum: yield 1; continue | |
chromosome = p[2] | |
if int(p[1]) & 16: strand = "-" | |
else: strand = "+" | |
if USESTRANDINFO: | |
chromosome += strand | |
cigarints, cigarsymbols = splitcigar(p[5]) | |
length = sum([l for l,s in zip(cigarints, cigarsymbols) if s in ["M","D","X","=","N"]]) | |
start = int(p[3])-1 | |
yield addread(chromosome, start, start+length, strand) | |
finally: | |
fileh.close() | |
if removemultimappers: | |
os.remove(tmpfile) | |
return | |
def readsfrombowtieoutput(filename, justreadnum): | |
try: | |
if removemultimappers: | |
tmpfile = sorttotmpfile(filename, 0, sep='\t') | |
fileh = open(tmpfile,'r') | |
else: | |
fileh = openfile(filename) | |
except IOError: | |
print "Warning: No such file", filename | |
return | |
try: | |
while 1: | |
line = fileh.readline() | |
if not line: break | |
p = line.rstrip('\r\n').split("\t") | |
chromosome = p[2] | |
try: strand = p[1] | |
except: strand = '.' | |
if USESTRANDINFO: chromosome += strand | |
yield addread(chromosome, int(p[3]), int(p[3])+len(p[4]), strand) | |
finally: | |
fileh.close() | |
if removemultimappers: | |
os.remove(tmpfile) | |
return | |
def sorttotmpfile(filename, idindex, ignoredprefix='', sep='\t'): | |
# sort file, remove duplicates, export to temporary file | |
# returned file needs later removal with os.remove() | |
import tempfile | |
outf = tempfile.mkstemp(suffix='_rpkmforgenes')[1] | |
try: | |
print "Sorting to ", outf | |
infh = openfile(filename, 'r') | |
lines = [] | |
for line in infh: | |
if ignoredprefix and line.startswith(ignoredprefix): continue | |
p = line.rstrip('\r\n').split('\t') | |
if ignoredprefix=='@' and int(p[2])&0x80: continue # remove 2nd in paired read | |
if len(p) <= idindex: | |
lines.append(('', line)) | |
else: | |
lines.append((p[idindex],line)) | |
infh.close() | |
lines.sort() | |
outfh = open(outf, 'w') | |
for i in range(len(lines)): | |
this = lines[i][0] | |
if i > 0: | |
neighbour = lines[i-1][0] | |
if neighbour and neighbour == this: continue | |
try: | |
neighbour = lines[i+1][0] | |
except IndexError: pass | |
else: | |
if neighbour and neighbour == this: continue | |
outfh.write(lines[i][1]) | |
outfh.close() | |
except: | |
try: os.remove(outf) | |
except: pass | |
raise | |
return outf | |
def mappos(chrindex, readpos): | |
try: window = allwindows[chrindex][readpos//WINDOWSIZE] | |
except IndexError: | |
return [] | |
return [exon for exon in window if exon.start <= readpos < exon.end] | |
def getarguments(flag): | |
# list of arguments that follow the flag | |
flagindex = sys.argv.index(flag) | |
ret = [] | |
index = flagindex + 1 | |
while index < len(sys.argv) and sys.argv[index][0] != "-": | |
ret.append(sys.argv[index]) | |
index += 1 | |
return ret | |
def getargument(flag): | |
# the argument after the flag | |
flagindex = sys.argv.index(flag) | |
return sys.argv[flagindex+1] | |
def testargumentflag(flag): | |
# if the flag is among the arguments | |
if flag in sys.argv: return 1 | |
else: return 0 | |
def main(): | |
global MAXGENES | |
global MAXREADS | |
global ONLYUNIQUEPOS | |
global ONLYTXUNIQUEEXONS | |
global INTRONSINSTEAD | |
global NODECONVOLUTION | |
global COLLAPSEGENES | |
global FULLTRANSCRIPTS | |
global USESTRANDINFO | |
global allchromosomes | |
global allchromosomes_dict | |
global annotationtype | |
global uniqueposdir | |
global swapstrands | |
global mapends | |
global removemultimappers | |
global bigWigSummary_path | |
global allwindows | |
# interpret program arguments | |
if len(sys.argv) < 2 or testargumentflag("--help") or testargumentflag("-h"): | |
print "Non-optional arguments:" | |
print " -o followed by output file" | |
print " -i followed by list of input files (by default, guesses format from file extension)" | |
print " -a followed by annotation file" | |
print "Gene model-related options:" | |
print " -u followed by a bigWig file, alternatively a directory for files for non-unique positions (lower case for nonunique k-mers (where k is the read length), upper case for unique; filenames are e.g. chr1.fa" | |
print " -fulltranscript to not remove 3'UTRs" | |
print " -maxlength followed by a distance to cut each transcript from the 3' end (5' if negative)" | |
print " -maxgenes limit how many genes expression is calculated for" | |
print " -limitcollapse to not consider indirect transcript overlap" | |
print " -namecollapse to only consider overlap between isoform with the same gene identifier" | |
print " -nocollapse to get isoform expressions (shaky)" | |
print " -nooverlap to ignore that transcripts can overlap (will count some reads several times)" | |
print " -flat to flatten all isoforms to one gene model" | |
print " -txunique to ignore exons shared by multiple gene isoforms" | |
print " -onlycoding to ignore noncoding transcripts" | |
print " -swapstrands to make reads on + strand map to genes on - and vice versa (and sets -strand)" | |
print " -introns gives gene expression from introns rather than exons (also removes exons of other isoforms)" | |
print " -keephap to not remove haplotype chromosome (_hap) annotation" | |
print "Annotation file formats:" | |
print " -genePred if annotation file uses format of refGene.txt etc (default if cannot guess from file name suffix)" | |
print " -bedann tab-separated 0-based bed file, chromosome start end and 9 optional fields" | |
print " -gffann GFF format, but no groups, only single exon genes" | |
print "Input formats:" | |
print " -bed tab separated bed file (default if cannot guess from file name suffix)" | |
print " -bedcompacted bed file with number of reads in the score column" | |
print " -bedspace space separated bed file" | |
print " -bowtie the default output format of bowtie" | |
print " -sam SAM format" | |
print " -bam BAM format" | |
print " -samse SAM format, single-end, uniquely mapped reads (faster than -sam)" | |
print " -gff GFF file, no groups" | |
print "Normalisation options:" | |
print " -mRNAnorm to normalize by the number of reads matching mRNA exons (default)" | |
print " -exonnorm to normalize by the number of reads matching exons, including ncRNA" | |
print " -allmapnorm to normalize by the total number of mapped reads (default if annotation contains no mRNA)" | |
print " -forcetotal followed by a number of reads for each sample to set a constant to normalise by" | |
print "Other optional arguments:" | |
print " -strand to use strand information of reads" | |
print " -bothends to also map the end positions to genes, each end counted as 0.5 (or 0.25 for paired-end reads)" | |
print " -diffreads to count only one read if several have the same position, strand and length" | |
print " -n followed by list of sample names (input file names are otherwise used)" | |
print " -maxreads followed by maximum number of reads to be used" | |
print " -randomreads to make -maxreads pick reads at random" | |
print " -readcount to add the number of reads supporting expression to the output" | |
print " -h to print this message and quit" | |
print "Special output values:" | |
print " 0 gene has no reads, -1 gene has no exons" | |
print " otherwise the output is in reads per kilobase and million mappable reads (or rather FPKM for paired-end reads)" | |
print "Output:" | |
print " gene symbol -tab- ID -tab- RPKM values [-tab- read count]" | |
return 0 | |
try: outfile = getargument("-o") | |
except: | |
print "No outfile (-o)" | |
return 1 | |
try: infiles = getarguments("-i") | |
except: | |
print "No infile (-i)" | |
return 1 | |
try: annotationfile = getargument("-a") | |
except: | |
print "No annotationfile (-a)" | |
return 1 | |
try: | |
names = getarguments("-n") | |
if len(names) != len(infiles): | |
print "Not the same number of sample names as files with reads" | |
return 1 | |
except: names = [filename.rsplit("/")[-1] for filename in infiles] | |
compressionsuffixes = ['gz','bz2'] | |
if testargumentflag("-genePred") or testargumentflag("-refseq"): annotationtype = 0 | |
elif testargumentflag("-ucsc"): annotationtype = 1 | |
elif testargumentflag("-ensembl"): annotationtype = 2 | |
elif testargumentflag("-bedann"): annotationtype = 6 | |
elif testargumentflag("-bed3ann"): annotationtype = 3 | |
elif testargumentflag("-bed6ann"): annotationtype = 4 | |
elif testargumentflag("-gtfann") or testargumentflag("-gffann"): annotationtype = 5 | |
else: | |
# if no annotation type given, guess from file suffix | |
anndict = {'bed':6, 'gff':5, 'gtf':5, 'txt':0} | |
try: | |
annpre, ending = annotationfile.rsplit('.',1) | |
if ending in compressionsuffixes: ending = annpre.rsplit('.',1)[1] | |
annotationtype = anndict[ending] | |
except: | |
annotationtype = 0 | |
if testargumentflag("-bed"): infileformat = "bed" | |
elif testargumentflag("-bedcompacted"): infileformat = "bedcompacted" | |
elif testargumentflag("-bedspace"): infileformat = "bedspace" | |
elif testargumentflag("-gff") or testargumentflag("-gtf"): infileformat = "gtf" | |
elif testargumentflag("-sam"): infileformat = "sam" | |
elif testargumentflag("-bowtie"): infileformat = "bowtie" | |
elif testargumentflag("-samse"): infileformat = "samse" | |
elif testargumentflag("-bam"): infileformat = "bam" | |
else: infileformat = "guess" | |
if testargumentflag("-nocollapse"): COLLAPSEGENES = 0 | |
else: COLLAPSEGENES = 1 | |
if testargumentflag("-limitcollapse"): limitcollapse = 1 | |
else: limitcollapse = 0 | |
genecollapse = testargumentflag("-namecollapse") | |
if testargumentflag("-fulltranscript"): FULLTRANSCRIPTS = 1 | |
else: FULLTRANSCRIPTS = 0 | |
bigWigSummary_path = 0 | |
if testargumentflag("-u"): | |
uniqueposdir = getargument("-u") | |
if uniqueposdir.endswith('.bw') or uniqueposdir.endswith('.bw.gz') or uniqueposdir.endswith('.bw.bz2') or uniqueposdir.endswith('.bigWig') or uniqueposdir.endswith('.bigWig.gz') or uniqueposdir.endswith('.bigWig.bz2'): | |
bigWigSummary_path = os.path.split(sys.argv[0])[0] | |
if bigWigSummary_path == '': | |
bigWigSummary_path = './' | |
bigWigSummary_path = os.path.join(bigWigSummary_path, 'bigWigSummary') | |
if not os.path.isfile(bigWigSummary_path): | |
print 'Error: ' + bigWigSummary_path + ' does not exist, needed to read bigWig file spec |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment