radaniba/rpkmforgenes.py

## rpkmforgenes.py
Sequence data input:
A common input file format is bed file. In the simplest form, it has the format
chromosome -tab- startposition -tab- endposition
where each line corresponds to one sequence read
However this format can also include strand information:
chromosome -tab- startposition -tab- endposition -tab- name -tab- score -tab- +/-
for example:
chr13 54005048 54005081 . 1 -
The score field is ignored.
Files are allowed by have an optional first line starting with "track", this line is then ignored
Other input formats can be specified by -bedspace, -gff, -bowtie, -bedcompacted, -sam, -samse or -bam. -bowtie uses the default output format of Bowtie, -sam for SAM files, -bam is for BAM files. The reading of BAM files requires an index (.bai) and installation of the pysam module.
For SAM files, multimapping reads are not counted. Also, this is the only input format where the program has built-in support for paired-end reads. The option -samse forces the use of a more light-weight SAM parser, which is faster but requires uniquely mapped reads which are not paired-end.

Gzipped (.gz) and bzipped (.bz2) files can be used as input read files

READ MORE HERE

http://sandberg.cmb.ki.se/media/data/rnaseq/instructions-rpkmforgenes.html

#!/usr/bin/python
"""
Calculates gene expression from a read mapping file
"""

lastmodified = "12 Apr 2011"
author = "Daniel Ramskold"
#4 june 2010: now assumes sam and gff files use 1-based coordinates, not 0-based, -exonnorm is default
#6 june 2010: partially rewrote code for overlapping exons from different isoforms, swapped ID and symbol fields for some annotation types
#10 june 2010: corrected gff end coordinates
#17 june 2010: removed exons of other isoforms from -intronsinstead, normalise by only mRNA by default
#28 june 2010: bed12 format as annotation, -flat option for collapsing before RPKM calculation, put strand in unused gene name column if -strand
#3 july 2010: changed output header to give number of reads normalised by regardless of normalisation option
#8 july 2010: -bedann for 3 columns gives 1-based chrX:start:end but no change for -bed3ann, -bothends map end-1 position instead of end, added -n option, more stringent check (strand, length) to filter with -diffreads
#28 july 2010: added -unique option (removal of multimapping reads)
#6 aug 2010: -u option can call bigWigSummary on bigWig files
#11 aug 2010: rewrote parsing of sam files to make -bothends and -diffreads work better, removed the need for -unique option
#19 aug 2010: changed gene/ID fields with -bedann
#23 aug 2010: changed ouput of 0 rpkm to 0.0 rpkm
#10 sep 2010: indexing exons instead of reads
#17 nov 2010: reenabled old sam format input function (now with -samse), added bam file support
#9 dec 2010: allowed -u to read from files with FASTA header and fixed line wrap
#5 jan 2011: looks for .bigWig file suffix, fixed reporting for -readcount -nocollapse
#20 jan 2011: changed GTF format ID to transcript ID
#24 jan 2011: added -randomreads
#9 feb 2011: speed optimizations for -randomreads, fixed error reporting at 0 total reads
#7 mars 2011: debugging of isoform deconvolution
#12 apr 2011: added -namecollapse option

from numpy import matrix, linalg
import numpy, sys, time, os, subprocess

WINDOWSIZE = 5000

uniqueposdir = "none"

class Cexon:
	def __init__(self, start, end, normalise_with):
		self.start = start
		self.end = end
		self.reads = 0
		self.transcripts = []
		self.readspersample = []
		self.forbidden = False # only for normalisation, hide from expression calculation and output
		self.normalise_with = normalise_with # include in exon normalisation

	def calclength(self, chromosome):
		if ONLYUNIQUEPOS:
			if USESTRANDINFO:
				chromsome = chromosome[:-1]
			if bigWigSummary_path:
				self.uniquelengthfromBigWig(chromosome)
			else:
				self.setuniquelength(os.path.join(uniqueposdir, chromosome + ".fa"))
		else:
			self.length = self.end - self.start

	def exonstring(self):
		outstr = "(" + str(self.start) + "," + str(self.end)
		for tx in self.transcripts:
			outstr += "," + tx.ID
		outstr += ") " + str(self.reads) + " " + str(self.length)
		return outstr

	def setuniquelength(self, chromosomefile):
		try:
			cfileh = openfile(chromosomefile)
		except:
			global warnedchromosomes
			if not chromosomefile in warnedchromosomes:
				warnedchromosomes.append(chromosomefile)
				print "Warning: Did not find", chromosomefile
			self.length = self.end - self.start
			return

		global chromosomefile_infodict
		try: chromosomefile_infodict
		except: chromosomefile_infodict = {}
		try: offset, linelength, seqlength = chromosomefile_infodict[chromosomefile]
		except:
			line1 = cfileh.readline(1000)
			if len(line1) < 1000 and line1[0] == '>': offset = len(line1)
			else:
				cfileh.seek(0)
				offset = 0
			line2 = cfileh.readline(1000)
			if len(line2) < 1000:
				linelength = len(line2)
				seqlength = len(line2.rstrip())
			else:
				linelength = 0
				seqlength = 0
			chromosomefile_infodict[chromosomefile] = offset, linelength, seqlength
		if linelength == 0:
			startfilepos = self.start + offset
			endfilepos = self.end + offset
		else:
			startfilepos = offset + (self.start // seqlength)*linelength + (self.start % seqlength)
			endfilepos = offset + (self.end // seqlength)*linelength + (self.end % seqlength)
		cfileh.seek(startfilepos, 0)
		sequence = cfileh.read(endfilepos-startfilepos)
		self.length = sequence.count('A')+sequence.count('C')+sequence.count('G')+sequence.count('T')	# upper-case means unique

	def uniquelengthfromBigWig(self, chromosome):
		try:
			proc = subprocess.Popen([bigWigSummary_path, uniqueposdir, chromosome, str(self.start), str(self.end), '1'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
		except KeyboardInterrupt:
			raise
		except:
			global bigwig_norun
			try: bigwig_norun
			except:
				print 'Error: Failed running bigWigSummary, required by the -u option for bigWig files. If it was\'t aborted by the user, perhaps its permission to execute has not been set (\'chmod +x '+ bigWigSummary_path + '\' on Unix-like systems)'
				sys.exit(1)
				bigwig_norun = 1
			self.length = self.end - self.start
			return
		ret, err = proc.communicate()
		if proc.returncode:
			if err.startswith('no data'):
				self.length = 0
			else:
				print "Warning: bigWigSummary returned an error:"
				print err
				self.length = self.end - self.start
		else:
			self.length = float(ret)*(self.end - self.start)


class Ctranscript:
	def __init__(self, ID):
		self.ID = ID
		self.expression = []
		self.reads = []
		self.overlaptx = [self]
		self.genename = "?"

class Cgene:
	def __init__(self, name):
		self.name = name
		self.transcripts = []
		self.exons = []
		self.chromosome = ""
		self.overlapsets = []

	def exonstring(self):
		outstr = self.name
		for exon in self.exons:
			outstr += "\t" + exon.exonstring()
		return outstr

class Cunit:
	# after collapse of genes
	def __init__(self):
		self.name1 = "."
		self.name2 = "."
		self.rpkms = []
		self.reads = []

def contractarrays(Ll, Rl):
	row = 0
	while row < len(Rl):
		zeropattern = Ll[row]
		(Ll, Rl) = contractarrays_inner(zeropattern, Ll, Rl, row)
		row += 1
	return (Ll, Rl)

def contractarrays_inner(zeropattern, Ll, Rl, firstmatchingrow):
	currentrow = firstmatchingrow+1
	while currentrow < len(Ll):
		matches = 1
		for place in range(len(zeropattern)):
			if (zeropattern[place] and not Ll[currentrow][place]) or \
			 (Ll[currentrow][place] and not zeropattern[place]):
				matches = 0
		if matches:
			for place in range(len(zeropattern)):
				Ll[firstmatchingrow][place] += Ll[currentrow][place]
			Rl[firstmatchingrow][0] += Rl[currentrow][0]
			del Ll[currentrow]
			del Rl[currentrow]
		else:
			currentrow += 1
	return (Ll, Rl)

def rowincludes(inclusiverow, includedrow):
	# checks if all elements of includedrows are found within inclusiverow
	for element in range(len(inclusiverow)):
		if includedrow[element] and not inclusiverow[element]:
			return 0
	return 1

def elementsinrow(row):
	count = 0
	for element in range(len(row)):
		if row[element]:
			count += 1
	return count

def removecommonrow(Ll, Rl):
	currentrow = 0
	while currentrow < len(Ll):
		matches = 1
		for place in range(len(Ll[currentrow])):
			if not Ll[currentrow][place]:
				matches = 0
		if matches:
			del Ll[currentrow]
			del Rl[currentrow]
		else:
			currentrow += 1
	return (Ll, Rl)

def removecolumn(Ll, column):
	currentrow = 0
	while currentrow < len(Ll):
		del Ll[currentrow][column]
		currentrow += 1
	return Ll

def removezeros(Ll, Rl):
	currentrow = 0
	while currentrow < len(Ll):
		matches = 1
		for place in range(len(Ll[currentrow])):
			if Ll[currentrow][place]:
				matches = 0
		if matches:
			del Ll[currentrow]
			del Rl[currentrow]
		else:
			currentrow += 1
	return (Ll, Rl)

def fromannotationline(line):
	if annotationtype == 0:
		# from refGene.txt
		p = line.rstrip('\r\n').split("\t")
		exonstarts = [int(f) for f in p[9].split(",")[:-1]]	# start positions for exons for the gene
		exonends = [int(f) for f in p[10].split(",")[:-1]]
		ID = p[1]
		chromosome = p[2]
		genename = p[12]
		strand = p[3]
		cdsstart = min(int(p[7]), int(p[6]))
		cdsend = max(int(p[7]), int(p[6]))
	elif annotationtype == 1:
		# from altered knownGene.txt
		p = line.rstrip('\r\n').split("\t")
		exonstarts = [int(f) for f in p[8].split(",")[:-1]]
		exonends = [int(f) for f in p[9].split(",")[:-1]]
		ID = p[11]
		chromosome = p[1]
		genename = p[0]
		strand = p[2]
		cdsstart = min(int(p[5]), int(p[6]))
		cdsend = max(int(p[5]), int(p[6]))
	elif annotationtype == 2:
		# from ensGene.txt or sibGene.txt
		p = line.rstrip('\r\n').split("\t")
		ID = p[1]
		strand = p[3]
		chromosome = p[2]
		genename = p[12]
		cdsstart = min(int(p[7]), int(p[6]))
		cdsend = max(int(p[7]), int(p[6]))
		exonstarts = [int(f) for f in p[9].split(",")[:-1]]
		exonends = [int(f) for f in p[10].split(",")[:-1]]
	elif annotationtype == 3:
		# 3 column bed file
		p = line.rstrip('\r\n').split("\t")
		exonstarts = [int(p[1])]
		exonends = [int(p[2])]
		cdsstart = 0
		cdsend = 0
		chromosome = p[0]
		genename = None
		ID = p[0]+":"+p[1]+"-"+p[2]
		strand = "+"
	elif annotationtype == 4:
		# 6 column bed file
		p = line.rstrip('\r\n').split("\t")
		exonstarts = [int(p[1])]
		exonends = [int(p[2])]
		cdsstart = 0
		cdsend = 0
		chromosome = p[0]
		genename = None
		ID = p[3]
		if len(p) > 5: strand = p[5]
		else: strand = "+"
	elif annotationtype == 5:
		# gtf file format
		p = line.rstrip('\r\n').split("\t")
		exonstarts = [int(p[3])-1]
		exonends = [int(p[4])]
		cdsstart = 0
		cdsend = 0
		chromosome = p[0]
		transInfo = p[8].split(';')
		transIDs = transInfo[1].lstrip('transcript_id ').strip('"').replace(' ', '').split(',')
		ID = '+'.join(transIDs)
		genename = p[0]+":"+p[3]+"-"+p[4]
		strand = p[6]
		if strand == '.': strand = '+'
	elif annotationtype == 6:
		# up to 12 column bed file
		p = line.rstrip('\r\n').split("\t")
		chromosome = p[0]
		absstart = int(p[1])
		try: absend = int(p[2])
		except: absend = absstart
		try: strand = p[5]
		except: strand = "+"
		try:	cdsstart = int(p[6]); cdsend = int(p[7])
		except: cdsstart = 0; cdsend = 0
		try: blocksizes = map(int, p[10].split(',')) ; blockstarts = map(int, p[11].split(','))
		except: blocksizes = [absend - absstart]; blockstarts = [0]
		exonstarts = [absstart + rs for rs in blockstarts]
		exonends = [es + size for es,size in zip(exonstarts, blocksizes)]
		try: genename = p[3]
		except:
			genename = None
		ID = str(chromosome)+":"+str(min(exonstarts)+1)+"-"+str(max(exonends)+1)
		if len(blockstarts) != len(blocksizes) or max(exonends) != absend:
			print "Warning: Block structure for " + ID + " is malformed"

	if USESTRANDINFO:
		if genename is None: genename = strand
		if swapstrands:
			if strand == "+": chromosome += "-"
			else: chromosome += "+"
		else:
			chromosome += strand	# so e.g. "chr1+" is different from "chr1-"
	else:
		if genename is None: genename = '.'
	return (chromosome, strand, cdsstart, cdsend, exonstarts, exonends, genename, ID)

class Cline:
	def __init__(self, line):
		global allchromosomes, allchromosomes_dict

		self.line = line
		(chromosome, direction, cdsstart, cdsend, exonstarts, exonends, genename, ID) = fromannotationline(line)
		try: self.chromosome = allchromosomes_dict[chromosome]
		except:
			allchromosomes.append(chromosome)
			self.chromosome = allchromosomes.index(chromosome)
			allchromosomes_dict[chromosome] = self.chromosome
		self.start = min(exonstarts)
		self.end = max(exonends)
		self.strand = direction

	def __cmp__(self, other):
		if self.chromosome != other.chromosome:
			return cmp(self.chromosome, other.chromosome)
		if self.start > other.end:
			return 1
		if other.start > self.end:
			return -1
		return cmp(self.start, other.start)

def openfile(filename, mode='r'):
	if filename.endswith(".gz"):
		import gzip
		fileh = gzip.open(filename, mode)
	elif filename.endswith(".bz2"):
		import bz2
		fileh = bz2.BZ2File(filename, mode)
	else: fileh = open(filename,mode)
	return fileh

def addread(chromosome, start, end, strand):
	try: chrID = allchromosomes_dict[chromosome]
	except:
		return -1, None
	if strand == '-':
		return chrID, (start, 1-end)
	else:
		return chrID, (start,end-1)

def addread_tuple(chromosome, postuple):
	try: chrID = allchromosomes_dict[chromosome]
	except:
		return -1, None
	return chrID, postuple

def readsfrombedtabfile(filename, justreadnum):
	try:
		if removemultimappers:
			tmpfile = sorttotmpfile(filename, 3, ignoredprefix='track', sep='\t')
			fileh = open(tmpfile,'r')
		else:
			fileh = openfile(filename)
	except IOError:
		print "Warning: No such file", filename
		return
	try:
		if not fileh.readline().startswith("track"): fileh.seek(0)
		while 1:
			line = fileh.readline()
			if not line: break
			if justreadnum: yield 1; continue
			p = line.rstrip('\r\n').split("\t")
			chromosome = p[0]
			try: strand = p[5]
			except: strand = '.'
			if USESTRANDINFO: chromosome += strand
			yield addread(chromosome, int(p[1]), int(p[2]), strand)
	finally:
		fileh.close()
		if removemultimappers:
			os.remove(tmpfile)

def readsfrombedtabfile_rs(filename, justreadnum):

	if removemultimappers: print "Warning: The -unique option was ignored, unsupported for this format"
	try: fileh = openfile(filename)
	except:
		print "Warning: No such file", filename
		return
	try:
		if not fileh.readline().startswith("track"): fileh.seek(0)
		while 1:
			line = fileh.readline()
			if not line: break
			p = line.rstrip('\r\n').split("\t")
			chromosome = p[0]
			try: strand = p[5]
			except: strand = '.'
			if USESTRANDINFO: chromosome += strand
			for nb_reads in xrange(int(p[4])):
				yield addread(chromosome, int(p[1]), int(p[2]), strand)
	finally:
		fileh.close()
		if removemultimappers:
			os.remove(tmpfile)

def readsfrombedspacefile(filename, justreadnum):
	try:
		if removemultimappers:
			tmpfile = sorttotmpfile(filename, 3, ignoredprefix='track', sep=' ')
			fileh = open(tmpfile,'r')
		else:
			fileh = openfile(filename)
	except IOError:
		print "Warning: No such file", filename
		return

	try:
		if not fileh.readline().startswith("track"): fileh.seek(0)
		while 1:
			line = fileh.readline()
			if not line: break
			if justreadnum: yield 1; continue
			p = line.rstrip('\r\n').split()
			chromosome = p[0]
			try: strand = p[5]
			except: strand = '.'
			if USESTRANDINFO: chromosome += strand
			yield addread(chromosome, int(p[1]), int(p[2]), strand)
	finally:
		fileh.close()

def readsfromgtffile(filename, justreadnum):
	if removemultimappers: print "Warning: The -unique option was ignored, unsupported for this format"
	try: fileh = openfile(filename)
	except:
		print "Warning: No such file", filename
		return
	try:
		if not fileh.readline().startswith("track"): fileh.seek(0)
		while 1:
			line = fileh.readline()
			if not line: break
			if justreadnum: yield 1; continue
			p = line.rstrip('\r\n').split("\t")
			chromosome = p[0]
			try: strand = p[6]
			except: strand = '.'
			if USESTRANDINFO: chromosome += strand
			yield addread(chromosome, int(p[3])-1, int(p[4]), strand)
	finally:
		fileh.close()

def splitcigar(string):
	numarr = ['']
	symbolarr = []
	wasnum = 1
	for letter in string:
		if letter.isdigit():
			if wasnum:
				numarr[-1] += letter
			else:
				numarr.append(letter)
			wasnum = 1
		else:
			if not wasnum:
				symbolarr[-1] += letter
			else:
				symbolarr.append(letter)
			wasnum = 0
	return [int(v) for v in numarr], symbolarr

def rowparse(row):
	chromosome = row[2]
	start = row[1]-1
	endincl = start + row[4]-1
	if row[3] & 16:
		if USESTRANDINFO:
			chromosome +=  "-"
		endincl = -endincl
	else:
		if USESTRANDINFO:
			chromosome +=  "+"
	return chromosome, start, endincl

def process_rowinfo(rowinfo, justreadnum):
	lastname = ''
	while 1:
		try: row = rowinfo.pop()
		except IndexError: break
		try: nrow = rowinfo[-1]
		except IndexError: pass
		else:
			if row[0] == nrow[0]:	# same name
				try:
					if rowinfo[-2][0] == row[0]:
						# multimapper
						try:
							while rowinfo[-1][0] == row[0]:
								rowinfo.pop()
						except IndexError: pass
						continue
				except IndexError: pass
				if row[3]&0x40 or row[3]&0x80:
					# paired end reads
					if row[4]&0x80:
						chromosome_1, start_1, endincl_1 = rowparse(nrow)
						chromosome_2, start_2, endincl_2 = rowparse(row)
					else:
						chromosome_1, start_1, endincl_1 = rowparse(row)
						chromosome_2, start_2, endincl_2 = rowparse(nrow)
					yield addread_tuple(chromosome_1, (start_1, endincl_1, start_2, endincl_1))
					continue
				# multimapper
				try:
					while rowinfo[-1][0] == row[0]:
						rowinfo.pop()
				except IndexError: pass
				continue
		chromosome_1, start_1, endincl_1 = rowparse(row)
		yield addread_tuple(chromosome_1, (start_1, endincl_1))

def readsfromsam(filename, justreadnum):
	global saved_rowinfo
	try:
		if justreadnum: raise NameError
		rowinfo, fn = saved_rowinfo
		del saved_rowinfo
		assert fn == filename
	except NameError:
		try:
			fileh = openfile(filename)
		except IOError:
			print "Warning: No such file", filename
			return

		rowinfo = []
		for line in fileh:
			if line.startswith("@"): continue	# header
			p = line.rstrip('\r\n').split("\t")
			bits = int(p[1])
			if bits & 4: continue	# unmapped
			chromosome = p[2]
			name = p[0]
			length = sum([l for l,s in zip(*splitcigar(p[5])) if s in ["M","D","X","=","N"]])
			rowinfo.append((name, int(p[3]), chromosome, bits, length))
		rowinfo.sort()
		import copy
		if justreadnum: saved_rowinfo = copy.copy(rowinfo), filename
		fileh.close()

	return process_rowinfo(rowinfo, justreadnum)

def readsfrombam(filename, justreadnum):
	global saved_rowinfo
	try:
		if justreadnum: raise NameError
		rowinfo, fn = saved_rowinfo
		del saved_rowinfo
		assert fn == filename
	except NameError:
		try:
			import pysam
		except:
			print "Error: tried to read BAM file, but missing the pysam module"
			exit(1)

		try:
			fileh = pysam.Samfile(filename, 'rb')
		except IOError:
			print "Warning: No such file", filename
			return

		rowinfo = []
		for read in fileh:
			if read.flag & 4: continue	# unmapped
			chromosome = fileh.getrname(read.rname)
			name = read.qname
			length = sum(l for o,l in read.cigar if o in [0,2,3]) #0=M,1=I,2=D,3=N
			rowinfo.append((name, read.pos+1, chromosome, read.flag, length))
		rowinfo.sort()
		import copy
		if justreadnum: saved_rowinfo = copy.copy(rowinfo), filename
		fileh.close()

	return process_rowinfo(rowinfo, justreadnum)

def readsfromsam_se(filename, justreadnum):
	try:
		if removemultimappers:
			tmpfile = sorttotmpfile(filename, 0, ignoredprefix='@', sep='\t')
			fileh = open(tmpfile,'r')
		else:
			fileh = openfile(filename)
	except IOError:
		print "Warning: No such file", filename
		return

	try:
		while 1:
			line = fileh.readline()
			if not line: break
			if line.startswith("@"): continue	# header
			p = line.rstrip('\r\n').split("\t")
			if int(p[1]) & 4: continue	# unmapped
			if justreadnum: yield 1; continue
			chromosome = p[2]
			if int(p[1]) & 16: strand = "-"
			else: strand = "+"
			if USESTRANDINFO:
				chromosome += strand

			cigarints, cigarsymbols = splitcigar(p[5])
			length = sum([l for l,s in zip(cigarints, cigarsymbols) if s in ["M","D","X","=","N"]])

			start = int(p[3])-1
			yield addread(chromosome, start, start+length, strand)
	finally:
		fileh.close()
		if removemultimappers:
			os.remove(tmpfile)
	return

def readsfrombowtieoutput(filename, justreadnum):
	try:
		if removemultimappers:
			tmpfile = sorttotmpfile(filename, 0, sep='\t')
			fileh = open(tmpfile,'r')
		else:
			fileh = openfile(filename)
	except IOError:
		print "Warning: No such file", filename
		return
	try:
		while 1:
			line = fileh.readline()
			if not line: break
			p = line.rstrip('\r\n').split("\t")
			chromosome = p[2]
			try: strand = p[1]
			except: strand = '.'
			if USESTRANDINFO: chromosome += strand
			yield addread(chromosome, int(p[3]), int(p[3])+len(p[4]), strand)
	finally:
		fileh.close()
		if removemultimappers:
			os.remove(tmpfile)
	return

def sorttotmpfile(filename, idindex, ignoredprefix='', sep='\t'):
	# sort file, remove duplicates, export to temporary file
	# returned file needs later removal with os.remove()
	import tempfile
	outf = tempfile.mkstemp(suffix='_rpkmforgenes')[1]
	try:
		print "Sorting to ", outf
		infh = openfile(filename, 'r')
		lines = []
		for line in infh:
			if ignoredprefix and line.startswith(ignoredprefix): continue
			p = line.rstrip('\r\n').split('\t')
			if ignoredprefix=='@' and int(p[2])&0x80: continue	# remove 2nd in paired read
			if len(p) <= idindex:
				lines.append(('', line))
			else:
				lines.append((p[idindex],line))
		infh.close()
		lines.sort()
		outfh = open(outf, 'w')
		for i in range(len(lines)):
			this = lines[i][0]
			if i > 0:
				neighbour = lines[i-1][0]
				if neighbour and neighbour == this: continue
			try:
				neighbour = lines[i+1][0]
			except IndexError: pass
			else:
				if neighbour and neighbour == this: continue
			outfh.write(lines[i][1])
		outfh.close()
	except:
		try: os.remove(outf)
		except: pass
		raise
	return outf

def mappos(chrindex, readpos):
	try: window = allwindows[chrindex][readpos//WINDOWSIZE]
	except IndexError:
		return []
	return [exon for exon in window if exon.start <= readpos < exon.end]


def getarguments(flag):
	# list of arguments that follow the flag
	flagindex = sys.argv.index(flag)
	ret = []
	index = flagindex + 1
	while index < len(sys.argv) and sys.argv[index][0] != "-":
		ret.append(sys.argv[index])
		index += 1
	return ret

def getargument(flag):
	# the argument after the flag
	flagindex = sys.argv.index(flag)
	return sys.argv[flagindex+1]

def testargumentflag(flag):
	# if the flag is among the arguments
	if flag in sys.argv: return 1
	else: return 0

def main():
	global MAXGENES
	global MAXREADS
	global ONLYUNIQUEPOS
	global ONLYTXUNIQUEEXONS
	global INTRONSINSTEAD
	global NODECONVOLUTION
	global COLLAPSEGENES
	global FULLTRANSCRIPTS
	global USESTRANDINFO
	global allchromosomes
	global allchromosomes_dict
	global annotationtype
	global uniqueposdir
	global swapstrands
	global mapends
	global removemultimappers
	global bigWigSummary_path
	global allwindows

	# interpret program arguments
	if len(sys.argv) < 2 or testargumentflag("--help") or testargumentflag("-h"):
		print "Non-optional arguments:"
		print " -o followed by output file"
		print " -i followed by list of input files (by default, guesses format from file extension)"
		print " -a followed by annotation file"
		print "Gene model-related options:"
		print " -u followed by a bigWig file, alternatively a directory for files for non-unique positions (lower case for nonunique k-mers (where k is the read length), upper case for unique; filenames are e.g. chr1.fa"
		print " -fulltranscript to not remove 3'UTRs"
		print " -maxlength followed by a distance to cut each transcript from the 3' end (5' if negative)"
		print " -maxgenes limit how many genes expression is calculated for"
		print " -limitcollapse to not consider indirect transcript overlap"
		print " -namecollapse to only consider overlap between isoform with the same gene identifier"
		print " -nocollapse to get isoform expressions (shaky)"
		print " -nooverlap to ignore that transcripts can overlap (will count some reads several times)"
		print " -flat to flatten all isoforms to one gene model"
		print " -txunique to ignore exons shared by multiple gene isoforms"
		print " -onlycoding to ignore noncoding transcripts"
		print " -swapstrands to make reads on + strand map to genes on - and vice versa (and sets -strand)"
		print " -introns gives gene expression from introns rather than exons (also removes exons of other isoforms)"
		print " -keephap to not remove haplotype chromosome (_hap) annotation"
		print "Annotation file formats:"
		print " -genePred if annotation file uses format of refGene.txt etc (default if cannot guess from file name suffix)"
		print " -bedann tab-separated 0-based bed file, chromosome start end and 9 optional fields"
		print " -gffann GFF format, but no groups, only single exon genes"
		print "Input formats:"
		print " -bed tab separated bed file (default if cannot guess from file name suffix)"
		print " -bedcompacted bed file with number of reads in the score column"
		print " -bedspace space separated bed file"
		print " -bowtie the default output format of bowtie"
		print " -sam SAM format"
		print " -bam BAM format"
		print " -samse SAM format, single-end, uniquely mapped reads (faster than -sam)"
		print " -gff GFF file, no groups"
		print "Normalisation options:"
		print " -mRNAnorm to normalize by the number of reads matching mRNA exons (default)"
		print " -exonnorm to normalize by the number of reads matching exons, including ncRNA"
		print " -allmapnorm to normalize by the total number of mapped reads (default if annotation contains no mRNA)"
		print " -forcetotal followed by a number of reads for each sample to set a constant to normalise by"
		print "Other optional arguments:"
		print " -strand to use strand information of reads"
		print " -bothends to also map the end positions to genes, each end counted as 0.5 (or 0.25 for paired-end reads)"
		print " -diffreads to count only one read if several have the same position, strand and length"
		print " -n followed by list of sample names (input file names are otherwise used)"
		print " -maxreads followed by maximum number of reads to be used"
		print " -randomreads to make -maxreads pick reads at random"
		print " -readcount to add the number of reads supporting expression to the output"

		print " -h to print this message and quit"
		print "Special output values:"
		print " 0 gene has no reads, -1 gene has no exons"
		print " otherwise the output is in reads per kilobase and million mappable reads (or rather FPKM for paired-end reads)"
		print "Output:"
		print " gene symbol -tab- ID -tab- RPKM values [-tab- read count]"
		return 0
	try: outfile = getargument("-o")
	except:
		print "No outfile (-o)"
		return 1
	try: infiles = getarguments("-i")
	except:
		print "No infile (-i)"
		return 1
	try: annotationfile = getargument("-a")
	except:
		print "No annotationfile (-a)"
		return 1
	try:
		names = getarguments("-n")
		if len(names) != len(infiles):
			print "Not the same number of sample names as files with reads"
			return 1
	except: names = [filename.rsplit("/")[-1] for filename in infiles]
	compressionsuffixes = ['gz','bz2']

	if testargumentflag("-genePred") or testargumentflag("-refseq"): annotationtype = 0
	elif testargumentflag("-ucsc"): annotationtype = 1
	elif testargumentflag("-ensembl"): annotationtype = 2
	elif testargumentflag("-bedann"): annotationtype = 6
	elif testargumentflag("-bed3ann"): annotationtype = 3
	elif testargumentflag("-bed6ann"): annotationtype = 4
	elif testargumentflag("-gtfann") or testargumentflag("-gffann"): annotationtype = 5
	else:
		# if no annotation type given, guess from file suffix
		anndict = {'bed':6, 'gff':5, 'gtf':5, 'txt':0}
		try:
			annpre, ending = annotationfile.rsplit('.',1)
			if ending in compressionsuffixes: ending = annpre.rsplit('.',1)[1]
			annotationtype = anndict[ending]
		except:
			annotationtype = 0

	if testargumentflag("-bed"): infileformat = "bed"
	elif testargumentflag("-bedcompacted"): infileformat = "bedcompacted"
	elif testargumentflag("-bedspace"): infileformat = "bedspace"
	elif testargumentflag("-gff") or testargumentflag("-gtf"): infileformat = "gtf"
	elif testargumentflag("-sam"): infileformat = "sam"
	elif testargumentflag("-bowtie"): infileformat = "bowtie"
	elif testargumentflag("-samse"): infileformat = "samse"
	elif testargumentflag("-bam"): infileformat = "bam"
	else: infileformat = "guess"

	if testargumentflag("-nocollapse"): COLLAPSEGENES = 0
	else: COLLAPSEGENES = 1
	if testargumentflag("-limitcollapse"): limitcollapse = 1
	else: limitcollapse = 0
	genecollapse = testargumentflag("-namecollapse")
	if testargumentflag("-fulltranscript"): FULLTRANSCRIPTS = 1
	else: FULLTRANSCRIPTS = 0
	bigWigSummary_path = 0
	if testargumentflag("-u"):
		uniqueposdir = getargument("-u")
		if uniqueposdir.endswith('.bw') or uniqueposdir.endswith('.bw.gz') or uniqueposdir.endswith('.bw.bz2') or uniqueposdir.endswith('.bigWig') or uniqueposdir.endswith('.bigWig.gz') or uniqueposdir.endswith('.bigWig.bz2'):
			bigWigSummary_path = os.path.split(sys.argv[0])[0]
			if bigWigSummary_path == '':
				bigWigSummary_path = './'
			bigWigSummary_path = os.path.join(bigWigSummary_path, 'bigWigSummary')
			if not os.path.isfile(bigWigSummary_path):
				print 'Error: ' + bigWigSummary_path + ' does not exist, needed to read bigWig file spec