Skip to content

Instantly share code, notes, and snippets.

@informationsea
Created August 31, 2015 05:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save informationsea/07b979c1fef684f8b9a1 to your computer and use it in GitHub Desktop.
Save informationsea/07b979c1fef684f8b9a1 to your computer and use it in GitHub Desktop.
Convert refFlat to Bed File with merging isoforms
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import argparse
import csv
import sqlite3
import sys
import commonlib
import collections
def _main():
parser = argparse.ArgumentParser(description="Load refflat to SQLite3")
parser.add_argument('refFlat', type=commonlib.FileType('r'))
parser.add_argument('--db', default=':memory:', nargs='?', help='default: %(default)s')
parser.add_argument('-o', '--output', default=sys.stdout, type=commonlib.FileType('w'))
parser.add_argument('--debug', action='store_true')
options = parser.parse_args()
conn = sqlite3.connect(options.db)
conn.execute('DROP TABLE IF EXISTS "refFlat"')
conn.execute('''CREATE TABLE `refFlat` (
`geneName` varchar(255) NOT NULL,
`name` varchar(255) NOT NULL,
`chrom` varchar(255) NOT NULL,
`strand` char(1) NOT NULL,
`txStart` int(10) NOT NULL,
`txEnd` int(10) NOT NULL,
`cdsStart` int(10) NOT NULL,
`cdsEnd` int(10) NOT NULL,
`exonCount` int(10) NOT NULL,
`exonStarts` longblob NOT NULL,
`exonEnds` longblob NOT NULL
)''')
reader = csv.reader(options.refFlat, delimiter='\t', quotechar=None)
for row in reader:
conn.execute('INSERT INTO refFlat VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', row)
for key in ['geneName', 'name', 'chrom', 'strand', 'txStart', 'txEnd', 'cdsStart', 'cdsEnd', 'exonCount']:
conn.execute('CREATE INDEX refFlat__{0} ON refFlat({0})'.format(key))
conn.commit()
writer = csv.writer(options.output, delimiter='\t', quotechar=None)
cur = conn.cursor()
cur.execute('SELECT distinct geneName FROM refFlat')
genenames = [x[0] for x in cur.fetchall()]
print 'Genes', len(genenames)
for i in genenames:
cur.execute('SELECT geneName, chrom, txStart, txEnd, strand FROM refFlat WHERE geneName = ?', [i])
data = cur.fetchall()
if len(data) == 1:
row = data[0]
writer.writerow([row[1], row[2], row[3], row[0], '0', row[4]])
continue
bychr = collections.defaultdict(list)
for j in data:
bychr[(j[1], j[4])].append(j)
#print bychr
for dummy, v in bychr.iteritems():
txstarts = [x[2] for x in v]
txends = [x[3] for x in v]
if all([x < y for x in txstarts for y in txends]):
row = v[0]
writer.writerow([row[1], min(txstarts), max(txends), row[0], '0', row[4]])
continue
matrix = [[True for x in v] for y in v]
for j, (j_start, j_end) in enumerate(zip(txstarts, txends)):
for k, (k_start, k_end) in enumerate(zip(txstarts, txends)):
if options.debug:
print j, j_start, j_end, k, k_start, k_end
if j_end < k_start or k_end < j_start:
matrix[j][k] = False
matrix[k][j] = False
if options.debug:
for j in matrix:
print j
group = []
for j in range(len(v)):
if any([(j in x) for x in group]):
continue
member = set([k for k in range(len(v)) if matrix[j][k]])
unchecked_members = set([x for x in member if x != j])
while unchecked_members:
one = unchecked_members.pop()
member |= set([k for k in range(len(v)) if matrix[one][k]])
group.append(member)
for onegroup in group:
rows = [v[x] for x in onegroup]
writer.writerow([rows[0][1], min([x[2] for x in rows]), max([x[3] for x in rows]),
rows[0][0], '0', rows[0][4]])
if options.debug:
print
if __name__ == '__main__':
_main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment