Skip to content

Instantly share code, notes, and snippets.

@bemasher
Created December 5, 2009 00:21
Show Gist options
  • Save bemasher/249481 to your computer and use it in GitHub Desktop.
Save bemasher/249481 to your computer and use it in GitHub Desktop.
import sqlite3, re, math, os
con = sqlite3.connect(':memory:')
db = con.cursor()
contigs = [390,389,166,690,6907,6993,2656,3501,6990,1530,7291,7069,7277,7290,6922,7257,7126,1500,7214]
# Set filename and open input
in_file = open("trimmed.ace", "r")
# Compile regex for parsing each line
AF_re = re.compile("AF\s(?P<name>.*?)\.(?P<range>\d+-\d+)(?P<link1>(?:\.fm|\.to)\d+)(?P<link2>(?:\.fm|\.to)\d+)?\s(?P<dir>[UC])")
# Create table
db.execute('''CREATE TABLE 'read_index' ('name' varchar(255) NOT NULL, 'contig' int(11) NOT NULL, 'range_low' int(11) NOT NULL, 'range_high' int(11) NOT NULL);''')
db.execute('''CREATE INDEX 'name' ON 'read_index' ('name')''')
db.execute('''CREATE INDEX 'contig' ON 'read_index' ('contig')''')
# Parse each line in the trimmed ace file and insert it into the database
for line in in_file:
if line.startswith("CO"):
# Current line is a contig, set current_contig value
current_contig = int(line.strip()[9:14])
else:
# Current line is a read, parse it and insert it into the database with the current contig
match = AF_re.match(line.strip())
read_name = match.group('name')
range = map(lambda x: int(x), match.group("range").split("-"))
db.execute("INSERT INTO read_index (name, contig, range_low, range_high) VALUES (?, ?, ?, ?);", (read_name, current_contig, range[0], range[1]))
# Commit changes to database
con.commit()
# SELECT a list of all reads contained in the database that have the contigs we're looking for
read_list = db.execute("SELECT DISTINCT * FROM read_index WHERE contig=%s" % (' OR contig='.join(map(lambda x: str(x), contigs[1:])))).fetchall()
# Make a dictionary of reads and their associated contigs
read_contigs = {}
for read in read_list:
read_contigs[str(read[0])] = map(lambda x: x[0], db.execute("SELECT contig FROM read_index WHERE name=?", (str(read[0]),)).fetchall())
# Get the paths that each read traces out by sorting their ranges and contigs associated with each range
read_paths = {}
for read, contig_list in read_contigs.items():
contigs = db.execute("SELECT contig FROM read_index WHERE name=? ORDER BY range_low, range_high", (read,)).fetchall()
path = '->'.join(map(lambda x: str(x[0]), contigs))
try:
# Increment path counter
read_paths[path] += 1
except:
# New path so set counter to 1
read_paths[path] = 1
# Close the database
db.close()
# Print each path and frequency
for path, frequency in read_paths.items():
print path, frequency
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment