Created
April 12, 2015 21:40
-
-
Save konstantint/faa2207ccb8d12c2cad1 to your computer and use it in GitHub Desktop.
Human Bananology (http://fouryears.eu/2015/04/12/two-bananas-make-a-human/)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Recreational computational human bananology. | |
# Copyright (c) 2015, Konstantin Tretyakov | |
# http://fouryears.eu/2015/04/12/two-bananas-make-a-human/ | |
# ------------------------ Imports and utility functions ---------------- # | |
import gzip | |
import csv | |
import urllib | |
from ucscgenome import Genome | |
from Bio import SeqIO | |
from clint.textui import progress | |
# ------------------------ Load data ------------------------ # | |
# Obtain the human genome | |
# (a ~800M download, unless you used the package before and have the genome cached already) | |
human = Genome('hg19') | |
# Obtain human gene annotations (4M) | |
urllib.urlretrieve("http://hgdownload.cse.ucsc.edu/goldenpath/hg19/database/knownCanonical.txt.gz", "/tmp/knownCanonical.txt.gz") | |
with gzip.open("/tmp/knownCanonical.txt.gz") as f: | |
rows = list(csv.reader(f, delimiter='\t')) # chrom, txStart, txEnd, ... | |
h_genes = sorted({(r[0], int(r[1]), int(r[2])) for r in rows}) | |
print "Number of unique, canonical protein-coding human gene regions: ", len(h_genes) # 31806 | |
# Download the banana genome (~116M download) and parse it using BioPython | |
urllib.urlretrieve("http://banana-genome.cirad.fr/download/musa_pseudochromosome.fna.gz", "/tmp/banana.fna.gz") | |
banana = {r.id: r.seq for r in SeqIO.parse(gzip.open("/tmp/banana.fna.gz"), "fasta")} | |
# ------------------------ Compute genome sizes ------------------------ # | |
h_chromosomes = ['chr%d' % d for d in range(1, 23)] + ['chrX', 'chrY'] | |
b_chromosomes = ['chr%d' % d for d in range(1, 12)] | |
h_length = sum([len(human[chr]) for chr in h_chromosomes]) # 3'095'677'412 | |
b_length = sum([len(banana[chr]) for chr in b_chromosomes]) # 331'812'599 | |
print "Human genome length: %10d" % h_length | |
print "Banana genome length: %10d" % b_length | |
# ------------------------ Collect all 20-nucleotide reads from the banana DNA ------------------------ # | |
# (Instead of collecting reads into a set we could use a suffix tree. | |
# I tried that - although the preprocessing was faster, the search seemed slower in the end) | |
def generate_reads(s, read_size=10): | |
for i in range(0, len(s)-read_size+1): | |
yield str(s[i:(i+read_size)]) | |
# %%time | |
banana_reads = set() | |
for chr in b_chromosomes: | |
for r in progress.bar(generate_reads(banana[chr]), expected_size=len(banana[chr]), every=100000, label=chr + ': '): | |
if 'N' not in r: | |
banana_reads.add(r) | |
# Wall time: 35min 36s [5min 44s when using a suffix tree (http://www.daimi.au.dk/~mailund/suffix_tree.html) ] | |
print "Total reads in banana: ", len(banana_reads) | |
# Total reads in banana: 247429656 | |
# ------ Check human genes for "orthology" by counting how many of their reads have any banana counterparts ----------------- # | |
def gene_match(chr, start, end): | |
try: | |
seq = human[chr][start:end].upper() | |
total_reads = 0 | |
matched_reads = 0 | |
for r in generate_reads(seq): | |
if 'N' not in r: | |
if r in banana_reads: | |
matched_reads += 1 | |
total_reads += 1 | |
return matched_reads, total_reads | |
except KeyError: | |
return 0, 0 | |
matches = [gene_match(*g) for g in progress.bar(h_genes)] | |
## Time: 00:32:49 | |
# Save the results, just in case | |
import pickle | |
with open("/tmp/matches-20.pickle", "wb") as f: | |
pickle.dump(matches, f) | |
# ------ Compute the answer to the question ----------------- # | |
match_pct = [float(x)/y for (x,y),(g,s,e) in zip(matches,h_genes) if y != 0 and e-s >= 1000] | |
print "Number of genes in human genome with at least 5% match to banana genome:", len([m for m in match_pct if m >= 0.05]), "out of", len(match_pct) | |
# Number of genes in human genome with at least 5% match to banana genome: 33 out of 24624 | |
print sorted(match_pct)[-10:] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment