Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
# Recreational computational human bananology.
# Copyright (c) 2015, Konstantin Tretyakov
# http://fouryears.eu/2015/04/12/two-bananas-make-a-human/
# ------------------------ Imports and utility functions ---------------- #
import gzip
import csv
import urllib
from ucscgenome import Genome
from Bio import SeqIO
from clint.textui import progress
# ------------------------ Load data ------------------------ #
# Obtain the human genome
# (a ~800M download, unless you used the package before and have the genome cached already)
human = Genome('hg19')
# Obtain human gene annotations (4M)
urllib.urlretrieve("http://hgdownload.cse.ucsc.edu/goldenpath/hg19/database/knownCanonical.txt.gz", "/tmp/knownCanonical.txt.gz")
with gzip.open("/tmp/knownCanonical.txt.gz") as f:
rows = list(csv.reader(f, delimiter='\t')) # chrom, txStart, txEnd, ...
h_genes = sorted({(r[0], int(r[1]), int(r[2])) for r in rows})
print "Number of unique, canonical protein-coding human gene regions: ", len(h_genes) # 31806
# Download the banana genome (~116M download) and parse it using BioPython
urllib.urlretrieve("http://banana-genome.cirad.fr/download/musa_pseudochromosome.fna.gz", "/tmp/banana.fna.gz")
banana = {r.id: r.seq for r in SeqIO.parse(gzip.open("/tmp/banana.fna.gz"), "fasta")}
# ------------------------ Compute genome sizes ------------------------ #
h_chromosomes = ['chr%d' % d for d in range(1, 23)] + ['chrX', 'chrY']
b_chromosomes = ['chr%d' % d for d in range(1, 12)]
h_length = sum([len(human[chr]) for chr in h_chromosomes]) # 3'095'677'412
b_length = sum([len(banana[chr]) for chr in b_chromosomes]) # 331'812'599
print "Human genome length: %10d" % h_length
print "Banana genome length: %10d" % b_length
# ------------------------ Collect all 20-nucleotide reads from the banana DNA ------------------------ #
# (Instead of collecting reads into a set we could use a suffix tree.
# I tried that - although the preprocessing was faster, the search seemed slower in the end)
def generate_reads(s, read_size=10):
for i in range(0, len(s)-read_size+1):
yield str(s[i:(i+read_size)])
# %%time
banana_reads = set()
for chr in b_chromosomes:
for r in progress.bar(generate_reads(banana[chr]), expected_size=len(banana[chr]), every=100000, label=chr + ': '):
if 'N' not in r:
banana_reads.add(r)
# Wall time: 35min 36s [5min 44s when using a suffix tree (http://www.daimi.au.dk/~mailund/suffix_tree.html) ]
print "Total reads in banana: ", len(banana_reads)
# Total reads in banana: 247429656
# ------ Check human genes for "orthology" by counting how many of their reads have any banana counterparts ----------------- #
def gene_match(chr, start, end):
try:
seq = human[chr][start:end].upper()
total_reads = 0
matched_reads = 0
for r in generate_reads(seq):
if 'N' not in r:
if r in banana_reads:
matched_reads += 1
total_reads += 1
return matched_reads, total_reads
except KeyError:
return 0, 0
matches = [gene_match(*g) for g in progress.bar(h_genes)]
## Time: 00:32:49
# Save the results, just in case
import pickle
with open("/tmp/matches-20.pickle", "wb") as f:
pickle.dump(matches, f)
# ------ Compute the answer to the question ----------------- #
match_pct = [float(x)/y for (x,y),(g,s,e) in zip(matches,h_genes) if y != 0 and e-s >= 1000]
print "Number of genes in human genome with at least 5% match to banana genome:", len([m for m in match_pct if m >= 0.05]), "out of", len(match_pct)
# Number of genes in human genome with at least 5% match to banana genome: 33 out of 24624
print sorted(match_pct)[-10:]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.
You signed in with another tab or window. Reload to refresh your session. You signed out in another tab or window. Reload to refresh your session.