Skip to content

Instantly share code, notes, and snippets.

@konstantint
Created April 12, 2015 21:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save konstantint/faa2207ccb8d12c2cad1 to your computer and use it in GitHub Desktop.
Save konstantint/faa2207ccb8d12c2cad1 to your computer and use it in GitHub Desktop.
# Recreational computational human bananology.
# Copyright (c) 2015, Konstantin Tretyakov
# http://fouryears.eu/2015/04/12/two-bananas-make-a-human/
# ------------------------ Imports and utility functions ---------------- #
import gzip
import csv
import urllib
from ucscgenome import Genome
from Bio import SeqIO
from clint.textui import progress
# ------------------------ Load data ------------------------ #
# Obtain the human genome
# (a ~800M download, unless you used the package before and have the genome cached already)
human = Genome('hg19')
# Obtain human gene annotations (4M)
urllib.urlretrieve("http://hgdownload.cse.ucsc.edu/goldenpath/hg19/database/knownCanonical.txt.gz", "/tmp/knownCanonical.txt.gz")
with gzip.open("/tmp/knownCanonical.txt.gz") as f:
rows = list(csv.reader(f, delimiter='\t')) # chrom, txStart, txEnd, ...
h_genes = sorted({(r[0], int(r[1]), int(r[2])) for r in rows})
print "Number of unique, canonical protein-coding human gene regions: ", len(h_genes) # 31806
# Download the banana genome (~116M download) and parse it using BioPython
urllib.urlretrieve("http://banana-genome.cirad.fr/download/musa_pseudochromosome.fna.gz", "/tmp/banana.fna.gz")
banana = {r.id: r.seq for r in SeqIO.parse(gzip.open("/tmp/banana.fna.gz"), "fasta")}
# ------------------------ Compute genome sizes ------------------------ #
h_chromosomes = ['chr%d' % d for d in range(1, 23)] + ['chrX', 'chrY']
b_chromosomes = ['chr%d' % d for d in range(1, 12)]
h_length = sum([len(human[chr]) for chr in h_chromosomes]) # 3'095'677'412
b_length = sum([len(banana[chr]) for chr in b_chromosomes]) # 331'812'599
print "Human genome length: %10d" % h_length
print "Banana genome length: %10d" % b_length
# ------------------------ Collect all 20-nucleotide reads from the banana DNA ------------------------ #
# (Instead of collecting reads into a set we could use a suffix tree.
# I tried that - although the preprocessing was faster, the search seemed slower in the end)
def generate_reads(s, read_size=10):
for i in range(0, len(s)-read_size+1):
yield str(s[i:(i+read_size)])
# %%time
banana_reads = set()
for chr in b_chromosomes:
for r in progress.bar(generate_reads(banana[chr]), expected_size=len(banana[chr]), every=100000, label=chr + ': '):
if 'N' not in r:
banana_reads.add(r)
# Wall time: 35min 36s [5min 44s when using a suffix tree (http://www.daimi.au.dk/~mailund/suffix_tree.html) ]
print "Total reads in banana: ", len(banana_reads)
# Total reads in banana: 247429656
# ------ Check human genes for "orthology" by counting how many of their reads have any banana counterparts ----------------- #
def gene_match(chr, start, end):
try:
seq = human[chr][start:end].upper()
total_reads = 0
matched_reads = 0
for r in generate_reads(seq):
if 'N' not in r:
if r in banana_reads:
matched_reads += 1
total_reads += 1
return matched_reads, total_reads
except KeyError:
return 0, 0
matches = [gene_match(*g) for g in progress.bar(h_genes)]
## Time: 00:32:49
# Save the results, just in case
import pickle
with open("/tmp/matches-20.pickle", "wb") as f:
pickle.dump(matches, f)
# ------ Compute the answer to the question ----------------- #
match_pct = [float(x)/y for (x,y),(g,s,e) in zip(matches,h_genes) if y != 0 and e-s >= 1000]
print "Number of genes in human genome with at least 5% match to banana genome:", len([m for m in match_pct if m >= 0.05]), "out of", len(match_pct)
# Number of genes in human genome with at least 5% match to banana genome: 33 out of 24624
print sorted(match_pct)[-10:]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment