konstantint/gist:faa2207ccb8d12c2cad1

## gistfile1.py
# Recreational computational human bananology.
# Copyright (c) 2015, Konstantin Tretyakov
# http://fouryears.eu/2015/04/12/two-bananas-make-a-human/

# ------------------------ Imports and utility functions ---------------- #
import gzip
import csv
import urllib
from ucscgenome import Genome
from Bio import SeqIO
from clint.textui import progress

# ------------------------ Load data ------------------------ #
# Obtain the human genome
# (a ~800M download, unless you used the package before and have the genome cached already)
human = Genome('hg19')

# Obtain human gene annotations (4M)
urllib.urlretrieve("http://hgdownload.cse.ucsc.edu/goldenpath/hg19/database/knownCanonical.txt.gz", "/tmp/knownCanonical.txt.gz")
with gzip.open("/tmp/knownCanonical.txt.gz") as f:
    rows = list(csv.reader(f, delimiter='\t'))  # chrom, txStart, txEnd, ...
    h_genes = sorted({(r[0], int(r[1]), int(r[2])) for r in rows})
    print "Number of unique, canonical protein-coding human gene regions: ", len(h_genes)   # 31806

# Download the banana genome (~116M download) and parse it using BioPython
urllib.urlretrieve("http://banana-genome.cirad.fr/download/musa_pseudochromosome.fna.gz", "/tmp/banana.fna.gz")
banana = {r.id: r.seq for r in SeqIO.parse(gzip.open("/tmp/banana.fna.gz"), "fasta")}

# ------------------------ Compute genome sizes ------------------------ #
h_chromosomes = ['chr%d' % d for d in range(1, 23)] + ['chrX', 'chrY']
b_chromosomes = ['chr%d' % d for d in range(1, 12)]
h_length = sum([len(human[chr]) for chr in h_chromosomes])   # 3'095'677'412
b_length = sum([len(banana[chr]) for chr in b_chromosomes])  #   331'812'599
print "Human genome length:  %10d" % h_length
print "Banana genome length: %10d" % b_length

# ------------------------ Collect all 20-nucleotide reads from the banana DNA ------------------------ #
# (Instead of collecting reads into a set we could use a suffix tree.
#  I tried that - although the preprocessing was faster, the search seemed slower in the end)

def generate_reads(s, read_size=10):
    for i in range(0, len(s)-read_size+1):
        yield str(s[i:(i+read_size)])

# %%time
banana_reads = set()
for chr in b_chromosomes:
    for r in progress.bar(generate_reads(banana[chr]), expected_size=len(banana[chr]), every=100000, label=chr + ': '):
        if 'N' not in r:
            banana_reads.add(r)
# Wall time: 35min 36s [5min 44s when using a suffix tree (http://www.daimi.au.dk/~mailund/suffix_tree.html) ]

print "Total reads in banana: ", len(banana_reads)
# Total reads in banana:  247429656


# ------ Check human genes for "orthology" by counting how many of their reads have any banana counterparts ----------------- #
def gene_match(chr, start, end):
    try:
        seq = human[chr][start:end].upper()
        total_reads = 0
        matched_reads = 0
        for r in generate_reads(seq):
            if 'N' not in r:
                if r in banana_reads:
                    matched_reads += 1
                total_reads += 1
        return matched_reads, total_reads
    except KeyError:
        return 0, 0

matches = [gene_match(*g) for g in progress.bar(h_genes)]
## Time: 00:32:49

# Save the results, just in case
import pickle
with open("/tmp/matches-20.pickle", "wb") as f:
    pickle.dump(matches, f)

# ------ Compute the answer to the question ----------------- #
match_pct = [float(x)/y for (x,y),(g,s,e) in zip(matches,h_genes) if y != 0 and e-s >= 1000]
print "Number of genes in human genome with at least 5% match to banana genome:", len([m for m in match_pct if m >= 0.05]), "out of", len(match_pct)
# Number of genes in human genome with at least 5% match to banana genome: 33 out of 24624
print sorted(match_pct)[-10:]
	# Recreational computational human bananology.
	# Copyright (c) 2015, Konstantin Tretyakov
	# http://fouryears.eu/2015/04/12/two-bananas-make-a-human/

	# ------------------------ Imports and utility functions ---------------- #
	import gzip
	import csv
	import urllib
	from ucscgenome import Genome
	from Bio import SeqIO
	from clint.textui import progress

	# ------------------------ Load data ------------------------ #
	# Obtain the human genome
	# (a ~800M download, unless you used the package before and have the genome cached already)
	human = Genome('hg19')

	# Obtain human gene annotations (4M)
	urllib.urlretrieve("http://hgdownload.cse.ucsc.edu/goldenpath/hg19/database/knownCanonical.txt.gz", "/tmp/knownCanonical.txt.gz")
	with gzip.open("/tmp/knownCanonical.txt.gz") as f:
	rows = list(csv.reader(f, delimiter='\t')) # chrom, txStart, txEnd, ...
	h_genes = sorted({(r[0], int(r[1]), int(r[2])) for r in rows})
	print "Number of unique, canonical protein-coding human gene regions: ", len(h_genes) # 31806

	# Download the banana genome (~116M download) and parse it using BioPython
	urllib.urlretrieve("http://banana-genome.cirad.fr/download/musa_pseudochromosome.fna.gz", "/tmp/banana.fna.gz")
	banana = {r.id: r.seq for r in SeqIO.parse(gzip.open("/tmp/banana.fna.gz"), "fasta")}

	# ------------------------ Compute genome sizes ------------------------ #
	h_chromosomes = ['chr%d' % d for d in range(1, 23)] + ['chrX', 'chrY']
	b_chromosomes = ['chr%d' % d for d in range(1, 12)]
	h_length = sum([len(human[chr]) for chr in h_chromosomes]) # 3'095'677'412
	b_length = sum([len(banana[chr]) for chr in b_chromosomes]) # 331'812'599
	print "Human genome length: %10d" % h_length
	print "Banana genome length: %10d" % b_length

	# ------------------------ Collect all 20-nucleotide reads from the banana DNA ------------------------ #
	# (Instead of collecting reads into a set we could use a suffix tree.
	# I tried that - although the preprocessing was faster, the search seemed slower in the end)

	def generate_reads(s, read_size=10):
	for i in range(0, len(s)-read_size+1):
	yield str(s[i:(i+read_size)])

	# %%time
	banana_reads = set()
	for chr in b_chromosomes:
	for r in progress.bar(generate_reads(banana[chr]), expected_size=len(banana[chr]), every=100000, label=chr + ': '):
	if 'N' not in r:
	banana_reads.add(r)
	# Wall time: 35min 36s [5min 44s when using a suffix tree (http://www.daimi.au.dk/~mailund/suffix_tree.html) ]

	print "Total reads in banana: ", len(banana_reads)
	# Total reads in banana: 247429656


	# ------ Check human genes for "orthology" by counting how many of their reads have any banana counterparts ----------------- #
	def gene_match(chr, start, end):
	try:
	seq = human[chr][start:end].upper()
	total_reads = 0
	matched_reads = 0
	for r in generate_reads(seq):
	if 'N' not in r:
	if r in banana_reads:
	matched_reads += 1
	total_reads += 1
	return matched_reads, total_reads
	except KeyError:
	return 0, 0

	matches = [gene_match(*g) for g in progress.bar(h_genes)]
	## Time: 00:32:49

	# Save the results, just in case
	import pickle
	with open("/tmp/matches-20.pickle", "wb") as f:
	pickle.dump(matches, f)

	# ------ Compute the answer to the question ----------------- #
	match_pct = [float(x)/y for (x,y),(g,s,e) in zip(matches,h_genes) if y != 0 and e-s >= 1000]
	print "Number of genes in human genome with at least 5% match to banana genome:", len([m for m in match_pct if m >= 0.05]), "out of", len(match_pct)
	# Number of genes in human genome with at least 5% match to banana genome: 33 out of 24624
	print sorted(match_pct)[-10:]