Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Synthesise genes for a potential child from two 23andme profiles. This is a work in progress and my still have issues.
import argparse
import os
import zipfile
from collections import Counter
# gene_set = set(['rs2015343', 'rs12203592', 'rs4778136', 'rs9782955',
# 'rs4778138', 'rs3739070', 'rs11074314', 'rs11631797',
# 'rs12914687', 'rs1667394', 'rs3794604', 'rs3947367',
# 'rs4778241', 'rs7495174', 'rs7170869',
# 'rs12593929', 'rs4900109', 'rs2240203', 'rs7174027',
# 'rs8028689', 'rs916977', 'rs3940272', 'rs3935591',
# 'rs2703969', 'rs1533995', 'rs12906280', 'rs12896399',
# 'rs1800414', 'rs10235789', 'rs12913832', 'rs746861',
# 'rs7170852', 'rs16891982', 'rs7494942', 'rs11634406',
# 'rs7183877', 'rs989869', 'rs2238289', 'rs1129038',
# 'rs895828', 'rs1800407', 'rs1800404', 'rs9834312',
# 'rs2179922', 'rs1867138', 'rs1867137', 'rs939882',
# 'rs1042725', 'rs363050', 'rs174575', 'rs324640',
# 'rs363039', 'rs363043', 'rs353016', ])
_23andme_header = """
# This data file generated by 23andMe at: Thu Dec 17 08:08:10 2015
#
# This file contains raw genotype data, including data that is not used in 23andMe reports.
# This data has undergone a general quality review however only a subset of markers have been
# individually validated for accuracy. As such, this data is suitable only for research,
# educational, and informational use and not for medical or other use.
#
# Below is a text version of your data. Fields are TAB-separated
# Each line corresponds to a single SNP. For each SNP, we provide its identifier
# (an rsid or an internal id), its location on the reference human genome, and the
# genotype call oriented with respect to the plus strand on the human reference sequence.
# We are using reference human assembly build 37 (also known as Annotation Release 104).
# Note that it is possible that data downloaded at different times may be different due to ongoing
# improvements in our ability to call genotypes. More information about these changes can be found at:
# https://www.23andme.com/you/download/revisions/
#
# More information on reference human assembly build 37 (aka Annotation Release 104):
# http://www.ncbi.nlm.nih.gov/mapview/map_search.cgi?taxid=9606
#
# rsid chromosome position genotype
"""
descriptors = ["eye_colour", "melanin_intensity", "hair_colour",]
gene_discriptors = {"rs9782955": {"eye_colour": "Blocks some melanin. Often gives light colored eyes",
"melanin_intensity": "Blocks some melanin. Often gives light colored eyes."
},
"rs12203592": {"hair_colour": "associated with hair color, eye color, and tanning response to sunlight"},
"rs4778138": {"eye_colour":"",
"melanin_intensity": "freckles more likely",},
"rs7495174": {"eye_colour": "AA - blue/gray eyes more likely associated with blue or green eye color in Caucasians"},
"rs1667394": {"eye_colour": "AA - increases susceptibility to Blond rather than brown hair 4.94 times for carriers of the A allele "},
"rs3794604": {"melanin_intensity": "CC - Blocks some melanin. Often gives light colored eyes."},
"rs3947367": {"hair_colour": "associated with red hair color"},
"rs4778241": {"eye_colour": "AC - usually brown eye color"},
"rs7174027": {"melanin_intensity": "Blocks some melanin. Often gives light colored eyes."},
"rs12896399": {"hair_colour": "GG - Light hair color"},
"rs1800414": {"melanin_intensity": "Blocks some melanine"},
"rs12913832": {"eye_colour": "GG - blue eye color, 99% of the time; AG - brown eye color; AA - brown eye color, 80% of the time"},
"rs16891982": {"hair_colour": "more likely to have black hair, Light skin; "},
"rs7183877": {"eye_colour": "CC - blue eye color if part of blue eye color haplotype; AA/AC - usually brown eye color"},
"rs989869": {"eye_colour": "CT - Contrasting sphincter around pupil."},
}
def write_gene_rslist(output_file):
with open(output_file, mode='w') as f:
while True:
item = yield
#print item
f.write("{0} \n".format(item))
def get_rslist_from_zip(zip_file):
new_gene_set = set([])
with zipfile.ZipFile(os.path.expanduser(zip_file)) as z:
with z.open(z.namelist()[0]) as f:
for line in f:
if not line.startswith("#"):
line_list = line.split()
new_gene_set.add(line_list[0])
return new_gene_set
def get_text_from_zip(zip_file, gene_set):
gene_dict = {}
with zipfile.ZipFile(os.path.expanduser(zip_file)) as z:
with z.open(z.namelist()[0]) as f:
for line in f:
if not line.startswith("#"):
line_list = line.split()
if line_list[0] in gene_set:
gene_dict[line_list[0]] = {
"chromosome" : line_list[1],
"position" : line_list[2],
"genotype" : line_list[3],
}
return gene_dict
def get_recessive_phenotypes(x_genotype, y_genotype):
if x_genotype == '--':
return y_genotype
if y_genotype == '--':
return x_genotype
alleles = []
for allele in x_genotype:
alleles.append(allele)
for allele in y_genotype:
alleles.append(allele)
count = Counter(alleles)
if count[count.keys()[0]] == count[count.keys()[1]]:
progressive_allele = "".join([count.keys()[0], count.keys()[1]])
return progressive_allele
else:
recessive_allele = max(count, key=count.get)
return recessive_allele * 2
def get_gene_matches(x, y, gene_set, output_file):
recv = write_gene_rslist(output_file)
recv.next()
recv.send(_23andme_header)
#print "{:<15} {:>10} {:<10} {:>10} {:>10} {:>10}".format("rsid", "chromosome", "position", "x_genotype", "y_genotype", "Progeny")
for rs in gene_set:
try:
if x[rs] and y[rs]:
if x[rs]["genotype"] == y[rs]["genotype"]:
print "{:<15} {:^10} {:<10} {:>10} {:>10} {:>10}".format(rs, x[rs]["chromosome"], x[rs]["position"], x[rs]["genotype"], y[rs]["genotype"], "".join([x[rs]["genotype"][:1], y[rs]["genotype"][1:]]))
recv.send("{:<15} {:^10} {:<10} {:>10}".format(rs, x[rs]["chromosome"], x[rs]["position"], "".join([x[rs]["genotype"][:1], y[rs]["genotype"][1:]])))
else:
print "{:<15} {:^10} {:<10} {:>10} {:>10} {:>10}".format(rs, x[rs]["chromosome"], x[rs]["position"], x[rs]["genotype"], y[rs]["genotype"], get_recessive_phenotypes(x[rs]["genotype"], y[rs]["genotype"]))
recv.send("{:<15} {:^10} {:<10} {:>10}".format(rs, x[rs]["chromosome"], x[rs]["position"], get_recessive_phenotypes(x[rs]["genotype"], y[rs]["genotype"])))
except:
#print "{:<15} {:^10} {:<10} {:<10}".format(rs, x[rs]["chromosome"], x[rs]["position"], x[rs]["genotype"])
#print "{:<15} {:^10} {:<10} {:<10}".format(rs, y[rs]["chromosome"], y[rs]["position"], y[rs]["genotype"])
pass
parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers(dest='subparser_name', help='commands')
#synthesis command
gene_parser = subparsers.add_parser('synthesis', help='specify operation')
gene_parser.add_argument('-x', '--female', required=True, help='female 23andme zip file, do not extract')
gene_parser.add_argument('-y', '--male', required=True, help='male 23andme zip file, do not extract')
gene_parser.add_argument('-o', '--output', required=True, help='outputfile')
def runner():
if parser.parse_args().subparser_name == 'synthesis':
gene_set_x = get_rslist_from_zip(parser.parse_args().female)
gene_set_y = get_rslist_from_zip(parser.parse_args().male)
gene_set = gene_set_x | gene_set_y
get_gene_matches(get_text_from_zip(parser.parse_args().female, gene_set),
get_text_from_zip(parser.parse_args().male, gene_set),
gene_set,
parser.parse_args().output)
if __name__ == "__main__":
runner()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment