casibbald/23andme_gene_synthesis.py

## 23andme_gene_synthesis.py
import argparse
import os
import zipfile
from collections import Counter


# gene_set = set(['rs2015343', 'rs12203592', 'rs4778136', 'rs9782955',
#                 'rs4778138', 'rs3739070', 'rs11074314', 'rs11631797',
#                 'rs12914687', 'rs1667394', 'rs3794604', 'rs3947367',
#                 'rs4778241', 'rs7495174',  'rs7170869',
#                 'rs12593929', 'rs4900109', 'rs2240203', 'rs7174027',
#                 'rs8028689', 'rs916977', 'rs3940272', 'rs3935591',
#                 'rs2703969', 'rs1533995', 'rs12906280', 'rs12896399',
#                 'rs1800414', 'rs10235789', 'rs12913832', 'rs746861',
#                 'rs7170852', 'rs16891982', 'rs7494942', 'rs11634406',
#                 'rs7183877', 'rs989869', 'rs2238289', 'rs1129038',
#                 'rs895828', 'rs1800407', 'rs1800404', 'rs9834312',
#                 'rs2179922', 'rs1867138', 'rs1867137', 'rs939882',
#                 'rs1042725', 'rs363050', 'rs174575', 'rs324640',
#                 'rs363039', 'rs363043', 'rs353016', ])

_23andme_header = """
# This data file generated by 23andMe at: Thu Dec 17 08:08:10 2015
#
# This file contains raw genotype data, including data that is not used in 23andMe reports.
# This data has undergone a general quality review however only a subset of markers have been
# individually validated for accuracy. As such, this data is suitable only for research,
# educational, and informational use and not for medical or other use.
#
# Below is a text version of your data.  Fields are TAB-separated
# Each line corresponds to a single SNP.  For each SNP, we provide its identifier
# (an rsid or an internal id), its location on the reference human genome, and the
# genotype call oriented with respect to the plus strand on the human reference sequence.
# We are using reference human assembly build 37 (also known as Annotation Release 104).
# Note that it is possible that data downloaded at different times may be different due to ongoing
# improvements in our ability to call genotypes. More information about these changes can be found at:
# https://www.23andme.com/you/download/revisions/
#
# More information on reference human assembly build 37 (aka Annotation Release 104):
# http://www.ncbi.nlm.nih.gov/mapview/map_search.cgi?taxid=9606
#
# rsid	chromosome	position	genotype
"""

descriptors = ["eye_colour", "melanin_intensity", "hair_colour",]

gene_discriptors = {"rs9782955": {"eye_colour": "Blocks some melanin. Often gives light colored eyes",
                                  "melanin_intensity": "Blocks some melanin. Often gives light colored eyes."
                                  },
                    "rs12203592": {"hair_colour": "associated with hair color, eye color, and tanning response to sunlight"},
                    "rs4778138": {"eye_colour":"",
                                  "melanin_intensity": "freckles more likely",},
                    "rs7495174": {"eye_colour": "AA - blue/gray eyes more likely associated with blue or green eye color in Caucasians"},
                    "rs1667394": {"eye_colour": "AA - increases susceptibility to Blond rather than brown hair 4.94 times for carriers of the A allele "},
                    "rs3794604": {"melanin_intensity": "CC - Blocks some melanin. Often gives light colored eyes."},
                    "rs3947367": {"hair_colour": "associated with red hair color"},
                    "rs4778241": {"eye_colour": "AC - usually brown eye color"},
                    "rs7174027": {"melanin_intensity": "Blocks some melanin. Often gives light colored eyes."},
                    "rs12896399": {"hair_colour": "GG - Light hair color"},
                    "rs1800414": {"melanin_intensity": "Blocks some melanine"},
                    "rs12913832": {"eye_colour": "GG - blue eye color, 99% of the time; AG - brown eye color; AA - brown eye color, 80% of the time"},
                    "rs16891982": {"hair_colour": "more likely to have black hair, Light skin; "},
                    "rs7183877": {"eye_colour": "CC - blue eye color if part of blue eye color haplotype; AA/AC - usually brown eye color"},
                    "rs989869": {"eye_colour": "CT - Contrasting sphincter around pupil."},
                    }


def write_gene_rslist(output_file):
    with open(output_file, mode='w') as f:

        while True:
            item = yield
            #print item
            f.write("{0} \n".format(item))


def get_rslist_from_zip(zip_file):
    new_gene_set = set([])
    with zipfile.ZipFile(os.path.expanduser(zip_file)) as z:
        with z.open(z.namelist()[0]) as f:
            for line in f:
                if not line.startswith("#"):
                    line_list = line.split()
                    new_gene_set.add(line_list[0])
    return new_gene_set


def get_text_from_zip(zip_file, gene_set):
    gene_dict = {}
    with zipfile.ZipFile(os.path.expanduser(zip_file)) as z:
        with z.open(z.namelist()[0]) as f:
            for line in f:
                if not line.startswith("#"):
                    line_list = line.split()
                    if line_list[0] in gene_set:
                         gene_dict[line_list[0]] = {
                           "chromosome"     : line_list[1],
                           "position"       : line_list[2],
                           "genotype"       : line_list[3],
                    }
    return gene_dict


def get_recessive_phenotypes(x_genotype, y_genotype):
    if x_genotype == '--':
        return y_genotype

    if y_genotype == '--':
        return x_genotype

    alleles = []
    for allele in x_genotype:
        alleles.append(allele)
    for allele in y_genotype:
        alleles.append(allele)
    count = Counter(alleles)

    if count[count.keys()[0]] == count[count.keys()[1]]:
        progressive_allele = "".join([count.keys()[0], count.keys()[1]])
        return progressive_allele
    else:
        recessive_allele = max(count, key=count.get)
        return recessive_allele * 2


def get_gene_matches(x, y, gene_set, output_file):
    recv = write_gene_rslist(output_file)
    recv.next()
    recv.send(_23andme_header)
    #print "{:<15} {:>10} {:<10} {:>10} {:>10} {:>10}".format("rsid", "chromosome", "position", "x_genotype", "y_genotype", "Progeny")

    for rs in gene_set:
        try:
            if x[rs] and y[rs]:
                if x[rs]["genotype"] == y[rs]["genotype"]:
                    print "{:<15} {:^10} {:<10} {:>10} {:>10} {:>10}".format(rs, x[rs]["chromosome"], x[rs]["position"], x[rs]["genotype"], y[rs]["genotype"], "".join([x[rs]["genotype"][:1], y[rs]["genotype"][1:]]))
                    recv.send("{:<15} {:^10} {:<10} {:>10}".format(rs, x[rs]["chromosome"], x[rs]["position"], "".join([x[rs]["genotype"][:1], y[rs]["genotype"][1:]])))
                else:
                    print "{:<15} {:^10} {:<10} {:>10} {:>10} {:>10}".format(rs, x[rs]["chromosome"], x[rs]["position"], x[rs]["genotype"], y[rs]["genotype"], get_recessive_phenotypes(x[rs]["genotype"], y[rs]["genotype"]))
                    recv.send("{:<15} {:^10} {:<10} {:>10}".format(rs, x[rs]["chromosome"], x[rs]["position"], get_recessive_phenotypes(x[rs]["genotype"], y[rs]["genotype"])))

        except:
            #print "{:<15} {:^10} {:<10} {:<10}".format(rs, x[rs]["chromosome"], x[rs]["position"], x[rs]["genotype"])
            #print "{:<15} {:^10} {:<10} {:<10}".format(rs, y[rs]["chromosome"], y[rs]["position"], y[rs]["genotype"])
            pass


parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers(dest='subparser_name', help='commands')

#synthesis command
gene_parser = subparsers.add_parser('synthesis', help='specify operation')
gene_parser.add_argument('-x', '--female', required=True, help='female 23andme zip file, do not extract')
gene_parser.add_argument('-y', '--male', required=True, help='male 23andme zip file, do not extract')
gene_parser.add_argument('-o', '--output', required=True, help='outputfile')


def runner():
    if parser.parse_args().subparser_name == 'synthesis':

        gene_set_x = get_rslist_from_zip(parser.parse_args().female)
        gene_set_y = get_rslist_from_zip(parser.parse_args().male)
        gene_set = gene_set_x | gene_set_y
        get_gene_matches(get_text_from_zip(parser.parse_args().female, gene_set),
                         get_text_from_zip(parser.parse_args().male, gene_set),
                         gene_set,
                         parser.parse_args().output)

if __name__ == "__main__":

    runner()
	import argparse
	import os
	import zipfile
	from collections import Counter


	# gene_set = set(['rs2015343', 'rs12203592', 'rs4778136', 'rs9782955',
	# 'rs4778138', 'rs3739070', 'rs11074314', 'rs11631797',
	# 'rs12914687', 'rs1667394', 'rs3794604', 'rs3947367',
	# 'rs4778241', 'rs7495174', 'rs7170869',
	# 'rs12593929', 'rs4900109', 'rs2240203', 'rs7174027',
	# 'rs8028689', 'rs916977', 'rs3940272', 'rs3935591',
	# 'rs2703969', 'rs1533995', 'rs12906280', 'rs12896399',
	# 'rs1800414', 'rs10235789', 'rs12913832', 'rs746861',
	# 'rs7170852', 'rs16891982', 'rs7494942', 'rs11634406',
	# 'rs7183877', 'rs989869', 'rs2238289', 'rs1129038',
	# 'rs895828', 'rs1800407', 'rs1800404', 'rs9834312',
	# 'rs2179922', 'rs1867138', 'rs1867137', 'rs939882',
	# 'rs1042725', 'rs363050', 'rs174575', 'rs324640',
	# 'rs363039', 'rs363043', 'rs353016', ])

	_23andme_header = """
	# This data file generated by 23andMe at: Thu Dec 17 08:08:10 2015
	#
	# This file contains raw genotype data, including data that is not used in 23andMe reports.
	# This data has undergone a general quality review however only a subset of markers have been
	# individually validated for accuracy. As such, this data is suitable only for research,
	# educational, and informational use and not for medical or other use.
	#
	# Below is a text version of your data. Fields are TAB-separated
	# Each line corresponds to a single SNP. For each SNP, we provide its identifier
	# (an rsid or an internal id), its location on the reference human genome, and the
	# genotype call oriented with respect to the plus strand on the human reference sequence.
	# We are using reference human assembly build 37 (also known as Annotation Release 104).
	# Note that it is possible that data downloaded at different times may be different due to ongoing
	# improvements in our ability to call genotypes. More information about these changes can be found at:
	# https://www.23andme.com/you/download/revisions/
	#
	# More information on reference human assembly build 37 (aka Annotation Release 104):
	# http://www.ncbi.nlm.nih.gov/mapview/map_search.cgi?taxid=9606
	#
	# rsid chromosome position genotype
	"""

	descriptors = ["eye_colour", "melanin_intensity", "hair_colour",]

	gene_discriptors = {"rs9782955": {"eye_colour": "Blocks some melanin. Often gives light colored eyes",
	"melanin_intensity": "Blocks some melanin. Often gives light colored eyes."
	},
	"rs12203592": {"hair_colour": "associated with hair color, eye color, and tanning response to sunlight"},
	"rs4778138": {"eye_colour":"",
	"melanin_intensity": "freckles more likely",},
	"rs7495174": {"eye_colour": "AA - blue/gray eyes more likely associated with blue or green eye color in Caucasians"},
	"rs1667394": {"eye_colour": "AA - increases susceptibility to Blond rather than brown hair 4.94 times for carriers of the A allele "},
	"rs3794604": {"melanin_intensity": "CC - Blocks some melanin. Often gives light colored eyes."},
	"rs3947367": {"hair_colour": "associated with red hair color"},
	"rs4778241": {"eye_colour": "AC - usually brown eye color"},
	"rs7174027": {"melanin_intensity": "Blocks some melanin. Often gives light colored eyes."},
	"rs12896399": {"hair_colour": "GG - Light hair color"},
	"rs1800414": {"melanin_intensity": "Blocks some melanine"},
	"rs12913832": {"eye_colour": "GG - blue eye color, 99% of the time; AG - brown eye color; AA - brown eye color, 80% of the time"},
	"rs16891982": {"hair_colour": "more likely to have black hair, Light skin; "},
	"rs7183877": {"eye_colour": "CC - blue eye color if part of blue eye color haplotype; AA/AC - usually brown eye color"},
	"rs989869": {"eye_colour": "CT - Contrasting sphincter around pupil."},
	}


	def write_gene_rslist(output_file):
	with open(output_file, mode='w') as f:

	while True:
	item = yield
	#print item
	f.write("{0} \n".format(item))


	def get_rslist_from_zip(zip_file):
	new_gene_set = set([])
	with zipfile.ZipFile(os.path.expanduser(zip_file)) as z:
	with z.open(z.namelist()[0]) as f:
	for line in f:
	if not line.startswith("#"):
	line_list = line.split()
	new_gene_set.add(line_list[0])
	return new_gene_set


	def get_text_from_zip(zip_file, gene_set):
	gene_dict = {}
	with zipfile.ZipFile(os.path.expanduser(zip_file)) as z:
	with z.open(z.namelist()[0]) as f:
	for line in f:
	if not line.startswith("#"):
	line_list = line.split()
	if line_list[0] in gene_set:
	gene_dict[line_list[0]] = {
	"chromosome" : line_list[1],
	"position" : line_list[2],
	"genotype" : line_list[3],
	}
	return gene_dict


	def get_recessive_phenotypes(x_genotype, y_genotype):
	if x_genotype == '--':
	return y_genotype

	if y_genotype == '--':
	return x_genotype

	alleles = []
	for allele in x_genotype:
	alleles.append(allele)
	for allele in y_genotype:
	alleles.append(allele)
	count = Counter(alleles)

	if count[count.keys()[0]] == count[count.keys()[1]]:
	progressive_allele = "".join([count.keys()[0], count.keys()[1]])
	return progressive_allele
	else:
	recessive_allele = max(count, key=count.get)
	return recessive_allele * 2


	def get_gene_matches(x, y, gene_set, output_file):
	recv = write_gene_rslist(output_file)
	recv.next()
	recv.send(_23andme_header)
	#print "{:<15} {:>10} {:<10} {:>10} {:>10} {:>10}".format("rsid", "chromosome", "position", "x_genotype", "y_genotype", "Progeny")

	for rs in gene_set:
	try:
	if x[rs] and y[rs]:
	if x[rs]["genotype"] == y[rs]["genotype"]:
	print "{:<15} {:^10} {:<10} {:>10} {:>10} {:>10}".format(rs, x[rs]["chromosome"], x[rs]["position"], x[rs]["genotype"], y[rs]["genotype"], "".join([x[rs]["genotype"][:1], y[rs]["genotype"][1:]]))
	recv.send("{:<15} {:^10} {:<10} {:>10}".format(rs, x[rs]["chromosome"], x[rs]["position"], "".join([x[rs]["genotype"][:1], y[rs]["genotype"][1:]])))
	else:
	print "{:<15} {:^10} {:<10} {:>10} {:>10} {:>10}".format(rs, x[rs]["chromosome"], x[rs]["position"], x[rs]["genotype"], y[rs]["genotype"], get_recessive_phenotypes(x[rs]["genotype"], y[rs]["genotype"]))
	recv.send("{:<15} {:^10} {:<10} {:>10}".format(rs, x[rs]["chromosome"], x[rs]["position"], get_recessive_phenotypes(x[rs]["genotype"], y[rs]["genotype"])))

	except:
	#print "{:<15} {:^10} {:<10} {:<10}".format(rs, x[rs]["chromosome"], x[rs]["position"], x[rs]["genotype"])
	#print "{:<15} {:^10} {:<10} {:<10}".format(rs, y[rs]["chromosome"], y[rs]["position"], y[rs]["genotype"])
	pass


	parser = argparse.ArgumentParser()
	subparsers = parser.add_subparsers(dest='subparser_name', help='commands')

	#synthesis command
	gene_parser = subparsers.add_parser('synthesis', help='specify operation')
	gene_parser.add_argument('-x', '--female', required=True, help='female 23andme zip file, do not extract')
	gene_parser.add_argument('-y', '--male', required=True, help='male 23andme zip file, do not extract')
	gene_parser.add_argument('-o', '--output', required=True, help='outputfile')



	def runner():
	if parser.parse_args().subparser_name == 'synthesis':

	gene_set_x = get_rslist_from_zip(parser.parse_args().female)
	gene_set_y = get_rslist_from_zip(parser.parse_args().male)
	gene_set = gene_set_x \| gene_set_y
	get_gene_matches(get_text_from_zip(parser.parse_args().female, gene_set),
	get_text_from_zip(parser.parse_args().male, gene_set),
	gene_set,
	parser.parse_args().output)

	if __name__ == "__main__":

	runner()