slowkow/fix_alleles.py

## fix_alleles.py
#!/usr/bin/env python
"""
Kamil Slowikowski

fix_alleles.py
==============

Change the alleles in a dosage file to match the alleles specified in
a separate reference file. If a variant is not present in the reference file,
then it will not be changed.

Usage
-----

    python fix_alleles.py alleles.txt[.gz|.bz2] file.dosage[.gz|.bz2] > output.dosage

Example
-------

Suppose that we want our variants to have these alleles:

    cat alleles.txt | column -t
    SNP   A1  A2
    snp7  A   T
    snp8  G   C
    snp9  A   T

However, our dosage file looks like this:

    cat dosage.txt | column -t
    SNP   A1  A2  FIID1  IID1
    snp7  A   T   0.97   0.03
    snp9  C   G   0.96   0.04

Notice that snp9 has genotype A/T in one file and C/G in the other file.

We can use this script to output a new dosage file with fixed alleles:

    python fix_alleles.py alleles.txt dosage.txt | column -t
    SNP   A1  A2  FIID1  IID1
    snp7  A   T   0.97   0.03
    snp9  A   T   0.96   0.04

This new dosage file has genotype A/T for snp9.

LICENSE

This is free and unencumbered software released into the public domain.
Anyone is free to copy, modify, publish, use, compile, sell, or
distribute this software, either in source code form or as a compiled
binary, for any purpose, commercial or non-commercial, and by any
means.

In jurisdictions that recognize copyright laws, the author or authors
of this software dedicate any and all copyright interest in the
software to the public domain. We make this dedication for the benefit
of the public at large and to the detriment of our heirs and
successors. We intend this dedication to be an overt act of
relinquishment in perpetuity of all present and future rights to this
software under copyright law.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.

For more information, please refer to <http://unlicense.org/>
"""

import sys

def main():
    # File names of two dosage files.
    in_f1 = sys.argv[1]
    in_f2 = sys.argv[2]

    # Generators that produce a record for each line in the file.
    in1 = allele_records(open_file(in_f1))
    in2 = dosage_records(open_file(in_f2))
    d1 = {}
    d2 = {}
    done1 = False
    done2 = False

    while not done1 or not done2:
        # Try to read a record from the alleles file.
        try:
            n1, a11, a21 = next(in1)
            if n1:
                d1[n1] = (a11, a21)
            else:
                done1 = True
        except:
            done1 = True

        # Try to read a record from the dosage file.
        try:
            n2, a12, a22, r2 = next(in2)
            if n2:
                d2[n2] = (a12, a22, r2)
            else:
                done2 = True
        except:
            done2 = True

        # If the record is present in both files, print it.
        if not done1 and n1 in d2:
            sys.stdout.write(
                n1 + ' ' + d1[n1][0]  + ' ' + d1[n1][1] + d2[n1][2] + '\n')
            del d1[n1]
            del d2[n1]
        if not done2 and n2 in d1:
            # sys.stdout.write(n2 + d1[n2] + d2[n2] + '\n')
            sys.stdout.write(
                n2 + ' ' + d1[n2][0]  + ' ' + d1[n2][1] + d2[n2][2] + '\n')
            del d1[n2]
            del d2[n2]

    # Check the remaining variants left over after we finish reading files.
    for n2 in d2.keys():
        if n2 in d1:
            sys.stdout.write(
                n2 + ' ' + d1[n2][0]  + ' ' + d1[n2][1] + d2[n2][2] + '\n')
        else:
            sys.stdout.write(
                n2 + ' ' + d2[n2][0]  + ' ' + d2[n2][1] + d2[n2][2] + '\n')

def allele_records(file_handle):
    while True:
        try:
            # Strip the newline character at the end of the line.
            line = file_handle.readline().rstrip()
        except:
            break
        # Break a line into three parts:
        #
        #                [a1] [a2]
        #                   \ /
        #   [       name  ]
        #   chr6_24020741_D D I3
        name, a1, a2 = line.split()
        yield (name, a1, a2)
    # Close the file after we're done reading from it.
    file_handle.close()

def dosage_records(file_handle):
    while True:
        try:
            # Strip the newline character at the end of the line.
            line = file_handle.readline().rstrip()
        except:
            break
        # Break a dosage line into four parts:
        #
        #                [a1] [a2]
        #                   \ /
        #   [       name  ]     [      record      ]
        #   chr6_24020741_D D I3 0 0.003 0 0.025 0 0
        space = find_nth(line, ' ', 3)
        name, a1, a2 = line[:space].split()
        record = line[space:]
        yield (name, a1, a2, record)
    # Close the file after we're done reading from it.
    file_handle.close()

def find_nth(haystack, needle, n):
    start = haystack.find(needle)
    while start >= 0 and n > 1:
        start = haystack.find(needle, start + len(needle))
        n -= 1
    return start

def open_file(filename):
    # Magic strings for gzip and bzip2.
    magic_dict = {
        "\x1f\x8b\x08": "gz",
        "\x42\x5a\x68": "bz2"
    }
    max_len = max(len(x) for x in magic_dict)

    # Read the start of the file.
    with open(filename) as f:
        file_start = f.read(max_len)

    # Look for a match.
    for magic, filetype in magic_dict.items():
        if file_start.startswith(magic):
            if filetype == "gz":
                import gzip
                return gzip.open(filename)
            if filetype == "bz2":
                import bz2
                return bz2.BZ2File(filename)

    return open(filename)

if __name__ == '__main__':
    # Check for valid arguments.
    usage = """
    Usage: python {} alleles.txt[.gz|.bz2] file.dosage[.gz|.bz2] > out.dosage
    """
    if not len(sys.argv) == 3:
        sys.stderr.write(usage.format(__file__))
        sys.exit(1)

    # Run!
    main()

## join_dosage_files.py
#!/usr/bin/env python
"""
Kamil Slowikowski
https://slowkow.com/notes/join-plink-dosage-files

join_dosage_files.py
====================

Join multiple dosage files into one file. The resulting joined file will
contain all of the dosage data from two dosage files.

When dosage file A contains variants that are absent from dosage file B, the
joined file will contain NA values 'NA'. In other words, this script does an
"outer join" using the union of variants from both files.

Usage
-----

    python join_dosage_files.py file.dosage[.gz|.bz2] file.dosage[.gz|.bz2] > output.txt

Dosage file format
------------------

A dosage file is a space-delimited table with a header.

Here's an example (with increased spacing for legibility):

       SNP  A1  A2     F1   I1     F2   I2      F3   I3
    rs0001   A   C   0.98 0.02   1.00 0.00    0.00 0.01
    rs0002   G   A   0.00 1.00   0.00 0.00    0.99 0.01

-   We have data for two SNPs (rs0001, rs0002) on three individuals (1, 2, 3).
-   Each genotype is represented by two numbers.
-   The two numbers for the first SNP represent the probabilities
    of A/A (P = 0.98) and A/C (P = 0.02) genotypes in individual 1.

Example
-------

    zcat dos1.txt.gz | column -t
    SNP   A1  A2  F1    I1     F2   I2
    snp1  A   T   0     0      0    0.001
    snp2  G   C   0.98  0.02   0.5  0.5
    snp3  C   T   0     0.089  0    0.167
    snp5  T   G   0.1   0.9    0.2  0.8

    bzcat dos2.txt.bz2 | column -t
    SNP   A1  A2  F1    I1    F2    I2
    snp1  A   T   0.1   0.9   0.01  0.99
    snp2  G   C   0.89  0.11  0.15  0.85
    snp3  C   T   1     0     1     0
    snp4  A   C   0.9   0.1   0.8   0.2

    python join_dosage_files.py dos1.txt.gz dos2.txt.bz2 | column -t
    SNP   A1  A2  F1    I1     F2   I2     F1    I1    F2    I2
    snp1  A   T   0     0      0    0.001  0.1   0.9   0.01  0.99
    snp2  G   C   0.98  0.02   0.5  0.5    0.89  0.11  0.15  0.85
    snp3  C   T   0     0.089  0    0.167  1     0     1     0
    snp5  T   G   0.1   0.9    0.2  0.8    NA    NA    NA    NA
    snp4  A   C   NA    NA     NA   NA     0.9   0.1   0.8   0.2

LICENSE

This is free and unencumbered software released into the public domain.
Anyone is free to copy, modify, publish, use, compile, sell, or
distribute this software, either in source code form or as a compiled
binary, for any purpose, commercial or non-commercial, and by any
means.

In jurisdictions that recognize copyright laws, the author or authors
of this software dedicate any and all copyright interest in the
software to the public domain. We make this dedication for the benefit
of the public at large and to the detriment of our heirs and
successors. We intend this dedication to be an overt act of
relinquishment in perpetuity of all present and future rights to this
software under copyright law.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.

For more information, please refer to <http://unlicense.org/>
"""

import sys

def main():
    # File names of two dosage files.
    in_f1 = sys.argv[1]
    in_f2 = sys.argv[2]

    # Generators that produce a record for each line in the file.
    in1 = dosage_records(open_file(in_f1))
    in2 = dosage_records(open_file(in_f2))
    d1 = {}
    d2 = {}
    done1 = False
    done2 = False

    # Number of columns in each file.
    num1 = None
    num2 = None

    # Print records that appear in both file1 and file2.
    while not done1 or not done2:
        # Try to read a record from the first file.
        try:
            n1, r1 = next(in1)
            if n1:
                d1[n1] = r1
            else:
                done1 = True
            if not num1:
                num1 = len(r1.split())
        except:
            done1 = True

        # Try to read a record from the second file.
        try:
            n2, r2 = next(in2)
            if n2:
                d2[n2] = r2
            else:
                done2 = True
            if not num2:
                num2 = len(r2.split())
        except:
            done2 = True

        # If the record is present in both files, print it.
        if not done1 and n1 in d2:
            sys.stdout.write(n1 + d1[n1] + d2[n1] + '\n')
            del d1[n1]
            del d2[n1]
        if not done2 and n2 in d1:
            sys.stdout.write(n2 + d1[n2] + d2[n2] + '\n')
            del d1[n2]
            del d2[n2]

    # Print records that appear only in one file.
    for n1 in d1.keys():
        sys.stdout.write(n1 + d1[n1] + ' NA' * num2 + '\n')
    for n2 in d2.keys():
        sys.stdout.write(n2 + ' NA' * num1 + d2[n2] + '\n')

def dosage_records(file_handle):
    while True:
        try:
            # Strip the newline character at the end of the line.
            line = file_handle.readline().rstrip()
        except:
            break
        # Break a dosage line into two parts at the 3rd space:
        #
        #   [       name       ][      record      ]
        #   chr6_24020741_D D I3 0 0.003 0 0.025 0 0
        #                       ^
        #                       third space
        space = find_nth(line, ' ', 3)
        name = line[:space]
        record = line[space:]
        yield (name, record)
    # Close the file after we're done reading from it.
    file_handle.close()

def find_nth(haystack, needle, n):
    start = haystack.find(needle)
    while start >= 0 and n > 1:
        start = haystack.find(needle, start + len(needle))
        n -= 1
    return start

def open_file(filename):
    # Magic strings for gzip and bzip2.
    magic_dict = {
        "\x1f\x8b\x08": "gz",
        "\x42\x5a\x68": "bz2"
    }
    max_len = max(len(x) for x in magic_dict)

    # Read the start of the file.
    with open(filename) as f:
        file_start = f.read(max_len)

    # Look for a match.
    for magic, filetype in magic_dict.items():
        if file_start.startswith(magic):
            if filetype == "gz":
                import gzip
                return gzip.open(filename)
            if filetype == "bz2":
                import bz2
                return bz2.BZ2File(filename)

    return open(filename)

if __name__ == '__main__':
    # Check for valid arguments.
    usage = """
    Usage: python {} file1.dosage[.gz|.bz2] file2.dosage[.gz|.bz2] > out.txt
    """
    if not len(sys.argv) == 3:
        sys.stderr.write(usage.format(__file__))
        sys.exit(1)

    # Run!
    main()
	#!/usr/bin/env python
	"""
	Kamil Slowikowski

	fix_alleles.py
	==============

	Change the alleles in a dosage file to match the alleles specified in
	a separate reference file. If a variant is not present in the reference file,
	then it will not be changed.

	Usage
	-----

	python fix_alleles.py alleles.txt[.gz\|.bz2] file.dosage[.gz\|.bz2] > output.dosage

	Example
	-------

	Suppose that we want our variants to have these alleles:

	cat alleles.txt \| column -t
	SNP A1 A2
	snp7 A T
	snp8 G C
	snp9 A T

	However, our dosage file looks like this:

	cat dosage.txt \| column -t
	SNP A1 A2 FIID1 IID1
	snp7 A T 0.97 0.03
	snp9 C G 0.96 0.04

	Notice that snp9 has genotype A/T in one file and C/G in the other file.

	We can use this script to output a new dosage file with fixed alleles:

	python fix_alleles.py alleles.txt dosage.txt \| column -t
	SNP A1 A2 FIID1 IID1
	snp7 A T 0.97 0.03
	snp9 A T 0.96 0.04

	This new dosage file has genotype A/T for snp9.

	LICENSE

	This is free and unencumbered software released into the public domain.
	Anyone is free to copy, modify, publish, use, compile, sell, or
	distribute this software, either in source code form or as a compiled
	binary, for any purpose, commercial or non-commercial, and by any
	means.

	In jurisdictions that recognize copyright laws, the author or authors
	of this software dedicate any and all copyright interest in the
	software to the public domain. We make this dedication for the benefit
	of the public at large and to the detriment of our heirs and
	successors. We intend this dedication to be an overt act of
	relinquishment in perpetuity of all present and future rights to this
	software under copyright law.

	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
	IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
	OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
	ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
	OTHER DEALINGS IN THE SOFTWARE.

	For more information, please refer to <http://unlicense.org/>
	"""

	import sys

	def main():
	# File names of two dosage files.
	in_f1 = sys.argv[1]
	in_f2 = sys.argv[2]

	# Generators that produce a record for each line in the file.
	in1 = allele_records(open_file(in_f1))
	in2 = dosage_records(open_file(in_f2))
	d1 = {}
	d2 = {}
	done1 = False
	done2 = False

	while not done1 or not done2:
	# Try to read a record from the alleles file.
	try:
	n1, a11, a21 = next(in1)
	if n1:
	d1[n1] = (a11, a21)
	else:
	done1 = True
	except:
	done1 = True

	# Try to read a record from the dosage file.
	try:
	n2, a12, a22, r2 = next(in2)
	if n2:
	d2[n2] = (a12, a22, r2)
	else:
	done2 = True
	except:
	done2 = True

	# If the record is present in both files, print it.
	if not done1 and n1 in d2:
	sys.stdout.write(
	n1 + ' ' + d1[n1][0] + ' ' + d1[n1][1] + d2[n1][2] + '\n')
	del d1[n1]
	del d2[n1]
	if not done2 and n2 in d1:
	# sys.stdout.write(n2 + d1[n2] + d2[n2] + '\n')
	sys.stdout.write(
	n2 + ' ' + d1[n2][0] + ' ' + d1[n2][1] + d2[n2][2] + '\n')
	del d1[n2]
	del d2[n2]

	# Check the remaining variants left over after we finish reading files.
	for n2 in d2.keys():
	if n2 in d1:
	sys.stdout.write(
	n2 + ' ' + d1[n2][0] + ' ' + d1[n2][1] + d2[n2][2] + '\n')
	else:
	sys.stdout.write(
	n2 + ' ' + d2[n2][0] + ' ' + d2[n2][1] + d2[n2][2] + '\n')

	def allele_records(file_handle):
	while True:
	try:
	# Strip the newline character at the end of the line.
	line = file_handle.readline().rstrip()
	except:
	break
	# Break a line into three parts:
	#
	# [a1] [a2]
	# \ /
	# [ name ]
	# chr6_24020741_D D I3
	name, a1, a2 = line.split()
	yield (name, a1, a2)
	# Close the file after we're done reading from it.
	file_handle.close()

	def dosage_records(file_handle):
	while True:
	try:
	# Strip the newline character at the end of the line.
	line = file_handle.readline().rstrip()
	except:
	break
	# Break a dosage line into four parts:
	#
	# [a1] [a2]
	# \ /
	# [ name ] [ record ]
	# chr6_24020741_D D I3 0 0.003 0 0.025 0 0
	space = find_nth(line, ' ', 3)
	name, a1, a2 = line[:space].split()
	record = line[space:]
	yield (name, a1, a2, record)
	# Close the file after we're done reading from it.
	file_handle.close()

	def find_nth(haystack, needle, n):
	start = haystack.find(needle)
	while start >= 0 and n > 1:
	start = haystack.find(needle, start + len(needle))
	n -= 1
	return start

	def open_file(filename):
	# Magic strings for gzip and bzip2.
	magic_dict = {
	"\x1f\x8b\x08": "gz",
	"\x42\x5a\x68": "bz2"
	}
	max_len = max(len(x) for x in magic_dict)

	# Read the start of the file.
	with open(filename) as f:
	file_start = f.read(max_len)

	# Look for a match.
	for magic, filetype in magic_dict.items():
	if file_start.startswith(magic):
	if filetype == "gz":
	import gzip
	return gzip.open(filename)
	if filetype == "bz2":
	import bz2
	return bz2.BZ2File(filename)

	return open(filename)

	if __name__ == '__main__':
	# Check for valid arguments.
	usage = """
	Usage: python {} alleles.txt[.gz\|.bz2] file.dosage[.gz\|.bz2] > out.dosage
	"""
	if not len(sys.argv) == 3:
	sys.stderr.write(usage.format(__file__))
	sys.exit(1)

	# Run!
	main()
	#!/usr/bin/env python
	"""
	Kamil Slowikowski
	https://slowkow.com/notes/join-plink-dosage-files

	join_dosage_files.py
	====================

	Join multiple dosage files into one file. The resulting joined file will
	contain all of the dosage data from two dosage files.

	When dosage file A contains variants that are absent from dosage file B, the
	joined file will contain NA values 'NA'. In other words, this script does an
	"outer join" using the union of variants from both files.

	Usage
	-----

	python join_dosage_files.py file.dosage[.gz\|.bz2] file.dosage[.gz\|.bz2] > output.txt

	Dosage file format
	------------------

	A dosage file is a space-delimited table with a header.

	Here's an example (with increased spacing for legibility):

	SNP A1 A2 F1 I1 F2 I2 F3 I3
	rs0001 A C 0.98 0.02 1.00 0.00 0.00 0.01
	rs0002 G A 0.00 1.00 0.00 0.00 0.99 0.01

	- We have data for two SNPs (rs0001, rs0002) on three individuals (1, 2, 3).
	- Each genotype is represented by two numbers.
	- The two numbers for the first SNP represent the probabilities
	of A/A (P = 0.98) and A/C (P = 0.02) genotypes in individual 1.

	Example
	-------

	zcat dos1.txt.gz \| column -t
	SNP A1 A2 F1 I1 F2 I2
	snp1 A T 0 0 0 0.001
	snp2 G C 0.98 0.02 0.5 0.5
	snp3 C T 0 0.089 0 0.167
	snp5 T G 0.1 0.9 0.2 0.8

	bzcat dos2.txt.bz2 \| column -t
	SNP A1 A2 F1 I1 F2 I2
	snp1 A T 0.1 0.9 0.01 0.99
	snp2 G C 0.89 0.11 0.15 0.85
	snp3 C T 1 0 1 0
	snp4 A C 0.9 0.1 0.8 0.2

	python join_dosage_files.py dos1.txt.gz dos2.txt.bz2 \| column -t
	SNP A1 A2 F1 I1 F2 I2 F1 I1 F2 I2
	snp1 A T 0 0 0 0.001 0.1 0.9 0.01 0.99
	snp2 G C 0.98 0.02 0.5 0.5 0.89 0.11 0.15 0.85
	snp3 C T 0 0.089 0 0.167 1 0 1 0
	snp5 T G 0.1 0.9 0.2 0.8 NA NA NA NA
	snp4 A C NA NA NA NA 0.9 0.1 0.8 0.2

	LICENSE

	This is free and unencumbered software released into the public domain.
	Anyone is free to copy, modify, publish, use, compile, sell, or
	distribute this software, either in source code form or as a compiled
	binary, for any purpose, commercial or non-commercial, and by any
	means.

	In jurisdictions that recognize copyright laws, the author or authors
	of this software dedicate any and all copyright interest in the
	software to the public domain. We make this dedication for the benefit
	of the public at large and to the detriment of our heirs and
	successors. We intend this dedication to be an overt act of
	relinquishment in perpetuity of all present and future rights to this
	software under copyright law.

	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
	IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
	OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
	ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
	OTHER DEALINGS IN THE SOFTWARE.

	For more information, please refer to <http://unlicense.org/>
	"""

	import sys

	def main():
	# File names of two dosage files.
	in_f1 = sys.argv[1]
	in_f2 = sys.argv[2]

	# Generators that produce a record for each line in the file.
	in1 = dosage_records(open_file(in_f1))
	in2 = dosage_records(open_file(in_f2))
	d1 = {}
	d2 = {}
	done1 = False
	done2 = False

	# Number of columns in each file.
	num1 = None
	num2 = None

	# Print records that appear in both file1 and file2.
	while not done1 or not done2:
	# Try to read a record from the first file.
	try:
	n1, r1 = next(in1)
	if n1:
	d1[n1] = r1
	else:
	done1 = True
	if not num1:
	num1 = len(r1.split())
	except:
	done1 = True

	# Try to read a record from the second file.
	try:
	n2, r2 = next(in2)
	if n2:
	d2[n2] = r2
	else:
	done2 = True
	if not num2:
	num2 = len(r2.split())
	except:
	done2 = True

	# If the record is present in both files, print it.
	if not done1 and n1 in d2:
	sys.stdout.write(n1 + d1[n1] + d2[n1] + '\n')
	del d1[n1]
	del d2[n1]
	if not done2 and n2 in d1:
	sys.stdout.write(n2 + d1[n2] + d2[n2] + '\n')
	del d1[n2]
	del d2[n2]

	# Print records that appear only in one file.
	for n1 in d1.keys():
	sys.stdout.write(n1 + d1[n1] + ' NA' * num2 + '\n')
	for n2 in d2.keys():
	sys.stdout.write(n2 + ' NA' * num1 + d2[n2] + '\n')

	def dosage_records(file_handle):
	while True:
	try:
	# Strip the newline character at the end of the line.
	line = file_handle.readline().rstrip()
	except:
	break
	# Break a dosage line into two parts at the 3rd space:
	#
	# [ name ][ record ]
	# chr6_24020741_D D I3 0 0.003 0 0.025 0 0
	# ^
	# third space
	space = find_nth(line, ' ', 3)
	name = line[:space]
	record = line[space:]
	yield (name, record)
	# Close the file after we're done reading from it.
	file_handle.close()

	def find_nth(haystack, needle, n):
	start = haystack.find(needle)
	while start >= 0 and n > 1:
	start = haystack.find(needle, start + len(needle))
	n -= 1
	return start

	def open_file(filename):
	# Magic strings for gzip and bzip2.
	magic_dict = {
	"\x1f\x8b\x08": "gz",
	"\x42\x5a\x68": "bz2"
	}
	max_len = max(len(x) for x in magic_dict)

	# Read the start of the file.
	with open(filename) as f:
	file_start = f.read(max_len)

	# Look for a match.
	for magic, filetype in magic_dict.items():
	if file_start.startswith(magic):
	if filetype == "gz":
	import gzip
	return gzip.open(filename)
	if filetype == "bz2":
	import bz2
	return bz2.BZ2File(filename)

	return open(filename)

	if __name__ == '__main__':
	# Check for valid arguments.
	usage = """
	Usage: python {} file1.dosage[.gz\|.bz2] file2.dosage[.gz\|.bz2] > out.txt
	"""
	if not len(sys.argv) == 3:
	sys.stderr.write(usage.format(__file__))
	sys.exit(1)

	# Run!
	main()