Last active
July 27, 2016 13:27
-
-
Save slowkow/8f36828cd2f10071288e to your computer and use it in GitHub Desktop.
Join multiple PLINK dosage files into one file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Kamil Slowikowski | |
fix_alleles.py | |
============== | |
Change the alleles in a dosage file to match the alleles specified in | |
a separate reference file. If a variant is not present in the reference file, | |
then it will not be changed. | |
Usage | |
----- | |
python fix_alleles.py alleles.txt[.gz|.bz2] file.dosage[.gz|.bz2] > output.dosage | |
Example | |
------- | |
Suppose that we want our variants to have these alleles: | |
cat alleles.txt | column -t | |
SNP A1 A2 | |
snp7 A T | |
snp8 G C | |
snp9 A T | |
However, our dosage file looks like this: | |
cat dosage.txt | column -t | |
SNP A1 A2 FIID1 IID1 | |
snp7 A T 0.97 0.03 | |
snp9 C G 0.96 0.04 | |
Notice that snp9 has genotype A/T in one file and C/G in the other file. | |
We can use this script to output a new dosage file with fixed alleles: | |
python fix_alleles.py alleles.txt dosage.txt | column -t | |
SNP A1 A2 FIID1 IID1 | |
snp7 A T 0.97 0.03 | |
snp9 A T 0.96 0.04 | |
This new dosage file has genotype A/T for snp9. | |
LICENSE | |
This is free and unencumbered software released into the public domain. | |
Anyone is free to copy, modify, publish, use, compile, sell, or | |
distribute this software, either in source code form or as a compiled | |
binary, for any purpose, commercial or non-commercial, and by any | |
means. | |
In jurisdictions that recognize copyright laws, the author or authors | |
of this software dedicate any and all copyright interest in the | |
software to the public domain. We make this dedication for the benefit | |
of the public at large and to the detriment of our heirs and | |
successors. We intend this dedication to be an overt act of | |
relinquishment in perpetuity of all present and future rights to this | |
software under copyright law. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | |
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
OTHER DEALINGS IN THE SOFTWARE. | |
For more information, please refer to <http://unlicense.org/> | |
""" | |
import sys | |
def main(): | |
# File names of two dosage files. | |
in_f1 = sys.argv[1] | |
in_f2 = sys.argv[2] | |
# Generators that produce a record for each line in the file. | |
in1 = allele_records(open_file(in_f1)) | |
in2 = dosage_records(open_file(in_f2)) | |
d1 = {} | |
d2 = {} | |
done1 = False | |
done2 = False | |
while not done1 or not done2: | |
# Try to read a record from the alleles file. | |
try: | |
n1, a11, a21 = next(in1) | |
if n1: | |
d1[n1] = (a11, a21) | |
else: | |
done1 = True | |
except: | |
done1 = True | |
# Try to read a record from the dosage file. | |
try: | |
n2, a12, a22, r2 = next(in2) | |
if n2: | |
d2[n2] = (a12, a22, r2) | |
else: | |
done2 = True | |
except: | |
done2 = True | |
# If the record is present in both files, print it. | |
if not done1 and n1 in d2: | |
sys.stdout.write( | |
n1 + ' ' + d1[n1][0] + ' ' + d1[n1][1] + d2[n1][2] + '\n') | |
del d1[n1] | |
del d2[n1] | |
if not done2 and n2 in d1: | |
# sys.stdout.write(n2 + d1[n2] + d2[n2] + '\n') | |
sys.stdout.write( | |
n2 + ' ' + d1[n2][0] + ' ' + d1[n2][1] + d2[n2][2] + '\n') | |
del d1[n2] | |
del d2[n2] | |
# Check the remaining variants left over after we finish reading files. | |
for n2 in d2.keys(): | |
if n2 in d1: | |
sys.stdout.write( | |
n2 + ' ' + d1[n2][0] + ' ' + d1[n2][1] + d2[n2][2] + '\n') | |
else: | |
sys.stdout.write( | |
n2 + ' ' + d2[n2][0] + ' ' + d2[n2][1] + d2[n2][2] + '\n') | |
def allele_records(file_handle): | |
while True: | |
try: | |
# Strip the newline character at the end of the line. | |
line = file_handle.readline().rstrip() | |
except: | |
break | |
# Break a line into three parts: | |
# | |
# [a1] [a2] | |
# \ / | |
# [ name ] | |
# chr6_24020741_D D I3 | |
name, a1, a2 = line.split() | |
yield (name, a1, a2) | |
# Close the file after we're done reading from it. | |
file_handle.close() | |
def dosage_records(file_handle): | |
while True: | |
try: | |
# Strip the newline character at the end of the line. | |
line = file_handle.readline().rstrip() | |
except: | |
break | |
# Break a dosage line into four parts: | |
# | |
# [a1] [a2] | |
# \ / | |
# [ name ] [ record ] | |
# chr6_24020741_D D I3 0 0.003 0 0.025 0 0 | |
space = find_nth(line, ' ', 3) | |
name, a1, a2 = line[:space].split() | |
record = line[space:] | |
yield (name, a1, a2, record) | |
# Close the file after we're done reading from it. | |
file_handle.close() | |
def find_nth(haystack, needle, n): | |
start = haystack.find(needle) | |
while start >= 0 and n > 1: | |
start = haystack.find(needle, start + len(needle)) | |
n -= 1 | |
return start | |
def open_file(filename): | |
# Magic strings for gzip and bzip2. | |
magic_dict = { | |
"\x1f\x8b\x08": "gz", | |
"\x42\x5a\x68": "bz2" | |
} | |
max_len = max(len(x) for x in magic_dict) | |
# Read the start of the file. | |
with open(filename) as f: | |
file_start = f.read(max_len) | |
# Look for a match. | |
for magic, filetype in magic_dict.items(): | |
if file_start.startswith(magic): | |
if filetype == "gz": | |
import gzip | |
return gzip.open(filename) | |
if filetype == "bz2": | |
import bz2 | |
return bz2.BZ2File(filename) | |
return open(filename) | |
if __name__ == '__main__': | |
# Check for valid arguments. | |
usage = """ | |
Usage: python {} alleles.txt[.gz|.bz2] file.dosage[.gz|.bz2] > out.dosage | |
""" | |
if not len(sys.argv) == 3: | |
sys.stderr.write(usage.format(__file__)) | |
sys.exit(1) | |
# Run! | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Kamil Slowikowski | |
https://slowkow.com/notes/join-plink-dosage-files | |
join_dosage_files.py | |
==================== | |
Join multiple dosage files into one file. The resulting joined file will | |
contain all of the dosage data from two dosage files. | |
When dosage file A contains variants that are absent from dosage file B, the | |
joined file will contain NA values 'NA'. In other words, this script does an | |
"outer join" using the union of variants from both files. | |
Usage | |
----- | |
python join_dosage_files.py file.dosage[.gz|.bz2] file.dosage[.gz|.bz2] > output.txt | |
Dosage file format | |
------------------ | |
A dosage file is a space-delimited table with a header. | |
Here's an example (with increased spacing for legibility): | |
SNP A1 A2 F1 I1 F2 I2 F3 I3 | |
rs0001 A C 0.98 0.02 1.00 0.00 0.00 0.01 | |
rs0002 G A 0.00 1.00 0.00 0.00 0.99 0.01 | |
- We have data for two SNPs (rs0001, rs0002) on three individuals (1, 2, 3). | |
- Each genotype is represented by two numbers. | |
- The two numbers for the first SNP represent the probabilities | |
of A/A (P = 0.98) and A/C (P = 0.02) genotypes in individual 1. | |
Example | |
------- | |
zcat dos1.txt.gz | column -t | |
SNP A1 A2 F1 I1 F2 I2 | |
snp1 A T 0 0 0 0.001 | |
snp2 G C 0.98 0.02 0.5 0.5 | |
snp3 C T 0 0.089 0 0.167 | |
snp5 T G 0.1 0.9 0.2 0.8 | |
bzcat dos2.txt.bz2 | column -t | |
SNP A1 A2 F1 I1 F2 I2 | |
snp1 A T 0.1 0.9 0.01 0.99 | |
snp2 G C 0.89 0.11 0.15 0.85 | |
snp3 C T 1 0 1 0 | |
snp4 A C 0.9 0.1 0.8 0.2 | |
python join_dosage_files.py dos1.txt.gz dos2.txt.bz2 | column -t | |
SNP A1 A2 F1 I1 F2 I2 F1 I1 F2 I2 | |
snp1 A T 0 0 0 0.001 0.1 0.9 0.01 0.99 | |
snp2 G C 0.98 0.02 0.5 0.5 0.89 0.11 0.15 0.85 | |
snp3 C T 0 0.089 0 0.167 1 0 1 0 | |
snp5 T G 0.1 0.9 0.2 0.8 NA NA NA NA | |
snp4 A C NA NA NA NA 0.9 0.1 0.8 0.2 | |
LICENSE | |
This is free and unencumbered software released into the public domain. | |
Anyone is free to copy, modify, publish, use, compile, sell, or | |
distribute this software, either in source code form or as a compiled | |
binary, for any purpose, commercial or non-commercial, and by any | |
means. | |
In jurisdictions that recognize copyright laws, the author or authors | |
of this software dedicate any and all copyright interest in the | |
software to the public domain. We make this dedication for the benefit | |
of the public at large and to the detriment of our heirs and | |
successors. We intend this dedication to be an overt act of | |
relinquishment in perpetuity of all present and future rights to this | |
software under copyright law. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | |
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR | |
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
OTHER DEALINGS IN THE SOFTWARE. | |
For more information, please refer to <http://unlicense.org/> | |
""" | |
import sys | |
def main(): | |
# File names of two dosage files. | |
in_f1 = sys.argv[1] | |
in_f2 = sys.argv[2] | |
# Generators that produce a record for each line in the file. | |
in1 = dosage_records(open_file(in_f1)) | |
in2 = dosage_records(open_file(in_f2)) | |
d1 = {} | |
d2 = {} | |
done1 = False | |
done2 = False | |
# Number of columns in each file. | |
num1 = None | |
num2 = None | |
# Print records that appear in both file1 and file2. | |
while not done1 or not done2: | |
# Try to read a record from the first file. | |
try: | |
n1, r1 = next(in1) | |
if n1: | |
d1[n1] = r1 | |
else: | |
done1 = True | |
if not num1: | |
num1 = len(r1.split()) | |
except: | |
done1 = True | |
# Try to read a record from the second file. | |
try: | |
n2, r2 = next(in2) | |
if n2: | |
d2[n2] = r2 | |
else: | |
done2 = True | |
if not num2: | |
num2 = len(r2.split()) | |
except: | |
done2 = True | |
# If the record is present in both files, print it. | |
if not done1 and n1 in d2: | |
sys.stdout.write(n1 + d1[n1] + d2[n1] + '\n') | |
del d1[n1] | |
del d2[n1] | |
if not done2 and n2 in d1: | |
sys.stdout.write(n2 + d1[n2] + d2[n2] + '\n') | |
del d1[n2] | |
del d2[n2] | |
# Print records that appear only in one file. | |
for n1 in d1.keys(): | |
sys.stdout.write(n1 + d1[n1] + ' NA' * num2 + '\n') | |
for n2 in d2.keys(): | |
sys.stdout.write(n2 + ' NA' * num1 + d2[n2] + '\n') | |
def dosage_records(file_handle): | |
while True: | |
try: | |
# Strip the newline character at the end of the line. | |
line = file_handle.readline().rstrip() | |
except: | |
break | |
# Break a dosage line into two parts at the 3rd space: | |
# | |
# [ name ][ record ] | |
# chr6_24020741_D D I3 0 0.003 0 0.025 0 0 | |
# ^ | |
# third space | |
space = find_nth(line, ' ', 3) | |
name = line[:space] | |
record = line[space:] | |
yield (name, record) | |
# Close the file after we're done reading from it. | |
file_handle.close() | |
def find_nth(haystack, needle, n): | |
start = haystack.find(needle) | |
while start >= 0 and n > 1: | |
start = haystack.find(needle, start + len(needle)) | |
n -= 1 | |
return start | |
def open_file(filename): | |
# Magic strings for gzip and bzip2. | |
magic_dict = { | |
"\x1f\x8b\x08": "gz", | |
"\x42\x5a\x68": "bz2" | |
} | |
max_len = max(len(x) for x in magic_dict) | |
# Read the start of the file. | |
with open(filename) as f: | |
file_start = f.read(max_len) | |
# Look for a match. | |
for magic, filetype in magic_dict.items(): | |
if file_start.startswith(magic): | |
if filetype == "gz": | |
import gzip | |
return gzip.open(filename) | |
if filetype == "bz2": | |
import bz2 | |
return bz2.BZ2File(filename) | |
return open(filename) | |
if __name__ == '__main__': | |
# Check for valid arguments. | |
usage = """ | |
Usage: python {} file1.dosage[.gz|.bz2] file2.dosage[.gz|.bz2] > out.txt | |
""" | |
if not len(sys.argv) == 3: | |
sys.stderr.write(usage.format(__file__)) | |
sys.exit(1) | |
# Run! | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment