Skip to content

Instantly share code, notes, and snippets.

@slowkow
Last active July 27, 2016 13:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save slowkow/8f36828cd2f10071288e to your computer and use it in GitHub Desktop.
Save slowkow/8f36828cd2f10071288e to your computer and use it in GitHub Desktop.
Join multiple PLINK dosage files into one file.
#!/usr/bin/env python
"""
Kamil Slowikowski
fix_alleles.py
==============
Change the alleles in a dosage file to match the alleles specified in
a separate reference file. If a variant is not present in the reference file,
then it will not be changed.
Usage
-----
python fix_alleles.py alleles.txt[.gz|.bz2] file.dosage[.gz|.bz2] > output.dosage
Example
-------
Suppose that we want our variants to have these alleles:
cat alleles.txt | column -t
SNP A1 A2
snp7 A T
snp8 G C
snp9 A T
However, our dosage file looks like this:
cat dosage.txt | column -t
SNP A1 A2 FIID1 IID1
snp7 A T 0.97 0.03
snp9 C G 0.96 0.04
Notice that snp9 has genotype A/T in one file and C/G in the other file.
We can use this script to output a new dosage file with fixed alleles:
python fix_alleles.py alleles.txt dosage.txt | column -t
SNP A1 A2 FIID1 IID1
snp7 A T 0.97 0.03
snp9 A T 0.96 0.04
This new dosage file has genotype A/T for snp9.
LICENSE
This is free and unencumbered software released into the public domain.
Anyone is free to copy, modify, publish, use, compile, sell, or
distribute this software, either in source code form or as a compiled
binary, for any purpose, commercial or non-commercial, and by any
means.
In jurisdictions that recognize copyright laws, the author or authors
of this software dedicate any and all copyright interest in the
software to the public domain. We make this dedication for the benefit
of the public at large and to the detriment of our heirs and
successors. We intend this dedication to be an overt act of
relinquishment in perpetuity of all present and future rights to this
software under copyright law.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
For more information, please refer to <http://unlicense.org/>
"""
import sys
def main():
# File names of two dosage files.
in_f1 = sys.argv[1]
in_f2 = sys.argv[2]
# Generators that produce a record for each line in the file.
in1 = allele_records(open_file(in_f1))
in2 = dosage_records(open_file(in_f2))
d1 = {}
d2 = {}
done1 = False
done2 = False
while not done1 or not done2:
# Try to read a record from the alleles file.
try:
n1, a11, a21 = next(in1)
if n1:
d1[n1] = (a11, a21)
else:
done1 = True
except:
done1 = True
# Try to read a record from the dosage file.
try:
n2, a12, a22, r2 = next(in2)
if n2:
d2[n2] = (a12, a22, r2)
else:
done2 = True
except:
done2 = True
# If the record is present in both files, print it.
if not done1 and n1 in d2:
sys.stdout.write(
n1 + ' ' + d1[n1][0] + ' ' + d1[n1][1] + d2[n1][2] + '\n')
del d1[n1]
del d2[n1]
if not done2 and n2 in d1:
# sys.stdout.write(n2 + d1[n2] + d2[n2] + '\n')
sys.stdout.write(
n2 + ' ' + d1[n2][0] + ' ' + d1[n2][1] + d2[n2][2] + '\n')
del d1[n2]
del d2[n2]
# Check the remaining variants left over after we finish reading files.
for n2 in d2.keys():
if n2 in d1:
sys.stdout.write(
n2 + ' ' + d1[n2][0] + ' ' + d1[n2][1] + d2[n2][2] + '\n')
else:
sys.stdout.write(
n2 + ' ' + d2[n2][0] + ' ' + d2[n2][1] + d2[n2][2] + '\n')
def allele_records(file_handle):
while True:
try:
# Strip the newline character at the end of the line.
line = file_handle.readline().rstrip()
except:
break
# Break a line into three parts:
#
# [a1] [a2]
# \ /
# [ name ]
# chr6_24020741_D D I3
name, a1, a2 = line.split()
yield (name, a1, a2)
# Close the file after we're done reading from it.
file_handle.close()
def dosage_records(file_handle):
while True:
try:
# Strip the newline character at the end of the line.
line = file_handle.readline().rstrip()
except:
break
# Break a dosage line into four parts:
#
# [a1] [a2]
# \ /
# [ name ] [ record ]
# chr6_24020741_D D I3 0 0.003 0 0.025 0 0
space = find_nth(line, ' ', 3)
name, a1, a2 = line[:space].split()
record = line[space:]
yield (name, a1, a2, record)
# Close the file after we're done reading from it.
file_handle.close()
def find_nth(haystack, needle, n):
start = haystack.find(needle)
while start >= 0 and n > 1:
start = haystack.find(needle, start + len(needle))
n -= 1
return start
def open_file(filename):
# Magic strings for gzip and bzip2.
magic_dict = {
"\x1f\x8b\x08": "gz",
"\x42\x5a\x68": "bz2"
}
max_len = max(len(x) for x in magic_dict)
# Read the start of the file.
with open(filename) as f:
file_start = f.read(max_len)
# Look for a match.
for magic, filetype in magic_dict.items():
if file_start.startswith(magic):
if filetype == "gz":
import gzip
return gzip.open(filename)
if filetype == "bz2":
import bz2
return bz2.BZ2File(filename)
return open(filename)
if __name__ == '__main__':
# Check for valid arguments.
usage = """
Usage: python {} alleles.txt[.gz|.bz2] file.dosage[.gz|.bz2] > out.dosage
"""
if not len(sys.argv) == 3:
sys.stderr.write(usage.format(__file__))
sys.exit(1)
# Run!
main()
#!/usr/bin/env python
"""
Kamil Slowikowski
https://slowkow.com/notes/join-plink-dosage-files
join_dosage_files.py
====================
Join multiple dosage files into one file. The resulting joined file will
contain all of the dosage data from two dosage files.
When dosage file A contains variants that are absent from dosage file B, the
joined file will contain NA values 'NA'. In other words, this script does an
"outer join" using the union of variants from both files.
Usage
-----
python join_dosage_files.py file.dosage[.gz|.bz2] file.dosage[.gz|.bz2] > output.txt
Dosage file format
------------------
A dosage file is a space-delimited table with a header.
Here's an example (with increased spacing for legibility):
SNP A1 A2 F1 I1 F2 I2 F3 I3
rs0001 A C 0.98 0.02 1.00 0.00 0.00 0.01
rs0002 G A 0.00 1.00 0.00 0.00 0.99 0.01
- We have data for two SNPs (rs0001, rs0002) on three individuals (1, 2, 3).
- Each genotype is represented by two numbers.
- The two numbers for the first SNP represent the probabilities
of A/A (P = 0.98) and A/C (P = 0.02) genotypes in individual 1.
Example
-------
zcat dos1.txt.gz | column -t
SNP A1 A2 F1 I1 F2 I2
snp1 A T 0 0 0 0.001
snp2 G C 0.98 0.02 0.5 0.5
snp3 C T 0 0.089 0 0.167
snp5 T G 0.1 0.9 0.2 0.8
bzcat dos2.txt.bz2 | column -t
SNP A1 A2 F1 I1 F2 I2
snp1 A T 0.1 0.9 0.01 0.99
snp2 G C 0.89 0.11 0.15 0.85
snp3 C T 1 0 1 0
snp4 A C 0.9 0.1 0.8 0.2
python join_dosage_files.py dos1.txt.gz dos2.txt.bz2 | column -t
SNP A1 A2 F1 I1 F2 I2 F1 I1 F2 I2
snp1 A T 0 0 0 0.001 0.1 0.9 0.01 0.99
snp2 G C 0.98 0.02 0.5 0.5 0.89 0.11 0.15 0.85
snp3 C T 0 0.089 0 0.167 1 0 1 0
snp5 T G 0.1 0.9 0.2 0.8 NA NA NA NA
snp4 A C NA NA NA NA 0.9 0.1 0.8 0.2
LICENSE
This is free and unencumbered software released into the public domain.
Anyone is free to copy, modify, publish, use, compile, sell, or
distribute this software, either in source code form or as a compiled
binary, for any purpose, commercial or non-commercial, and by any
means.
In jurisdictions that recognize copyright laws, the author or authors
of this software dedicate any and all copyright interest in the
software to the public domain. We make this dedication for the benefit
of the public at large and to the detriment of our heirs and
successors. We intend this dedication to be an overt act of
relinquishment in perpetuity of all present and future rights to this
software under copyright law.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
For more information, please refer to <http://unlicense.org/>
"""
import sys
def main():
# File names of two dosage files.
in_f1 = sys.argv[1]
in_f2 = sys.argv[2]
# Generators that produce a record for each line in the file.
in1 = dosage_records(open_file(in_f1))
in2 = dosage_records(open_file(in_f2))
d1 = {}
d2 = {}
done1 = False
done2 = False
# Number of columns in each file.
num1 = None
num2 = None
# Print records that appear in both file1 and file2.
while not done1 or not done2:
# Try to read a record from the first file.
try:
n1, r1 = next(in1)
if n1:
d1[n1] = r1
else:
done1 = True
if not num1:
num1 = len(r1.split())
except:
done1 = True
# Try to read a record from the second file.
try:
n2, r2 = next(in2)
if n2:
d2[n2] = r2
else:
done2 = True
if not num2:
num2 = len(r2.split())
except:
done2 = True
# If the record is present in both files, print it.
if not done1 and n1 in d2:
sys.stdout.write(n1 + d1[n1] + d2[n1] + '\n')
del d1[n1]
del d2[n1]
if not done2 and n2 in d1:
sys.stdout.write(n2 + d1[n2] + d2[n2] + '\n')
del d1[n2]
del d2[n2]
# Print records that appear only in one file.
for n1 in d1.keys():
sys.stdout.write(n1 + d1[n1] + ' NA' * num2 + '\n')
for n2 in d2.keys():
sys.stdout.write(n2 + ' NA' * num1 + d2[n2] + '\n')
def dosage_records(file_handle):
while True:
try:
# Strip the newline character at the end of the line.
line = file_handle.readline().rstrip()
except:
break
# Break a dosage line into two parts at the 3rd space:
#
# [ name ][ record ]
# chr6_24020741_D D I3 0 0.003 0 0.025 0 0
# ^
# third space
space = find_nth(line, ' ', 3)
name = line[:space]
record = line[space:]
yield (name, record)
# Close the file after we're done reading from it.
file_handle.close()
def find_nth(haystack, needle, n):
start = haystack.find(needle)
while start >= 0 and n > 1:
start = haystack.find(needle, start + len(needle))
n -= 1
return start
def open_file(filename):
# Magic strings for gzip and bzip2.
magic_dict = {
"\x1f\x8b\x08": "gz",
"\x42\x5a\x68": "bz2"
}
max_len = max(len(x) for x in magic_dict)
# Read the start of the file.
with open(filename) as f:
file_start = f.read(max_len)
# Look for a match.
for magic, filetype in magic_dict.items():
if file_start.startswith(magic):
if filetype == "gz":
import gzip
return gzip.open(filename)
if filetype == "bz2":
import bz2
return bz2.BZ2File(filename)
return open(filename)
if __name__ == '__main__':
# Check for valid arguments.
usage = """
Usage: python {} file1.dosage[.gz|.bz2] file2.dosage[.gz|.bz2] > out.txt
"""
if not len(sys.argv) == 3:
sys.stderr.write(usage.format(__file__))
sys.exit(1)
# Run!
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment