Skip to content

Instantly share code, notes, and snippets.

@dpwrussell
Last active August 29, 2015 13:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dpwrussell/10582216 to your computer and use it in GitHub Desktop.
Save dpwrussell/10582216 to your computer and use it in GitHub Desktop.
Calculate intersection of variants
# Copyright 2014 Douglas Russell
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import csv
import sys
def main(argv):
if len(argv) != 3:
print 'Usage: python cohort_intersection.py <unionfile> <cohortvariantsfile> <intersectionfile>'
sys.exit(1)
union_file = argv[0]
cohort_variants_file = argv[1]
intersection_file = argv[2]
all_variants = {}
with open(union_file, 'rb') as f:
line = 0
reader = csv.reader(f, delimiter='\t')
for row in reader:
all_variants[(row[0], row[1])] = row
if not line % 1000000:
print '%s million completed' % (line/1000000)
line += 1
with open(intersection_file, 'wb') as f:
writer = csv.writer(f, delimiter='\t')
with open(cohort_variants_file, 'rb') as f:
reader = csv.reader(f, delimiter='\t')
for row in reader:
found_variant = all_variants.get((row[0], row[1]), None)
if found_variant:
writer.writerow(found_variant)
else:
print "Variant not found: %s %s" % (row[0], row[1])
if __name__ == "__main__":
main(sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment