Skip to content

Instantly share code, notes, and snippets.

@108krohan
Created July 18, 2019 15:02
Show Gist options
  • Save 108krohan/89b3f1114aeeeab533f0b0d4035be788 to your computer and use it in GitHub Desktop.
Save 108krohan/89b3f1114aeeeab533f0b0d4035be788 to your computer and use it in GitHub Desktop.
Execution format: `python check_assumptions.py -i refsnp-chr-XYZ.json.bz2`
import argparse
import json
import bz2
import time
def is_pltp_uniq(info):
'''
checks if PLTP is true at only 1 place, i.e. primary in any JSON
'''
pltp_count = 0
for alleleinfo in info:
# has top level placement (ptlp) and assembly info
if alleleinfo['is_ptlp'] :
pltp_count += 1
if(pltp_count == 1) :
return True
else :
return False
def is_start_and_pos_const(info):
'''
checks if start and pos are same across all variants reported in any JSON
'''
diff_start_count = 0
diff_pos_count = 0
start = -1
pos = -1
for alleleinfo in info:
# has top level placement (ptlp) and assembly info
if alleleinfo['is_ptlp'] and \
len(alleleinfo['placement_annot']
['seq_id_traits_by_assembly']) > 0:
assembly_name = (alleleinfo['placement_annot']
['seq_id_traits_by_assembly']
[0]['assembly_name'])
for a in alleleinfo['alleles']:
spdi = a['allele']['spdi']
if(start == -1) :
start = spdi['seq_id']
pos = spdi['position']
continue
if(spdi['seq_id'] != start) :
diff_start_count += 1
if(spdi['position'] != pos) :
diff_pos_count += 1
if(diff_pos_count == 0 and diff_start_count == 0) :
return True
else :
return False
parser = argparse.ArgumentParser(
description='Example of parsing JSON RefSNP Data')
parser.add_argument(
'-i', dest='input_fn', required=True,
help='The name of the input file to parse')
args = parser.parse_args()
start_time = time.time()
cnt = 0
pltp_multiple = 0
start_and_pos_diff = 0
with bz2.BZ2File(args.input_fn, 'rb') as f_in:
for line in f_in:
rs_obj = json.loads(line.decode('utf-8'))
if 'primary_snapshot_data' in rs_obj:
placements_with_allele = rs_obj['primary_snapshot_data']['placements_with_allele']
if(not is_pltp_uniq(placements_with_allele)) :
pltp_multiple += 1
if(not is_start_and_pos_const(placements_with_allele)) :
start_and_pos_diff += 1
cnt = cnt + 1
#print("Progress = " + str(cnt))
##if (cnt > 10000):
## break
print("Lines scanned = " + str(cnt))
print("PLTP were found to be non-unique in "
+ str(pltp_multiple) + " instances... \n\tPercentage samples affected = " + str((pltp_multiple * 100)/cnt))
print("CONTIG and POS differed in "
+ str(start_and_pos_diff)
+ " instances... \n\tPercentage samples affected = " + str((start_and_pos_diff * 100)/cnt))
print("Execution time: " + str(round((time.time() - start_time)/60, 2)) + " mins.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment