Skip to content

Instantly share code, notes, and snippets.

@bmamlin
Created April 1, 2021 14:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bmamlin/121bf199b7875f5d9aa1dfc83bace640 to your computer and use it in GitHub Desktop.
Save bmamlin/121bf199b7875f5d9aa1dfc83bace640 to your computer and use it in GitHub Desktop.
Create diff between CIEL import file and an OCL export file
import json
import ndjson
import logging
LOCAL_FILE = '2021-03-12/staging/ciel-v20210312.ndjson'
OCL_FILE = '2021-03-12/staging/ocl-staging-ciel-v20210312.json'
DIFF_FILE = '2021-03-12/staging/diff.json'
# LOCAL_FILE = '2021-03-12/prod/ciel-v20210312.ndjson'
# OCL_FILE = '2021-03-12/prod/ocl-prod-ciel-v20210312.json'
# DIFF_FILE = '2021-03-12/prod/diff.json'
logger = logging.getLogger('diff')
logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.DEBUG)
with open(LOCAL_FILE, 'r') as f:
local_data = ndjson.load(f)
with open(OCL_FILE, 'r') as f:
ocl_data = json.load(f)
def type_equals(t): return lambda x: x['type'] == t
def get_mapping_key(mapping):
if 'to_concept_url' in mapping and mapping['to_concept_url']:
return '%s--%s--%s' % (mapping['from_concept_url'], mapping['map_type'], mapping['to_concept_url'])
return '%s--%s--%s%s/' % (mapping['from_concept_url'], mapping['map_type'], mapping['to_source_url'], mapping['to_concept_code'])
def compare_concepts(a, b):
properties_to_compare = ['external_id',
'concept_class', 'datatype', 'retired']
for p in properties_to_compare:
if not a[p] == b[p]:
logger.warning('Concept "%s" mismatch on %s: "%s" <> "%s"' % (
a['id'], p, a[p], b[p]))
def compare_mappings(a, b):
pass
out = open(DIFF_FILE, 'w')
out.write('{\n')
ocl_concepts_cache = {}
ocl_concepts = ocl_data['concepts']
for c in ocl_concepts:
if c['id'] in ocl_concepts_cache:
# This shouldn't happen
logger.error('OCL has more than one concept with id "%s"' % c['id'])
else:
ocl_concepts_cache[c['id']] = c
out.write(' "missing_concepts":[')
removed_concepts = {}
first = True
for c in filter(type_equals('Concept'), local_data):
if not c['id'] in ocl_concepts_cache:
if c['id'] in removed_concepts:
removed_concepts[c['id']] += 1
logger.error('Multiple copies of local concept "%s" (n=%i)\n%s' %
(c['id'], removed_concepts[c['id']], json.dumps(c)))
else:
logger.warning('OCL does not have concept with id "%s"' % c['id'])
if not first:
out.write(',')
out.write('\n %s' % json.dumps(c))
first = False
else:
compare_concepts(c, ocl_concepts_cache[c['id']])
# Remove concept from cache (if 1:1 match, cache should end up empty when loop ends)
del ocl_concepts_cache[c['id']]
removed_concepts[c['id']] = 1
if not first:
out.write('\n ')
out.write('],\n')
# Any concepts remaining in cache represent extra OCL entries
out.write(' "extra_concepts": [')
first = True
for c in ocl_concepts_cache:
logger.warning('OCL has extra concept with id "%s"' % c['id'])
if not first:
out.write(',')
out.write('\n %s' % json.dumps(c))
first = False
# logger.debug('DELETE %s/orgs/%s/concepts/%s/' % ('', c['owner'], c['id']))
if not first:
out.write('\n ')
out.write('],\n')
ocl_mappings_cache = {}
ocl_mappings = ocl_data['mappings']
for m in ocl_mappings:
key = get_mapping_key(m)
if key in ocl_mappings_cache:
logger.error('OCL has more than one mapping for "%s"' % key)
else:
ocl_mappings_cache[key] = m
out.write(' "missing_mappings": [')
removed_mappings = {}
first = True
for m in filter(type_equals('Mapping'), local_data):
key = get_mapping_key(m)
if not key in ocl_mappings_cache:
if key in removed_mappings:
removed_mappings[key] += 1
logger.error('Multiple copies of local mapping "%s" (n=%i)\n%s' %
(key, removed_mappings[key], json.dumps(m)))
else:
logger.warning('OCL does not have mapping "%s"' % key)
if not first:
out.write(',')
out.write('\n %s' % json.dumps(m))
first = False
else:
compare_mappings(m, ocl_mappings_cache[key])
# Remove mapping from cache (if 1:1 match, cache should end up empty)
del ocl_mappings_cache[key]
removed_mappings[key] = 1
if not first:
out.write('\n ')
out.write('],\n')
# Any mappings remaining in cache represent extra OCL entries
out.write(' "extra_mappings": [')
first = True
for m in ocl_mappings_cache:
# logger.warning('OCL has extra mapping "%s"' % m)
if ocl_mappings_cache[m]['retired'] != True:
if not first:
out.write(',')
out.write('\n %s' % json.dumps(ocl_mappings_cache[m]))
first = False
if not first:
out.write('\n ')
out.write(']\n')
out.write('}')
out.close()
print('done')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment