Skip to content

Instantly share code, notes, and snippets.

@callahantiff
Last active August 15, 2022 07:53
Show Gist options
  • Save callahantiff/597ed54a684da2e7ac832c5bb57c162a to your computer and use it in GitHub Desktop.
Save callahantiff/597ed54a684da2e7ac832c5bb57c162a to your computer and use it in GitHub Desktop.
Composite Patient Similarity Algorithm for Semi-Supervised Rare Disease Phenotyping. Additional details can be found here: https://mor.nlm.nih.gov/pubs/alum/2017-callahan.pdf
#########################################################################################################
# 2017 NLM Summer Medical Informatics Internship
# Purpose: queries a Google BigQuery Database and returns a vector of values for a set of patients
# version 1.1.0
# date: 08.15.2017
#########################################################################################################
# import and load needed scripts
import dawg
import math
import matplotlib as mpl
import matplotlib.pyplot as pylab
import numpy as np
import os
import pandas as pd
import pickle
from progressbar import ProgressBar, FormatLabel, Percentage, Bar
import pydendroheatmap as pdh
import scipy.cluster.hierarchy as sch
import scipy.spatial.distance as dist
def GBQData(query):
"""
Function takes a string containing a user query to be run against the de-identified OMOP database and returns the
query results as a list. Each row of the list corresponds to a row of the results. Function is designed to
re-establish client connection for each query run against the database.
:param query: a string containing a user query
:return: list of query results
"""
print("Started running query from " + str() + "against the OMOP de-id database: " + str(datetime.datetime.now()))
# Authenticate and set client
client = bigquery.Client('sandbox-tc')
# get project datasets
# datasets = [dataset.name for dataset in client.list_datasets()]
# run query
user_query = client.run_sync_query(query)
# Use standard SQL syntax.
user_query.use_legacy_sql = False
# Google API request
user_query.run()
# convert query results to list
results = []
while len(results) == 0:
results = [x for x in user_query.fetch_data()]
if len(results) == 0:
print('Query returned no results, trying again')
else:
break
# save results to working directory
# pickle.dump(results, open('omop_concepts.txt', 'wb'))
print("Finished processing query: " + str(datetime.datetime.now()))
print('Query returned: %d' % len(results) + " results")
return results
def AncestorQuery(vocab, code):
'''
Function takes a string containing the vocabulary type to filter on, and a code to search for ancestors of. The
function updates generic keywords using the input strings and returns a string containing the updated query.
:param vocab: a string containing the vocabulary type to filter on
:param code: a code to search for ancestors of
:return: a string containing the updated query
'''
input_file = 'Queries/concept_ancestor_query'
# CHECK - file has data
if os.stat(input_file).st_size == 0:
raise ValueError('input file: {} is empty'.format(input_file))
else:
query = open(input_file).read()
# update query text
if vocab == 'cond':
query_update = query.replace('VOCAB', "'" + 'SNOMED' + "'")
query_update = query_update.replace('CODE', code)
return query_update
elif vocab == 'meas':
query_update = query.replace('VOCAB', "'" + 'LOINC' + "'")
query_update = query_update.replace('CODE', code)
return query_update
else:
query_update = query.replace('VOCAB', "'" + 'VA Class' + "'")
query_update = query_update.replace('CODE', code)
return query_update
def AncestorFinder(query):
'''
Function takes a string storing a SQL query as input and runs this string against the Google BQ API. Withe the
returned results the function creates a nested dictionary where the outer key is the OMOP descendant code and the
inner keys are the vocabulary, source ancestor code and label and the source descendant code and label.
:param query: a string storing a SQL query
:return: a nested dictionary where the outer key is the OMOP descendant code and the inner keys are the vocabulary,
source ancestor code and label and the source descendant code and label.
'''
ancestor_dict = {}
# return data from Google API
# ancestors = Google_API.GBQData(query)
# initialize progress bar progress bar
widgets = [Percentage(), Bar(), FormatLabel('(elapsed: %(elapsed)s)')]
pbar = ProgressBar(widgets=widgets, maxval=len(query))
# create ancestor dictionary - keyed by omop descendant concept
for row in pbar(query):
key = row[1]
if key in ancestor_dict.keys():
# ancestor_dict[key]['vocab']= str(row[2])
ancestor_dict[key]['descendant_code'].add(str(row[5]))
ancestor_dict[key]['descendant_label'].add(str(row[6]))
ancestor_dict[key]['ancestor_code'].add(str(row[3]))
ancestor_dict[key]['ancestor_label'].add(str(row[4]))
else:
ancestor_dict[key] = {}
# ancestor_dict[key]['vocab'] = str(row[2])
ancestor_dict[key]['descendant_code'] = set([str(row[5])])
ancestor_dict[key]['descendant_label'] = set([str(row[6])])
ancestor_dict[key]['ancestor_code'] = set([str(row[3])])
ancestor_dict[key]['ancestor_label'] = set([str(row[4])])
pbar.finish()
return ancestor_dict
def AncestorUpdate(ancestor_dict, patient_concepts):
'''
Function takes a dictionary of concept ancestor information and a list of lists representing patient concepts.
With this input the function checks that all patient concepts are included in the ancestor dictionary. Concepts
in the patient data that are not included in the ancestor dictionary (most likely those concepts without any
ancestors are added to the ancestor dictionary using themselves as their ancestor (this works because of the
definition of the current semantic similarity measure and would need to be re-evaluated should the measure
change). The function returns an updated ancestor dictionary.
:param ancestor_dict: a dictionary of concept ancestor information; keys are OMOP concepts and values are sets
of ancestor codes and labels as well as the concept code's mapped code
:param patient_concepts: a list of lists representing patient concepts resulting from running from GBQ
:return: updated ancestor dictionary
'''
for code in patient_concepts:
if code[1] not in ancestor_dict.keys() and code[1] != 0:
ancestor_dict.update({code[1]: {'ancestor_code':set([str(code[2])]),
'ancestor_label':set([str(code[3])]) ,
'descendant_code':set([str(code[2])]),
'descendant_label':set([str(code[3])])}})
return ancestor_dict
def SemanticSimilarity(data):
'''
Function takes a lists of lists as input and calculates the pairwise semantic similarity of the ancestors for all
descendant concept identifiers. The similarity scores are normalized to be between 0 and 1 and only if a concept
compared directly to itself is given a score of 1. The function returns a dictionary where the keys are pairs of
concepts and the values are the semantic similarity score for the pair of concepts.
:param data: a lists of lists storing the output of a google big query
:return: a dictionary where the keys are pairs of concepts and the values are the semantic similarity score for the
pair of concepts.
'''
semantic_sim = {}
# initialize progress bar progress bar
widgets = [Percentage(), Bar(), FormatLabel('(elapsed: %(elapsed)s)')]
pbar = ProgressBar(widgets=widgets, maxval=len(data.keys()))
# pairwise compare all elements in the list to each other
for i in pbar(range(0, len(data.keys()))):
for j in range(i, len(data.keys())):
# get ancestor sets
id1 = data[data.keys()[i]]['descendant_code']
id1_anc = data[data.keys()[i]]['ancestor_code'].union(id1)
id2 = data[data.keys()[j]]['descendant_code']
id2_anc = data[data.keys()[j]]['ancestor_code'].union(id2)
# semantic similarity
if id1 != id2:
numerator = len((id1_anc.union(id2_anc) - id1_anc.intersection(id2_anc)))
denominator = len(id1_anc.union(id2_anc))
sim = abs(-math.log(numerator/float(denominator), 2))
# append to dictionary
semantic_sim[tuple([list(id1)[0], list(id2)[0]])] = sim
semantic_sim[tuple([list(id2)[0], list(id1)[0]])] = sim
else:
# append to dictionary
semantic_sim[tuple([list(id1)[0], list(id2)[0]])] = 0.0
# normalize scores by the max semantic similarity score
max_score = max(semantic_sim.values()) + 0.05
semantic_sim = {comp[0]: comp[1]/max_score if comp[0][0] != comp[0][1] else 1.0 for comp in semantic_sim.items()}
# CHECK - make sure that the range is 1.0
if max(semantic_sim.values()) != 1.0:
raise ValueError('Problem with semantic similarity score normalization ')
else:
pbar.finish()
return semantic_sim
def PatientDict(input_files):
'''
Function takes a list of labeled input files and creates a dictionary where the keys are person_ids and the value is
a list (0-age; 1-gender; 2-race; 3-condition concepts set; 4-measurements set; 5-medications set). Concept codes of
0 or "No matching concept" represent concepts that were unable to be mapped by OMOP to a standardized terminology,
are excluded.
:param input_files: a list of labeled input files
:return: a dictionary where the keys are person_ids and the value is a list (0-age; 1-gender; 2-race; 3-condition
concepts set; 4-measurements set; 5-medications set)
'''
patient_dict = {}
cond_count = set()
meas_count = set()
med_count = set()
for query in input_files:
# return data from Google API
data = Google_API.GBQData(open(query[1]).read())
# initialize progress bar progress bar
widgets = [Percentage(), Bar(), FormatLabel('(elapsed: %(elapsed)s)')]
pbar = ProgressBar(widgets=widgets, maxval=len(data))
# create patient dictionary - keyed by OMOP descendant concept
if query[0] == 'person':
for row in pbar(data):
patient_dict[str(row[0])] = [[] for _ in range(6)]
patient_dict[str(row[0])][0].append(str(row[4]))
patient_dict[str(row[0])][1].append(str(row[2]))
patient_dict[str(row[0])][2].append(str(row[3]))
if query[0] == 'cond':
for row in pbar(data):
if str(row[2]) == 'No matching concept':
cond_count.add(row[4])
if str(row[2]) != 'No matching concept':
patient_dict[str(row[0])][3].append(str(row[2]))
if query[0] == 'meas':
for row in pbar(data):
if str(row[2]) == 'No matching concept':
meas_count.add(row[4])
if str(row[2]) != 'No matching concept':
patient_dict[str(row[0])][4].append(str(row[2]))
if query[0] == 'meds':
for row in pbar(data):
if str(row[2]) == 'No matching concept':
med_count.add(row[4])
if str(row[2]) != 'No matching concept':
patient_dict[str(row[0])][5].append(str(row[2]))
# print counts by table of un-mappable codes
print 'From the condition data there were ' + str(len(cond_count)) + ' conditions with no SNOMED Code'
print 'From the measurement data there were ' + str(len(meas_count)) + ' measurements with no LOINC Code'
print 'From the medication data there were ' + str(len(med_count)) + ' drugs with no RxNorm Code'
pbar.finish()
return patient_dict
def SetSim(sim_dict, list1, list2):
'''
Function takes a dictionary of pre-computed semantic similarity scores and two lists of concepts. The function
computes all pairwise comparisons of the scores find the max score for each concept in the list (from comparing it
to all other concepts in the other set). The function then takes the sum of each set's concept max and divides it
by the total number of items in each set.
:param sim_dict: pre-computed semantic similarity scores (keys: pairs of concepts; values: scores)
:param list1: list of concepts
:param list2: list of concepts
:return: integer representing the similarity score
'''
#https://mor.nlm.nih.gov/pubs/pdf/2005-ismb_bioontologies-fja.pdf
list1_max = []
list2_max = []
for i in list1:
scores = []
for j in list2:
scores.append(sim_dict[(i, j)])
list1_max.append(max(scores))
for i in list2:
scores = []
for j in list1:
scores.append(sim_dict[(i, j)])
list2_max.append(max(scores))
return (sum(list1_max) + sum(list2_max)) / float((len(list1) + len(list2)))
def DiseaseSetSim(sim_dicts, list1, list2):
'''
Function takes a list of dictionaries that have pre-computed pairwise similarity scores and two list of lists
representing sets of concepts for each individual. The function then calculates similarity for the set via the
SetSim function and returns a list of scores.
:param sim_dicts: a list of dictionaries that have pre-computed pairwise similarity scores
:param list1: list of lists representing a set of concepts for an individual
:param list2: list of lists representing a set of concepts for an individual
:return: a list of scores
'''
# conditions - calculate disease set sim for each person
cond_sim = SetSim(sim_dicts[0], list1[3], list2[3])
# measurements - calculate disease set sim for each person
meas_sim = SetSim(sim_dicts[1], list1[4], list2[4])
# medications - calculate disease set sim for each person
med_sim = SetSim(sim_dicts[2], list1[5], list2[5])
return cond_sim, meas_sim, med_sim
def AgeSim(ages, age1, age2):
'''
Function takes a lists of lists and two numbers representing two ages and using the range of ages in the data set
calculates the difference between the two ages and returns a proportion representing the difference between the two
ages.
:param: list of lists representing person-level data
:param age1: number representing an age
:param age2: number representing an age
:return: a proportion representing the difference between the two ages
'''
age_range = max(ages) - min(ages)
# convert set to int
age1 = float(list(age1)[0])
age2 = float(list(age2)[0])
# calculate similarity
sim = 1 - (abs(age1 - age2) / float(age_range))
return sim
def PatientSimilarity(patient_data, ages, sim_dicts, weight):
patient_sim = {}
# initialize progress bar progress bar
widgets = [Percentage(), Bar(), FormatLabel('(elapsed: %(elapsed)s)')]
pbar = ProgressBar(widgets=widgets, maxval=len(patient_data.keys()))
# pairwise compare all patients
for i in pbar(range(0, len(patient_data.items()))):
for j in range(i, len(patient_data.items())):
p1 = patient_data.keys()[i]
p2 = patient_data.keys()[j]
p1_data = patient_data[p1]
p2_data = patient_data[p2]
# age
age = AgeSim(ages, p1_data[0], p2_data[0])
# gender
gender = [1 if p1_data[1] == p2_data[1] else 0][0]
# race
race = [1 if p1_data[2] == p2_data[2] else 0][0]
# conditions, measurements, medications
DiseaseSetSim(sim_dicts, p1_data, p2_data)
# calculate patient similarity score
sim = [age, gender, race] + list(DiseaseSetSim(sim_dicts, p1_data, p2_data))
# apply weight
sim_weight = [(1+float(y))*x for x,y in zip(sim, weight)]
# apply weighting to similarity
if p1 != p2:
patient_sim[tuple([p1, p2])] = sum(sim_weight)
else:
patient_sim[tuple([p1, p2])] = 0.0
# normalize scores by the max semantic similarity score
max_score = max(patient_sim.values()) + 0.05
sem_sim = {comp[0]: comp[1]/max_score if comp[0][0] != comp[0][1] else 0.5 for comp in patient_sim.items()}
# CHECK - make sure that the range is 1.0
if max(sem_sim.values()) > 1.0:
raise ValueError('Problem with semantic similarity score normalization ')
pbar.finish()
return sem_sim
def DendroHeat(grps, data, method, title, filename):
'''
Function takes a dictionary of pairwise patient similarities (keys: tuples(p1, p2); values: similarity) and
and converts the dictionary to a matrix of distances. With this matrix a dendrogram and heatmap are produced and
the output the plot is written to the working directory.
:param grps: dictionary of group type (key) and patient_id (value)
:param data: a dictionary of pairwise patient similarities (keys: tuples(p1, p2); values: similarity)
:param method: linkage method to use ('average', 'single', 'centroid', 'complete')
:param title: a plot title
:param filename: string with location for where to write file
:return: a dendrogram and heatmap are produced and written to the working directory
'''
# code: https://github.com/maximilianh/crisporPaper/blob/master/heatmap.py;
# https://stackoverflow.com/questions/38705359/how-to-give-sns-clustermap-a-precomputed-distance-matrix
# convert data to matrix
unq_keys, key_idx = np.unique(np.array(data.keys()), return_inverse=True)
key_idx = key_idx.reshape(-1, 2)
matrix = np.zeros((len(unq_keys), len(unq_keys)), dtype=np.array(data.values()).dtype)
matrix[key_idx[:, 0], key_idx[:, 1]] = np.array(data.values())
matrix += matrix.T
# subtract from 1 to convert similarity to distance
dist_matrix = 1 - matrix
# calculate linkage and produce a dendrogram
linkage = sch.linkage(dist.squareform(dist_matrix), method=str(method)) #metric = 'euclidean'
dendro = sch.dendrogram(linkage)
idx1 = dendro['leaves']
# re-order columns grouped by linkage to be together
dist_matrix = dist_matrix[idx1, :]; dist_matrix = dist_matrix[:, idx1]
# create heat map
heatmap = pdh.DendroHeatMap(heat_map_data = dist_matrix, top_dendrogram = linkage)
heatmap.title = str(title)
heatmap.colormap = pylab.cm.YlGnBu
# heatmap.cluster_cb_colors = mpl.colors.LinearSegmentedColormap.from_list(name = "custom",
# colors = ['red','lime'], N=2)
# color dendrogram
sch.set_link_color_palette(['blue'])
# add labels to heat map
heatmap.col_labels = [str(grps[unq_keys[dendro['leaves'][x]]]) + '-' + str(unq_keys[dendro['leaves'][x]]) for x in range(dist_matrix.shape[1])]
# show plot and write to file
heatmap.show()
pylab.savefig(str(filename) + '.jpg', dpi=600)
def main():
#### PATIENT DATA ####
# cystic fibrosis patients
cf_person = 'Queries/CF_person'
cf_cond_query = 'Queries/CF_conditions_query'
cf_ancestor_cond_query = 'Queries/CF_conds_ancest'
cf_lab_query = 'Queries/CF_measurement_query'
cf_ancestor_lab_query = 'Queries/CF_meas_ancest'
cf_med_query = 'Queries/CF_medications_query'
cf_ancestor_med_query = 'Queries/CF_meds_ancest'
# huntingtons chorea patients
hc_person = 'Queries/HC_person'
hc_cond_query = 'Queries/HC_conditions_query'
hc_ancestor_cond_query = 'Queries/HC_conds_ancest'
hc_lab_query = 'Queries/HC_measurement_query'
hc_ancestor_lab_query = 'Queries/HC_meas_ancest'
hc_med_query = 'Queries/HC_medications_query'
hc_ancestor_med_query = 'Queries/HC_meds_ancest'
# person - for calculating age range
ages = [x[4] for x in GBQData(open(cf_person).read()) + GBQData(open(hc_person).read())]
# set group labels by id
grp1 = {str(x[0]):'CF' for x in GBQData(open(cf_person).read())}
grp2 = {str(x[0]):'HC' for x in GBQData(open(hc_person).read())}
grps = dict(grp1.items() + grp2.items())
## Patient Vectors ##
CF_patients = PatientDict([['person',cf_person], ['cond', cf_cond_query], ['meas', cf_lab_query], ['meds', cf_med_query]])
# unmappable codes = conds(0); meas(0); meds(151)
HC_patients = PatientDict([['person', hc_person], ['cond', hc_cond_query'], ['meas', hc_lab_query], ['meds', hc_med_query]])
# unmappable codes = conds(0); meas(0); meds(45)
CF_red = {k: CF_patients.get(k, None) for k in ('350177','582315')}
HC_red = {k: HC_patients.get(k, None) for k in ('151130', '434388')}
patient_data = dict(CF_patients.items() + HC_patients.items())
#### ANCESTORS ####
# conditions
# conds + ancestors
conds = GBQData(open(cf_ancestor_cond_query).read()) + GBQData(open(hc_ancestor_cond_query).read())
cond_ancestor = AncestorFinder(conds)
len(cond_ancestor) # 637
# patient conds
patient_conds = GBQData(open(cf_cond_query).read()) + GBQData(open(hc_cond_query).read())
# make sure all patient concepts are included in semantic similarity calculations
cond_ancestor = AncestorUpdate(cond_ancestor, patient_conds)
len(cond_ancestor) # 637
# calculate semantic similarity
cond_sim = SemanticSimilarity(cond_ancestor)
## measurements
meas = GBQData(open(cf_ancestor_lab_query).read()) + GBQData(open(hc_ancestor_lab_query).read())
meas_ancestor = AncestorFinder(meas)
len(meas_ancestor) #137
# patient meas
patient_meas = GBQData(open(cf_lab_query).read()) + GBQData(open(hc_lab_query).read())
# make sure all patient concepts are included in semantic similarity calculations
meas_ancestor = AncestorUpdate(meas_ancestor, patient_meas)
len(meas_ancestor) #137
# calculate semantic similarity
meas_sim = SemanticSimilarity(meas_ancestor)
## medications
meds = GBQData(open(cf_ancestor_med_query).read()) + GBQData(open(HC_ancestor_med_query).read())
meds_ancestor = AncestorFinder(meds)
len(meds_ancestor) #432
# patient meds
patient_meds = GBQData(open(cf_med_query).read()) + GBQData(open(hc_med_query).read())
# make sure all patient concepts are included in semantic similarity calculations
meds_ancestor = AncestorUpdate(meds_ancestor, patient_meds)
len(meds_ancestor) #460
# calculate semantic similarity
meds_sim = SemanticSimilarity(meds_ancestor)
# put all similarity dictionaries in a list
sim_dicts = [cond_sim, meas_sim, meds_sim]
#### PATIENT SIMILARITY ####
weight = [-1.0, -1.0, -1.0, 0.0, 0.0, 0.0]
patient_similarity = PatientSimilarity(patient_data, ages, sim_dicts, weight)
# export dictionary
output_loc = 'Data/HC_CF_patient_similarity_full'
with open(output_loc, 'wb') as handle:
pickle.dump(patient_similarity, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(output_loc, 'rb') as handle:
patient_similarity = pickle.load(handle)
DendroHeat(grps,
patient_similarity,
'complete',"Patient Similarity: Huntington's Chorea (333.4) vs. Cystic Fibrosis (277.0) - Clinical",
"HCCF_heatMap_clinical")
@callahantiff
Copy link
Author

CF Queries

File: Queries/CF_person.sql

SELECT
p.person_id AS pat_id,
c1.concept_name AS ethnicity, --OMOP normalized concept code name
c2.concept_name AS gender, --OMOP normalized concept code name
c3.concept_name AS race, --OMOP normalized concept code name
ROUND(DATEDIFF(CURRENT_DATE(), p.time_of_birth)/365.25) AS age

FROM db.person p
INNER JOIN db.concept c1
ON c1.concept_id = p.ethnicity_concept_id
INNER JOIN db.concept c2
ON c2.concept_id = p.gender_concept_id
INNER JOIN db.concept c3
ON c3.concept_id = p.race_concept_id
INNER JOIN db.CF_patients pi
ON pi.person_id = p.person_id
;

File: Queries/CF_conditions_query.sql

SELECT
co.person_id AS pat_id,
co.condition_concept_id AS omop_cond_id, --OMOP concept id
c1.concept_code AS source_cond_id,
c1.concept_name,
co.condition_source_value

FROM db.condition_occurrence co
INNER JOIN db.concept c1
ON c1.concept_id = co.condition_concept_id
INNER JOIN db.CF_patients p
ON p.person_id = co.person_id
-- GROUP BY pat_id
;

File: CF_conds_ancest.sql

SELECT
ca.ancestor_concept_id AS omop_ancestor_id,
ca.descendant_concept_id AS omop_descendant_id,
c2.concept_class_id AS ancestor_concept_class_id,
c2.concept_code AS ancestor_source_code,
c2.concept_name AS ancestor_code_name,
c1.concept_code AS descendant_source_code,
c1.concept_name AS descendant_code_name

FROM db.concept_ancestor_tc ca
INNER JOIN db.concept c2
ON ca.ancestor_concept_id = c2.concept_id
INNER JOIN db.concept c1
ON ca.descendant_concept_id = c1.concept_id

-- get concept ancestors (get the parents of the OMOP concept id)
WHERE c2.vocabulary_id = 'SNOMED' AND ca.descendant_concept_id IN
(SELECT
  co.condition_concept_id AS omop_cond_id, --OMOP concept id
  FROM db.condition_occurrence co
  INNER JOIN db.CF_patients p
  ON p.person_id = co.person_id)
;

File: Queries/CF_measurement_query.sql

SELECT
m.person_id AS pat_id,
m.measurement_concept_id, --OMOP concept id
c1.concept_code, --source mapped code,
c1.concept_name,
m.measurement_source_value

FROM db.measurement m
INNER JOIN db.concept c1
ON c1.concept_id = m.measurement_concept_id
INNER JOIN db.CF_patients pi
ON pi.person_id = m.person_id
-- GROUP BY pat_id
;

File: Queries/CF_meas_ancest.sql

SELECT
ca.ancestor_concept_id AS omop_ancestor_id,
ca.descendant_concept_id AS omop_descendant_id,
c2.concept_class_id AS ancestor_concept_class_id,
c2.concept_code AS ancestor_source_code,
c2.concept_name AS ancestor_code_name,
c1.concept_code AS descendant_source_code,
c1.concept_name AS descendant_code_name

FROM db.concept_ancestor_tc ca
INNER JOIN db.concept c2
ON ca.ancestor_concept_id = c2.concept_id
INNER JOIN db.concept c1
ON ca.descendant_concept_id = c1.concept_id

-- get concept ancestors (get the parents of the OMOP concept id)
WHERE c2.concept_class_id = 'LOINC Class' AND ca.descendant_concept_id IN
(SELECT
  m.measurement_concept_id, --OMOP concept id
  FROM db.measurement m
  INNER JOIN db.CF_patients pi
  ON pi.person_id = m.person_id)
;

File: Queries/CF_medications_query.sql

SELECT
d.person_id AS pat_id,
d.drug_concept_id, --OMOP concept id
c1.concept_code,
c1.concept_name,
d.drug_source_value

FROM db.drug_exposure d
INNER JOIN db.concept c1
ON c1.concept_id = d.drug_concept_id
INNER JOIN db.CF_patients pi
ON pi.person_id = d.person_id
;

File: Queries/CF_meds_ancest.sql

SELECT
ca.ancestor_concept_id AS omop_ancestor_id,
ca.descendant_concept_id AS omop_descendant_id,
c2.concept_class_id AS ancestor_concept_class_id,
c2.concept_code AS ancestor_source_code,
c2.concept_name AS ancestor_code_name,
c1.concept_code AS descendant_source_code,
c1.concept_name AS descendant_code_name

FROM db.concept_ancestor_tc ca
INNER JOIN db.concept c2
ON ca.ancestor_concept_id = c2.concept_id
INNER JOIN db.concept c1
ON ca.descendant_concept_id = c1.concept_id

-- get concept ancestors (get the parents of the OMOP concept id)
WHERE c2.concept_class_id = 'VA Class' AND ca.descendant_concept_id IN
(SELECT
  d.drug_concept_id, --OMOP concept id
  FROM db.drug_exposure d
  INNER JOIN db.CF_patients pi
  ON pi.person_id = d.person_id)
;

@callahantiff
Copy link
Author

HC Queries

File: Queries/HC_person.sql

SELECT
p.person_id AS pat_id,
c1.concept_name AS ethnicity, --OMOP normalized concept code name
c2.concept_name AS gender, --OMOP normalized concept code name
c3.concept_name AS race, --OMOP normalized concept code name
ROUND(DATEDIFF(CURRENT_DATE(), p.time_of_birth)/365.25) AS age

FROM db.person p
INNER JOIN db.concept c1
ON c1.concept_id = p.ethnicity_concept_id
INNER JOIN db.concept c2
ON c2.concept_id = p.gender_concept_id
INNER JOIN db.concept c3
ON c3.concept_id = p.race_concept_id
INNER JOIN db.HC_patients pi
ON pi.person_id = p.person_id
;

File: Queries/HC_conditions_query.sql

SELECT
co.person_id AS pat_id,
co.condition_concept_id AS omop_cond_id, --OMOP concept id
c1.concept_code AS source_cond_id,
c1.concept_name,
co.condition_source_value

FROM db.condition_occurrence co
INNER JOIN db.concept c1
ON c1.concept_id = co.condition_concept_id
INNER JOIN db.HC_patients p
ON p.person_id = co.person_id
-- GROUP BY pat_id
;

File: HC_conds_ancest.sql

SELECT
ca.ancestor_concept_id AS omop_ancestor_id,
ca.descendant_concept_id AS omop_descendant_id,
c2.concept_class_id AS ancestor_concept_class_id,
c2.concept_code AS ancestor_source_code,
c2.concept_name AS ancestor_code_name,
c1.concept_code AS descendant_source_code,
c1.concept_name AS descendant_code_name

FROM db.concept_ancestor_tc ca
INNER JOIN db.concept c2
ON ca.ancestor_concept_id = c2.concept_id
INNER JOIN db.concept c1
ON ca.descendant_concept_id = c1.concept_id

-- get concept ancestors (get the parents of the OMOP concept id)
WHERE c2.vocabulary_id = 'SNOMED' AND ca.descendant_concept_id IN
(SELECT
  co.condition_concept_id AS omop_cond_id, --OMOP concept id
  FROM db.condition_occurrence co
I. NNER JOIN db.HC_patients p
  ON p.person_id = co.person_id)
;

File: Queries/HC_measurement_query.sql

SELECT
m.person_id AS pat_id,
m.measurement_concept_id, --OMOP concept id
c1.concept_code, --source mapped code
c1.concept_name,
m.measurement_source_value

FROM db.measurement m
INNER JOIN db.concept c1
ON c1.concept_id = m.measurement_concept_id
INNER JOIN db.HC_patients pi
ON pi.person_id = m.person_id
;

File: Queries/HC_meas_ancest.sql

SELECT
ca.ancestor_concept_id AS omop_ancestor_id,
ca.descendant_concept_id AS omop_descendant_id,
c2.concept_class_id AS ancestor_concept_class_id,
c2.concept_code AS ancestor_source_code,
c2.concept_name AS ancestor_code_name,
c1.concept_code AS descendant_source_code,
c1.concept_name AS descendant_code_name

FROM db.concept_ancestor_tc ca
INNER JOIN db.concept c2
ON ca.ancestor_concept_id = c2.concept_id
INNER JOIN db.concept c1
ON ca.descendant_concept_id = c1.concept_id

-- get concept ancestors (get the parents of the OMOP concept id)
WHERE
c2.concept_class_id = 'LOINC Class' AND ca.descendant_concept_id IN
(SELECT
  m.measurement_concept_id, --OMOP concept id
  FROM db.measurement m
  INNER JOIN db.HC_patients pi
  ON pi.person_id = m.person_id)
;

File: Queries/HC_medications_query.sql

SELECT
d.person_id AS pat_id,
d.drug_concept_id, --OMOP concept id
c1.concept_code,
c1.concept_name,
d.drug_source_value

FROM db.drug_exposure d
INNER JOIN db.concept c1
ON c1.concept_id = d.drug_concept_id
INNER JOIN db.HC_patients pi
ON pi.person_id = d.person_id
;

File: Queries/HC_meds_ancest.sql

SELECT
ca.ancestor_concept_id AS omop_ancestor_id,
ca.descendant_concept_id AS omop_descendant_id,
c2.concept_class_id AS ancestor_concept_class_id,
c2.concept_code AS ancestor_source_code,
c2.concept_name AS ancestor_code_name,
c1.concept_code AS descendant_source_code,
c1.concept_name AS descendant_code_name

FROM db.concept_ancestor_tc ca
INNER JOIN db.concept c2
ON ca.ancestor_concept_id = c2.concept_id
INNER JOIN db.concept c1
ON ca.descendant_concept_id = c1.concept_id

-- get concept ancestors (get the parents of the OMOP concept id)
WHERE c2.concept_class_id = 'VA Class' AND ca.descendant_concept_id IN
(SELECT
  d.drug_concept_id, --OMOP concept id
  FROM db.drug_exposure d
  INNER JOIN db.HC_patients pi
  ON pi.person_id = d.person_id)
;

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment