callahantiff/composite_patient_similarity.py

## composite_patient_similarity.py
#########################################################################################################
# 2017 NLM Summer Medical Informatics Internship
# Purpose: queries a Google BigQuery Database and returns a vector of values for a set of patients
# version 1.1.0
# date: 08.15.2017
#########################################################################################################


# import and load needed scripts
import dawg
import math
import matplotlib as mpl
import matplotlib.pyplot as pylab
import numpy as np
import os
import pandas as pd
import pickle
from progressbar import ProgressBar, FormatLabel, Percentage, Bar
import pydendroheatmap as pdh
import scipy.cluster.hierarchy as sch
import scipy.spatial.distance as dist


def GBQData(query):
    """
    Function takes a string containing a user query to be run against the de-identified OMOP database and returns the
    query results as a list. Each row of the list corresponds to a row of the results. Function is designed to
    re-establish client connection for each query run against the database.
    :param query: a string containing a user query
    :return: list of query results
    """

    print("Started running query from " + str() + "against the OMOP de-id database: " + str(datetime.datetime.now()))

    # Authenticate and set client
    client = bigquery.Client('sandbox-tc')

    # get project datasets
    # datasets = [dataset.name for dataset in client.list_datasets()]

    # run query
    user_query = client.run_sync_query(query)

    # Use standard SQL syntax.
    user_query.use_legacy_sql = False

    # Google API request
    user_query.run()

    # convert query results to list
    results = []

    while len(results) == 0:

        results = [x for x in user_query.fetch_data()]

        if len(results) == 0:
            print('Query returned no results, trying again')

        else:
            break

    # save results to working directory
    # pickle.dump(results, open('omop_concepts.txt', 'wb'))

    print("Finished processing query: " + str(datetime.datetime.now()))
    print('Query returned: %d' % len(results) + " results")

    return results


def AncestorQuery(vocab, code):
    '''
    Function takes a string containing the vocabulary type to filter on, and a code to search for ancestors of. The
    function updates generic keywords using the input strings and returns a string containing the updated query.
    :param vocab: a string containing the vocabulary type to filter on
    :param code: a code to search for ancestors of
    :return: a string containing the updated query
    '''

    input_file = 'Queries/concept_ancestor_query'

    # CHECK - file has data
    if os.stat(input_file).st_size == 0:
        raise ValueError('input file: {} is empty'.format(input_file))

    else:
        query = open(input_file).read()

    # update query text
    if vocab == 'cond':
        query_update = query.replace('VOCAB', "'" + 'SNOMED' + "'")
        query_update = query_update.replace('CODE', code)

        return query_update

    elif vocab == 'meas':
        query_update = query.replace('VOCAB', "'" + 'LOINC' + "'")
        query_update = query_update.replace('CODE', code)

        return query_update

    else:
        query_update = query.replace('VOCAB', "'" + 'VA Class' + "'")
        query_update = query_update.replace('CODE', code)

        return query_update


def AncestorFinder(query):
    '''
    Function takes a string storing a SQL query as input and runs this string against the Google BQ API. Withe the
    returned results the function creates a nested dictionary where the outer key is the OMOP descendant code and the
    inner keys are the vocabulary, source ancestor code and label and the source descendant code and label.
    :param query: a string storing a SQL query
    :return: a nested dictionary where the outer key is the OMOP descendant code and the inner keys are the vocabulary,
    source ancestor code and label and the source descendant code and label.
    '''
    ancestor_dict = {}

    # return data from Google API
    # ancestors = Google_API.GBQData(query)

    # initialize progress bar progress bar
    widgets = [Percentage(), Bar(), FormatLabel('(elapsed: %(elapsed)s)')]
    pbar = ProgressBar(widgets=widgets, maxval=len(query))

    # create ancestor dictionary - keyed by omop descendant concept
    for row in pbar(query):
        key = row[1]

        if key in ancestor_dict.keys():
            # ancestor_dict[key]['vocab']= str(row[2])
            ancestor_dict[key]['descendant_code'].add(str(row[5]))
            ancestor_dict[key]['descendant_label'].add(str(row[6]))
            ancestor_dict[key]['ancestor_code'].add(str(row[3]))
            ancestor_dict[key]['ancestor_label'].add(str(row[4]))

        else:
            ancestor_dict[key] = {}
            # ancestor_dict[key]['vocab'] = str(row[2])
            ancestor_dict[key]['descendant_code'] = set([str(row[5])])
            ancestor_dict[key]['descendant_label'] = set([str(row[6])])
            ancestor_dict[key]['ancestor_code'] = set([str(row[3])])
            ancestor_dict[key]['ancestor_label'] = set([str(row[4])])

    pbar.finish()

    return ancestor_dict


def AncestorUpdate(ancestor_dict, patient_concepts):
    '''
    Function takes a dictionary of concept ancestor information and a list of lists representing patient concepts.
    With this input the function checks that all patient concepts are included in the ancestor dictionary. Concepts
    in the patient data that are not included in the ancestor dictionary (most likely those concepts without any
    ancestors are added to the ancestor dictionary using themselves as their ancestor (this works because of the
    definition of the current semantic similarity measure and would need to be re-evaluated should the measure
    change). The function returns an updated ancestor dictionary.
    :param ancestor_dict: a dictionary of concept ancestor information; keys are OMOP concepts and values are sets
    of ancestor codes and labels as well as the concept code's mapped code
    :param patient_concepts: a list of lists representing patient concepts resulting from running from GBQ
    :return: updated ancestor dictionary
    '''

    for code in patient_concepts:
        if code[1] not in ancestor_dict.keys() and code[1] != 0:

            ancestor_dict.update({code[1]: {'ancestor_code':set([str(code[2])]),
                                            'ancestor_label':set([str(code[3])]) ,
                                            'descendant_code':set([str(code[2])]),
                                            'descendant_label':set([str(code[3])])}})

    return ancestor_dict


def SemanticSimilarity(data):
    '''
    Function takes a lists of lists as input and calculates the pairwise semantic similarity of the ancestors for all
    descendant concept identifiers. The similarity scores are normalized to be between 0 and 1 and only if a concept
    compared directly to itself is given a score of 1. The function returns a dictionary where the keys are pairs of
    concepts and the values are the semantic similarity score for the pair of concepts.
    :param data: a lists of lists storing the output of a google big query
    :return: a dictionary where the keys are pairs of concepts and the values are the semantic similarity score for the
    pair of concepts.
    '''
    semantic_sim = {}

    # initialize progress bar progress bar
    widgets = [Percentage(), Bar(), FormatLabel('(elapsed: %(elapsed)s)')]
    pbar = ProgressBar(widgets=widgets, maxval=len(data.keys()))

    # pairwise compare all elements in the list to each other
    for i in pbar(range(0, len(data.keys()))):
        for j in range(i, len(data.keys())):

            # get ancestor sets
            id1 = data[data.keys()[i]]['descendant_code']
            id1_anc = data[data.keys()[i]]['ancestor_code'].union(id1)
            id2 = data[data.keys()[j]]['descendant_code']
            id2_anc = data[data.keys()[j]]['ancestor_code'].union(id2)

            # semantic similarity
            if id1 != id2:
                numerator = len((id1_anc.union(id2_anc) - id1_anc.intersection(id2_anc)))
                denominator = len(id1_anc.union(id2_anc))
                sim = abs(-math.log(numerator/float(denominator), 2))

                # append to dictionary
                semantic_sim[tuple([list(id1)[0], list(id2)[0]])] = sim
                semantic_sim[tuple([list(id2)[0], list(id1)[0]])] = sim

            else:
                # append to dictionary
                semantic_sim[tuple([list(id1)[0], list(id2)[0]])] = 0.0

    # normalize scores by the max semantic similarity score
    max_score = max(semantic_sim.values()) + 0.05

    semantic_sim = {comp[0]: comp[1]/max_score if comp[0][0] != comp[0][1] else 1.0 for comp in semantic_sim.items()}

    # CHECK - make sure that the range is 1.0
    if max(semantic_sim.values()) != 1.0:
        raise ValueError('Problem with semantic similarity score normalization ')

    else:
        pbar.finish()
        return semantic_sim


def PatientDict(input_files):
    '''
    Function takes a list of labeled input files and creates a dictionary where the keys are person_ids and the value is
    a list (0-age; 1-gender; 2-race; 3-condition concepts set; 4-measurements set; 5-medications set). Concept codes of
    0 or "No matching concept" represent concepts that were unable to be mapped by OMOP to a standardized terminology,
    are excluded.
    :param input_files: a list of labeled input files
    :return: a dictionary where the keys are person_ids and the value is a list (0-age; 1-gender; 2-race; 3-condition
    concepts set; 4-measurements set; 5-medications set)
    '''
    patient_dict = {}
    cond_count = set()
    meas_count = set()
    med_count = set()

    for query in input_files:
        # return data from Google API
        data = Google_API.GBQData(open(query[1]).read())

        # initialize progress bar progress bar
        widgets = [Percentage(), Bar(), FormatLabel('(elapsed: %(elapsed)s)')]
        pbar = ProgressBar(widgets=widgets, maxval=len(data))

        # create patient dictionary - keyed by OMOP descendant concept
        if query[0] == 'person':
            for row in pbar(data):
                patient_dict[str(row[0])] = [[] for _ in range(6)]
                patient_dict[str(row[0])][0].append(str(row[4]))
                patient_dict[str(row[0])][1].append(str(row[2]))
                patient_dict[str(row[0])][2].append(str(row[3]))

        if query[0] == 'cond':
            for row in pbar(data):
                if str(row[2]) == 'No matching concept':
                    cond_count.add(row[4])

                if str(row[2]) != 'No matching concept':
                    patient_dict[str(row[0])][3].append(str(row[2]))

        if query[0] == 'meas':
            for row in pbar(data):
                if str(row[2]) == 'No matching concept':
                    meas_count.add(row[4])

                if str(row[2]) != 'No matching concept':
                    patient_dict[str(row[0])][4].append(str(row[2]))

        if query[0] == 'meds':
            for row in pbar(data):
                if str(row[2]) == 'No matching concept':
                    med_count.add(row[4])

                if str(row[2]) != 'No matching concept':
                    patient_dict[str(row[0])][5].append(str(row[2]))

    # print counts by table of un-mappable codes
    print 'From the condition data there were ' + str(len(cond_count)) + ' conditions with no SNOMED Code'
    print 'From the measurement data there were ' + str(len(meas_count)) + ' measurements with no LOINC Code'
    print 'From the medication data there were ' + str(len(med_count)) + ' drugs with no RxNorm Code'

    pbar.finish()
    return patient_dict


def SetSim(sim_dict, list1, list2):
    '''
    Function takes a dictionary of pre-computed semantic similarity scores and two lists of concepts. The function
    computes all pairwise comparisons of the scores find the max score for each concept in the list (from comparing it
    to all other concepts in the other set). The function then takes the sum of each set's concept max and divides it
    by the total number of items in each set.
    :param sim_dict: pre-computed semantic similarity scores (keys: pairs of concepts; values: scores)
    :param list1: list of concepts
    :param list2: list of concepts
    :return: integer representing the similarity score
    '''
    #https://mor.nlm.nih.gov/pubs/pdf/2005-ismb_bioontologies-fja.pdf

    list1_max = []
    list2_max = []

    for i in list1:
        scores = []
        for j in list2:
            scores.append(sim_dict[(i, j)])
        list1_max.append(max(scores))

    for i in list2:
        scores = []
        for j in list1:
            scores.append(sim_dict[(i, j)])

        list2_max.append(max(scores))

    return (sum(list1_max) + sum(list2_max)) / float((len(list1) + len(list2)))


def DiseaseSetSim(sim_dicts, list1, list2):
    '''
    Function takes a list of dictionaries that have pre-computed pairwise similarity scores and two list of lists
    representing sets of concepts for each individual. The function then calculates similarity for the set via the
    SetSim function and returns a list of scores.
    :param sim_dicts: a list of dictionaries that have pre-computed pairwise similarity scores
    :param list1: list of lists representing a set of concepts for an individual
    :param list2: list of lists representing a set of concepts for an individual
    :return: a list of scores
    '''

    # conditions - calculate disease set sim for each person
    cond_sim = SetSim(sim_dicts[0], list1[3], list2[3])

    # measurements - calculate disease set sim for each person
    meas_sim = SetSim(sim_dicts[1], list1[4], list2[4])

    # medications - calculate disease set sim for each person
    med_sim = SetSim(sim_dicts[2], list1[5], list2[5])

    return cond_sim, meas_sim, med_sim


def AgeSim(ages, age1, age2):
    '''
    Function takes a lists of lists and two numbers representing two ages and using the range of ages in the data set
    calculates the difference between the two ages and returns a proportion representing the difference between the two
    ages.
    :param: list of lists representing person-level data
    :param age1: number representing an age
    :param age2: number representing an age
    :return: a proportion representing the difference between the two ages
    '''
    age_range = max(ages) - min(ages)

    # convert set to int
    age1 = float(list(age1)[0])
    age2 = float(list(age2)[0])

    # calculate similarity
    sim = 1 - (abs(age1 - age2) / float(age_range))

    return sim


def PatientSimilarity(patient_data, ages, sim_dicts, weight):

    patient_sim = {}

    # initialize progress bar progress bar
    widgets = [Percentage(), Bar(), FormatLabel('(elapsed: %(elapsed)s)')]
    pbar = ProgressBar(widgets=widgets, maxval=len(patient_data.keys()))

    # pairwise compare all patients
    for i in pbar(range(0, len(patient_data.items()))):
        for j in range(i, len(patient_data.items())):

            p1 = patient_data.keys()[i]
            p2 = patient_data.keys()[j]
            p1_data = patient_data[p1]
            p2_data = patient_data[p2]

            # age
            age = AgeSim(ages, p1_data[0], p2_data[0])

            # gender
            gender = [1 if p1_data[1] == p2_data[1] else 0][0]

            # race
            race = [1 if p1_data[2] == p2_data[2] else 0][0]

            # conditions, measurements, medications
            DiseaseSetSim(sim_dicts, p1_data, p2_data)

            # calculate patient similarity score
            sim = [age, gender, race] + list(DiseaseSetSim(sim_dicts, p1_data, p2_data))

            # apply weight
            sim_weight = [(1+float(y))*x for x,y in zip(sim, weight)]

            # apply weighting to similarity
            if p1 != p2:
                patient_sim[tuple([p1, p2])] = sum(sim_weight)
            else:
                patient_sim[tuple([p1, p2])] = 0.0

            # normalize scores by the max semantic similarity score
            max_score = max(patient_sim.values()) + 0.05

            sem_sim = {comp[0]: comp[1]/max_score if comp[0][0] != comp[0][1] else 0.5 for comp in patient_sim.items()}

            # CHECK - make sure that the range is 1.0
            if max(sem_sim.values()) > 1.0:
                raise ValueError('Problem with semantic similarity score normalization ')

    pbar.finish()
    return sem_sim


def DendroHeat(grps, data, method, title, filename):
    '''
    Function takes a dictionary of pairwise patient similarities (keys: tuples(p1, p2); values: similarity) and
    and converts the dictionary to a matrix of distances. With this matrix a dendrogram and heatmap are produced and
    the output the plot is written to the working directory.
    :param grps: dictionary of group type (key) and patient_id (value)
    :param data: a dictionary of pairwise patient similarities (keys: tuples(p1, p2); values: similarity)
    :param method: linkage method to use ('average', 'single', 'centroid', 'complete')
    :param title: a plot title
    :param filename: string with location for where to write file
    :return: a dendrogram and heatmap are produced and written to the working directory
    '''
    # code: https://github.com/maximilianh/crisporPaper/blob/master/heatmap.py;
    # https://stackoverflow.com/questions/38705359/how-to-give-sns-clustermap-a-precomputed-distance-matrix

    # convert data to matrix
    unq_keys, key_idx = np.unique(np.array(data.keys()), return_inverse=True)
    key_idx = key_idx.reshape(-1, 2)
    matrix = np.zeros((len(unq_keys), len(unq_keys)), dtype=np.array(data.values()).dtype)
    matrix[key_idx[:, 0], key_idx[:, 1]] = np.array(data.values())
    matrix += matrix.T

    # subtract from 1 to convert similarity to distance
    dist_matrix = 1 - matrix

    # calculate linkage and produce a dendrogram
    linkage = sch.linkage(dist.squareform(dist_matrix), method=str(method)) #metric = 'euclidean'
    dendro = sch.dendrogram(linkage)
    idx1 = dendro['leaves']

    # re-order columns grouped by linkage to be together
    dist_matrix = dist_matrix[idx1, :]; dist_matrix = dist_matrix[:, idx1]

    # create heat map
    heatmap = pdh.DendroHeatMap(heat_map_data = dist_matrix, top_dendrogram = linkage)
    heatmap.title = str(title)
    heatmap.colormap = pylab.cm.YlGnBu
    # heatmap.cluster_cb_colors = mpl.colors.LinearSegmentedColormap.from_list(name = "custom",
    #                                              colors = ['red','lime'], N=2)
    # color dendrogram
    sch.set_link_color_palette(['blue'])

    # add labels to heat map
    heatmap.col_labels = [str(grps[unq_keys[dendro['leaves'][x]]]) + '-' + str(unq_keys[dendro['leaves'][x]]) for x in range(dist_matrix.shape[1])]

    # show plot and write to file
    heatmap.show()
    pylab.savefig(str(filename) + '.jpg', dpi=600)


def main():

    #### PATIENT DATA ####
    # cystic fibrosis patients
    cf_person = 'Queries/CF_person'
    cf_cond_query = 'Queries/CF_conditions_query'
    cf_ancestor_cond_query = 'Queries/CF_conds_ancest'
    cf_lab_query = 'Queries/CF_measurement_query'
    cf_ancestor_lab_query = 'Queries/CF_meas_ancest'
    cf_med_query = 'Queries/CF_medications_query'
    cf_ancestor_med_query = 'Queries/CF_meds_ancest'
    # huntingtons chorea patients
    hc_person = 'Queries/HC_person'
    hc_cond_query = 'Queries/HC_conditions_query'
    hc_ancestor_cond_query = 'Queries/HC_conds_ancest'
    hc_lab_query = 'Queries/HC_measurement_query'
    hc_ancestor_lab_query = 'Queries/HC_meas_ancest'
    hc_med_query = 'Queries/HC_medications_query'
    hc_ancestor_med_query = 'Queries/HC_meds_ancest'

    # person - for calculating age range
    ages = [x[4] for x in GBQData(open(cf_person).read()) + GBQData(open(hc_person).read())]

    # set group labels by id
    grp1 = {str(x[0]):'CF' for x in GBQData(open(cf_person).read())}
    grp2 = {str(x[0]):'HC' for x in GBQData(open(hc_person).read())}
    grps = dict(grp1.items() + grp2.items())

    ## Patient Vectors ##
    CF_patients = PatientDict([['person',cf_person], ['cond', cf_cond_query], ['meas', cf_lab_query], ['meds', cf_med_query]])
    # unmappable codes = conds(0); meas(0); meds(151)

    HC_patients = PatientDict([['person', hc_person], ['cond', hc_cond_query'], ['meas', hc_lab_query], ['meds', hc_med_query]])
    # unmappable codes = conds(0); meas(0); meds(45)

    CF_red = {k: CF_patients.get(k, None) for k in ('350177','582315')}
    HC_red = {k: HC_patients.get(k, None) for k in ('151130', '434388')}
    patient_data = dict(CF_patients.items() + HC_patients.items())

    #### ANCESTORS ####
    # conditions
    # conds + ancestors
    conds = GBQData(open(cf_ancestor_cond_query).read()) + GBQData(open(hc_ancestor_cond_query).read())
    cond_ancestor = AncestorFinder(conds)
    len(cond_ancestor) # 637
    # patient conds
    patient_conds = GBQData(open(cf_cond_query).read()) + GBQData(open(hc_cond_query).read())
    # make sure all patient concepts are included in semantic similarity calculations
    cond_ancestor = AncestorUpdate(cond_ancestor, patient_conds)
    len(cond_ancestor) # 637
    # calculate semantic similarity
    cond_sim = SemanticSimilarity(cond_ancestor)

    ## measurements
    meas = GBQData(open(cf_ancestor_lab_query).read()) + GBQData(open(hc_ancestor_lab_query).read())
    meas_ancestor = AncestorFinder(meas)
    len(meas_ancestor) #137
    # patient meas
    patient_meas = GBQData(open(cf_lab_query).read()) + GBQData(open(hc_lab_query).read())
    # make sure all patient concepts are included in semantic similarity calculations
    meas_ancestor = AncestorUpdate(meas_ancestor, patient_meas)
    len(meas_ancestor) #137
    # calculate semantic similarity
    meas_sim = SemanticSimilarity(meas_ancestor)

    ## medications
    meds = GBQData(open(cf_ancestor_med_query).read()) + GBQData(open(HC_ancestor_med_query).read())
    meds_ancestor = AncestorFinder(meds)
    len(meds_ancestor) #432
    # patient meds
    patient_meds = GBQData(open(cf_med_query).read()) + GBQData(open(hc_med_query).read())
    # make sure all patient concepts are included in semantic similarity calculations
    meds_ancestor = AncestorUpdate(meds_ancestor, patient_meds)
    len(meds_ancestor) #460
    # calculate semantic similarity
    meds_sim = SemanticSimilarity(meds_ancestor)

    # put all similarity dictionaries in a list
    sim_dicts = [cond_sim, meas_sim, meds_sim]

    #### PATIENT SIMILARITY ####
    weight = [-1.0, -1.0, -1.0, 0.0, 0.0, 0.0]
    patient_similarity = PatientSimilarity(patient_data, ages, sim_dicts, weight)

    # export dictionary
    output_loc = 'Data/HC_CF_patient_similarity_full'
    with open(output_loc, 'wb') as handle:
        pickle.dump(patient_similarity, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(output_loc, 'rb') as handle:
        patient_similarity = pickle.load(handle)

    DendroHeat(grps,
               patient_similarity,
               'complete',"Patient Similarity: Huntington's Chorea (333.4) vs. Cystic Fibrosis (277.0) - Clinical",
               "HCCF_heatMap_clinical")