Skip to content

Instantly share code, notes, and snippets.

@linuskohl
Created June 26, 2020 19:49
Show Gist options
  • Save linuskohl/892ed937432e8b5497e860d4cf61c029 to your computer and use it in GitHub Desktop.
Save linuskohl/892ed937432e8b5497e860d4cf61c029 to your computer and use it in GitHub Desktop.
import io
import os
import string
import csv
import xml
import re
import unicodedata
import itertools
import requests
from functools import partial
import multiprocessing as mp
from statistics import mean
from sklearn.preprocessing import minmax_scale
import pandas as pd
import numpy as np
from quickumls import QuickUMLS
# Initialize QuickUMLS
# UMLS data available in the quickumls directory
# overlapping_criteria="length" sets precedence for longest match
umls_matcher = QuickUMLS("./quickumls", overlapping_criteria="length", threshold=0.7)
# Helper functions
def load_csv_from_url(url, index_col):
"""Loads CSV file from url
Args:
url (str): The URL of the CSV file
index_col (str): Name of the index column
Returns:
pandas.DataFrame: DataFrame containint the table
"""
raw_data=requests.get(url).content
return pd.read_csv(io.StringIO(raw_data.decode('utf-8')), index_col=index_col)
def get_umls_terms(text):
"""Extracts UMLS terms from text
Args:
text (str): Text to extract terms from
Returns:
list: List of dictionaries containing term and CUI
"""
terms = []
results = umls_matcher.match(text, best_match=True, ignore_syntax=False)
for result in results:
for x in result:
terms.append({'term': x['term'], 'cui': x['cui']})
return terms
# Load BIOSSES dataset
biosses_texts_url = "https://gist.githubusercontent.com/linuskohl/5b6f82e9cd0b1ad50e5a57fa48210371/raw/46cab550499c8ffb7cc9e49f61639ae707028c13/biosses_texts.csv"
biosses_meta_url = "https://gist.githubusercontent.com/linuskohl/a037ea921af159f1f95a55ae82a21d43/raw/92b20bf4f898ca19d947d6f552098868f87f0b12/biosses_meta.csv"
biosses_texts = load_csv_from_url(biosses_texts_url, "Id")
biosses_meta = load_csv_from_url(biosses_meta_url, "Id")
# Scale the average rating fron [0,4] to [0,1]
biosses_meta['Avg'] = minmax_scale(biosses_meta['Avg'])
# Extract terms from texts
biosses_texts['UMLS_Terms']=np.NaN
biosses_texts['UMLS_CUIs']=np.NaN
for idx, text in biosses_texts.iterrows():
terms = get_umls_terms(text.Text)
biosses_texts.loc[idx, 'UMLS_Terms'] = [term['term'] for term in terms]
biosses_texts.loc[idx, 'UMLS_CUIs'] = [term['cui'] for term in terms]
# Generate unique list of all CUIs that occur in the texts
biosses_cuis = [cui for cui in set(biosses_texts['UMLS_CUIs'].explode()) if str(cui) != 'nan']
# Create a dataframe of pairings that we need to calculate the distance for
cui_pairings = pd.DataFrame([[cui_0,cui_1] for cui_0 in biosses_cuis for cui_1 in biosses_cuis if cui_0 != cui_1])
# Export to CSV
cui_pairings.to_csv('./cui_pairings.csv', index=False, header=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment