linuskohl/umls_similarity_article_p1.py

## umls_similarity_article_p1.py
import io
import os
import string
import csv
import xml
import re
import unicodedata
import itertools
import requests
from functools import partial
import multiprocessing as mp
from statistics import mean
from sklearn.preprocessing import minmax_scale
import pandas as pd
import numpy as np
from quickumls import QuickUMLS

# Initialize QuickUMLS
# UMLS data available in the quickumls directory
# overlapping_criteria="length" sets precedence for longest match
umls_matcher = QuickUMLS("./quickumls", overlapping_criteria="length", threshold=0.7)

# Helper functions
def load_csv_from_url(url, index_col):
  """Loads CSV file from url
     Args:
        url (str): The URL of the CSV file
        index_col (str): Name of the index column
     Returns:
        pandas.DataFrame: DataFrame containint the table
  """
  raw_data=requests.get(url).content
  return pd.read_csv(io.StringIO(raw_data.decode('utf-8')), index_col=index_col)

def get_umls_terms(text):
  """Extracts UMLS terms from text
     Args:
        text (str): Text to extract terms from
     Returns:
        list: List of dictionaries containing term and CUI
  """
  terms = []
  results = umls_matcher.match(text, best_match=True, ignore_syntax=False)
  for result in results:
      for x in result:
        terms.append({'term': x['term'], 'cui': x['cui']})
  return terms

# Load BIOSSES dataset
biosses_texts_url = "https://gist.githubusercontent.com/linuskohl/5b6f82e9cd0b1ad50e5a57fa48210371/raw/46cab550499c8ffb7cc9e49f61639ae707028c13/biosses_texts.csv"
biosses_meta_url = "https://gist.githubusercontent.com/linuskohl/a037ea921af159f1f95a55ae82a21d43/raw/92b20bf4f898ca19d947d6f552098868f87f0b12/biosses_meta.csv"
biosses_texts = load_csv_from_url(biosses_texts_url, "Id")
biosses_meta = load_csv_from_url(biosses_meta_url, "Id")
# Scale the average rating fron [0,4] to [0,1]
biosses_meta['Avg'] = minmax_scale(biosses_meta['Avg'])

# Extract terms from texts
biosses_texts['UMLS_Terms']=np.NaN
biosses_texts['UMLS_CUIs']=np.NaN
for idx, text in biosses_texts.iterrows():
    terms = get_umls_terms(text.Text)
    biosses_texts.loc[idx, 'UMLS_Terms'] = [term['term'] for term in terms]
    biosses_texts.loc[idx, 'UMLS_CUIs']  = [term['cui']  for term in terms]

# Generate unique list of all CUIs that occur in the texts
biosses_cuis =   [cui for cui in set(biosses_texts['UMLS_CUIs'].explode()) if str(cui) != 'nan']

# Create a dataframe of pairings that we need to calculate the distance for
cui_pairings = pd.DataFrame([[cui_0,cui_1] for cui_0 in biosses_cuis for cui_1 in biosses_cuis if cui_0 != cui_1])

# Export to CSV
cui_pairings.to_csv('./cui_pairings.csv', index=False, header=False)
	import io
	import os
	import string
	import csv
	import xml
	import re
	import unicodedata
	import itertools
	import requests
	from functools import partial
	import multiprocessing as mp
	from statistics import mean
	from sklearn.preprocessing import minmax_scale
	import pandas as pd
	import numpy as np
	from quickumls import QuickUMLS

	# Initialize QuickUMLS
	# UMLS data available in the quickumls directory
	# overlapping_criteria="length" sets precedence for longest match
	umls_matcher = QuickUMLS("./quickumls", overlapping_criteria="length", threshold=0.7)

	# Helper functions
	def load_csv_from_url(url, index_col):
	"""Loads CSV file from url
	Args:
	url (str): The URL of the CSV file
	index_col (str): Name of the index column
	Returns:
	pandas.DataFrame: DataFrame containint the table
	"""
	raw_data=requests.get(url).content
	return pd.read_csv(io.StringIO(raw_data.decode('utf-8')), index_col=index_col)

	def get_umls_terms(text):
	"""Extracts UMLS terms from text
	Args:
	text (str): Text to extract terms from
	Returns:
	list: List of dictionaries containing term and CUI
	"""
	terms = []
	results = umls_matcher.match(text, best_match=True, ignore_syntax=False)
	for result in results:
	for x in result:
	terms.append({'term': x['term'], 'cui': x['cui']})
	return terms

	# Load BIOSSES dataset
	biosses_texts_url = "https://gist.githubusercontent.com/linuskohl/5b6f82e9cd0b1ad50e5a57fa48210371/raw/46cab550499c8ffb7cc9e49f61639ae707028c13/biosses_texts.csv"
	biosses_meta_url = "https://gist.githubusercontent.com/linuskohl/a037ea921af159f1f95a55ae82a21d43/raw/92b20bf4f898ca19d947d6f552098868f87f0b12/biosses_meta.csv"
	biosses_texts = load_csv_from_url(biosses_texts_url, "Id")
	biosses_meta = load_csv_from_url(biosses_meta_url, "Id")
	# Scale the average rating fron [0,4] to [0,1]
	biosses_meta['Avg'] = minmax_scale(biosses_meta['Avg'])

	# Extract terms from texts
	biosses_texts['UMLS_Terms']=np.NaN
	biosses_texts['UMLS_CUIs']=np.NaN
	for idx, text in biosses_texts.iterrows():
	terms = get_umls_terms(text.Text)
	biosses_texts.loc[idx, 'UMLS_Terms'] = [term['term'] for term in terms]
	biosses_texts.loc[idx, 'UMLS_CUIs'] = [term['cui'] for term in terms]

	# Generate unique list of all CUIs that occur in the texts
	biosses_cuis = [cui for cui in set(biosses_texts['UMLS_CUIs'].explode()) if str(cui) != 'nan']

	# Create a dataframe of pairings that we need to calculate the distance for
	cui_pairings = pd.DataFrame([[cui_0,cui_1] for cui_0 in biosses_cuis for cui_1 in biosses_cuis if cui_0 != cui_1])

	# Export to CSV
	cui_pairings.to_csv('./cui_pairings.csv', index=False, header=False)