Skip to content

Instantly share code, notes, and snippets.

@jamesqo
Created October 12, 2023 16:17
Show Gist options
  • Save jamesqo/edfbcec5f47118b0571b7f4386bfd70c to your computer and use it in GitHub Desktop.
Save jamesqo/edfbcec5f47118b0571b7f4386bfd70c to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
import requests
def main():
df = pd.read_csv('clinical_data_mappings.tsv', sep='\t')
attrib_names = df["cBioPortal CDD Attribute"]
sheet_display_names = df["Display Name"]
sheet_descs = df["Description"]
missing_rows = []
for i, attrib_name in enumerate(attrib_names):
if not attrib_name or pd.isna(attrib_name) or attrib_name == '-':
missing_rows.append(i)
continue
request_url = 'https://cdd.cbioportal.mskcc.org:443/api/search?attributeType=PATIENT&inclusiveSearch=true'
response = requests.post(
request_url,
headers={
'Content-Type': 'application/json',
'Accept': 'application/json'
},
data=f'["{attrib_name}"]'
)
status = response.status_code
if status == 400:
raise Exception('bad request')
elif status == 503:
raise Exception('server error')
elif status == 404: # no attributes matching search term
missing_rows.append(i)
continue
elif status == 200:
matches = response.json()
matches = [m for m in matches if m['column_header'] == attrib_name]
if len(matches) == 0:
missing_rows.append(i)
continue
assert len(matches) == 1, matches
match = matches[0]
display_name = match['display_name']
desc = match['description']
if display_name != sheet_display_names[i] or \
desc != sheet_descs[i]:
print(f"{attrib_name} was found but display name or desc doesn't match, adding...")
missing_rows.append(i)
continue
pass # found it
else:
raise Exception(f'unrecognized status: {status}')
missing_df = df.loc[missing_rows, :]
missing_df.to_csv('attribs_missing_from_cdd.tsv', sep='\t', index=False)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment