Skip to content

Instantly share code, notes, and snippets.

@nblixt
Created May 7, 2019 17:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nblixt/874d2b3157640fe72368047b1fc6690f to your computer and use it in GitHub Desktop.
Save nblixt/874d2b3157640fe72368047b1fc6690f to your computer and use it in GitHub Desktop.
Clinical_Trial_Scrape
import os
import numpy as np
import pandas as pd
import bs4 as bs
def clinical_trial_xml_reader(file):
"""Uses BeautifulSoup to open and parse an xml file from a clinical trial.
Returns the html/xml text.
The path and file name together are the ony argument.
The xml_soup is returned
"""
xml_soup = bs.BeautifulSoup(open(file,"r"), "html.parser")
return xml_soup
def parse_clinical_trial_xml(soup, trial_data_categories):
"""A function to parse multiple myeloma clinical trials from xml files.
Scrapes multiple fields of interest to describe the study generally.
Takes as arguments the html/xml text from 'xml_reader()' and a list
that acts as labels for the columns of the Series'.
Returns a Series called 'clinical_trial_row' that can be appended as a row to a DataFrame.
"""
def get_tag_text(soup, tag):
"""A function that returns the text of the first specified tag if present, otherwise returns nan.
Takes a soup of choice and the tag of choice as arguments. Remember to put the tag in quotes.
Returns either the text from the tag or, if the tag isn't present, NaN.
"""
try:
return soup.find(tag).get_text()
except AttributeError:
return np.NaN
nct_id = get_tag_text(soup, "nct_id")
acronym = get_tag_text(soup, "acronym")
brief_title = get_tag_text(soup, "brief_title")
phase = get_tag_text(soup, "phase")
agency = get_tag_text(soup, "agency")
url = get_tag_text(soup, "url")
overall_status = get_tag_text(soup, "overall_status")
start_date = get_tag_text(soup, "start_date")
completion_date = get_tag_text(soup, "completion_date")
enrollment = get_tag_text(soup, "enrollment")
number_of_arms = get_tag_text(soup, "number_of_arms")
# Creates a Series using the above scraped information and store as a row for the DataFrame.
row = pd.Series([nct_id, acronym, brief_title, phase, agency, url,
overall_status, start_date, completion_date,
enrollment, number_of_arms
], index=trial_data_categories)
# Since the number of treatment arms is variable, append these values separately.
# First create a dictionary to number each treatment arm (<arm_group>)
arm_groups = soup.find_all("arm_group")
arm_labels = [arm.arm_group_label.get_text() for arm in arm_groups]
arm_dict = {}
n=0
for n in range(n, len(arm_labels)):
arm_dict["Arm_"+ str(n+1)] = arm_labels[n]
n+=1
arm_series = pd.Series(arm_dict)
clinical_trial_row = pd.concat([row, arm_series])
return clinical_trial_row
def clinical_trial_scrape(folder_path):
"""Takes a folder's path as an argument. The folder should contain .xml files from ClinicalTrials.gov to scrape.
Returns a DataFrame called 'clinical_trial_df' containing basic information about each clinical trial as a row.
"""
# Create a list of the categories to be scraped, and use this list as column names for a DataFrame.
trial_data_categories = ["NCT_ID", "Acronym", "Brief_Title", "Phase",
"Agency", "URL", "Overall_Status",
"Start_Date", "Completion_Date", "Enrollment",
"Number_of_Arms"]
clinical_trial_df = pd.DataFrame(columns=trial_data_categories)
# Generate a list of all .xml files in the folder, then iterate over the list to parse each file.
files = [file for file in os.listdir(path) if file.endswith(".xml")]
for file in files:
soup = clinical_trial_xml_reader(path+file)
clinical_trial_row = parse_clinical_trial_xml(soup, trial_data_categories)
# Since the number of treatment arms is variable and unknown before scraping, each iteration
# should check if the clinical_trial_df has the same column heads as the clinical_trial_row's index.
# If not, then the missing column heads should be added to clinical_trial_df.
if len(clinical_trial_df.columns) < len(clinical_trial_row):
n=len(clinical_trial_df.columns)
for n in range(n, len(clinical_trial_row)):
clinical_trial_df[clinical_trial_row.index[n]] = np.NaN
clinical_trial_df = clinical_trial_df.append(clinical_trial_row, ignore_index=True)
clinical_trial_df.Start_Date = pd.to_datetime(clinical_trial_df.Start_Date)
clinical_trial_df.Completion_Date = pd.to_datetime(clinical_trial_df.Completion_Date)
return clinical_trial_df
def parse_adverse_events(soup):
"""Parse all reported adverse events, the number of participants affected, and total number at risk.
Takes a clinical trial soup as the only argument.
Returns a DataFrame containing the adverse events as the indices and the treatment
arms as column names. Values are represented as percent of affected out of total
per treatment arm.
"""
# Classification of groupings for treatment arms and adverse effects are nested under
# <reported_events>. Create a dictionary matching the group_id to treatment arm(s).
group_id_dict = {}
reported_events = soup.reported_events.find_all("group")
n = 0
for n in range(n, len(reported_events)):
group_id_dict[reported_events[n].get("group_id")] = reported_events[n].description.get_text()
# Serious and non-serious adverse events are nested under <default_assessment>.
# Obtain all children of each "default_assessment".
adverse_events = []
for assessment in soup.find_all("default_assessment"):
adverse_events.append(assessment.find_next())
counts = []
sub_titles = []
events = []
# Iterate over adverse_events data (should only be 2).
for event in adverse_events:
events = event.find_all("event")
# Each adverse_event item has a <sub_title> and <counts> for each treatment arm.
# Obtain separate lists of <counts> and <sub_title> for each event.
for event in events:
counts_per_event = event.find_all("counts")
sub_titles_per_event = event.find_all("sub_title")
# Obtain <subjects_affected> and <subjects_at_risk> numbers for each count value.
for count in counts_per_event:
count_info = []
count_info.append(count.get("subjects_affected"))
count_info.append(count.get("subjects_at_risk"))
counts.append(count_info)
# Get text from the <sub_title> and append to list.
for sub_title in sub_titles_per_event:
sub_titles.append(sub_title.get_text())
# Determine the <group_id> values used to distinguish treatment arms.
group_id = []
for count in counts_per_event:
group_id.append(count.get("group_id"))
# Add a dictionary that links group_id to treatment.
# Counts should contain values for <subjects_affected> and <subjects_at_risk> for each
# <group_id> per event. Convert these values into a percentage and store in counts_percent.
counts_percent = []
for count in counts:
counts_percent.append(round((int(count[0])/int(count[1]))*100, 2))
# Separate the counts_percent values into sub-lists by their group_id.
counts_percent_groups = []
n=0
for n in range(n, len(set(group_id))):
counts_percent_groups.append(counts_percent[n::len(set(group_id))])
n+=1
# Create an empty DataFrame. Iterate over counts_percent_groups to add values to the DataFrame.
adverse_events_df = pd.DataFrame()
n=0
for n in range(len(counts_percent_groups)):
adverse_events_df[group_id[n]] = counts_percent_groups[n]
n+=1
adverse_events_df.index = sub_titles
# Note that the non-serious adverse events are appended after the serious adverse events.
# If the index is sorted differently, it will be difficult to separate them again.
return adverse_events_df
# Running the code
path = "Trials/"
clinical_trial_df = clinical_trial_scrape(path)
#clinical_trial_df.to_csv(path+"Trials.csv", index=False)
xml_soup = clinical_trial_xml_reader(path + "NCT00153920.xml")
parse_adverse_events(xml_soup)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment