nblixt/clinical_trial_scrape.py

## clinical_trial_scrape.py
import os
import numpy as np
import pandas as pd
import bs4 as bs


def clinical_trial_xml_reader(file):
    """Uses BeautifulSoup to open and parse an xml file from a clinical trial.
    Returns the html/xml text.

    The path and file name together are the ony argument.
    The xml_soup is returned
    """
    xml_soup = bs.BeautifulSoup(open(file,"r"), "html.parser")
    return xml_soup


def parse_clinical_trial_xml(soup, trial_data_categories):
    """A function to parse multiple myeloma clinical trials from xml files.
    Scrapes multiple fields of interest to describe the study generally.

    Takes as arguments the html/xml text from 'xml_reader()' and a list
    that acts as labels for the columns of the Series'.
    Returns a Series called 'clinical_trial_row' that can be appended as a row to a DataFrame.
    """


    def get_tag_text(soup, tag):
        """A function that returns the text of the first specified tag if present, otherwise returns nan.

        Takes a soup of choice and the tag of choice as arguments.  Remember to put the tag in quotes.
        Returns either the text from the tag or, if the tag isn't present, NaN.
        """
        try:
            return soup.find(tag).get_text()
        except AttributeError:
            return np.NaN

    nct_id = get_tag_text(soup, "nct_id")
    acronym = get_tag_text(soup, "acronym")
    brief_title = get_tag_text(soup, "brief_title")
    phase = get_tag_text(soup, "phase")
    agency = get_tag_text(soup, "agency")
    url = get_tag_text(soup, "url")
    overall_status = get_tag_text(soup, "overall_status")
    start_date = get_tag_text(soup, "start_date")
    completion_date = get_tag_text(soup, "completion_date")
    enrollment = get_tag_text(soup, "enrollment")
    number_of_arms = get_tag_text(soup, "number_of_arms")

    # Creates a Series using the above scraped information and store as a row for the DataFrame.
    row = pd.Series([nct_id, acronym, brief_title, phase, agency, url,
                 overall_status, start_date, completion_date,
                 enrollment, number_of_arms
                ], index=trial_data_categories)


    # Since the number of treatment arms is variable, append these values separately.
    # First create a dictionary to number each treatment arm (<arm_group>)
    arm_groups = soup.find_all("arm_group")
    arm_labels = [arm.arm_group_label.get_text() for arm in arm_groups]
    arm_dict = {}
    n=0
    for n in range(n, len(arm_labels)):
        arm_dict["Arm_"+ str(n+1)] = arm_labels[n]
        n+=1

    arm_series = pd.Series(arm_dict)
    clinical_trial_row = pd.concat([row, arm_series])
    return clinical_trial_row


def clinical_trial_scrape(folder_path):
    """Takes a folder's path as an argument.  The folder should contain .xml files from ClinicalTrials.gov to scrape.
    Returns a DataFrame called 'clinical_trial_df' containing basic information about each clinical trial as a row.
    """
    # Create a list of the categories to be scraped, and use this list as column names for a DataFrame.
    trial_data_categories = ["NCT_ID", "Acronym", "Brief_Title", "Phase",
                   "Agency", "URL", "Overall_Status",
                   "Start_Date", "Completion_Date", "Enrollment",
                   "Number_of_Arms"]
    clinical_trial_df = pd.DataFrame(columns=trial_data_categories)

    # Generate a list of all .xml files in the folder, then iterate over the list to parse each file.
    files = [file for file in os.listdir(path) if file.endswith(".xml")]
    for file in files:
        soup = clinical_trial_xml_reader(path+file)
        clinical_trial_row = parse_clinical_trial_xml(soup, trial_data_categories)

        # Since the number of treatment arms is variable and unknown before scraping, each iteration
        # should check if the clinical_trial_df has the same column heads as the clinical_trial_row's index.
        # If not, then the missing column heads should be added to clinical_trial_df.
        if len(clinical_trial_df.columns) < len(clinical_trial_row):
            n=len(clinical_trial_df.columns)
            for n in range(n, len(clinical_trial_row)):
                clinical_trial_df[clinical_trial_row.index[n]] = np.NaN

        clinical_trial_df = clinical_trial_df.append(clinical_trial_row, ignore_index=True)

    clinical_trial_df.Start_Date = pd.to_datetime(clinical_trial_df.Start_Date)
    clinical_trial_df.Completion_Date = pd.to_datetime(clinical_trial_df.Completion_Date)
    return clinical_trial_df


def parse_adverse_events(soup):
    """Parse all reported adverse events, the number of participants affected, and total number at risk.

    Takes a clinical trial soup as the only argument.
    Returns a DataFrame containing the adverse events as the indices and the treatment
    arms as column names. Values are represented as percent of affected out of total
    per treatment arm.
    """

    # Classification of groupings for treatment arms and adverse effects are nested under
    # <reported_events>.  Create a dictionary matching the group_id to treatment arm(s).
    group_id_dict = {}
    reported_events = soup.reported_events.find_all("group")
    n = 0
    for n in range(n, len(reported_events)):
        group_id_dict[reported_events[n].get("group_id")] = reported_events[n].description.get_text()

    # Serious and non-serious adverse events are nested under <default_assessment>.
    # Obtain all children of each "default_assessment".
    adverse_events = []
    for assessment in soup.find_all("default_assessment"):
        adverse_events.append(assessment.find_next())


    counts = []
    sub_titles = []
    events = []

    # Iterate over adverse_events data (should only be 2).
    for event in adverse_events:
        events = event.find_all("event")

        # Each adverse_event item has a <sub_title> and <counts> for each treatment arm.
        # Obtain separate lists of <counts> and <sub_title> for each event.
        for event in events:
            counts_per_event = event.find_all("counts")
            sub_titles_per_event = event.find_all("sub_title")

            # Obtain <subjects_affected> and <subjects_at_risk> numbers for each count value.
            for count in counts_per_event:
                count_info = []
                count_info.append(count.get("subjects_affected"))
                count_info.append(count.get("subjects_at_risk"))
                counts.append(count_info)

            # Get text from the <sub_title> and append to list.
            for sub_title in sub_titles_per_event:
                sub_titles.append(sub_title.get_text())


    # Determine the <group_id> values used to distinguish treatment arms.
    group_id = []
    for count in counts_per_event:
        group_id.append(count.get("group_id"))
        # Add a dictionary that links group_id to treatment.


    # Counts should contain values for <subjects_affected> and <subjects_at_risk> for each
    # <group_id> per event.  Convert these values into a percentage and store in counts_percent.
    counts_percent = []
    for count in counts:
        counts_percent.append(round((int(count[0])/int(count[1]))*100, 2))

    # Separate the counts_percent values into sub-lists by their group_id.
    counts_percent_groups = []
    n=0
    for n in range(n, len(set(group_id))):
        counts_percent_groups.append(counts_percent[n::len(set(group_id))])
        n+=1

    # Create an empty DataFrame.  Iterate over counts_percent_groups to add values to the DataFrame.
    adverse_events_df = pd.DataFrame()
    n=0
    for n in range(len(counts_percent_groups)):
        adverse_events_df[group_id[n]] = counts_percent_groups[n]
        n+=1
    adverse_events_df.index = sub_titles
    # Note that the non-serious adverse events are appended after the serious adverse events.
    # If the index is sorted differently, it will be difficult to separate them again.

    return adverse_events_df


# Running the code
path = "Trials/"
clinical_trial_df = clinical_trial_scrape(path)
#clinical_trial_df.to_csv(path+"Trials.csv", index=False)

xml_soup = clinical_trial_xml_reader(path + "NCT00153920.xml")
parse_adverse_events(xml_soup)
	import os
	import numpy as np
	import pandas as pd
	import bs4 as bs


	def clinical_trial_xml_reader(file):
	"""Uses BeautifulSoup to open and parse an xml file from a clinical trial.
	Returns the html/xml text.

	The path and file name together are the ony argument.
	The xml_soup is returned
	"""
	xml_soup = bs.BeautifulSoup(open(file,"r"), "html.parser")
	return xml_soup


	def parse_clinical_trial_xml(soup, trial_data_categories):
	"""A function to parse multiple myeloma clinical trials from xml files.
	Scrapes multiple fields of interest to describe the study generally.

	Takes as arguments the html/xml text from 'xml_reader()' and a list
	that acts as labels for the columns of the Series'.
	Returns a Series called 'clinical_trial_row' that can be appended as a row to a DataFrame.
	"""


	def get_tag_text(soup, tag):
	"""A function that returns the text of the first specified tag if present, otherwise returns nan.

	Takes a soup of choice and the tag of choice as arguments. Remember to put the tag in quotes.
	Returns either the text from the tag or, if the tag isn't present, NaN.
	"""
	try:
	return soup.find(tag).get_text()
	except AttributeError:
	return np.NaN

	nct_id = get_tag_text(soup, "nct_id")
	acronym = get_tag_text(soup, "acronym")
	brief_title = get_tag_text(soup, "brief_title")
	phase = get_tag_text(soup, "phase")
	agency = get_tag_text(soup, "agency")
	url = get_tag_text(soup, "url")
	overall_status = get_tag_text(soup, "overall_status")
	start_date = get_tag_text(soup, "start_date")
	completion_date = get_tag_text(soup, "completion_date")
	enrollment = get_tag_text(soup, "enrollment")
	number_of_arms = get_tag_text(soup, "number_of_arms")

	# Creates a Series using the above scraped information and store as a row for the DataFrame.
	row = pd.Series([nct_id, acronym, brief_title, phase, agency, url,
	overall_status, start_date, completion_date,
	enrollment, number_of_arms
	], index=trial_data_categories)


	# Since the number of treatment arms is variable, append these values separately.
	# First create a dictionary to number each treatment arm (<arm_group>)
	arm_groups = soup.find_all("arm_group")
	arm_labels = [arm.arm_group_label.get_text() for arm in arm_groups]
	arm_dict = {}
	n=0
	for n in range(n, len(arm_labels)):
	arm_dict["Arm_"+ str(n+1)] = arm_labels[n]
	n+=1

	arm_series = pd.Series(arm_dict)
	clinical_trial_row = pd.concat([row, arm_series])
	return clinical_trial_row


	def clinical_trial_scrape(folder_path):
	"""Takes a folder's path as an argument. The folder should contain .xml files from ClinicalTrials.gov to scrape.
	Returns a DataFrame called 'clinical_trial_df' containing basic information about each clinical trial as a row.
	"""
	# Create a list of the categories to be scraped, and use this list as column names for a DataFrame.
	trial_data_categories = ["NCT_ID", "Acronym", "Brief_Title", "Phase",
	"Agency", "URL", "Overall_Status",
	"Start_Date", "Completion_Date", "Enrollment",
	"Number_of_Arms"]
	clinical_trial_df = pd.DataFrame(columns=trial_data_categories)

	# Generate a list of all .xml files in the folder, then iterate over the list to parse each file.
	files = [file for file in os.listdir(path) if file.endswith(".xml")]
	for file in files:
	soup = clinical_trial_xml_reader(path+file)
	clinical_trial_row = parse_clinical_trial_xml(soup, trial_data_categories)

	# Since the number of treatment arms is variable and unknown before scraping, each iteration
	# should check if the clinical_trial_df has the same column heads as the clinical_trial_row's index.
	# If not, then the missing column heads should be added to clinical_trial_df.
	if len(clinical_trial_df.columns) < len(clinical_trial_row):
	n=len(clinical_trial_df.columns)
	for n in range(n, len(clinical_trial_row)):
	clinical_trial_df[clinical_trial_row.index[n]] = np.NaN

	clinical_trial_df = clinical_trial_df.append(clinical_trial_row, ignore_index=True)

	clinical_trial_df.Start_Date = pd.to_datetime(clinical_trial_df.Start_Date)
	clinical_trial_df.Completion_Date = pd.to_datetime(clinical_trial_df.Completion_Date)
	return clinical_trial_df


	def parse_adverse_events(soup):
	"""Parse all reported adverse events, the number of participants affected, and total number at risk.

	Takes a clinical trial soup as the only argument.
	Returns a DataFrame containing the adverse events as the indices and the treatment
	arms as column names. Values are represented as percent of affected out of total
	per treatment arm.
	"""

	# Classification of groupings for treatment arms and adverse effects are nested under
	# <reported_events>. Create a dictionary matching the group_id to treatment arm(s).
	group_id_dict = {}
	reported_events = soup.reported_events.find_all("group")
	n = 0
	for n in range(n, len(reported_events)):
	group_id_dict[reported_events[n].get("group_id")] = reported_events[n].description.get_text()

	# Serious and non-serious adverse events are nested under <default_assessment>.
	# Obtain all children of each "default_assessment".
	adverse_events = []
	for assessment in soup.find_all("default_assessment"):
	adverse_events.append(assessment.find_next())


	counts = []
	sub_titles = []
	events = []

	# Iterate over adverse_events data (should only be 2).
	for event in adverse_events:
	events = event.find_all("event")

	# Each adverse_event item has a <sub_title> and <counts> for each treatment arm.
	# Obtain separate lists of <counts> and <sub_title> for each event.
	for event in events:
	counts_per_event = event.find_all("counts")
	sub_titles_per_event = event.find_all("sub_title")

	# Obtain <subjects_affected> and <subjects_at_risk> numbers for each count value.
	for count in counts_per_event:
	count_info = []
	count_info.append(count.get("subjects_affected"))
	count_info.append(count.get("subjects_at_risk"))
	counts.append(count_info)

	# Get text from the <sub_title> and append to list.
	for sub_title in sub_titles_per_event:
	sub_titles.append(sub_title.get_text())


	# Determine the <group_id> values used to distinguish treatment arms.
	group_id = []
	for count in counts_per_event:
	group_id.append(count.get("group_id"))
	# Add a dictionary that links group_id to treatment.


	# Counts should contain values for <subjects_affected> and <subjects_at_risk> for each
	# <group_id> per event. Convert these values into a percentage and store in counts_percent.
	counts_percent = []
	for count in counts:
	counts_percent.append(round((int(count[0])/int(count[1]))*100, 2))

	# Separate the counts_percent values into sub-lists by their group_id.
	counts_percent_groups = []
	n=0
	for n in range(n, len(set(group_id))):
	counts_percent_groups.append(counts_percent[n::len(set(group_id))])
	n+=1

	# Create an empty DataFrame. Iterate over counts_percent_groups to add values to the DataFrame.
	adverse_events_df = pd.DataFrame()
	n=0
	for n in range(len(counts_percent_groups)):
	adverse_events_df[group_id[n]] = counts_percent_groups[n]
	n+=1
	adverse_events_df.index = sub_titles
	# Note that the non-serious adverse events are appended after the serious adverse events.
	# If the index is sorted differently, it will be difficult to separate them again.

	return adverse_events_df



	# Running the code
	path = "Trials/"
	clinical_trial_df = clinical_trial_scrape(path)
	#clinical_trial_df.to_csv(path+"Trials.csv", index=False)

	xml_soup = clinical_trial_xml_reader(path + "NCT00153920.xml")
	parse_adverse_events(xml_soup)