ShenZhouHong/file-download.py

## file-download.py
import os # To handle directory paths in a platform independent way
import requests.exceptions # To handle exceptions on download
import xml.etree.ElementTree as ET # To get additional information from request error

column_to_view: dict[str, str] = {
    "cfu_xrayupload"   : "antpr", # Anterio-posterior
    "cfu_xrayuploadlat": "later", # Lateral-medial
    "cfu_xrayuploadobl": "obliq", # Oblique
    "cfu_xrayuploadaxi": "axial"  # Axial
}

# For every study_id and file upload field
index: int
label: pd.Series

# Note: we start at iloc 49 because iloc 46, 47, and 48 are corrupt
for index, label in df_labels.iloc[45:].iterrows():
    # First, let us determine which file upload fields are null. These we ignore.
    is_null: pd.Series = pd.isnull(
        label[radiograph_columns]
    )

    column: str
    value_is_null: bool

    # print(index)
    # break
    for column, value_is_null in is_null.iteritems():
        # For those radiograph file uploads that are not null,

        # Since PAIN is a longitudinal study with multiple indices, we must
        # download radiographs for both every record_id as well as every event
        study_id: int = index[0]
        event_id: str = index[1]

        if not value_is_null:
            # We will try to download the file using PyCap's Project.
            # export_file method\
            try:
                downloaded_file: tuple[bytes, dict[str, str]] = project.export_file(
                    record=str(study_id),
                    field=column,
                    event=event_id
                )
            except requests.exceptions.RequestException as e:
                # If the file does not exist on the REDCap Server
                # Retrieve the actual error message from the XML

                # Remove the the b' and ' from the xml string
                exception_string: str = str(e)[2:-1]

                # Feed it into a ElementTree parser
                root = ET.fromstring(exception_string)
                error = root.find('error').text

                print(f"RequestException for: study_id: {str(study_id)}, event {event_id}")
                print(error)

            # After downloading the file, we will construct a filename for it
            # using the format {study_id}_{view}.{extension}, e.g.: 1001_latr.jpg
            orig_filename : str = downloaded_file[1]['name']
            orig_extension: str = orig_filename.split(".")[-1]
            view: str = column_to_view.get(column, None)

            new_filename: str = f"{study_id}-{event_id}-{view}.{orig_extension}"

            # Now that we have the new filename, save the image to disk
            dataset_directory: str = "dataset"
            with open(os.path.join(dataset_directory, new_filename), "wb") as binary_file:

                # Remember, the test_export_image is a tuple
                # where only the first item is the binary image file.
                binary_file.write(downloaded_file[0])

            # Print filename on successful download
            print(new_filename)
	import os # To handle directory paths in a platform independent way
	import requests.exceptions # To handle exceptions on download
	import xml.etree.ElementTree as ET # To get additional information from request error

	column_to_view: dict[str, str] = {
	"cfu_xrayupload" : "antpr", # Anterio-posterior
	"cfu_xrayuploadlat": "later", # Lateral-medial
	"cfu_xrayuploadobl": "obliq", # Oblique
	"cfu_xrayuploadaxi": "axial" # Axial
	}

	# For every study_id and file upload field
	index: int
	label: pd.Series

	# Note: we start at iloc 49 because iloc 46, 47, and 48 are corrupt
	for index, label in df_labels.iloc[45:].iterrows():
	# First, let us determine which file upload fields are null. These we ignore.
	is_null: pd.Series = pd.isnull(
	label[radiograph_columns]
	)

	column: str
	value_is_null: bool

	# print(index)
	# break
	for column, value_is_null in is_null.iteritems():
	# For those radiograph file uploads that are not null,

	# Since PAIN is a longitudinal study with multiple indices, we must
	# download radiographs for both every record_id as well as every event
	study_id: int = index[0]
	event_id: str = index[1]

	if not value_is_null:
	# We will try to download the file using PyCap's Project.
	# export_file method\
	try:
	downloaded_file: tuple[bytes, dict[str, str]] = project.export_file(
	record=str(study_id),
	field=column,
	event=event_id
	)
	except requests.exceptions.RequestException as e:
	# If the file does not exist on the REDCap Server
	# Retrieve the actual error message from the XML

	# Remove the the b' and ' from the xml string
	exception_string: str = str(e)[2:-1]

	# Feed it into a ElementTree parser
	root = ET.fromstring(exception_string)
	error = root.find('error').text

	print(f"RequestException for: study_id: {str(study_id)}, event {event_id}")
	print(error)

	# After downloading the file, we will construct a filename for it
	# using the format {study_id}_{view}.{extension}, e.g.: 1001_latr.jpg
	orig_filename : str = downloaded_file[1]['name']
	orig_extension: str = orig_filename.split(".")[-1]
	view: str = column_to_view.get(column, None)

	new_filename: str = f"{study_id}-{event_id}-{view}.{orig_extension}"

	# Now that we have the new filename, save the image to disk
	dataset_directory: str = "dataset"
	with open(os.path.join(dataset_directory, new_filename), "wb") as binary_file:

	# Remember, the test_export_image is a tuple
	# where only the first item is the binary image file.
	binary_file.write(downloaded_file[0])

	# Print filename on successful download
	print(new_filename)