Skip to content

Instantly share code, notes, and snippets.

@DonRichards
Last active September 12, 2023 18:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save DonRichards/34c7092a620ec2d74ad3207aed417f48 to your computer and use it in GitHub Desktop.
Save DonRichards/34c7092a620ec2d74ad3207aed417f48 to your computer and use it in GitHub Desktop.
Use IDC's Metadata export CSV file to find and pull the Original File's URL path into the CSV.
#!/usr/bin/env python3
USERNAME = 'admin'
PASSWORD = 'PASSWORD'
import asyncio
import os
from pyppeteer import launch
import csv
DRUPAL_LOGIN_URL = 'https://stage.library.jhu.edu/user/login'
BASE_URL = 'https://stage.library.jhu.edu'
def load_input_csv(filename):
with open(filename, mode='r', encoding='utf-8') as file:
return list(csv.DictReader(file))
def mark_as_completed(url):
with open("completed.txt", "a") as f:
f.write(url + "\n")
def is_completed(url):
try:
with open("completed.txt", "r") as f:
completed_urls = f.readlines()
return url + "\n" in completed_urls
except FileNotFoundError:
return False
async def main():
browser = await launch(headless=True, args=['--no-sandbox'])
print("Browser launched...")
page = await browser.newPage()
await page.setViewport({'width': 1280, 'height': 800})
await page.goto(DRUPAL_LOGIN_URL)
print(f"Opened tab for {DRUPAL_LOGIN_URL}...")
await page.type('input[name=name]', USERNAME)
await page.type('input[name=pass]', PASSWORD)
await page.click('input[value="Log in"]')
await page.waitForSelector('#toolbar-link-workbench-content')
input_data = load_input_csv('input.csv')
total_urls = len(input_data)
if not os.path.isfile('output.csv'):
with open('output.csv', 'w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=input_data[0].keys())
writer.writeheader()
MAX_RETRIES = 3
for row_data in input_data:
url = row_data['url'].replace("https://digital.library.jhu.edu", "https://stage.digital.library.jhu.edu")
total_urls -= 1
if is_completed(url):
print(f"{url} has already been processed. Skipping...")
continue
for attempt in range(MAX_RETRIES):
try:
media_url = f"{url}/media"
await page.goto(media_url)
rows_length = len(await page.querySelectorAll("table.views-table tbody tr"))
found_intermediate_file = False
# First loop to check for any Intermediate Files
for i in range(rows_length):
row = await page.querySelector(f"table.views-table tbody tr:nth-child({i+1})")
if not row:
continue
media_use = await row.querySelector("td.views-field.views-field-field-media-use")
if not media_use:
continue
media_use_text = await page.evaluate('(element) => element.textContent', media_use)
if "Intermediate File" in media_use_text:
found_intermediate_file = True
break # Once we know there's at least one Intermediate File, break out
# Second loop based on whether we found an Intermediate File or not
file_urls = []
for i in range(rows_length):
row = await page.querySelector(f"table.views-table tbody tr:nth-child({i+1})")
if not row:
continue
media_use = await row.querySelector("td.views-field.views-field-field-media-use")
if not media_use:
continue
media_use_text = await page.evaluate('(element) => element.textContent', media_use)
if found_intermediate_file and "Intermediate File" in media_use_text:
print("Found intermediate file. Getting its URL...")
edit_link = await row.querySelector("li.edit.dropbutton-action a")
if edit_link:
edit_link_href = await page.evaluate('(element) => element.href', edit_link)
await page.goto(edit_link_href)
import asyncio
import os
from pyppeteer import launch
import csv
DRUPAL_LOGIN_URL = 'https://stage.digital.library.jhu.edu/user/login'
BASE_URL = 'https://stage.digital.library.jhu.edu'
def load_input_csv(filename):
with open(filename, mode='r', encoding='utf-8') as file:
return list(csv.DictReader(file))
def mark_as_completed(url):
with open("completed.txt", "a") as f:
f.write(url + "\n")
def is_completed(url):
try:
with open("completed.txt", "r") as f:
completed_urls = f.readlines()
return url + "\n" in completed_urls
except FileNotFoundError:
return False
async def main():
browser = await launch(headless=True, args=['--no-sandbox'])
print("Browser launched...")
page = await browser.newPage()
await page.setViewport({'width': 1280, 'height': 800})
await page.goto(DRUPAL_LOGIN_URL)
print(f"Opened tab for {DRUPAL_LOGIN_URL}...")
await page.type('input[name=name]', USERNAME)
await page.type('input[name=pass]', PASSWORD)
await page.click('input[value="Log in"]')
await page.waitForSelector('#toolbar-link-workbench-content')
input_data = load_input_csv('input.csv')
total_urls = len(input_data)
if not os.path.isfile('output.csv'):
with open('output.csv', 'w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=input_data[0].keys())
writer.writeheader()
MAX_RETRIES = 3
for row_data in input_data:
url = row_data['url'].replace("https://digital.library.jhu.edu", "https://stage.digital.library.jhu.edu")
total_urls -= 1
if is_completed(url):
print(f"{url} has already been processed. Skipping...")
continue
for attempt in range(MAX_RETRIES):
try:
media_url = f"{url}/media"
await page.goto(media_url)
rows_length = len(await page.querySelectorAll("table.views-table tbody tr"))
found_intermediate_file = False
# First loop to check for any Intermediate Files
for i in range(rows_length):
row = await page.querySelector(f"table.views-table tbody tr:nth-child({i+1})")
if not row:
continue
media_use = await row.querySelector("td.views-field.views-field-field-media-use")
if not media_use:
continue
media_use_text = await page.evaluate('(element) => element.textContent', media_use)
if "Intermediate File" in media_use_text:
found_intermediate_file = True
break # Once we know there's at least one Intermediate File, break out
# Second loop based on whether we found an Intermediate File or not
file_urls = []
for i in range(rows_length):
row = await page.querySelector(f"table.views-table tbody tr:nth-child({i+1})")
if not row:
continue
media_use = await row.querySelector("td.views-field.views-field-field-media-use")
if not media_use:
continue
media_use_text = await page.evaluate('(element) => element.textContent', media_use)
if found_intermediate_file and "Intermediate File" in media_use_text:
print("Found intermediate file. Getting its URL...")
edit_link = await row.querySelector("li.edit.dropbutton-action a")
if edit_link:
edit_link_href = await page.evaluate('(element) => element.href', edit_link)
# Set timeout to 1000000 ms (1000 seconds) to allow for jp2 problems.
print(f"Opening {edit_link_href}...")
await page.goto(edit_link_href, timeout=1000000)
await page.waitForSelector('#edit-submit')
remove_btn = await page.querySelector('input[type="submit"][value="Remove"]')
file_url_elements = await remove_btn.Jx("//span[contains(@class, 'file')]//a[@href]")
if file_url_elements:
file_url_element = file_url_elements[0]
file_url = await page.evaluate('(element) => element.href', file_url_element)
file_urls.append(file_url)
await page.goto(media_url)
elif not found_intermediate_file and "Original File" in media_use_text:
print("Found original file (and no intermediate file).")
edit_link = await row.querySelector("li.edit.dropbutton-action a")
if edit_link:
edit_link_href = await page.evaluate('(element) => element.href', edit_link)
# Set timeout to 1000000 ms (1000 seconds) to allow for jp2 problems.
print(f"Opening {edit_link_href}...")
await page.goto(edit_link_href, timeout=1000000)
await page.waitForSelector('#edit-submit')
remove_btn = await page.querySelector('input[type="submit"][value="Remove"]')
file_url_elements = await remove_btn.Jx("//span[contains(@class, 'file')]//a[@href]")
if file_url_elements:
file_url_element = file_url_elements[0]
file_url = await page.evaluate('(element) => element.href', file_url_element)
file_urls.append(file_url)
await page.goto(media_url)
mark_as_completed(url)
break
except Exception as e:
print(f"Error processing {url}. Error message: {e}.")
if attempt < MAX_RETRIES - 1:
await asyncio.sleep(10)
else:
print(f"Failed processing {url} after {MAX_RETRIES} attempts. Skipping...")
row_data['file'] = '|'.join(file_urls)
with open('output.csv', 'a', newline='') as f:
writer = csv.DictWriter(f, fieldnames=row_data.keys())
writer.writerow(row_data)
await browser.close()
asyncio.get_event_loop().run_until_complete(main())

Grab.py retrieves URLs for either "Intermediate Files" or "Original Files" and appends them to a CSV file.

The script includes a feature for resuming its progress in case of an interruption or error. As a precaution, it adjusts the URLs from the production server to redirect them to a staging server.

Additionally, the script incorporates error-handling mechanisms, utilizing a try-catch-retry approach, to gracefully manage timeouts.

Setup to run

python3 -m venv .venv
source .venv/bin/activate
python3 -m pip install selenium chromedriver_autoinstaller
python3 -m pip install pyppeteer

# Maybe?!
# sudo apt-get install chromium-browser

To run

Place an input.csv file next to grab.py.

Example of the input.csv CSV file

The input csv needs a column name "url" and no "file" column.

id,url,uuid,title,field_collection_number,field_date_available,field_date_published,field_finding_aid,field_item_barcode,field_library_catalog_link,field_oclc_number,field_unique_id,field_abstract,field_access_rights,field_access_terms,field_contributor,field_copyright_and_use,field_description,field_digital_publisher,field_genre,field_model,field_publisher,field_resource_type,field_subject,field_alternative_title,field_featured_item,field_dspace_item_id,field_digital_identifier,field_creator,field_publisher_country,field_spatial_coverage,field_date_copyrighted
1,https://digital.library.jhu.edu/node/11943,d3ba594a-a226-4bab-9185-72fe54cd7328,Is X-ray harmful?,COLL-0008,2021-06-24,1957-01-20,https://aspace.library.jhu.edu/repositories/3/resources/1109,mq2415107mmmmm,https://catalyst.library.jhu.edu/catalog/bib_2415107,54860034,83aeefed-fbc8-4772-b2a8-a9a3865ab6e2,"Lynn Poole discusses x-rays for treatment and diagnosis of disease and displays a recent report from the National Academy or Sciences and National Research Council on the biological effects of radiation. Dr. Russell Morgan, Director of Radiology Dept. at Johns Hopkins University, fields questions from members of the press: Nate Hazeltine, a ""Washington Post"" science writer; Pare Lorentz, a film producer; and Earl Ubell, a reporter and science editor with the ""New York Herald Tribune"". Dr. Morgan explains that x-rays affect both individual cells and the whole body, making them more susceptible to premature aging. He discusses the research by John Lawrence on the effects of radiation on mice and their extrapolation to man. He also notes a study on radiation vs. non-radiation workers that showed no difference in life spans of the two groups. It is the amount of radiation exposure that determines the effects of the damage. For example, a chest x-ray only delivers about 1/20th roentgen, a unit of radiation. However, Dr. Morgan discusses the feasibility of a reporting system for patients' total x-ray exposure and the need for a set of standards. And he does admit that the complexity and amount of radiation exposure is increasing in diagnostic studies and could double by 1960-65. A film clip demonstrates that this radiation exposure can be reduced by filtration, distance from the x-ray machine, length of time of exposure, and protection of areas not being radiated. Mr. Poole points out that Dr. Morgan has developed a fluoroscopy machine reducing by up to ten times the radiation time. In conclusion, Dr. Morgan discusses whether the Atomic Energy Commission or the U. S. Public Health Services should be responsible for the public's radiation health problems.",Public digital access,Science Review,"relators:brd:ABC Television Network||relators:drt:Calfee, Kennard||relators:nrt:Chaseman, Joel||relators:prd:Hazeltine, Nate||relators:prd:Lorentz, Pare||relators:prd:Morgan, Russell H. (Russell Hedley), 1911-1986||relators:prd:Poole, Lynn||relators:prd:Ubell, Earl||relators:pro:Geier, Leo, 1926-2017||relators:pro:Poole, Lynn||relators:pro:WAAM (Television station : Baltimore, Md.)||relators:aus:Comte, Gilbert",Copyright Not Evaluated,"Originally broadcast as a segment of the television program Johns Hopkins File 7 on January 20, 1957 from the studios of WAAM in Baltimore, Md. Black and white. Lynn Poole, Leo Geier, producers; Kennard Calfee, director; Gilbert Comte, writer; Joel Chaseman, narrator; produced by WAAM television station in Baltimore, Md. for the ABC Television Network. Lynn Poole, Russell H. Morgan, Nate Hazeltine, Pare Lorentz, Earl Ubell, presenters. Digitized in 2004.",Johns Hopkins University. Sheridan Libraries,Educational television programs,Video,Johns Hopkins University,Moving Image,X-rays--Physiological effect||X-rays,,,,,,,,,
2,https://digital.library.jhu.edu/node/11944,9d4a0ed0-3b08-450f-ad7a-32ab30dc104a,Seeing in the dark,COLL-0008,2021-06-24,1957-01-13,https://aspace.library.jhu.edu/repositories/3/resources/1109,mq2415085mmmmm,https://catalyst.library.jhu.edu/catalog/bib_2415085,54859985,793edfd5-d2f7-4604-8742-bc560016cb1f,"Lynn Poole tells how the tenth century Islamic scholar Alhazan described the workings of the camera obscura. Later, Frenchman Niepce discovered an emulsion that could retain a photographic image. Dr. Walter Driscoll, director of research at Baird-Atomic Inc., then shows a chart of the electromagnetic spectrum and notes that while x-rays yield only shadowy pictures and radar waves detect but don't create pictures, germanium and silicon filters block radiated energy and allow infrared light to pass through to form an image. Dr. Driscoll displays a scanning bolometer, which can see in the dark, but the shapes it creates need to be interpreted. He also shows a snooperscope and a film clip of a sniperscope with infrared scope. Previous research on infrared or thermal detection was done by Sir John Frederick William Herschel. Potter Trainer demonstrates and explains the Evaporagraph (EVA), which is based on the principle that all things radiate heat as infrared rays, and shows some of the actual pictures made from heat rather than light. Dr. Walter Baird describes applications of EVA to industry, such as detecting problem-causing hot spots in electronic equipment or indicating heat escape or insulation deficiency in a building. EVA's resolution is 10 lines/mm at best, and it shows temperature contrast of .2 degree. The machine's weakness is the slow speed of response to small temperature differences and the inability to obtain the temperature scale of the item viewed. Nonetheless, Mr. Poole says EVA could play a vital role in civil defense and medicine.",Public digital access,Science Review,"relators:brd:ABC Television Network||relators:drt:Calfee, Kennard||relators:nrt:Chaseman, Joel||relators:prd:Baird, Walter S.||relators:prd:Driscoll, Walter G.||relators:prd:Poole, Lynn||relators:prd:Trainer, Potter.||relators:pro:Geier, Leo, 1926-2017||relators:pro:Poole, Lynn||relators:pro:WAAM (Television station : Baltimore, Md.)||relators:aus:Comte, Gilbert",Copyright Not Evaluated,"Originally broadcast as a segment of the television program Johns Hopkins File 7 on January 13, 1957 from the studios of WAAM in Baltimore, Md. Black and white. Lynn Poole, Leo Geier, producers; Kennard Calfee, director; Gilbert Comte, writer; Joel Chaseman, narrator; produced by WAAM television station in Baltimore, Md. for the ABC Television Network. Lynn Poole, Walter S. Baird, Walter G. Driscoll, Potter Trainer, presenters. Digitized in 2004.",Johns Hopkins University. Sheridan Libraries,Educational television programs,Video,Johns Hopkins University,Moving Image,Infrared radiation||Camera obscuras,,,,,,,,,

Example of the output.csv when complete

id,url,uuid,title,field_collection_number,field_date_available,field_date_published,field_finding_aid,field_item_barcode,field_library_catalog_link,field_oclc_number,field_unique_id,field_abstract,field_access_rights,field_access_terms,field_contributor,field_copyright_and_use,field_description,field_digital_publisher,field_genre,field_model,field_publisher,field_resource_type,field_subject,field_alternative_title,field_featured_item,field_dspace_item_id,field_digital_identifier,field_creator,field_publisher_country,field_spatial_coverage,field_date_copyrighted,field_date_created
1,https://digital.library.jhu.edu/node/11943,d3ba594a-a226-4bab-9185-72fe54cd7328,Is X-ray harmful?,COLL-0008,2021-06-24,1957-01-20,https://aspace.library.jhu.edu/repositories/3/resources/1109,mq2415107mmmmm,https://catalyst.library.jhu.edu/catalog/bib_2415107,54860034,83aeefed-fbc8-4772-b2a8-a9a3865ab6e2,"Lynn Poole discusses x-rays for treatment and diagnosis of disease and displays a recent report from the National Academy or Sciences and National Research Council on the biological effects of radiation. Dr. Russell Morgan, Director of Radiology Dept. at Johns Hopkins University, fields questions from members of the press: Nate Hazeltine, a ""Washington Post"" science writer; Pare Lorentz, a film producer; and Earl Ubell, a reporter and science editor with the ""New York Herald Tribune"". Dr. Morgan explains that x-rays affect both individual cells and the whole body, making them more susceptible to premature aging. He discusses the research by John Lawrence on the effects of radiation on mice and their extrapolation to man. He also notes a study on radiation vs. non-radiation workers that showed no difference in life spans of the two groups. It is the amount of radiation exposure that determines the effects of the damage. For example, a chest x-ray only delivers about 1/20th roentgen, a unit of radiation. However, Dr. Morgan discusses the feasibility of a reporting system for patients' total x-ray exposure and the need for a set of standards. And he does admit that the complexity and amount of radiation exposure is increasing in diagnostic studies and could double by 1960-65. A film clip demonstrates that this radiation exposure can be reduced by filtration, distance from the x-ray machine, length of time of exposure, and protection of areas not being radiated. Mr. Poole points out that Dr. Morgan has developed a fluoroscopy machine reducing by up to ten times the radiation time. In conclusion, Dr. Morgan discusses whether the Atomic Energy Commission or the U. S. Public Health Services should be responsible for the public's radiation health problems.",Public digital access,Science Review,"relators:brd:ABC Television Network||relators:drt:Calfee, Kennard||relators:nrt:Chaseman, Joel||relators:prd:Hazeltine, Nate||relators:prd:Lorentz, Pare||relators:prd:Morgan, Russell H. (Russell Hedley), 1911-1986||relators:prd:Poole, Lynn||relators:prd:Ubell, Earl||relators:pro:Geier, Leo, 1926-2017||relators:pro:Poole, Lynn||relators:pro:WAAM (Television station : Baltimore, Md.)||relators:aus:Comte, Gilbert",Copyright Not Evaluated,"Originally broadcast as a segment of the television program Johns Hopkins File 7 on January 20, 1957 from the studios of WAAM in Baltimore, Md. Black and white. Lynn Poole, Leo Geier, producers; Kennard Calfee, director; Gilbert Comte, writer; Joel Chaseman, narrator; produced by WAAM television station in Baltimore, Md. for the ABC Television Network. Lynn Poole, Russell H. Morgan, Nate Hazeltine, Pare Lorentz, Earl Ubell, presenters. Digitized in 2004.",Johns Hopkins University. Sheridan Libraries,Educational television programs,Video,Johns Hopkins University,Moving Image,X-rays--Physiological effect||X-rays,,,,,,,,,,https://stage.digital.library.jhu.edu/system/files/2022-03-17/jhu_coll-0008_A6024.mp4
2,https://digital.library.jhu.edu/node/11944,9d4a0ed0-3b08-450f-ad7a-32ab30dc104a,Seeing in the dark,COLL-0008,2021-06-24,1957-01-13,https://aspace.library.jhu.edu/repositories/3/resources/1109,mq2415085mmmmm,https://catalyst.library.jhu.edu/catalog/bib_2415085,54859985,793edfd5-d2f7-4604-8742-bc560016cb1f,"Lynn Poole tells how the tenth century Islamic scholar Alhazan described the workings of the camera obscura. Later, Frenchman Niepce discovered an emulsion that could retain a photographic image. Dr. Walter Driscoll, director of research at Baird-Atomic Inc., then shows a chart of the electromagnetic spectrum and notes that while x-rays yield only shadowy pictures and radar waves detect but don't create pictures, germanium and silicon filters block radiated energy and allow infrared light to pass through to form an image. Dr. Driscoll displays a scanning bolometer, which can see in the dark, but the shapes it creates need to be interpreted. He also shows a snooperscope and a film clip of a sniperscope with infrared scope. Previous research on infrared or thermal detection was done by Sir John Frederick William Herschel. Potter Trainer demonstrates and explains the Evaporagraph (EVA), which is based on the principle that all things radiate heat as infrared rays, and shows some of the actual pictures made from heat rather than light. Dr. Walter Baird describes applications of EVA to industry, such as detecting problem-causing hot spots in electronic equipment or indicating heat escape or insulation deficiency in a building. EVA's resolution is 10 lines/mm at best, and it shows temperature contrast of .2 degree. The machine's weakness is the slow speed of response to small temperature differences and the inability to obtain the temperature scale of the item viewed. Nonetheless, Mr. Poole says EVA could play a vital role in civil defense and medicine.",Public digital access,Science Review,"relators:brd:ABC Television Network||relators:drt:Calfee, Kennard||relators:nrt:Chaseman, Joel||relators:prd:Baird, Walter S.||relators:prd:Driscoll, Walter G.||relators:prd:Poole, Lynn||relators:prd:Trainer, Potter.||relators:pro:Geier, Leo, 1926-2017||relators:pro:Poole, Lynn||relators:pro:WAAM (Television station : Baltimore, Md.)||relators:aus:Comte, Gilbert",Copyright Not Evaluated,"Originally broadcast as a segment of the television program Johns Hopkins File 7 on January 13, 1957 from the studios of WAAM in Baltimore, Md. Black and white. Lynn Poole, Leo Geier, producers; Kennard Calfee, director; Gilbert Comte, writer; Joel Chaseman, narrator; produced by WAAM television station in Baltimore, Md. for the ABC Television Network. Lynn Poole, Walter S. Baird, Walter G. Driscoll, Potter Trainer, presenters. Digitized in 2004.",Johns Hopkins University. Sheridan Libraries,Educational television programs,Video,Johns Hopkins University,Moving Image,Infrared radiation||Camera obscuras,,,,,,,,,,https://stage.digital.library.jhu.edu/system/files/2022-03-17/jhu_coll-0008_A6014.mp4

Running it

./grab.py
. . .
Processing https://stage.digital.library.jhu.edu/node/11855...5663 left to complete.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment