alanmackenzie/scrape_datacite_doi.py

## scrape_datacite_doi.py
"""
Credits: Carl Wilson (Open Preservation Foundation)

Using the search root for datacite: https://search.datacite.org/data-centers?member-id=bl&page=

I iterate through the data centres (in this case those that are BL assigned), then for each data centre I visit the
works page, e.g. https://search.datacite.org/data-centers/bl.ads  and cycle through the works. I stop
(usually on the first) when I can scrape the doi from something like
https://doi.org/10.15124/19B43AA8-744A-404B-A8FF-7B0F931BF6D3  where the 3rd part of the URL is the DOI, in this case
for York University.


"""


def scrape_datacite_doi(datacentre_page_rel_url):
    """ Scrapes the DOI from a datacentre's home page and returns the DOI and
    the BL identifier as a tuple.
    """
    datacentre_url = '{}{}'.format(DATACITE_HTML_ROOT, datacentre_page_rel_url)
    datacentre_page = requests.get(datacentre_url)
    datacentre_tree = html.fromstring(datacentre_page.content)
    doi_links = datacentre_tree.xpath('//h3[@class="work"]/a')
    for doi_link in doi_links:
        scraped_href = doi_link.get('href')
        if scraped_href.startswith("/works"):
            href_parts = scraped_href.split('/')
            if len(href_parts) > 2:
                return href_parts[2], href_parts[3]
    return None, None
	"""
	Credits: Carl Wilson (Open Preservation Foundation)

	Using the search root for datacite: https://search.datacite.org/data-centers?member-id=bl&page=

	I iterate through the data centres (in this case those that are BL assigned), then for each data centre I visit the
	works page, e.g. https://search.datacite.org/data-centers/bl.ads and cycle through the works. I stop
	(usually on the first) when I can scrape the doi from something like
	https://doi.org/10.15124/19B43AA8-744A-404B-A8FF-7B0F931BF6D3 where the 3rd part of the URL is the DOI, in this case
	for York University.


	"""


	def scrape_datacite_doi(datacentre_page_rel_url):
	""" Scrapes the DOI from a datacentre's home page and returns the DOI and
	the BL identifier as a tuple.
	"""
	datacentre_url = '{}{}'.format(DATACITE_HTML_ROOT, datacentre_page_rel_url)
	datacentre_page = requests.get(datacentre_url)
	datacentre_tree = html.fromstring(datacentre_page.content)
	doi_links = datacentre_tree.xpath('//h3[@class="work"]/a')
	for doi_link in doi_links:
	scraped_href = doi_link.get('href')
	if scraped_href.startswith("/works"):
	href_parts = scraped_href.split('/')
	if len(href_parts) > 2:
	return href_parts[2], href_parts[3]
	return None, None