Skip to content

Instantly share code, notes, and snippets.

@epoz
Last active August 10, 2023 05:47
Show Gist options
  • Save epoz/0c994423101945446938e9a98bf1588e to your computer and use it in GitHub Desktop.
Save epoz/0c994423101945446938e9a98bf1588e to your computer and use it in GitHub Desktop.
OAI Downloader
import httpx, os
import xml.etree.ElementTree as ET
OUTPATH = "./harvest"
def parse(result:str):
doc = ET.fromstring(result)
buf = doc.findall('.//{http://www.openarchives.org/OAI/2.0/}record')
token = doc.find('.//{http://www.openarchives.org/OAI/2.0/}resumptionToken')
if token is not None:
token = token.text
return buf, token
def harvest(uri:str, metadata:str):
if not os.path.exists(OUTPATH):
os.mkdir(OUTPATH)
r = httpx.get(f'{uri}?verb=ListRecords&metadataPrefix={metadata}', timeout=60)
if r.status_code != 200:
raise Exception(f"{r.status_code} {r.text}")
buf, token = parse(r.text)
while token:
for record in buf:
i = record.find('.//{http://www.openarchives.org/OAI/2.0/}identifier').text
open(os.path.join(OUTPATH, i), "wb").write(ET.tostring(record))
try:
r = httpx.get(f'{uri}?verb=ListRecords&resumptionToken={token}', timeout=60)
except:
traceback.print_exc()
break
if r.status_code != 200:
print(r.text, r.status_code, "Error!")
break
buf, token = parse(r.text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment