Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Downloading PBCore records from https://americanarchive.org
import requests
import json
import glob
all_xml_files = glob.glob('xml_files/*.pbcore')
already_done = []
for already_done_file in all_xml_files:
already_done.append(already_done_file.replace('xml_files/',''))
all_urls = json.load(open('all_urls.json'))
for url in all_urls:
url = 'https://americanarchive.org' + url + '.pbcore'
print(url)
filename = url.replace('https://americanarchive.org/catalog/','')
if filename not in already_done:
r = requests.get(url)
with open('xml_files/'+ filename, 'w') as outfile:
outfile.write(r.text)
else:
print('skipping',filename)
import glob
import xml.etree.ElementTree as ET
all_dates_count = 0
total_dates = 0
for file in glob.glob('xml_files/*.pbcore'):
print(file)
tree = ET.parse(file)
root = tree.getroot()
for child in root:
if child.tag == '{http://www.pbcore.org/PBCore/PBCoreNamespace.html}pbcoreAssetDate':
year = int(child.text.split('-')[0])
total_dates = total_dates + year
all_dates_count = all_dates_count +1
print(total_dates / all_dates_count )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment