thisismattmiller/get_links.py

## get_links.py
import requests
from bs4 import BeautifulSoup
import json

all_item_urls = []

counter = 1


while counter <= 609:


	url = "https://americanarchive.org/catalog?f%5Baccess_types%5D%5B%5D=online&per_page=100&q="

	url =url+ "&page=" + str(counter)
	print(url)
	r = requests.get(url)

	soup = BeautifulSoup(r.text, features="html.parser")

	articles = soup.find_all('article', {'class':'col-md-9'})

	for article in articles:
		alink = article.find('a')
		all_item_urls.append(alink['href'])


	json.dump(all_item_urls, open('all_urls.json','w'),indent=2)


	counter=counter+1

## get_xml.py
import requests
import json
import glob


all_xml_files = glob.glob('xml_files/*.pbcore')
already_done = []

for already_done_file in all_xml_files:

	already_done.append(already_done_file.replace('xml_files/',''))

all_urls = json.load(open('all_urls.json'))

for url in all_urls:


	url = 'https://americanarchive.org' + url + '.pbcore'
	print(url)

	filename = url.replace('https://americanarchive.org/catalog/','')

	if filename not in already_done:


		r = requests.get(url)

		with open('xml_files/'+ filename, 'w') as outfile:

			outfile.write(r.text)

	else:

		print('skipping',filename)


## parse_xml.py
import glob
import xml.etree.ElementTree as ET

all_dates_count = 0
total_dates = 0
for file in glob.glob('xml_files/*.pbcore'):
	print(file)

	tree = ET.parse(file)

	root = tree.getroot()
	for child in root:

		if child.tag == '{http://www.pbcore.org/PBCore/PBCoreNamespace.html}pbcoreAssetDate':

			year = int(child.text.split('-')[0])

			total_dates = total_dates + year
			all_dates_count = all_dates_count  +1

print(total_dates / all_dates_count )
	import requests
	from bs4 import BeautifulSoup
	import json

	all_item_urls = []

	counter = 1


	while counter <= 609:


	url = "https://americanarchive.org/catalog?f%5Baccess_types%5D%5B%5D=online&per_page=100&q="

	url =url+ "&page=" + str(counter)
	print(url)
	r = requests.get(url)

	soup = BeautifulSoup(r.text, features="html.parser")

	articles = soup.find_all('article', {'class':'col-md-9'})

	for article in articles:
	alink = article.find('a')
	all_item_urls.append(alink['href'])



	json.dump(all_item_urls, open('all_urls.json','w'),indent=2)



	counter=counter+1
	import requests
	import json
	import glob


	all_xml_files = glob.glob('xml_files/*.pbcore')
	already_done = []

	for already_done_file in all_xml_files:

	already_done.append(already_done_file.replace('xml_files/',''))

	all_urls = json.load(open('all_urls.json'))

	for url in all_urls:


	url = 'https://americanarchive.org' + url + '.pbcore'
	print(url)

	filename = url.replace('https://americanarchive.org/catalog/','')

	if filename not in already_done:


	r = requests.get(url)

	with open('xml_files/'+ filename, 'w') as outfile:

	outfile.write(r.text)

	else:

	print('skipping',filename)
	import glob
	import xml.etree.ElementTree as ET

	all_dates_count = 0
	total_dates = 0
	for file in glob.glob('xml_files/*.pbcore'):
	print(file)

	tree = ET.parse(file)

	root = tree.getroot()
	for child in root:

	if child.tag == '{http://www.pbcore.org/PBCore/PBCoreNamespace.html}pbcoreAssetDate':

	year = int(child.text.split('-')[0])

	total_dates = total_dates + year
	all_dates_count = all_dates_count +1

	print(total_dates / all_dates_count )