peakera/remix_webscraper.py

## remix_webscraper.py
import urllib2
from bs4 import BeautifulSoup
import re
import csv
import requests

#Checks all of the pages listing records for the record urls
for i in range(0,5):
	url = ('http://www.qub.ac.uk/imagining-history/resources/short/index.php?pageNum_rstCallShort=' + str(i))

	x = urllib2.urlopen(url)
	soup = BeautifulSoup(x, "html.parser")
	soup_links = soup.find_all(href=re.compile("record"))
	for link in soup_links:
		rec = link.get("href")
		#Adds the record number to the end of a url to generate a list of records for recursive scraping
		page = ('http://www.qub.ac.uk/imagining-history/resources/short/' + rec)

		#Makes the soup
		y = urllib2.urlopen(page)
		soupy = BeautifulSoup(y, "html.parser")

		#scrapes the html content of all record pages
		res = requests.get(page)
		res.raise_for_status()
		file = open(rec + '.txt', 'wb')
		for chunk in res.iter_content(100000):
			file.write(chunk)
	    	file.close()


		#creates a csv file with a list of the record page urls
		# with open('dart_pages.csv', 'a') as f:
		# 	writer = csv.writer(f, delimiter=",")
		# 	writer.writerow([page])
	import urllib2
	from bs4 import BeautifulSoup
	import re
	import csv
	import requests

	#Checks all of the pages listing records for the record urls
	for i in range(0,5):
	url = ('http://www.qub.ac.uk/imagining-history/resources/short/index.php?pageNum_rstCallShort=' + str(i))

	x = urllib2.urlopen(url)
	soup = BeautifulSoup(x, "html.parser")
	soup_links = soup.find_all(href=re.compile("record"))
	for link in soup_links:
	rec = link.get("href")
	#Adds the record number to the end of a url to generate a list of records for recursive scraping
	page = ('http://www.qub.ac.uk/imagining-history/resources/short/' + rec)

	#Makes the soup
	y = urllib2.urlopen(page)
	soupy = BeautifulSoup(y, "html.parser")

	#scrapes the html content of all record pages
	res = requests.get(page)
	res.raise_for_status()
	file = open(rec + '.txt', 'wb')
	for chunk in res.iter_content(100000):
	file.write(chunk)
	file.close()


	#creates a csv file with a list of the record page urls
	# with open('dart_pages.csv', 'a') as f:
	# writer = csv.writer(f, delimiter=",")
	# writer.writerow([page])