Skip to content

Instantly share code, notes, and snippets.

@peakera
Created July 28, 2016 16:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save peakera/c7041e6c779b37fa60d60b5b506ae11c to your computer and use it in GitHub Desktop.
Save peakera/c7041e6c779b37fa60d60b5b506ae11c to your computer and use it in GitHub Desktop.
import urllib2
from bs4 import BeautifulSoup
import re
import csv
import requests
#Checks all of the pages listing records for the record urls
for i in range(0,5):
url = ('http://www.qub.ac.uk/imagining-history/resources/short/index.php?pageNum_rstCallShort=' + str(i))
x = urllib2.urlopen(url)
soup = BeautifulSoup(x, "html.parser")
soup_links = soup.find_all(href=re.compile("record"))
for link in soup_links:
rec = link.get("href")
#Adds the record number to the end of a url to generate a list of records for recursive scraping
page = ('http://www.qub.ac.uk/imagining-history/resources/short/' + rec)
#Makes the soup
y = urllib2.urlopen(page)
soupy = BeautifulSoup(y, "html.parser")
#scrapes the html content of all record pages
res = requests.get(page)
res.raise_for_status()
file = open(rec + '.txt', 'wb')
for chunk in res.iter_content(100000):
file.write(chunk)
file.close()
#creates a csv file with a list of the record page urls
# with open('dart_pages.csv', 'a') as f:
# writer = csv.writer(f, delimiter=",")
# writer.writerow([page])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment