Skip to content

Instantly share code, notes, and snippets.

Created July 28, 2016 16:44
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save peakera/c7041e6c779b37fa60d60b5b506ae11c to your computer and use it in GitHub Desktop.
import urllib2
from bs4 import BeautifulSoup
import re
import csv
import requests
#Checks all of the pages listing records for the record urls
for i in range(0,5):
url = ('' + str(i))
x = urllib2.urlopen(url)
soup = BeautifulSoup(x, "html.parser")
soup_links = soup.find_all(href=re.compile("record"))
for link in soup_links:
rec = link.get("href")
#Adds the record number to the end of a url to generate a list of records for recursive scraping
page = ('' + rec)
#Makes the soup
y = urllib2.urlopen(page)
soupy = BeautifulSoup(y, "html.parser")
#scrapes the html content of all record pages
res = requests.get(page)
file = open(rec + '.txt', 'wb')
for chunk in res.iter_content(100000):
#creates a csv file with a list of the record page urls
# with open('dart_pages.csv', 'a') as f:
# writer = csv.writer(f, delimiter=",")
# writer.writerow([page])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment