Skip to content

Instantly share code, notes, and snippets.

@zmjones
Last active January 2, 2016 06:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zmjones/8261146 to your computer and use it in GitHub Desktop.
Save zmjones/8261146 to your computer and use it in GitHub Desktop.
scrape all of the of Peace Science presentations from the 2013 meeting
import requests
from bs4 import BeautifulSoup
def find_links(url, find_file=False):
soup = BeautifulSoup(requests.get(url).content)
try:
links = soup.find('div', id='content-core').find_all('a')
if find_file:
links = [link['href'] + '/' + link.contents[2].strip()
for link in links]
else:
links = [link['href'] for link in links]
except:
return None
return list(set(links))
def download_file(url, directory):
filename = url.split('/')[-1]
try:
r = requests.get(url, stream=True)
except:
return None
with open(directory + filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
f.flush()
return filename
main = find_links('http://forms2.la.psu.edu/pss/pss-presentations/')
pages = sum([find_links(link) for link in main], [])
pages = sum([find_links(page) for page in pages], [])
papers = [find_links(page, True) for page in pages]
[download_file(paper[0], './pss2013/') for paper in papers if type(paper) is list]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment