Skip to content

Instantly share code, notes, and snippets.

@zmjones zmjones/pss2013.py
Last active Jan 2, 2016

Embed
What would you like to do?
scrape all of the of Peace Science presentations from the 2013 meeting
import requests
from bs4 import BeautifulSoup
def find_links(url, find_file=False):
soup = BeautifulSoup(requests.get(url).content)
try:
links = soup.find('div', id='content-core').find_all('a')
if find_file:
links = [link['href'] + '/' + link.contents[2].strip()
for link in links]
else:
links = [link['href'] for link in links]
except:
return None
return list(set(links))
def download_file(url, directory):
filename = url.split('/')[-1]
try:
r = requests.get(url, stream=True)
except:
return None
with open(directory + filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
f.flush()
return filename
main = find_links('http://forms2.la.psu.edu/pss/pss-presentations/')
pages = sum([find_links(link) for link in main], [])
pages = sum([find_links(page) for page in pages], [])
papers = [find_links(page, True) for page in pages]
[download_file(paper[0], './pss2013/') for paper in papers if type(paper) is list]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.