Skip to content

Instantly share code, notes, and snippets.

@jjjake
Created February 5, 2014 19:05
Show Gist options
  • Save jjjake/8830793 to your computer and use it in GitHub Desktop.
Save jjjake/8830793 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
"""Get PDF URLs from metmuseum.org"""
import requests
from BeautifulSoup import BeautifulSoup
def get_pdfs(page):
u = 'http://www.metmuseum.org/research/metpublications/titles-with-full-text-online'
p = dict(
searchtype='F',
rpp='12',
pg=page,
)
r = requests.get(u, params=p)
soup = BeautifulSoup(r.content)
pdfs = []
for a in soup.findAll('a'):
href = a.get('href')
if href and href.startswith('/research/metpublications/'):
url = 'http://resources.metmuseum.org/resources/metpublications/pdf/'
url += href.split('?')[0].replace('/research/metpublications/', '')
url += '.pdf'
pdfs.append(url)
return pdfs
if __name__ == '__main__':
for i in range(1,30):
for pdf in get_pdfs(i):
print pdf
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment