Skip to content

Instantly share code, notes, and snippets.

@wckd
Last active August 29, 2015 14:16
Show Gist options
  • Save wckd/92b817f177af2a724980 to your computer and use it in GitHub Desktop.
Save wckd/92b817f177af2a724980 to your computer and use it in GitHub Desktop.
Download issues in pdf-form from idunn.no
#!/usr/bin/env python
import requests
from bs4 import BeautifulSoup
import os
## Settings
base_url = 'http://www.idunn.no'
folder = 'skatt' ## change this
scrape_url = '{0}/{1}'.format(base_url, folder)
pdf_url = ('{0}/{1}').format(base_url, 'file/pdf')
## Lists used internally
issueList = []
redirectList = []
## Start
print('Starting scrape of %s') % scrape_url
page = requests.get(scrape_url).text
print('Reading page ...')
soup = BeautifulSoup(page)
print('Doing some magic ...')
## Build issue list
for i in soup.find_all('a', { 'class' : 'issueInfo' }):
issueList.append(i['href'])
## Build url for issue page, visit the page and get pdf info
for issue in issueList:
url = ('{0}{1}').format(base_url, issue)
issue_page = requests.get(url).text
issue_soup = BeautifulSoup(issue_page)
for i in issue_soup('div', { 'id' : 'accessinfo' }):
if not '-1' in i['data-product-id']:
pdf_url = ('{0}/{1}/{2}').format(pdf_url, i['data-product-id'], i['data-issue-pdf-url'])
if not os.path.exists(folder):
os.makedirs(folder)
remote_pdf = requests.get(pdf_url)
if remote_pdf.history:
redirectList.append(issue)
break
pdf_local = ('{0}/{1}').format(folder, i['data-issue-pdf-url'])
with open(pdf_local, 'wb') as local_file:
local_file.write(remote_pdf.content)
print('Downloaded {0}').format(pdf_url)
print('{0} issue(s) skipped because of redirection').format(len(redirectList))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment