Skip to content

Instantly share code, notes, and snippets.

@gg4u
Created December 8, 2016 22:04
Show Gist options
  • Save gg4u/529173829df7c2bb51a4c2d3b7c0cdea to your computer and use it in GitHub Desktop.
Save gg4u/529173829df7c2bb51a4c2d3b7c0cdea to your computer and use it in GitHub Desktop.
Scraper for free pdf books at UnitedDiversity
#!/usr/bin/env python
import requests
from bs4 import BeautifulSoup
import wget
import os
def get_folder(q=None):
url = 'http://library.uniteddiversity.coop/'
if q:
url += str(q)
print 'q', q
if not os.path.exists(q):
os.makedirs(q)
print 'making directory', os.getcwd() + '/' + q
r = requests.get(url)
soup = BeautifulSoup(r.content)
links = soup.find_all('a')
for item in list(links):
print item['href'], '\n'
if '..' in item['href']:
continue
if item['href'][-1] != '/':
wget.download( url + item['href'] , out=os.getcwd() + '/' + q)
else:
get_folder(item['href'])
links.remove(item)
get_folder()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment