Skip to content

Instantly share code, notes, and snippets.

@wilbeibi
Created January 31, 2015 19:46
Show Gist options
  • Save wilbeibi/3644e800dd048004dbe1 to your computer and use it in GitHub Desktop.
Save wilbeibi/3644e800dd048004dbe1 to your computer and use it in GitHub Desktop.
PyQuery
from pyquery import PyQuery as pq
from multiprocessing.dummy import Pool as ThreadPool
import urllib
class Download():
def __init__(self, ext, url):
self.ext = ext
self.url = url
def getlinks(self):
# @return list of files
res = []
data = pq(url=self.url) # url
links = data('a')
for link in links:
href_file = pq(link).attr('href')
if not href_file or len(href_file) < 4:
continue
if href_file[-4:] == '.' + self.ext:
res.append(href_file)
return res
def download_single(self, link):
fname = link.rsplit('/', 1)[1]
urllib.urlretrieve(link, fname)
print 'download %s success' % (fname)
def download_all(self):
links = self.getlinks()
pool = ThreadPool(len(links)/3)
pool.map(self.download_single, links)
if __name__ == '__main__':
down = Download('pdf', 'http://netsmell.com/principles-of-distributed-computing-lecture-collection.html')
down.download_all()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment