Created
January 31, 2015 19:46
-
-
Save wilbeibi/3644e800dd048004dbe1 to your computer and use it in GitHub Desktop.
PyQuery
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyquery import PyQuery as pq | |
from multiprocessing.dummy import Pool as ThreadPool | |
import urllib | |
class Download(): | |
def __init__(self, ext, url): | |
self.ext = ext | |
self.url = url | |
def getlinks(self): | |
# @return list of files | |
res = [] | |
data = pq(url=self.url) # url | |
links = data('a') | |
for link in links: | |
href_file = pq(link).attr('href') | |
if not href_file or len(href_file) < 4: | |
continue | |
if href_file[-4:] == '.' + self.ext: | |
res.append(href_file) | |
return res | |
def download_single(self, link): | |
fname = link.rsplit('/', 1)[1] | |
urllib.urlretrieve(link, fname) | |
print 'download %s success' % (fname) | |
def download_all(self): | |
links = self.getlinks() | |
pool = ThreadPool(len(links)/3) | |
pool.map(self.download_single, links) | |
if __name__ == '__main__': | |
down = Download('pdf', 'http://netsmell.com/principles-of-distributed-computing-lecture-collection.html') | |
down.download_all() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment