Created
September 15, 2013 09:52
-
-
Save beyondkmp/6569260 to your computer and use it in GitHub Desktop.
简单说一下:download类继承了threading.Thread类,并重写了run函数,目的是只要队列不为空,则不停的从队列中取出资源真实链接地址调用wget下载,如果为空则退出线程。startDown函数是多线程下载的接口,里面的参数分别为:url--资源的网页,rule--正则表达式匹配方式,num--开启的线程数,start--正则中匹配真实链接的起始位置,end--正则中匹配真实链接的结束位置,decoding--资源页面采用的编码方式,默认是utf8。
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from urllib2 import urlopen | |
import re | |
import Queue | |
import threading | |
import os | |
class download(threading.Thread): | |
def __init__(self,que): | |
threading.Thread.__init__(self) | |
self.que=que | |
def run(self): | |
while True: | |
if not self.que.empty(): | |
#print('-----%s------'%(self.name)) | |
#os.system('wget '+self.que.get()) | |
print self.que.get() | |
else: | |
break | |
def startDown(url,rule,num,start,end,decoding=None): | |
if not decoding: | |
decoding='utf8' | |
req=urlopen(url) | |
body=req.read().decode(decoding) | |
rule=re.compile(rule) | |
link=rule.findall(body) | |
que=Queue.Queue() | |
for l in link: | |
que.put(l[start:end]) | |
for i in range(num): | |
d=download(que) | |
d.start() | |
if __name__=='__main__': | |
url='https://class.coursera.org/algo-004/lecture/index' | |
rule='<a target=\"_new\" href=\".*\"' | |
startDown(url,rule,10,23,-1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment