Skip to content

Instantly share code, notes, and snippets.

@imebeh
Created July 11, 2013 16:23
Show Gist options
  • Save imebeh/5976925 to your computer and use it in GitHub Desktop.
Save imebeh/5976925 to your computer and use it in GitHub Desktop.
travel simple cd
#coding: utf-8
from threading import Thread
import urllib
import urllib2
import StringIO
import gzip
import Queue
import re
import time
repageid = re.compile(ur'<td class\="entry-info">[\n\s\t]+?<a href="/entry/([a-zA-Z0-9]+?)/')
relinkid = re.compile(ur'/download/\?mode=seperate&rid=([a-zA-Z0-9]+?)">')
reed2k = re.compile(ur'(ed2k://\|file\|[^\n]+?\|/)')
URL_LIST = 'http://simplecd.me/category/电影/?page=%s'
FROM_PAGE = 1
TO_PAGE = 10
URL_PAGE = u'http://simplecd.me/entry/%s/'
URL_DLOD = u'http://simplecd.me/download/?mode=copy&rid=%s' # &rid=fzPrxpg9&rid=fzPrxK1W
pageids = Queue.Queue()
linkids = Queue.Queue()
bListDone = False
bPageDone = False
####
header = {
'Accept': r'text/html, application/xml;q=0.9, application/xhtml+xml, image/png, image/webp, image/jpeg, image/gif, image/x-xbitmap, */*;q=0.1',
'User-Agent': r'safari/4.99/zh-cn',
'Accept-Language': 'zh-cn,zh;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Accept-Charset': 'GB2312,utf-8;q=0.7,*;q=0.7',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Connection': 'keep-alive'
}
cookie = urllib2.HTTPCookieProcessor()
opener = urllib2.build_opener(cookie)
def request(url, method='GET', data=None, timeout=60):
if method.upper() == 'POST':
if not data: data = {}
data = urllib.urlencode(data).encode('utf-8')
req = urllib2.Request(url=url,
headers=header,
data=data)
elif method.upper() == 'GET':
if data:
data = urllib.urlencode(data).encode('utf-8')
req = urllib2.Request(url=url + ('?%s' % data),
headers=header)
else:
req = urllib2.Request(url=url,
headers=header)
else:
return None
try:
response = opener.open(req, timeout= timeout)
x = response.read()
except Exception, e:
print url
print 'request error: %s' % e.message
return None
if 'gzip' in str(response.info().get('Content-Encoding')).lower():
buf = StringIO.StringIO(buf=x)
f = gzip.GzipFile(fileobj=buf)
x = f.read()
try:
ret = x.decode('utf-8')
except UnicodeDecodeError:
ret = x
response.close()
return ret
def searchEd2kLinks(html):
r= reed2k.findall(html)
if not r:
return []
rk={}
for i in r:
rk.update({i : None})
return rk.keys()
def travelList():
global bListDone
for i in range(FROM_PAGE, TO_PAGE+1):
#print 'page: ', i
html = request(url = URL_LIST % i, method = 'GET')
if html:
ps = repageid.findall(html)
if ps:
for p in ps:
pageids.put(p)
bListDone = True
def travelPages():
global bListDone
global bPageDone
while True:
print 'pageids: ', pageids.qsize()
if bListDone and pageids.qsize()==0:
break
elif pageids.qsize()==0:
time.sleep(0.5)
if pageids.qsize() > 0:
di = pageids.get()
url = URL_PAGE % di
html = request(url= url, method= 'GET')
if html:
ids= relinkid.findall(html)
if ids:
for i in range(1, len(ids), 5):
linkids.put(ids[i:i+5])
#time.sleep(2)
bPageDone = True
def travelEd2kLinks():
global bPageDone
while True:
print 'linkids: ', linkids.qsize()
if bPageDone and linkids.qsize()==0:
break
elif linkids.qsize()==0:
time.sleep(0.5)
if linkids.qsize() >0:
ids= linkids.get()
dlurl= URL_DLOD % '&rid='.join(ids)
dlhtml= request(url= dlurl, method= 'GET')
if dlhtml:
links= searchEd2kLinks(dlhtml)
if len(links):
for x in links:
print x
time.sleep(2)
t1= Thread(target= travelList)
t2= Thread(target= travelPages)
t3= Thread(target= travelEd2kLinks)
t1.start()
t2.start()
t3.start()
t2.join()
t3.join()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment