Skip to content

Instantly share code, notes, and snippets.

@mopemope
Created September 9, 2010 09:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mopemope/571672 to your computer and use it in GitHub Desktop.
Save mopemope/571672 to your computer and use it in GitHub Desktop.
from os import path
from werkzeug import secure_filename
import eventlet
from eventlet.green import urllib2
from pyquery import PyQuery as pq
from urlparse import urlparse
search_urls = [
'http://www.empflix.com/browsecat.php?page=%s&chid=17&category=rd',
]
detail_urls = []
id_mode = True
save_path = "/home/ma2/Public/empflix/"
pool = eventlet.GreenPool(2)
import re
download_re = re.compile("\s*so.addVariable\('config',\s*'([\w\d\.:/%=_-]*)'\);", re.M)
def get_pagelist(url, page=1):
q = []
conn = urllib2.urlopen(url % page)
page = conn.read()
d = pq(page)
for span in d(".thumb"):
detail_url = pq(span.find("a")).attr.href
q.append(detail_url)
return q
def _get_flv(page):
match = download_re.search(page)
if match:
url = match.group(1)
#url = "http://cdn.tnaflix.com/" + url
import urllib
url = urllib.unquote(url)
conn = urllib2.urlopen(url)
data = conn.read()
d = pq(data)
download_url = d("file").text()
d = pq(page)
file_name = secure_filename(d("h2:first").text() + ".flv")
return download_url, file_name
def get_download_url(url):
conn = urllib2.urlopen(url)
page = "".join(conn.readlines())
d = pq(page)
#download_url = d(".linkRight a:first").attr.href
download_url = d(".downloadButton").attr.href
if download_url:
parsed = urlparse(download_url)
file_name = parsed.path.split("/")[-1]
else:
download_url, file_name = _get_flv(page)
return url, download_url, file_name
def download_flv(url, down_url, file_name):
print "'%s' ---- Try Download ----" % url
out_path = path.join(save_path, file_name)
if not file_name:
print "'%s' ** Not Found Link ** " % url
return
partial = False
try:
conn = urllib2.urlopen(down_url)
length = conn.info()['Content-Length']
length = int(length)
if length < 1024 * 1024 * 50 or length > 1024 * 1024 * 900:
print "*** '%s' is small! Skip!!!'%s' ***" % (url, length)
return
if path.exists(out_path):
size = path.getsize(out_path)
if size < length:
r = "bytes=%s-" % size
req = urllib2.Request(down_url, headers={"Range":r})
conn = urllib2.urlopen(req)
print "'%s' == Resume!! '%s' ==" % (url, file_name)
print "'%s' == File '%s' Size: %d/%d'" % (url, file_name, size, length)
partial = True
else:
print "'%s' == Downloaded '%s' ==" % (url, file_name)
return
except:
import traceback
print traceback.format_exc()
pool.spawn_n(download, url)
return
if partial:
f = open(out_path, "rb+")
f.seek(0, 2)
else:
f = open(out_path, "wb")
print "'%s' == Start '%s' ==" % (url, file_name)
while True:
data = conn.read(1024 * 512 )
if not data:
break
f.write(data)
#per = path.getsize(out_path) / float(length) * 100.0
#print "'%s' == '%s' %d%% done. ==" % (url, file_name, per)
print "'%s' == Finish '%s' ==" % (url, file_name)
def download(url):
url, download_url, file_name = get_download_url(url)
id = urlparse(url).query[3:]
if id_mode:
file_name = id + "_" + file_name
if not download_url.startswith('#'):
if file_name.lower().find('mosaic') == -1:
download_flv(url, download_url, file_name)
q = []
def start(url, min_page=1, max_page=12):
for i in xrange(min_page, max_page+1):
urls = get_pagelist(url, page=i)
q.extend(urls)
q.reverse()
while q:
url = q.pop()
pool.spawn_n(download, url)
def read_detail_urls(file='empflix.txt'):
i = 0
for href in open(file):
i += 1
href = href.strip()
if href:
detail_urls.append(href)
if __name__ == '__main__':
for url in search_urls:
start(url=url)
pool.waitall()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment