Skip to content

Instantly share code, notes, and snippets.

@ymotongpoo
Created February 7, 2011 16:35
Show Gist options
  • Save ymotongpoo/814675 to your computer and use it in GitHub Desktop.
Save ymotongpoo/814675 to your computer and use it in GitHub Desktop.
download flv file from some video site
# -*- coding: utf-8 -*-
import re
import urllib
from urlparse import urlunparse, urlparse
import gzip
import eventlet
from eventlet.green import urllib2
import cookielib
from pyquery import PyQuery as pq
from functools import partial
from werkzeug import secure_filename
import os.path
pool = eventlet.GreenPool()
search_url = ur"http://www.empflix.com/search.php"
method = "GET"
query = [u"creampie"]
charset = "utf-8"
# flvがあるページへのURL
# http://cdn.empflix.com/empflv/xxxxxxxxxx
target_url_ptn = re.compile("""
so.addVariable\('config',\ ' # SWFObjectに渡す部分
(?P<url>http://\S+) # flvファイルへのURL
'\)
""", re.VERBOSE)
flv_url_ptn = re.compile("""
<!--<file>
(?P<url>http://\S+\.flv)
</file>-->
""", re.VERBOSE)
download_dir = "./download/empflix"
def build_opener():
jar = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(jar),
urllib2.HTTPRedirectHandler())
opener.addheaders = [("User-Agent", "Mozilla/5.0 (compatible; python)"),
("Connection", "keep-alive")]
return opener
def get_search_result(opener, query, page=1):
view_urls = []
for q in query:
form_dict = {u'what': q,
u'page': unicode(page)}
params = urllib.urlencode(form_dict)
print params
conn = None
if method.upper() == "GET":
conn = opener.open(search_url + u'?' + params)
elif method.upper() == "POST":
conn = opener.open(search_url, params)
else:
raise ValueError, method
if conn:
page = conn.read().decode(charset)
d = pq(page)
for span in d(".thumb"):
view_url = pq(span.find("a")).attr.href
view_urls.append(view_url)
return view_urls
def _get_download_url(opener, view_url):
conn = opener.open(view_url)
data = conn.read()
d = pq(data)
download_url = d(".downloadButton").attr.href
if download_url:
parsed = urlparse(download_url)
filename = parsed.path.split("/")[-1]
else:
download_url, filename = _parse_extra_page(opener, data)
return download_url, filename
def _parse_extra_page(opener, data):
# ファイル名
dom = pq(data)
filename = secure_filename(dom("h2:first").text()) + ".flv"
# 次のページに行ってflvの直リンクを取得
match = target_url_ptn.search(data)
if match:
d = match.groupdict()
extra_page_url = urllib.unquote(d['url'])
conn = opener.open(extra_page_url)
data = conn.read()
ematch = flv_url_ptn.search(data)
if ematch:
ed = match.groupdict()
download_url = ed['url']
return download_url, filename
def download_flv(opener, download_url, filename):
if not filename:
print "%s *** no filename found ***" % download_url
return
try:
conn = opener.open(download_url)
length = conn.info()['Content-Length']
length = int(length)
save_path = os.path.join(download_dir, filename)
resume = False
if os.path.exists(save_path):
resume = True
size = os.path.getsize(save_path)
if size < length:
since = "bytes=%s-" % size
req = urllib2.Request(download_url)
req.add_header("Range", since)
conn = opener.open(req)
else:
print "%s ====> downloaded '%s'" % (download_url, filename)
return
except:
import traceback
print traceback.format_exc()
return
fp = open(save_path, 'ab+') if resume else open(save_path, 'wb')
try:
if resume:
print "%s ----> resume start from %d byte" % (filename, size)
print "%s ----> download start" % filename
while True:
data = conn.read(1024 * 512)
if not data:
break
fp.write(data)
except:
import traceback
print traceback.format_exc()
return
def main():
opener = build_opener()
view_urls = get_search_result(opener, query)
get_download_url = partial(_get_download_url, opener)
results = []
for url in pool.imap(get_download_url, view_urls):
results.append(url)
for download_url, filename in results:
download_flv(opener, download_url, filename)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment