Skip to content

Instantly share code, notes, and snippets.

@X-Wei
Created December 31, 2016 11:29
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save X-Wei/46817a6614e3677391ab13e420b4cb9f to your computer and use it in GitHub Desktop.
Save X-Wei/46817a6614e3677391ab13e420b4cb9f to your computer and use it in GitHub Desktop.
jikexueyuan video downloader
# coding: utf-8
import requests
from lxml import etree
import os, time, sys
import cPickle as pk
reload(sys)
sys.setdefaultencoding('utf-8') # to avoid encoding problems
hea = {
'Host': 'www.jikexueyuan.com',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.90 Safari/537.36',
'Connection': 'keep-alive',
'Cookie': 'gr_user_id=87ea5cdc-d227-4bed-8e9d-369149cc3fda; stat_uuid=1468588589952565216740; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221560a141fc686-0baf1a4503416-24414032-15f900-1560a141fc88%22%7D; Hm_lvt_f3c68d41bda15331608595c98e9c3915=1469047902; looyu_id=e0e3fc304a4c5523ddf6454e3fbdad6579_20001269%3A5; undefined=; connect.sid=s%3AeXmKsrfkcmNUVGMVdQ04ZaXFXT0znDBF.ChCxa6%2F9sHM4PQPmOy3r0lPFAKKvek6%2Bvy8lPMvK3t8; QINGCLOUDELB=84b10773c6746376c2c7ad1fac354ddfd562b81daa2a899c46d3a1e304c7eb2b|WCroE|WCroE; _ga=GA1.2.1510658653.1468588590; _gat=1; gr_session_id_aacd01fff9535e79=258f42eb-cd9f-4f17-97f0-9bd5bf9cc3b1; uname=jike_jddymx; uid=3514144; code=MPDZNL; authcode=0198Sfj%2FuyDWiApnRaJJxScBWwrupZo9w0oQkE1yBaN9D8tnIT4RmcJf6%2BpbYw4Ba711cMARquMX1reWaTrFa3JLAaqFDZRba3CZszzbnK9P2MgT8ua1Fq4T213pC0k; avatar=http%3A%2F%2Fassets.jikexueyuan.com%2Fuser%2Favtar%2Fdefault.gif; ca_status=0; vip_status=1; level_id=3; is_expire=0; domain=0Jjgkqkqq',
}
# useful help function
def download((url, fpath), headers={}):
fname = os.path.split(fpath)[-1]
print 'start downloading %s ...' % fname
with open(fpath, 'wb') as f:
while 1:
resp = requests.get(url, stream=True, headers=headers); time.sleep(1.0)
if resp.ok: break
print resp.status_code
for chunk in resp.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
print 'download finished: %s' % fpath
def gethtml(url, delay=0.01):
while 1:
try:
html = requests.get(url, headers=hea).content
time.sleep(delay) # avoid Error 104 (reset by peer)
break
except Exception: pass
return html
def geturls_course(course_url = 'http://www.jikexueyuan.com/course/1287.html', folder='videos', idx = None):
html = gethtml(course_url)
sel = etree.HTML(html)
course_name = sel.xpath('//span[@class="tit"]/text()')[0].strip().encode('utf-8')
if idx is not None:
course_name = '%d-%s' % (idx, course_name)
video_folder = os.path.join(folder, course_name)
if not os.path.exists(video_folder):
os.makedirs(video_folder)
print 'getting course "%s"......' % course_name
lessons = sel.xpath('//div[@class="lesson-box"]/ul/li/div/h2/a')
video_urls_list = [] # contains (url,fpath) tuple
for i,les in enumerate(lessons,1):
les_name = les.xpath('string(.)').strip().encode('utf-8')
sys.stdout.write('\r %d-%s...' % (i,les_name)); sys.stdout.flush()
les_url = les.xpath('@href')[0]
les_html = gethtml(les_url)
video_url = etree.HTML(les_html).xpath('//source/@src')[0]
fpath = os.path.join(video_folder, '%d.%s.mp4'%(i,les_name))
video_urls_list.append( (str(video_url), str(fpath)) )
print ''
print 'All video urls in course "%s" got!' % course_name
# get course zip file
def download_course_zip(courseid):
json_url = 'http://www.jikexueyuan.com/course/downloadRes?course_id=%d' % courseid
import json
jsdict = json.loads( gethtml(json_url) )
if 'url' not in jsdict['data'] :
return # some courses don't have a zip file to download
zip_url = jsdict['data']['url']
fpath = os.path.join(video_folder, '%s.zip' % course_name)
download( (zip_url, fpath), headers=hea )
#~ courseid = int( course_url[:-5].split('/')[-1] )
#~ download_course_zip(courseid) # downloading zip always get 403 Error !
download_videos(video_urls_list)
return video_urls_list
def geturls_series(series_url='http://ke.jikexueyuan.com/xilie/116'):
html = requests.get(series_url).content; time.sleep(0.01)
sel = etree.HTML(html)
series_name = sel.xpath('//dd/h2/text()')[0].strip()
print '===getting series "%s"===' % series_name
folder = os.path.join('videos', 'series-' + series_name)
courses = sel.xpath('//div[@class="lesson-item"]/a/@href')
video_urls_list = [] # contains (url,fpath) tuple
for i, course_url in enumerate(courses, 1):
print i, course_url
video_urls_list += geturls_course(course_url, folder=folder , idx=i)
time.sleep(0.2)
print '===all video urls in series "%s" got===' % series_name
with open('%s-urls.pk'%series_name, 'wb') as f:
pk.dump(video_urls_list, f)
return video_urls_list
def geturls_career_path(career_url='http://ke.jikexueyuan.com/zhiye/android/'):
'''
course_url_xpath = '//section[@class="lesson-unit"]/table[@class="table lesson-step"]//a[@class="inner"]/@href'
step_title_xpath = '//section[@class="lesson-unit"]/table[@class="table lesson-step"]//th/text()'
unit_title_xpath = '//section[@class="lesson-unit"]//h3/text()'
'''
html = requests.get(career_url).content; time.sleep(0.01)
sel = etree.HTML(html)
career_path_name = 'career-path-' + sel.xpath('//h1[@class="caption"]/text()')[0].strip()
units = sel.xpath('//section[@class="lesson-unit"]')
video_urls_list = []
for unit in units:
unit_name = unit.xpath('header/h3/text()')[0].strip()
print '===unit %s===' % unit_name
steps = unit.xpath('table[@class="table lesson-step"]')
for s, step in enumerate(steps,1):
step_name = '%d-%s' % (s, step.xpath('thead/tr/th/text()')[0].strip() )
print '===step %s===' % step_name
for i, course_url in enumerate(step.xpath('.//a[@class="inner"]/@href'), 1):
folder = os.path.join('videos', career_path_name, unit_name, step_name)
print i, course_url
video_urls_list += geturls_course(course_url, folder=folder , idx=i)
time.sleep(0.2)
print '===step %s finished===' % step_name
print '===unit %s finished===' % unit_name
with open('%s-urls.pk'%career_path_name, 'wb') as f:
pk.dump(video_urls_list, f)
return video_urls_list
def download_videos(video_urls_list):
print 'downloading %d files in parallel...' % len(video_urls_list)
from multiprocessing import Pool
pool = Pool(processes=4)
pool.map(download, video_urls_list)
pool.close()
pool.join()
print 'all downloading finished !'
if __name__ == '__main__':
#~ geturls_series('http://ke.jikexueyuan.com/xilie/108')
video_urls_list = geturls_career_path('http://ke.jikexueyuan.com/zhiye/web/')
download_videos(video_urls_list)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment