Skip to content

Instantly share code, notes, and snippets.

@vickychijwani
Forked from sumodx/ai-class.py
Created November 12, 2011 10:37
Show Gist options
  • Save vickychijwani/1360366 to your computer and use it in GitHub Desktop.
Save vickychijwani/1360366 to your computer and use it in GitHub Desktop.
Download lecture videos of ai-class, with HTTP proxy and basic resume support
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__author__ = "Deepak.G.R."
__credits__ = "Sumod Hajela"
__license__ = 'Public Domain'
"""
usage:
Go to command line and type
python ai-class.py "topic-name"
topic-names can be "Welcome to AI", "Problem Solving"
If download is interrupted, delete the last partial downloaded file restart. The script will skip all existing files and start with next new file.
To use HTTP proxy, change the <proxy_url> and <proxy_port> in the proxies dictionary below.
PS: Python2.6.2 should be installed in your system.
Let me know if you have any problems.
"""
import urllib
from urllib2 import *
from urlparse import *
from sgmllib import SGMLParser
import re
import pdb
import sys
import json
from os import *
import os.path
url_youtube = 'http://www.youtube.com/watch?v='
#req_unit = 'problem solving'
#req_unit = 'welcome to AI'
req_unit = sys.argv[1]
quiz_hash = dict();
proxies = {
'http': 'http://<proxy_url>:<proxy_port>'
}
class UrlLister(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.urls = []
self.flag = 0;
self.req_unit = req_unit;
self.names = [];
def start_a(self, attrs):
href = [value for name, value in attrs if name == 'href']
topic = re.search(r'/course/topic/(\d)+', str(href[0]))
if topic:
self.flag = 0
match = re.search(r'/course/video/\w+/\d+$', str(href[0]))
if match and self.flag == 1:
category = [value for name, value in attrs if name == 'id']
if 'quiz' in category[0]:
quiz_id = re.findall(r'quiz_(\d+)', category[0])[0]
video_id = quiz_hash[quiz_id]
else:
video_id = re.findall(r'video_\d+_(.+)', category[0])[0]
link = url_youtube + video_id
self.urls.append(link)
def handle_data(self, text):
if self.flag == 0:
text = text.strip();
text = re.sub(r'[^A-Za-z]', '', text)
self.req_unit = re.sub(r'[^A-Za-z]', '', self.req_unit)
match = re.search(text, self.req_unit, re.IGNORECASE)
if match and len(text) != 0:
self.flag = 1
def init_quiz_hash():
print 'STATUS: Initializing quiz_id hash'
quiz_url = 'http://www.ai-class.com/course/json/filter/QuizQuestion'
ProxyHandler(proxies)
quiz_url = urlopen(quiz_url)
data = json.load(quiz_url)
quiz_id = list()
for ind in xrange(len(data['data'])):
piece = str(data['data'][ind])
match = re.findall('\'youtube_id\': u\'(.+?)\',.*?\'quiz_question\': (\d+?),', piece)
if match:
quiz_id.append(match[0])
for v, i in quiz_id:
quiz_hash[i] = v
print 'STATUS: quiz_id Initialized.'
def download_video(urls):
dirname = 'lecture ' + str(req_unit)
py_path = path.abspath(sys.argv[0])
py_path = path.dirname(py_path)
if not os.path.exists(dirname):
mkdir(dirname)
chdir(dirname)
for video_url in urls:
video_id = parse_qs(urlparse(video_url).query)['v'][0]
get_vars = parse_qs(unquote(urlopen("http://www.youtube.com/get_video_info?video_id="+video_id).read()))
title = get_vars['title'][0] + '.flv'
if os.path.isfile(title):
continue;
i = 0
entries = (get_vars['fmt_list'][0]).split(',')
for entry in entries:
match = re.search(r'^45.*', entry)
if match:
break;
i = i + 1;
link = get_vars['itag'][i]
link = re.findall(r'45,url=(.*)', link)[0]
print '\n-->Downloading, Title: ', title
urllib.urlretrieve(link, title)
"""
for v in get_vars.keys():
print v, '\n', get_vars[v], '\n\n'
pdb.set_trace()
"""
chdir(py_path)
def main():
init_quiz_hash();
page = urlopen("http://www.ai-class.com/home/")
htmlSource = page.read()
parser = UrlLister()
print 'STATUS: Fetching video urls.'
parser.feed(htmlSource)
print 'STATUS: SUCCESS'
page.close()
parser.close()
i = 0
"""
for url in parser.urls:
print 'url: ', url, '\n'
i = i + 1
"""
print 'Number of videos: ', len(parser.urls);
print 'STATUS: Starting download.'
download_video(parser.urls)
print '\n\n*********Download Finished*********'
if __name__ == "__main__":
main()
@milimetric
Copy link

Awesome, works perfectly, thank you very much. It got both "Planning" and "Advanced Planning" when I did python ai-class.py "Advanced Planning", not sure if that was intended. Either way, thank you.

@swairshah
Copy link

thanks a lot!

@giorno
Copy link

giorno commented Dec 22, 2011

Excellent script. Neat and powerful. Thank you!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment