sumodx/ai-class.py

## ai-class.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
__author__ = "Deepak.G.R."
__credits__ = "Sumod Hajela"
__license__ = 'Public Domain'

"""
usage:
Go to command line and type

python ai-class.py "topic-name"

topic-names can be "Welcome to AI", "Problem Solving"

If download is interrupted, delete the last partial downloaded file restart. The script will skip all existing files and start with next new file.

PS: Python2.6.2 should be installed in your system.

Let me know if you have any problems.

"""

from urllib import *
from urlparse import *
from sgmllib import SGMLParser
import re
import pdb
import sys
import json
from os import *
import os.path

url_youtube = 'http://www.youtube.com/watch?v='
#req_unit = 'problem solving'
#req_unit = 'welcome to AI'
req_unit = sys.argv[1]
quiz_hash = dict();

class UrlLister(SGMLParser):

    def reset(self):
        SGMLParser.reset(self)
        self.urls = []
        self.flag = 0;
        self.req_unit = req_unit;
        self.names = [];

    def start_a(self, attrs):
        href = [value for name, value in attrs if name == 'href']
        topic = re.search(r'/course/topic/(\d)+', str(href[0]))

        if topic:
            self.flag = 0

        match = re.search(r'/course/video/\w+/\d+$', str(href[0]))

        if match and self.flag == 1:
            category = [value for name, value in attrs if name == 'id']

            if 'quiz' in category[0]:
                quiz_id = re.findall(r'quiz_(\d+)', category[0])[0]
                video_id = quiz_hash[quiz_id]
            else:
                video_id = re.findall(r'video_\d+_(.+)', category[0])[0]

            link = url_youtube + video_id
            self.urls.append(link)

    def handle_data(self, text):
        if self.flag == 0:
            text = text.strip();
            text = re.sub(r'[^A-Za-z]', '', text)
            self.req_unit = re.sub(r'[^A-Za-z]', '', self.req_unit)
            match = re.search(text, self.req_unit, re.IGNORECASE)

            if match and len(text) != 0:
                self.flag = 1


def init_quiz_hash():
    print 'STATUS: Initializing quiz_id hash'
    quiz_url = 'http://www.ai-class.com/course/json/filter/QuizQuestion'
    quiz_url = urlopen(quiz_url)
    data = json.load(quiz_url)
    quiz_id = list()

    for ind in xrange(len(data['data'])):
        piece = str(data['data'][ind])
        match = re.findall('\'youtube_id\': u\'(.+?)\',.*?\'quiz_question\': (\d+?),', piece)

        if match:
            quiz_id.append(match[0])

    for v, i in quiz_id:
        quiz_hash[i] = v
    print 'STATUS: quiz_id Initialized.'

def download_video(urls):
    dirname = 'lecture ' + str(req_unit)
    py_path = path.abspath(sys.argv[0])
    py_path = path.dirname(py_path)
    if not os.path.exists(dirname):
        mkdir(dirname)
    chdir(dirname)
    for video_url in urls:
        video_id = parse_qs(urlparse(video_url).query)['v'][0]
        get_vars = parse_qs(unquote(urlopen("http://www.youtube.com/get_video_info?video_id="+video_id).read()))
        title = get_vars['title'][0] + '.flv'
        if os.path.isfile(title):
            continue;
        i = 0
        entries = (get_vars['fmt_list'][0]).split(',')
        for entry in entries:
            match = re.search(r'^45.*', entry)
            if match:
                break;
            i = i + 1;

        link = get_vars['itag'][i]
        link = re.findall(r'45,url=(.*)', link)[0]

        print '\n-->Downloading, Title: ', title
        urlretrieve(link, title)
        """
        for v in get_vars.keys():
            print v, '\n', get_vars[v], '\n\n'
        pdb.set_trace()
        """
    chdir(py_path)

def main():

    init_quiz_hash();
    page = urlopen("http://www.ai-class.com/home/")
    htmlSource = page.read()
    parser = UrlLister()
    print 'STATUS: Fetching video urls.'
    parser.feed(htmlSource)
    print 'STATUS: SUCCESS'
    page.close()
    parser.close()
    i = 0
    """
    for url in parser.urls:
        print 'url: ', url, '\n'
        i = i + 1
    """
    print 'Number of videos: ', len(parser.urls);
    print 'STATUS: Starting download.'
    download_video(parser.urls)
    print '\n\n*********Download Finished*********'

if __name__  == "__main__":
    main()

	#!/usr/bin/env python
	# -- coding: utf-8 --
	__author__ = "Deepak.G.R."
	__credits__ = "Sumod Hajela"
	__license__ = 'Public Domain'

	"""
	usage:
	Go to command line and type

	python ai-class.py "topic-name"

	topic-names can be "Welcome to AI", "Problem Solving"

	If download is interrupted, delete the last partial downloaded file restart. The script will skip all existing files and start with next new file.

	PS: Python2.6.2 should be installed in your system.

	Let me know if you have any problems.

	"""

	from urllib import *
	from urlparse import *
	from sgmllib import SGMLParser
	import re
	import pdb
	import sys
	import json
	from os import *
	import os.path

	url_youtube = 'http://www.youtube.com/watch?v='
	#req_unit = 'problem solving'
	#req_unit = 'welcome to AI'
	req_unit = sys.argv[1]
	quiz_hash = dict();

	class UrlLister(SGMLParser):

	def reset(self):
	SGMLParser.reset(self)
	self.urls = []
	self.flag = 0;
	self.req_unit = req_unit;
	self.names = [];

	def start_a(self, attrs):
	href = [value for name, value in attrs if name == 'href']
	topic = re.search(r'/course/topic/(\d)+', str(href[0]))

	if topic:
	self.flag = 0

	match = re.search(r'/course/video/\w+/\d+$', str(href[0]))

	if match and self.flag == 1:
	category = [value for name, value in attrs if name == 'id']

	if 'quiz' in category[0]:
	quiz_id = re.findall(r'quiz_(\d+)', category[0])[0]
	video_id = quiz_hash[quiz_id]
	else:
	video_id = re.findall(r'video_\d+_(.+)', category[0])[0]

	link = url_youtube + video_id
	self.urls.append(link)

	def handle_data(self, text):
	if self.flag == 0:
	text = text.strip();
	text = re.sub(r'[^A-Za-z]', '', text)
	self.req_unit = re.sub(r'[^A-Za-z]', '', self.req_unit)
	match = re.search(text, self.req_unit, re.IGNORECASE)

	if match and len(text) != 0:
	self.flag = 1


	def init_quiz_hash():
	print 'STATUS: Initializing quiz_id hash'
	quiz_url = 'http://www.ai-class.com/course/json/filter/QuizQuestion'
	quiz_url = urlopen(quiz_url)
	data = json.load(quiz_url)
	quiz_id = list()

	for ind in xrange(len(data['data'])):
	piece = str(data['data'][ind])
	match = re.findall('\'youtube_id\': u\'(.+?)\',.*?\'quiz_question\': (\d+?),', piece)

	if match:
	quiz_id.append(match[0])

	for v, i in quiz_id:
	quiz_hash[i] = v
	print 'STATUS: quiz_id Initialized.'

	def download_video(urls):
	dirname = 'lecture ' + str(req_unit)
	py_path = path.abspath(sys.argv[0])
	py_path = path.dirname(py_path)
	if not os.path.exists(dirname):
	mkdir(dirname)
	chdir(dirname)
	for video_url in urls:
	video_id = parse_qs(urlparse(video_url).query)['v'][0]
	get_vars = parse_qs(unquote(urlopen("http://www.youtube.com/get_video_info?video_id="+video_id).read()))
	title = get_vars['title'][0] + '.flv'
	if os.path.isfile(title):
	continue;
	i = 0
	entries = (get_vars['fmt_list'][0]).split(',')
	for entry in entries:
	match = re.search(r'^45.*', entry)
	if match:
	break;
	i = i + 1;

	link = get_vars['itag'][i]
	link = re.findall(r'45,url=(.*)', link)[0]

	print '\n-->Downloading, Title: ', title
	urlretrieve(link, title)
	"""
	for v in get_vars.keys():
	print v, '\n', get_vars[v], '\n\n'
	pdb.set_trace()
	"""
	chdir(py_path)

	def main():

	init_quiz_hash();
	page = urlopen("http://www.ai-class.com/home/")
	htmlSource = page.read()
	parser = UrlLister()
	print 'STATUS: Fetching video urls.'
	parser.feed(htmlSource)
	print 'STATUS: SUCCESS'
	page.close()
	parser.close()
	i = 0
	"""
	for url in parser.urls:
	print 'url: ', url, '\n'
	i = i + 1
	"""
	print 'Number of videos: ', len(parser.urls);
	print 'STATUS: Starting download.'
	download_video(parser.urls)
	print '\n\n*******Download Finished*******'

	if __name__ == "__main__":
	main()