lorantto/ai-class.py

## ai-class.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__author__ = "Deepak.G.R."
__license__ = 'Public Domain'

"""
usage:
Go to command line and type

python ai-class.py "topic-name"

topic-names can be "Welcome to AI", "Problem Solving"

PS: Python 2.7.2 should be installed in your system.

Let me know if you get into any problems.
"""

from urllib import *
from urlparse import *
from sgmllib import SGMLParser
from os import *
from json import *
import re
import pdb
import sys
import json
import urllib2


video_code = 9

code = 22
"""
code = 34 for 640*360
code = 35 for 854*480(Default)
code = 22 for 1270*720
"""
if code == 22:
    video_fmt = '.mp4'
else:
    video_fmt = '.flv'


url_youtube = 'http://www.youtube.com/watch?v='
quiz_hash = dict();

req_unit = sys.argv[1]

class UrlLister(SGMLParser):

    def reset(self):
        SGMLParser.reset(self)
        self.urls = []
        self.flag = 0;
        self.req_unit = req_unit;
        self.names = [];

    def start_a(self, attrs):
        href = [value for name, value in attrs if name == 'href']
        topic = re.search(r'/course/topic/(\d)+', str(href[0]))

        if topic:
            self.flag = 0

        match = re.search(r'/course/video/\w+/\d+$', str(href[0]))

        if match and self.flag == 1:
            category = [value for name, value in attrs if name == 'id']

            if 'quiz' in category[0]:
                quiz_id = re.findall(r'quiz_(\d+)', category[0])[0]
                video_ids = quiz_hash[quiz_id]
                for video_id in video_ids:
                    link = url_youtube + video_id
                    self.urls.append(link)
            else:
                video_id = re.findall(r'video_\d+_(.+)', category[0])[0]
                link = url_youtube + video_id
                self.urls.append(link)

    def handle_data(self, text):
        if self.flag == 0:
            text = text.strip();
            text = re.sub(r'[^A-Za-z]', '', text)
            self.req_unit = re.sub(r'[^A-Za-z]', '', self.req_unit)
            match = re.match(text, self.req_unit, re.IGNORECASE)
            if match and len(text) != 0:
                self.flag = 1


def init_quiz_hash():
    print 'STATUS: Initializing quiz_id hash'
    quiz_url = 'http://www.ai-class.com/course/json/filter/QuizQuestion'
    quiz_url = urllib2.urlopen(quiz_url);
    data = json.load(quiz_url)
    quiz_id = list()

    for ind in xrange(len(data['data'])):
        piece = str(data['data'][ind])
        match = re.findall('\'youtube_id\': u\'(.+?)\',.*?\'quiz_question\': (\d+?),', piece)

        if match:
            for entry in match:
                quiz_id.append(entry)


    for v, i in quiz_id:
        if not quiz_hash.has_key(i):
            quiz_hash[i] = list()

        quiz_hash[i].append(v)

    print 'STATUS: quiz_id Initialized.'

def match_highest_res(entries):
    global video_code
    match = None
    res, i = -1, 0
    tag, w, h, r = '', -1, -1, -1
    btag, bw, bh, br, bres = '', -1, -1, -1, -1
    for entry in entries:
      match = re.search(r'^([0-9]+)\/([0-9]+)x([0-9]+)\/([0-9]+)\/.*', entry)
      if match:
        if len(match.groups()) > 3:
          try:
            ttag, tw, th, tr = match.group(1), int(match.group(2)), int(match.group(3)), int(match.group(4))
            if tw * th > w * h:
              bres = i
              btag, bw, bh, br = ttag, tw, th, tr
              if video_code == tr:
                res = i
                tag, w, h, r = ttag, tw, th, tr
          except Exception as inst:
            print "Couldn't parse video info", inst
      i = i + 1;
    if res >= 0:
      return (res, tag)
    return (bres, btag)

def download_video(urls):
    dirname = str(req_unit)
    py_path = path.abspath(sys.argv[0])
    py_path = path.dirname(py_path)

    if path.exists(dirname):
        delete_recent_video(dirname)
    else:
        mkdir(dirname)
        chdir(dirname)


    for video_url in urls:
        video_id = parse_qs(urlparse(video_url).query)['v'][0]
        get_vars = parse_qs(unquote(urlopen("http://www.youtube.com/get_video_info?video_id=" + video_id).read()))
        title = get_vars['title'][0] + video_fmt

        if path.isfile(title):
            continue

        entries = (get_vars['fmt_list'][0]).split(',')
        i, tag = match_highest_res(entries)
        if i < 0:
            print 'ERROR: Couldn\'t Download video: ', title
            continue

        link = get_vars['itag'][i]
        link = re.findall(r'\d+,url=(.*)', link)[0]

        print '\n-->Downloading, Title: ', title

        urlretrieve(link, title)

    chdir(py_path)

def delete_recent_video(dirname):
    chdir(dirname)
    name = ''
    recent = 0
    for fo in listdir('.'):
        temp = stat(fo)[8]
        if temp > recent:
            recent = temp
            name = fo
    remove(name)

def main():

    init_quiz_hash();
    page = urllib2.urlopen("http://www.ai-class.com/home/")
    htmlSource = page.read()
    parser = UrlLister()
    print 'STATUS: Fetching video urls.'
    parser.feed(htmlSource)
    print 'STATUS: SUCCESS'
    page.close()
    parser.close()
    print 'Number of videos: ', len(parser.urls);
    print 'STATUS: Starting download.'

    download_video(parser.urls)

    print '\n\n*********Download Finished*********'

if __name__  == "__main__":
    main()
	#!/usr/bin/env python
	# -- coding: utf-8 --
	__author__ = "Deepak.G.R."
	__license__ = 'Public Domain'

	"""
	usage:
	Go to command line and type

	python ai-class.py "topic-name"

	topic-names can be "Welcome to AI", "Problem Solving"

	PS: Python 2.7.2 should be installed in your system.

	Let me know if you get into any problems.
	"""

	from urllib import *
	from urlparse import *
	from sgmllib import SGMLParser
	from os import *
	from json import *
	import re
	import pdb
	import sys
	import json
	import urllib2


	video_code = 9

	code = 22
	"""
	code = 34 for 640*360
	code = 35 for 854*480(Default)
	code = 22 for 1270*720
	"""
	if code == 22:
	video_fmt = '.mp4'
	else:
	video_fmt = '.flv'


	url_youtube = 'http://www.youtube.com/watch?v='
	quiz_hash = dict();

	req_unit = sys.argv[1]

	class UrlLister(SGMLParser):

	def reset(self):
	SGMLParser.reset(self)
	self.urls = []
	self.flag = 0;
	self.req_unit = req_unit;
	self.names = [];

	def start_a(self, attrs):
	href = [value for name, value in attrs if name == 'href']
	topic = re.search(r'/course/topic/(\d)+', str(href[0]))

	if topic:
	self.flag = 0

	match = re.search(r'/course/video/\w+/\d+$', str(href[0]))

	if match and self.flag == 1:
	category = [value for name, value in attrs if name == 'id']

	if 'quiz' in category[0]:
	quiz_id = re.findall(r'quiz_(\d+)', category[0])[0]
	video_ids = quiz_hash[quiz_id]
	for video_id in video_ids:
	link = url_youtube + video_id
	self.urls.append(link)
	else:
	video_id = re.findall(r'video_\d+_(.+)', category[0])[0]
	link = url_youtube + video_id
	self.urls.append(link)

	def handle_data(self, text):
	if self.flag == 0:
	text = text.strip();
	text = re.sub(r'[^A-Za-z]', '', text)
	self.req_unit = re.sub(r'[^A-Za-z]', '', self.req_unit)
	match = re.match(text, self.req_unit, re.IGNORECASE)
	if match and len(text) != 0:
	self.flag = 1


	def init_quiz_hash():
	print 'STATUS: Initializing quiz_id hash'
	quiz_url = 'http://www.ai-class.com/course/json/filter/QuizQuestion'
	quiz_url = urllib2.urlopen(quiz_url);
	data = json.load(quiz_url)
	quiz_id = list()

	for ind in xrange(len(data['data'])):
	piece = str(data['data'][ind])
	match = re.findall('\'youtube_id\': u\'(.+?)\',.*?\'quiz_question\': (\d+?),', piece)

	if match:
	for entry in match:
	quiz_id.append(entry)


	for v, i in quiz_id:
	if not quiz_hash.has_key(i):
	quiz_hash[i] = list()

	quiz_hash[i].append(v)

	print 'STATUS: quiz_id Initialized.'

	def match_highest_res(entries):
	global video_code
	match = None
	res, i = -1, 0
	tag, w, h, r = '', -1, -1, -1
	btag, bw, bh, br, bres = '', -1, -1, -1, -1
	for entry in entries:
	match = re.search(r'^([0-9]+)\/([0-9]+)x([0-9]+)\/([0-9]+)\/.*', entry)
	if match:
	if len(match.groups()) > 3:
	try:
	ttag, tw, th, tr = match.group(1), int(match.group(2)), int(match.group(3)), int(match.group(4))
	if tw * th > w * h:
	bres = i
	btag, bw, bh, br = ttag, tw, th, tr
	if video_code == tr:
	res = i
	tag, w, h, r = ttag, tw, th, tr
	except Exception as inst:
	print "Couldn't parse video info", inst
	i = i + 1;
	if res >= 0:
	return (res, tag)
	return (bres, btag)

	def download_video(urls):
	dirname = str(req_unit)
	py_path = path.abspath(sys.argv[0])
	py_path = path.dirname(py_path)

	if path.exists(dirname):
	delete_recent_video(dirname)
	else:
	mkdir(dirname)
	chdir(dirname)


	for video_url in urls:
	video_id = parse_qs(urlparse(video_url).query)['v'][0]
	get_vars = parse_qs(unquote(urlopen("http://www.youtube.com/get_video_info?video_id=" + video_id).read()))
	title = get_vars['title'][0] + video_fmt

	if path.isfile(title):
	continue

	entries = (get_vars['fmt_list'][0]).split(',')
	i, tag = match_highest_res(entries)
	if i < 0:
	print 'ERROR: Couldn\'t Download video: ', title
	continue

	link = get_vars['itag'][i]
	link = re.findall(r'\d+,url=(.*)', link)[0]

	print '\n-->Downloading, Title: ', title

	urlretrieve(link, title)

	chdir(py_path)

	def delete_recent_video(dirname):
	chdir(dirname)
	name = ''
	recent = 0
	for fo in listdir('.'):
	temp = stat(fo)[8]
	if temp > recent:
	recent = temp
	name = fo
	remove(name)

	def main():

	init_quiz_hash();
	page = urllib2.urlopen("http://www.ai-class.com/home/")
	htmlSource = page.read()
	parser = UrlLister()
	print 'STATUS: Fetching video urls.'
	parser.feed(htmlSource)
	print 'STATUS: SUCCESS'
	page.close()
	parser.close()
	print 'Number of videos: ', len(parser.urls);
	print 'STATUS: Starting download.'

	download_video(parser.urls)

	print '\n\n*******Download Finished*******'

	if __name__ == "__main__":
	main()