emersonmoretto/ml-class_subtitle_down.py

## ml-class_subtitle_down.py
import urllib2
from cStringIO import StringIO
import re
import os
import glob
import sys,codecs


## To Run: python ml-class_subtitle_down.py

## This script scan $path folder to find *.mp4 (videos downloaded from ml-class) then download the subtitles then convert subtitles to 'srt'
## You must edit $path and $cookie vars


## YOU MUST EDIT THESE 2 VARS:

# put your cookie here (get it from firebug net tab)
cookie = "__utma=38404479.953217282.1319653074.1319653074.1319653074.1; __utmb=38404479.3.10.1319653074; __utmc=38404479; __utmz=38404479.1319653074.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); session=NDlhNDBlNDE1OTFkMjg2Y2YyOWMzZTNmMzFjNzM0YWYyZGQ5YzhiM3w5bEc5TnNKdzZwalBkc1FTL1pXeElMcFpiSGpzdThDNHNuOUp2M0F5YllYSHpoOGRtYlYwMzdjbmhmRHhpd0lFV2RrbUJSYWlLNEt2UFlhNEVnV3VDRXRzVGlkWS9IYnJFc0RDWXpnL1VCWHZIamU0RkRnb252TDdhVDZ6UlZEcE5TbGFiSnNjRnR4RVB4TTAxSVdUU3dUNlNwRm9pRVdJVFdNbnhaM2ZRRWlLT3AzNjhZcHpRRWUxVEY0V3R3bnRNTE8reTFZYjZqWFVzNWlIWUtBajBuNDhGcERIaXAzZlc0bG80QWM4L0pJPQ%3D%3D"

# path of your video files
path = 'Downloads/'


## Here you dont need edit (maybe)

#  xml2srt stolen from https://gist.github.com/1277162
limit=[60,60,60,1000]
def xml2srt(fi,fo):
    data=''.join((fi.read().split('\n')[9:-4])).strip().split('</p>')
    for i in range(0,len(data)-1):
        #print i,data[i]
        if data[i]:
            st_st=data[i].index('"')
            st_ed=data[i].index('"',st_st+1)
            if i+1<len(data)-1:
                nx_st=data[i+1].index('"')
                nx_ed=data[i+1].index('"',nx_st+1)
            fo.write(str(i+1)+' \n')
            stamps=[data[i][st_st+1:st_ed],
                    data[i+1][nx_st+1:nx_ed] if i+1<len(data)-1 else "99:59:59.999"]
            word=data[i][data[i].index('>')+1:].replace('\n',' ')+' \n\n\n'
            for i,stamp in enumerate(stamps):
                stamp=stamp.split('.')
                stamps[i]=map(int,stamp[0].split(':'))
                stamps[i].append(int(stamp[1]))
            stamps=map(lambda s:"%02d:%02d:%02d,%03d" % tuple(s),stamps)
            fo.write("%s --> %s \n" % tuple(stamps))
            fo.write(word)
    #print 'OK!'


# setting cookie at headers
opener = urllib2.build_opener()
opener.addheaders = [('Cookie', cookie)]
urllib2.install_opener(opener)

print 'Getting subtitles...'

# filter (default *.mp4)
ext = "*.mp4"


for infile in glob.glob( os.path.join(path, ext) ):
	infile = infile.replace(path,"")
	if int(infile[:2]) >= 6:
		infile = infile.replace(".mp4","-subtitles.xml")	# videos after 06.** have -subtitles sufix
	else:
		infile = infile.replace("mp4","xml")
	url = "http://s3.amazonaws.com/stanford_videos/cs229/subtitles/"+infile+"?refresh=922262637"
	print "Getting ", url
	fo=file(infile,'w')
	fo.write(urllib2.urlopen(url).read())
	fo.close()
	print "Converting ",infile
	xml2srt(file(infile,'r'),file(infile.replace("xml", "srt"),'w'))
	print 'Done:',infile
	import urllib2
	from cStringIO import StringIO
	import re
	import os
	import glob
	import sys,codecs




	## To Run: python ml-class_subtitle_down.py

	## This script scan $path folder to find *.mp4 (videos downloaded from ml-class) then download the subtitles then convert subtitles to 'srt'
	## You must edit $path and $cookie vars


	## YOU MUST EDIT THESE 2 VARS:

	# put your cookie here (get it from firebug net tab)
	cookie = "__utma=38404479.953217282.1319653074.1319653074.1319653074.1; __utmb=38404479.3.10.1319653074; __utmc=38404479; __utmz=38404479.1319653074.1.1.utmcsr=(direct)\|utmccn=(direct)\|utmcmd=(none); session=NDlhNDBlNDE1OTFkMjg2Y2YyOWMzZTNmMzFjNzM0YWYyZGQ5YzhiM3w5bEc5TnNKdzZwalBkc1FTL1pXeElMcFpiSGpzdThDNHNuOUp2M0F5YllYSHpoOGRtYlYwMzdjbmhmRHhpd0lFV2RrbUJSYWlLNEt2UFlhNEVnV3VDRXRzVGlkWS9IYnJFc0RDWXpnL1VCWHZIamU0RkRnb252TDdhVDZ6UlZEcE5TbGFiSnNjRnR4RVB4TTAxSVdUU3dUNlNwRm9pRVdJVFdNbnhaM2ZRRWlLT3AzNjhZcHpRRWUxVEY0V3R3bnRNTE8reTFZYjZqWFVzNWlIWUtBajBuNDhGcERIaXAzZlc0bG80QWM4L0pJPQ%3D%3D"

	# path of your video files
	path = 'Downloads/'






	## Here you dont need edit (maybe)

	# xml2srt stolen from https://gist.github.com/1277162
	limit=[60,60,60,1000]
	def xml2srt(fi,fo):
	data=''.join((fi.read().split('\n')[9:-4])).strip().split('</p>')
	for i in range(0,len(data)-1):
	#print i,data[i]
	if data[i]:
	st_st=data[i].index('"')
	st_ed=data[i].index('"',st_st+1)
	if i+1<len(data)-1:
	nx_st=data[i+1].index('"')
	nx_ed=data[i+1].index('"',nx_st+1)
	fo.write(str(i+1)+' \n')
	stamps=[data[i][st_st+1:st_ed],
	data[i+1][nx_st+1:nx_ed] if i+1<len(data)-1 else "99:59:59.999"]
	word=data[i][data[i].index('>')+1:].replace('\n',' ')+' \n\n\n'
	for i,stamp in enumerate(stamps):
	stamp=stamp.split('.')
	stamps[i]=map(int,stamp[0].split(':'))
	stamps[i].append(int(stamp[1]))
	stamps=map(lambda s:"%02d:%02d:%02d,%03d" % tuple(s),stamps)
	fo.write("%s --> %s \n" % tuple(stamps))
	fo.write(word)
	#print 'OK!'



	# setting cookie at headers
	opener = urllib2.build_opener()
	opener.addheaders = [('Cookie', cookie)]
	urllib2.install_opener(opener)

	print 'Getting subtitles...'

	# filter (default *.mp4)
	ext = "*.mp4"


	for infile in glob.glob( os.path.join(path, ext) ):
	infile = infile.replace(path,"")
	if int(infile[:2]) >= 6:
	infile = infile.replace(".mp4","-subtitles.xml") # videos after 06.** have -subtitles sufix
	else:
	infile = infile.replace("mp4","xml")
	url = "http://s3.amazonaws.com/stanford_videos/cs229/subtitles/"+infile+"?refresh=922262637"
	print "Getting ", url
	fo=file(infile,'w')
	fo.write(urllib2.urlopen(url).read())
	fo.close()
	print "Converting ",infile
	xml2srt(file(infile,'r'),file(infile.replace("xml", "srt"),'w'))
	print 'Done:',infile