Skip to content

Instantly share code, notes, and snippets.

@emersonmoretto
Created October 26, 2011 19:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save emersonmoretto/1317503 to your computer and use it in GitHub Desktop.
Save emersonmoretto/1317503 to your computer and use it in GitHub Desktop.
ML-Class subtitles downloader and converter (to srt) script
import urllib2
from cStringIO import StringIO
import re
import os
import glob
import sys,codecs
## To Run: python ml-class_subtitle_down.py
## This script scan $path folder to find *.mp4 (videos downloaded from ml-class) then download the subtitles then convert subtitles to 'srt'
## You must edit $path and $cookie vars
## YOU MUST EDIT THESE 2 VARS:
# put your cookie here (get it from firebug net tab)
cookie = "__utma=38404479.953217282.1319653074.1319653074.1319653074.1; __utmb=38404479.3.10.1319653074; __utmc=38404479; __utmz=38404479.1319653074.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); session=NDlhNDBlNDE1OTFkMjg2Y2YyOWMzZTNmMzFjNzM0YWYyZGQ5YzhiM3w5bEc5TnNKdzZwalBkc1FTL1pXeElMcFpiSGpzdThDNHNuOUp2M0F5YllYSHpoOGRtYlYwMzdjbmhmRHhpd0lFV2RrbUJSYWlLNEt2UFlhNEVnV3VDRXRzVGlkWS9IYnJFc0RDWXpnL1VCWHZIamU0RkRnb252TDdhVDZ6UlZEcE5TbGFiSnNjRnR4RVB4TTAxSVdUU3dUNlNwRm9pRVdJVFdNbnhaM2ZRRWlLT3AzNjhZcHpRRWUxVEY0V3R3bnRNTE8reTFZYjZqWFVzNWlIWUtBajBuNDhGcERIaXAzZlc0bG80QWM4L0pJPQ%3D%3D"
# path of your video files
path = 'Downloads/'
## Here you dont need edit (maybe)
# xml2srt stolen from https://gist.github.com/1277162
limit=[60,60,60,1000]
def xml2srt(fi,fo):
data=''.join((fi.read().split('\n')[9:-4])).strip().split('</p>')
for i in range(0,len(data)-1):
#print i,data[i]
if data[i]:
st_st=data[i].index('"')
st_ed=data[i].index('"',st_st+1)
if i+1<len(data)-1:
nx_st=data[i+1].index('"')
nx_ed=data[i+1].index('"',nx_st+1)
fo.write(str(i+1)+' \n')
stamps=[data[i][st_st+1:st_ed],
data[i+1][nx_st+1:nx_ed] if i+1<len(data)-1 else "99:59:59.999"]
word=data[i][data[i].index('>')+1:].replace('\n',' ')+' \n\n\n'
for i,stamp in enumerate(stamps):
stamp=stamp.split('.')
stamps[i]=map(int,stamp[0].split(':'))
stamps[i].append(int(stamp[1]))
stamps=map(lambda s:"%02d:%02d:%02d,%03d" % tuple(s),stamps)
fo.write("%s --> %s \n" % tuple(stamps))
fo.write(word)
#print 'OK!'
# setting cookie at headers
opener = urllib2.build_opener()
opener.addheaders = [('Cookie', cookie)]
urllib2.install_opener(opener)
print 'Getting subtitles...'
# filter (default *.mp4)
ext = "*.mp4"
for infile in glob.glob( os.path.join(path, ext) ):
infile = infile.replace(path,"")
if int(infile[:2]) >= 6:
infile = infile.replace(".mp4","-subtitles.xml") # videos after 06.** have -subtitles sufix
else:
infile = infile.replace("mp4","xml")
url = "http://s3.amazonaws.com/stanford_videos/cs229/subtitles/"+infile+"?refresh=922262637"
print "Getting ", url
fo=file(infile,'w')
fo.write(urllib2.urlopen(url).read())
fo.close()
print "Converting ",infile
xml2srt(file(infile,'r'),file(infile.replace("xml", "srt"),'w'))
print 'Done:',infile
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment