Skip to content

Instantly share code, notes, and snippets.

@BadUncleX
Last active April 11, 2018 05:43
Show Gist options
  • Save BadUncleX/f0fb1afdfe3770cd2c74 to your computer and use it in GitHub Desktop.
Save BadUncleX/f0fb1afdfe3770cd2c74 to your computer and use it in GitHub Desktop.
python 抓取 coursera 字幕 (machine learning)
## 用2to3 从版本2转到3
#Download all subtitle of videos on coursera for Machine Learning Course by Andrew Ng
#Author: Hebi
#Note: Before sunning the script create a Folder Named: 'subtitle' in the same directory
import urllib.request, urllib.error, urllib.parse
import os
import base64
url = 'https://class.coursera.org/ml-008/lecture/subtitles?q='
url2 = '_zh&format=txt'
#url2 = '_en&format=txt'
print("Downloaded Subtitle No. ", end=' ')
for i in range(1,115):
try:
# urllib2.unquote('%20')
remoteurl = urllib.request.urlopen(url+str(i)+url2)
cd = remoteurl.info()['Content-Disposition'].split(';')
cd2 = cd[1].split('=')[1].strip("\"'")
filename = urllib.parse.unquote(cd2).replace("/","_")
print("filename:", filename)
# print "remoteurl.url,", remoteurl.url
# print "parsename:",urllib2.urlparse.urlparse(remoteurl.url).path
# filename = os.path.basename(urllib2.urlparse.urlparse(remoteurl.url).path)
# print "filename,", filename
#filename = cd['filename'].strip("\"'")
page = remoteurl.read();
f = open('subtitle-cn/'+filename,'w+');
f.write(page);
f.close();
print(i, end=' ');
except (urllib.error.HTTPError, urllib.error.URLError) as e:
print('\nProblem Downloading file: ',i)
print('OR Connection Error')
print('\nDownload Finished\nHappy Coding!!!')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment