Last active
March 6, 2016 12:26
-
-
Save hiepnd/3b65d6f239b35fc5c6be to your computer and use it in GitHub Desktop.
Just for fun!
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scrapy | |
import os | |
import urllib | |
from threading import Thread | |
from Queue import Queue | |
from multiprocessing import Pool | |
import time | |
class CourseraSpider(scrapy.Spider): | |
name = 'CourseraSprider' | |
start_urls = ['https://class.coursera.org/ml-005/lecture'] | |
queue = Queue() | |
def __init__(self, name=None, **kwargs): | |
super(CourseraSpider, self).__init__(name, **kwargs) | |
for i in range(10): | |
worker = DownloadWorker(self.queue) | |
worker.daemon = True | |
worker.start() | |
self.ts = time.time() | |
def parse(self, response): | |
# Traverse headers | |
for header in response.css('.course-item-list-header'): | |
# Lecture list is the first sibling of the header | |
item = dict(name=header.xpath('h3//text()').extract_first()) | |
item['lectures'] = list() | |
lectures = header.xpath('following-sibling::*[position()=1]') | |
# Traverse lecture | |
for lecture in lectures.xpath('child::*'): | |
data = dict() | |
# Get the name | |
data['name'] = lecture.xpath('a/text()').extract_first() | |
data['files'] = list() | |
# Each lecture has some files | |
for href in lecture.css('.course-lecture-item-resource a::attr(href)'): | |
h = href.extract() | |
if '.pdf' in h: | |
data['files'].append(h) | |
# if '.mp4' in h: | |
# data['files'].append(h) | |
item['lectures'].append(data) | |
self.download(item) | |
self.queue.join() | |
print 'Download finish', time.time()-self.ts | |
def download(self, item): | |
dir_name = self.trim(item['name']) | |
print '\nDownloading lesson:', dir_name | |
if not os.path.isdir(dir_name): | |
os.makedirs(dir_name) | |
lectures = item['lectures'] | |
mi = pi = 0 | |
for lecture in lectures: | |
lecture_name = self.trim(lecture['name']) | |
print '\tDownloading lecture:', lecture_name | |
for file in lecture['files']: | |
print '\t\t', file | |
if '.pdf' in file: | |
pi += 1 | |
self.queue_download(file, dir_name, str(pi) + ' - ' + dir_name + '.pdf') | |
if '.mp4' in file: | |
mi += 1 | |
self.queue_download(file, dir_name, str(mi) + ' - ' + lecture_name + '.mp4') | |
def trim(self, name): | |
name = name.strip() | |
i = name.find('(') | |
if i > 0: | |
name = name[:i].strip() | |
exculdes = [os.path.sep] | |
for e in exculdes: | |
name = name.replace(e, ' ') | |
return name | |
def queue_download(self, url, dir_name, file_name): | |
if not os.path.isfile(os.path.join(dir_name, file_name)): | |
self.queue.put((url, dir_name, file_name)) | |
class DownloadWorker(Thread): | |
def __init__(self, queue): | |
Thread.__init__(self) | |
self.queue = queue | |
def run(self): | |
while True: | |
# Get the work from the queue and expand the tuple | |
url, dir_name, file_name = self.queue.get() | |
self.do_download(url, dir_name, file_name) | |
self.queue.task_done() | |
def do_download(self, url, dir_name, file_name): | |
print '\t\t\t ==> ', file_name | |
urllib.urlretrieve(url, os.path.join(dir_name, file_name)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment