Skip to content

Instantly share code, notes, and snippets.

@hiepnd
Last active March 6, 2016 12:26
Show Gist options
  • Save hiepnd/3b65d6f239b35fc5c6be to your computer and use it in GitHub Desktop.
Save hiepnd/3b65d6f239b35fc5c6be to your computer and use it in GitHub Desktop.
Just for fun!
import scrapy
import os
import urllib
from threading import Thread
from Queue import Queue
from multiprocessing import Pool
import time
class CourseraSpider(scrapy.Spider):
name = 'CourseraSprider'
start_urls = ['https://class.coursera.org/ml-005/lecture']
queue = Queue()
def __init__(self, name=None, **kwargs):
super(CourseraSpider, self).__init__(name, **kwargs)
for i in range(10):
worker = DownloadWorker(self.queue)
worker.daemon = True
worker.start()
self.ts = time.time()
def parse(self, response):
# Traverse headers
for header in response.css('.course-item-list-header'):
# Lecture list is the first sibling of the header
item = dict(name=header.xpath('h3//text()').extract_first())
item['lectures'] = list()
lectures = header.xpath('following-sibling::*[position()=1]')
# Traverse lecture
for lecture in lectures.xpath('child::*'):
data = dict()
# Get the name
data['name'] = lecture.xpath('a/text()').extract_first()
data['files'] = list()
# Each lecture has some files
for href in lecture.css('.course-lecture-item-resource a::attr(href)'):
h = href.extract()
if '.pdf' in h:
data['files'].append(h)
# if '.mp4' in h:
# data['files'].append(h)
item['lectures'].append(data)
self.download(item)
self.queue.join()
print 'Download finish', time.time()-self.ts
def download(self, item):
dir_name = self.trim(item['name'])
print '\nDownloading lesson:', dir_name
if not os.path.isdir(dir_name):
os.makedirs(dir_name)
lectures = item['lectures']
mi = pi = 0
for lecture in lectures:
lecture_name = self.trim(lecture['name'])
print '\tDownloading lecture:', lecture_name
for file in lecture['files']:
print '\t\t', file
if '.pdf' in file:
pi += 1
self.queue_download(file, dir_name, str(pi) + ' - ' + dir_name + '.pdf')
if '.mp4' in file:
mi += 1
self.queue_download(file, dir_name, str(mi) + ' - ' + lecture_name + '.mp4')
def trim(self, name):
name = name.strip()
i = name.find('(')
if i > 0:
name = name[:i].strip()
exculdes = [os.path.sep]
for e in exculdes:
name = name.replace(e, ' ')
return name
def queue_download(self, url, dir_name, file_name):
if not os.path.isfile(os.path.join(dir_name, file_name)):
self.queue.put((url, dir_name, file_name))
class DownloadWorker(Thread):
def __init__(self, queue):
Thread.__init__(self)
self.queue = queue
def run(self):
while True:
# Get the work from the queue and expand the tuple
url, dir_name, file_name = self.queue.get()
self.do_download(url, dir_name, file_name)
self.queue.task_done()
def do_download(self, url, dir_name, file_name):
print '\t\t\t ==> ', file_name
urllib.urlretrieve(url, os.path.join(dir_name, file_name))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment