Create a gist now

Instantly share code, notes, and snippets.

@cornchz /scraper.py Secret
Last active Aug 29, 2015

What would you like to do?
# -*- coding: utf-8 -*-
import requests
# urllib2
from urlparse import urljoin
from scrapy.selector import Selector
# lxml
# beautifulsoup
from gevent import monkey; monkey.patch_all()
from gevent.pool import Pool
# multiprocessing
def fetch_page(url):
'''1. 웹페이지 다운로드'''
r = requests.get(url)
return r.text
def talk_links_from_listpage(url):
'''2. 목록 페이지에서 강의 링크들 추출'''
html = fetch_page(url)
sel = Selector(text=html)
links = sel.css('.talk-link .media__message a::attr(href)').extract()
links = [urljoin(url, link) for link in links] # list comprehension
# links = (urljoin(url, link) for link in links) # generator expression
return links
def talk_from_page(url):
'''3. 강의 페이지에서 강의 메타정보 추출'''
print url
html = fetch_page(url)
sel = Selector(text=html)
title = sel.css('.talk-hero__title::text').extract()
description = sel.css('.talk-description::text').extract()
return {
'title': title.pop(),
'description': description.pop(),
} # coding convention: PEP-8
def latest_talks(page=1):
'''4. 최근 강의 목록 반환'''
url = 'http://www.ted.com/talks/browse?page={page}'.format(page=page)
links = talk_links_from_listpage(url)#[:8]
pool = Pool(20)
# talks = [talk_from_page(link) for link in links]
talks = pool.map(talk_from_page, links)
# jobs = [pool.spawn(link) for link in links]
# gevent.joinall(jobs)
# talks =[job.get() for job in jobs]
return talks
# talks = []
# for link in links:
# talk = talk_from_page(link)
# talks.append(talk)
from pprint import pprint
pprint(latest_talks(2))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment