-
-
Save cornchz/c340bd469d6a2f31827d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import requests | |
# urllib2 | |
from urlparse import urljoin | |
from scrapy.selector import Selector | |
# lxml | |
# beautifulsoup | |
from gevent import monkey; monkey.patch_all() | |
from gevent.pool import Pool | |
# multiprocessing | |
def fetch_page(url): | |
'''1. 웹페이지 다운로드''' | |
r = requests.get(url) | |
return r.text | |
def talk_links_from_listpage(url): | |
'''2. 목록 페이지에서 강의 링크들 추출''' | |
html = fetch_page(url) | |
sel = Selector(text=html) | |
links = sel.css('.talk-link .media__message a::attr(href)').extract() | |
links = [urljoin(url, link) for link in links] # list comprehension | |
# links = (urljoin(url, link) for link in links) # generator expression | |
return links | |
def talk_from_page(url): | |
'''3. 강의 페이지에서 강의 메타정보 추출''' | |
print url | |
html = fetch_page(url) | |
sel = Selector(text=html) | |
title = sel.css('.talk-hero__title::text').extract() | |
description = sel.css('.talk-description::text').extract() | |
return { | |
'title': title.pop(), | |
'description': description.pop(), | |
} # coding convention: PEP-8 | |
def latest_talks(page=1): | |
'''4. 최근 강의 목록 반환''' | |
url = 'http://www.ted.com/talks/browse?page={page}'.format(page=page) | |
links = talk_links_from_listpage(url)#[:8] | |
pool = Pool(20) | |
# talks = [talk_from_page(link) for link in links] | |
talks = pool.map(talk_from_page, links) | |
# jobs = [pool.spawn(link) for link in links] | |
# gevent.joinall(jobs) | |
# talks =[job.get() for job in jobs] | |
return talks | |
# talks = [] | |
# for link in links: | |
# talk = talk_from_page(link) | |
# talks.append(talk) | |
from pprint import pprint | |
pprint(latest_talks(2)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment