Forked from makerj/asyncio-uvloop-aiohttp_crawler.py
Created
February 23, 2017 10:54
-
-
Save hmphu/7d9f972a03845058d9865dc78c86dea4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import asyncio | |
from concurrent.futures import ProcessPoolExecutor | |
from concurrent.futures import ThreadPoolExecutor | |
from functools import partial | |
import aiohttp | |
import bs4 | |
import uvloop | |
import mailer | |
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) | |
main_loop = asyncio.get_event_loop() | |
main_loop.set_default_executor(ProcessPoolExecutor()) | |
mail_loop = asyncio.new_event_loop() | |
mail_loop.set_default_executor(ThreadPoolExecutor()) | |
def _parse(body): | |
interested_items = [] | |
try: | |
soup = bs4.BeautifulSoup(body, 'lxml') | |
table = soup.find('table', class_='board_list_table') | |
rows = table.tbody.find_all('tr') | |
for row in filter(lambda r: 'notice' not in r['class'], rows): | |
category = row.find('td', class_='divsn').text.strip() | |
if category not in {'게임S/W', '게임H/W', '휴대폰'}: | |
interested_items.append(row.find('td', class_='subject').a.text.strip()) | |
except AttributeError: | |
return [] | |
return interested_items | |
async def parse_page(body): | |
return await main_loop.run_in_executor(None, partial(_parse, body)) | |
async def process_page(session, url): | |
async with session.get(url) as response: | |
body = await response.read() | |
return await parse_page(body) | |
def main(): | |
# board_url = 'http://bbs.ruliweb.com/news/board/1020' # Hot Deal (our main target) | |
board_url = 'http://bbs.ruliweb.com/community/board/300143' # Humor (eligible for testing) | |
polling_delay = 5 | |
with aiohttp.ClientSession(loop=main_loop) as session: | |
fs = [] | |
for page in range(1, 5): | |
fs.append(asyncio.ensure_future( | |
process_page( | |
session, board_url + '/list?page={}'.format(page) | |
))) | |
fs = main_loop.run_until_complete(asyncio.wait(fs))[0] | |
initial_dataset = set(sum([f.result() for f in fs], [])) | |
print(initial_dataset) | |
while True: | |
main_loop.run_until_complete(asyncio.sleep(polling_delay)) | |
print('polling...') | |
polled = set(main_loop.run_until_complete(asyncio.ensure_future(process_page(session, board_url)))) | |
polled -= initial_dataset | |
if polled: | |
initial_dataset |= polled | |
mail_loop.run_in_executor(None, partial( | |
mailer.send_mail, 'ahenwkqdj@naver.com', 'new post!', str(polled) | |
)) | |
print(polled) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment