Last active
February 23, 2017 10:54
-
-
Save makerj/b0ab6a5bbe912ab660e0609f399fa2d3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import asyncio | |
from concurrent.futures import ProcessPoolExecutor | |
from concurrent.futures import ThreadPoolExecutor | |
from functools import partial | |
import aiohttp | |
import bs4 | |
import uvloop | |
import mailer | |
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) | |
main_loop = asyncio.get_event_loop() | |
main_loop.set_default_executor(ProcessPoolExecutor()) | |
mail_loop = asyncio.new_event_loop() | |
mail_loop.set_default_executor(ThreadPoolExecutor()) | |
def _parse(body): | |
interested_items = [] | |
try: | |
soup = bs4.BeautifulSoup(body, 'lxml') | |
table = soup.find('table', class_='board_list_table') | |
rows = table.tbody.find_all('tr') | |
for row in filter(lambda r: 'notice' not in r['class'], rows): | |
category = row.find('td', class_='divsn').text.strip() | |
if category not in {'게임S/W', '게임H/W', '휴대폰'}: | |
interested_items.append(row.find('td', class_='subject').a.text.strip()) | |
except AttributeError: | |
return [] | |
return interested_items | |
async def parse_page(body): | |
return await main_loop.run_in_executor(None, partial(_parse, body)) | |
async def process_page(session, url): | |
async with session.get(url) as response: | |
body = await response.read() | |
return await parse_page(body) | |
def main(): | |
# board_url = 'http://bbs.ruliweb.com/news/board/1020' # Hot Deal (our main target) | |
board_url = 'http://bbs.ruliweb.com/community/board/300143' # Humor (eligible for testing) | |
polling_delay = 5 | |
with aiohttp.ClientSession(loop=main_loop) as session: | |
fs = [] | |
for page in range(1, 5): | |
fs.append(asyncio.ensure_future( | |
process_page( | |
session, board_url + '/list?page={}'.format(page) | |
))) | |
fs = main_loop.run_until_complete(asyncio.wait(fs))[0] | |
initial_dataset = set(sum([f.result() for f in fs], [])) | |
print(initial_dataset) | |
while True: | |
main_loop.run_until_complete(asyncio.sleep(polling_delay)) | |
print('polling...') | |
polled = set(main_loop.run_until_complete(asyncio.ensure_future(process_page(session, board_url)))) | |
polled -= initial_dataset | |
if polled: | |
initial_dataset |= polled | |
mail_loop.run_in_executor(None, partial( | |
mailer.send_mail, 'ahenwkqdj@naver.com', 'new post!', str(polled) | |
)) | |
print(polled) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment