Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save hmphu/7d9f972a03845058d9865dc78c86dea4 to your computer and use it in GitHub Desktop.
Save hmphu/7d9f972a03845058d9865dc78c86dea4 to your computer and use it in GitHub Desktop.
import asyncio
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import aiohttp
import bs4
import uvloop
import mailer
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
main_loop = asyncio.get_event_loop()
main_loop.set_default_executor(ProcessPoolExecutor())
mail_loop = asyncio.new_event_loop()
mail_loop.set_default_executor(ThreadPoolExecutor())
def _parse(body):
interested_items = []
try:
soup = bs4.BeautifulSoup(body, 'lxml')
table = soup.find('table', class_='board_list_table')
rows = table.tbody.find_all('tr')
for row in filter(lambda r: 'notice' not in r['class'], rows):
category = row.find('td', class_='divsn').text.strip()
if category not in {'게임S/W', '게임H/W', '휴대폰'}:
interested_items.append(row.find('td', class_='subject').a.text.strip())
except AttributeError:
return []
return interested_items
async def parse_page(body):
return await main_loop.run_in_executor(None, partial(_parse, body))
async def process_page(session, url):
async with session.get(url) as response:
body = await response.read()
return await parse_page(body)
def main():
# board_url = 'http://bbs.ruliweb.com/news/board/1020' # Hot Deal (our main target)
board_url = 'http://bbs.ruliweb.com/community/board/300143' # Humor (eligible for testing)
polling_delay = 5
with aiohttp.ClientSession(loop=main_loop) as session:
fs = []
for page in range(1, 5):
fs.append(asyncio.ensure_future(
process_page(
session, board_url + '/list?page={}'.format(page)
)))
fs = main_loop.run_until_complete(asyncio.wait(fs))[0]
initial_dataset = set(sum([f.result() for f in fs], []))
print(initial_dataset)
while True:
main_loop.run_until_complete(asyncio.sleep(polling_delay))
print('polling...')
polled = set(main_loop.run_until_complete(asyncio.ensure_future(process_page(session, board_url))))
polled -= initial_dataset
if polled:
initial_dataset |= polled
mail_loop.run_in_executor(None, partial(
mailer.send_mail, 'ahenwkqdj@naver.com', 'new post!', str(polled)
))
print(polled)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment