Skip to content

Instantly share code, notes, and snippets.

@wtneal
Last active April 17, 2019 07:38
Show Gist options
  • Save wtneal/377ed73f42f5da1acdbebc5eee976bd1 to your computer and use it in GitHub Desktop.
Save wtneal/377ed73f42f5da1acdbebc5eee976bd1 to your computer and use it in GitHub Desktop.
asyncio scraper
import asyncio
import aiofiles
import aiohttp
import logging
import re
import sys
import os
import lxml.html
logger = logging.getLogger(__name__)
logger.addHandler(logging.StreamHandler())
logger.setLevel(logging.DEBUG)
HEADERS = {
'USER-AGENT': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.85 Safari/537.36'
}
MAX_PICS = 10
SUBREDDIT = 'earthporn'
class WorkerPool:
def __init__(self, loop, coro, worker_count, options):
self.loop = loop or asyncio.get_event_loop()
self.result = None
self.q = asyncio.Queue(loop=self.loop)
self.coro = coro
self.worker_count = worker_count
self.options = options
async def run(self):
workers = [asyncio.Task(self.coro(self.loop, self.q, self.options))
for _ in range(self.worker_count)]
await self.q.join()
for w in workers:
w.cancel()
async def _fetch(loop, q, options):
# the loop makes sure the coroutine doesn't return and keeps going
while True:
try:
''' Pull a url from the queue and request it '''
url, utype = await q.get()
logger.debug('url: %s, url type: %s', url, utype)
async with aiohttp.ClientSession(loop=loop, headers=HEADERS) as session:
async with session.get(url) as resp:
if utype == 'seed':
text = await resp.text()
for link in _parse_links(text):
q.put_nowait((link, 'post'))
elif utype == 'post':
text = await resp.text()
q.put_nowait((_get_image_link(text), 'img'))
else:
outdir = os.path.join('/tmp', SUBREDDIT, options['mods'].replace('/', '_'))
await _get_image(resp, outdir)
logger.debug('about to finish task, queue size: %s', q.qsize())
q.task_done()
except asyncio.CancelledError:
break
except:
logger.exception('error')
raise
def _parse_links(text):
logger.debug('getting post links')
html = lxml.html.fromstring(text)
links = html.xpath("//a[contains(@class, title)]/@href")
logger.debug(links)
links = ['http://imgur.com' + link for link in links
if re.search('^/r/{}/[a-zA-Z0-9]{{7}}'.format(SUBREDDIT), link, re.I)]
logger.debug(links)
return links[:MAX_PICS]
def _get_image_link(text):
logger.debug('getting actual image link')
html = lxml.html.fromstring(text)
video_link = html.xpath('//div[@class="post-image"]//video/following-sibling::meta[@itemprop="contentURL"]/@content')
if video_link:
logger.debug('returning video link')
return video_link[0]
link = html.xpath('//div[@class="post-image"]//img/@src')[0]
return link
async def _get_image(resp, outdir):
logger.debug('saving image')
fn = resp.url.split('/')[-1]
async with aiofiles.open('{}/{}'.format(outdir, fn), 'wb') as f:
data = await resp.read()
await f.write(data)
def main(mods=''):
try:
outdir = os.path.join('/tmp', SUBREDDIT, mods.replace('/', '_'))
os.makedirs(outdir)
except:
logger.warn("oops couldn't create folder")
try:
loop = asyncio.get_event_loop()
wp = WorkerPool(loop, _fetch, 5, options={'sr': SUBREDDIT, 'mods': mods})
seed_url = 'http://imgur.com/r/{}{}'.format(SUBREDDIT, mods)
wp.q.put_nowait((seed_url, 'seed'))
loop.run_until_complete(wp.run())
except KeyboardInterrupt:
sys.stderr.flush()
except:
logger.exception('error with loop')
finally:
loop.close()
if __name__ == '__main__':
main(mods='/top/all')
@xiaowenjie21
Copy link

hello wtneal, rum this code get error, please help to look this:

about to finish task, queue size: 6
url: //i.imgur.com/sATBfbPr.jpg, url type: img
error
Traceback (most recent call last):
File "C:\Users\admin\AppData\Local\Programs\Python\Python35\lib\site-packages\aiohttp\connector.py", line 608, in _create_direct_connection
local_addr=self._local_addr)
File "C:\Users\admin\AppData\Local\Programs\Python\Python35\lib\asyncio\base_events.py", line 695, in create_connection
raise exceptions[0]
File "C:\Users\admin\AppData\Local\Programs\Python\Python35\lib\asyncio\base_events.py", line 682, in create_connection
yield from self.sock_connect(sock, address)
File "C:\Users\admin\AppData\Local\Programs\Python\Python35\lib\asyncio\futures.py", line 361, in iter
yield self # This tells Task to wait for completion.
File "C:\Users\admin\AppData\Local\Programs\Python\Python35\lib\asyncio\tasks.py", line 296, in _wakeup
future.result()
File "C:\Users\admin\AppData\Local\Programs\Python\Python35\lib\asyncio\futures.py", line 274, in result
raise self._exception
File "C:\Users\admin\AppData\Local\Programs\Python\Python35\lib\asyncio\selector_events.py", line 414, in _sock_connect
sock.connect(address)
OSError: [WinError 10049] 在其上下文中,该请求的地址无效。

Exception:

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
File "C:/Users/admin/PycharmProjects/untitled/new-aysncio/example-github/crawl-asyncio/web-asyncio.py", line 47, in _fetch
async with session.get(url) as resp:
File "C:\Users\admin\AppData\Local\Programs\Python\Python35\lib\site-packages\aiohttp\client.py", line 529, in aenter
self._resp = yield from self._coro
File "C:\Users\admin\AppData\Local\Programs\Python\Python35\lib\site-packages\aiohttp\client.py", line 165, in _request
conn = yield from self._connector.connect(req)
File "C:\Users\admin\AppData\Local\Programs\Python\Python35\lib\site-packages\aiohttp\connector.py", line 316, in connect
.format(key, exc.strerror)) from exc
aiohttp.errors.ClientOSError: [Errno 10049] Cannot connect to host i.imgur.com:None ssl:False [Can not connect to i.imgur.com:None [在其上下文中,该请求的地址无效。]]

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment