Last active

Embed URL

HTTPS clone URL

SSH clone URL

You can clone with HTTPS or SSH.

Download Gist

A example of scrapper using asyncio and aiohttp

View scrapper.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38
import asyncio
import aiohttp
import bs4
import tqdm
 
 
@asyncio.coroutine
def get(*args, **kwargs):
response = yield from aiohttp.request('GET', *args, **kwargs)
return (yield from response.read_and_close(decode=True))
 
 
@asyncio.coroutine
def wait_with_progress(coros):
for f in tqdm.tqdm(asyncio.as_completed(coros), total=len(coros)):
yield from f
 
 
def first_magnet(page):
soup = bs4.BeautifulSoup(page)
a = soup.find('a', title='Download this torrent using magnet')
return a['href']
 
 
@asyncio.coroutine
def print_magnet(query):
url = 'http://thepiratebay.se/search/{}/0/7/0'.format(query)
with (yield from sem):
page = yield from get(url, compress=True)
magnet = first_magnet(page)
print('{}: {}'.format(query, magnet))
 
 
distros = ['archlinux', 'ubuntu', 'debian']
sem = asyncio.Semaphore(5)
loop = asyncio.get_event_loop()
f = asyncio.wait([print_magnet(d) for d in distros])
loop.run_until_complete(f)
bsima commented

FYI it's spelled "scraper" not "scrapper"

Perhaps I'm being picky, but you should pass your variable sem to print_magnet() to keep data encapsulation. Was confusing me as I had no idea what this variable was. Apart from that, this was helpful :)

Hi, I did a "POST" version with connection pooling after I'd read the example.

import asyncio
import aiohttp
import ujson
import tqdm

payload = ujson.dumps({'hello': 'world'})
url = "http://127.0.0.1:8000"

sem = asyncio.Semaphore(5)

@asyncio.coroutine
def wait_with_progress(coros):
    for f in tqdm.tqdm(asyncio.as_completed(coros), total=len(coros)):
        yield from f

@asyncio.coroutine
def post():
    resp = yield from aiohttp.request('post', url, data=payload, connector=conn)
    return (yield from resp.text())

@asyncio.coroutine
def print_status(i):
    with (yield from sem):
        status = yield from post()
        #print('#' + str(i) + ' ' + status)

if __name__ == '__main__':
    loop = asyncio.get_event_loop()
    conn = aiohttp.connector.TCPConnector()
    f = [print_status(i) for i in range(5000)]
    loop.run_until_complete(wait_with_progress(f))

And the echo server:

from flask import Flask
from flask import request

app = Flask(__name__)

@app.route('/', methods=['POST'])
def echo():
    return request.data

if __name__ == '__main__':
    app.run(port=8000)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.