Skip to content
Create a gist now

Instantly share code, notes, and snippets.

A example of scrapper using asyncio and aiohttp
import asyncio
import aiohttp
import bs4
import tqdm
@asyncio.coroutine
def get(*args, **kwargs):
response = yield from aiohttp.request('GET', *args, **kwargs)
return (yield from response.read_and_close(decode=True))
@asyncio.coroutine
def wait_with_progress(coros):
for f in tqdm.tqdm(asyncio.as_completed(coros), total=len(coros)):
yield from f
def first_magnet(page):
soup = bs4.BeautifulSoup(page)
a = soup.find('a', title='Download this torrent using magnet')
return a['href']
@asyncio.coroutine
def print_magnet(query):
url = 'http://thepiratebay.se/search/{}/0/7/0'.format(query)
with (yield from sem):
page = yield from get(url, compress=True)
magnet = first_magnet(page)
print('{}: {}'.format(query, magnet))
distros = ['archlinux', 'ubuntu', 'debian']
sem = asyncio.Semaphore(5)
loop = asyncio.get_event_loop()
f = asyncio.wait([print_magnet(d) for d in distros])
loop.run_until_complete(f)
@bsima
bsima commented Mar 4, 2014

FYI it's spelled "scraper" not "scrapper"

@sampeka
sampeka commented Jun 24, 2014

Perhaps I'm being picky, but you should pass your variable sem to print_magnet() to keep data encapsulation. Was confusing me as I had no idea what this variable was. Apart from that, this was helpful :)

@Vayn
Vayn commented Dec 14, 2014

Hi, I did a "POST" version with connection pooling after I'd read the example.

import asyncio
import aiohttp
import ujson
import tqdm

payload = ujson.dumps({'hello': 'world'})
url = "http://127.0.0.1:8000"

sem = asyncio.Semaphore(5)

@asyncio.coroutine
def wait_with_progress(coros):
    for f in tqdm.tqdm(asyncio.as_completed(coros), total=len(coros)):
        yield from f

@asyncio.coroutine
def post():
    resp = yield from aiohttp.request('post', url, data=payload, connector=conn)
    return (yield from resp.text())

@asyncio.coroutine
def print_status(i):
    with (yield from sem):
        status = yield from post()
        #print('#' + str(i) + ' ' + status)

if __name__ == '__main__':
    loop = asyncio.get_event_loop()
    conn = aiohttp.connector.TCPConnector()
    f = [print_status(i) for i in range(5000)]
    loop.run_until_complete(wait_with_progress(f))

And the echo server:

from flask import Flask
from flask import request

app = Flask(__name__)

@app.route('/', methods=['POST'])
def echo():
    return request.data

if __name__ == '__main__':
    app.run(port=8000)
@alexjj
alexjj commented Mar 14, 2016

The aiohttp method ClientResponse.read_and_close() is gone as of December 2015

Should change line 10 to:

return (yield from response.text())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.