Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
A example of scrapper using asyncio and aiohttp
import asyncio
import aiohttp
import bs4
import tqdm
@asyncio.coroutine
def get(*args, **kwargs):
response = yield from aiohttp.request('GET', *args, **kwargs)
return (yield from response.read_and_close(decode=True))
@asyncio.coroutine
def wait_with_progress(coros):
for f in tqdm.tqdm(asyncio.as_completed(coros), total=len(coros)):
yield from f
def first_magnet(page):
soup = bs4.BeautifulSoup(page)
a = soup.find('a', title='Download this torrent using magnet')
return a['href']
@asyncio.coroutine
def print_magnet(query):
url = 'http://thepiratebay.se/search/{}/0/7/0'.format(query)
with (yield from sem):
page = yield from get(url, compress=True)
magnet = first_magnet(page)
print('{}: {}'.format(query, magnet))
distros = ['archlinux', 'ubuntu', 'debian']
sem = asyncio.Semaphore(5)
loop = asyncio.get_event_loop()
f = asyncio.wait([print_magnet(d) for d in distros])
loop.run_until_complete(f)
@bsima

This comment has been minimized.

Copy link

@bsima bsima commented Mar 4, 2014

FYI it's spelled "scraper" not "scrapper"

@sampeka

This comment has been minimized.

Copy link

@sampeka sampeka commented Jun 24, 2014

Perhaps I'm being picky, but you should pass your variable sem to print_magnet() to keep data encapsulation. Was confusing me as I had no idea what this variable was. Apart from that, this was helpful :)

@Vayn

This comment has been minimized.

Copy link

@Vayn Vayn commented Dec 14, 2014

Hi, I did a "POST" version with connection pooling after I'd read the example.

import asyncio
import aiohttp
import ujson
import tqdm

payload = ujson.dumps({'hello': 'world'})
url = "http://127.0.0.1:8000"

sem = asyncio.Semaphore(5)

@asyncio.coroutine
def wait_with_progress(coros):
    for f in tqdm.tqdm(asyncio.as_completed(coros), total=len(coros)):
        yield from f

@asyncio.coroutine
def post():
    resp = yield from aiohttp.request('post', url, data=payload, connector=conn)
    return (yield from resp.text())

@asyncio.coroutine
def print_status(i):
    with (yield from sem):
        status = yield from post()
        #print('#' + str(i) + ' ' + status)

if __name__ == '__main__':
    loop = asyncio.get_event_loop()
    conn = aiohttp.connector.TCPConnector()
    f = [print_status(i) for i in range(5000)]
    loop.run_until_complete(wait_with_progress(f))

And the echo server:

from flask import Flask
from flask import request

app = Flask(__name__)

@app.route('/', methods=['POST'])
def echo():
    return request.data

if __name__ == '__main__':
    app.run(port=8000)
@alexjj

This comment has been minimized.

Copy link

@alexjj alexjj commented Mar 14, 2016

The aiohttp method ClientResponse.read_and_close() is gone as of December 2015

Should change line 10 to:

return (yield from response.text())
@Wikidepia

This comment has been minimized.

Copy link

@Wikidepia Wikidepia commented Oct 19, 2020

import asyncio
import aiohttp
import bs4
import tqdm


async def get(*args, **kwargs):
    async with aiohttp.ClientSession() as session:
        async with session.get(*args, **kwargs) as resp:
            return (await resp.text())


def first_magnet(page):
    soup = bs4.BeautifulSoup(page, features="lxml")
    a = soup.find('a', title='Download this torrent using magnet')
    return a['href']


async def print_magnet(query):
    url = 'http://thepiratebay.se/search/{}/0/7/0'.format(query)
    async with sem:
        page = await get(url, compress=True)
    magnet = first_magnet(page)
    print('{}: {}'.format(query, magnet))


distros = ['archlinux', 'ubuntu', 'debian']
sem = asyncio.Semaphore(5)
loop = asyncio.get_event_loop()
f = asyncio.wait([print_magnet(d) for d in distros])
loop.run_until_complete(f)

Tested with Python 3.8, but rip ThePirateBay.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.