Skip to content

Instantly share code, notes, and snippets.

@DimasInchidi
Created July 13, 2018 14:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save DimasInchidi/aa1e455ae975d3d40dd5e898ca89568c to your computer and use it in GitHub Desktop.
Save DimasInchidi/aa1e455ae975d3d40dd5e898ca89568c to your computer and use it in GitHub Desktop.
simple python web scrapper example
#!/usr/bin/env python
import asyncio
import re
import time
import aiohttp # pip install aiohttp
from bs4 import BeautifulSoup # pip install beautifulsoup4
async def get_response_text(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as r:
return await r.text()
async def crawl(i):
url = f"https://www.passiton.com/inspirational-quotes?page={i}"
r = await get_response_text(url)
soup = BeautifulSoup(r, 'html.parser')
results = soup.find_all('div', {'class': 'portfolio-image'})
for result in results:
quotes = result('a')[0]('img')[0]['alt']
quotes.replace('\n\n', '\n').replace('\n', ' ').replace(' ', ' ')
quotes = re.sub(' #<Author:.+>', '', quotes)
print(quotes, end='\n\n')
def start(chunk):
loop = asyncio.get_event_loop()
tasks = [
asyncio.ensure_future(crawl(i)) for i in chunk
]
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
if __name__ == '__main__':
max_page = 50
iterable = range(1, max_page)
start_time = time.time()
start(iterable)
end_time = time.time()
print(
f"Total crawl time of {max_page} page{'' if max_page<2 else 's'} in {end_time - start_time} seconds"
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment