Skip to content

Instantly share code, notes, and snippets.

@Chestermozhao
Created July 30, 2020 08:16
Show Gist options
  • Save Chestermozhao/5847186262fea47fe0d17374263b5f9d to your computer and use it in GitHub Desktop.
Save Chestermozhao/5847186262fea47fe0d17374263b5f9d to your computer and use it in GitHub Desktop.
import asyncio
import requests
from bs4 import BeautifulSoup as bs
from requests.adapters import HTTPAdapter
from concurrent.futures import ThreadPoolExecutor
def fetch(session, url):
with session.get(url) as response:
src_html = response.text
soup = bs(src_html, "html.parser")
main_content = soup.select("#main-content")
for d in main_content:
main_c = d.get_text()
_segment(main_c)
async def get_data_asynchronous(urls):
with ThreadPoolExecutor(max_workers=20) as executor:
with requests.Session() as session:
# Set any session parameters here before calling `fetch`
session.mount('http://', HTTPAdapter(max_retries=3))
session.mount('https://', HTTPAdapter(max_retries=3))
loop = asyncio.get_event_loop()
tasks = [
loop.run_in_executor(
executor,
fetch,
*(session, url)
)
for url in urls
]
for response in await asyncio.gather(*tasks):
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment