Skip to content

Instantly share code, notes, and snippets.

@NeuronQ
Last active February 5, 2019 23:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save NeuronQ/b60a47960712b2bafc9849b704f07892 to your computer and use it in GitHub Desktop.
Save NeuronQ/b60a47960712b2bafc9849b704f07892 to your computer and use it in GitHub Desktop.
# sync_scrape.py (needs Python 3.7+)
import time, re, requests
def fetch_url(url):
t = time.perf_counter()
html = requests.get(url).text
print(f"time of fetch_url({url}): {time.perf_counter() - t:.2f}s")
return html
def scrape_data(html):
return re.findall(r'href="([^"]*)"', html)
urls = [
"https://www.ietf.org/rfc/rfc2616.txt",
"https://en.wikipedia.org/wiki/Asynchronous_I/O",
]
extracted_data = {}
t = time.perf_counter()
for url in urls:
html = fetch_url(url)
extracted_data[url] = scrape_data(html)
print("> extracted data:", extracted_data)
print(f"time elapsed: {time.perf_counter() - t:.2f}s")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment