Skip to content

Instantly share code, notes, and snippets.

@justynroberts
Last active February 14, 2024 12:51
Show Gist options
  • Save justynroberts/996118684a5de2cf9d305e217c3bd1e4 to your computer and use it in GitHub Desktop.
Save justynroberts/996118684a5de2cf9d305e217c3bd1e4 to your computer and use it in GitHub Desktop.
Scrape site for ChatGPT Training

Async Web Crawler with Playwright

This project is an asynchronous web crawler built using Python and Playwright. It navigates through web pages, extracts content and links, and saves the content to a file, managing concurrency with asyncio semaphores.

Features

  • Asynchronous crawling for efficiency and speed.
  • Uses Playwright for browser automation, capable of handling JavaScript-rendered pages.
  • Saves crawled content incrementally to a file.
  • Configurable concurrency limit.

Requirements

  • Python 3.7+
  • Playwright
  • aiofiles
# config.py
CONFIG = {
"url": ["https://docs.rundeck.com"], # Assuming list format to potentially handle multiple starting URLs
"match": "https://docs.rundeck.com",
"selector": "*",
"maxPagesToCrawl": 1000,
"outputFileName": "output.json"
}
# MIT License
# Copyright (c) 2024 Justyn Roberts
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from playwright.async_api import async_playwright
import asyncio
import aiofiles # You need to install aiofiles
import json
from config import CONFIG # Assuming CONFIG is defined in config.py
async def get_page_text(page, selector):
"""Extracts inner text from the specified selector on a given page."""
content = await page.inner_text(selector)
return content
async def extract_links(page, base_url):
"""Extracts and returns all unique links from the given page that match the base URL."""
links = await page.evaluate('''() => {
return Array.from(document.querySelectorAll('a')).map(a => a.href);
}''')
return list(set(filter(lambda link: link.startswith(base_url), links)))
async def crawl_page(browser, url, visited_urls, results_file, max_pages, semaphore):
"""Crawls a single page, extracts content and links, following links up to a max depth, with concurrency control."""
async with semaphore: # Use semaphore to control concurrency
if len(visited_urls) >= max_pages or url in visited_urls:
return
visited_urls.add(url)
page = await browser.new_page()
await page.goto(url)
content = await get_page_text(page, CONFIG['selector'])
print(f"Crawled: {url}")
# Append result to file as a JSON line
async with aiofiles.open(results_file, mode='a') as f:
await f.write(json.dumps({"url": url, "content": content}) + '\n')
links = await extract_links(page, CONFIG['url'][0])
await page.close()
return links
async def main():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
visited_urls = set()
results_file = CONFIG['outputFileName']
concurrency_limit = 5 # Adjust as needed
semaphore = asyncio.Semaphore(concurrency_limit)
async def limited_crawl_page(url):
return await crawl_page(browser, url, visited_urls, results_file, CONFIG['maxPagesToCrawl'], semaphore)
base_url = CONFIG['url'][0]
# Ensure the file is empty before starting
async with aiofiles.open(results_file, mode='w') as f:
pass # Just to clear/initialize the file
# Create an initial task for the base URL
tasks = [asyncio.create_task(limited_crawl_page(base_url))]
while tasks:
finished_tasks, _ = await asyncio.wait(tasks, return_when=asyncio.ALL_COMPLETED)
new_links = []
for task in finished_tasks:
links = await task
if links:
new_links.extend(links)
tasks = [asyncio.create_task(limited_crawl_page(link)) for link in new_links if link not in visited_urls and len(visited_urls) < CONFIG['maxPagesToCrawl']]
await browser.close()
if __name__ == '__main__':
asyncio.run(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment