justynroberts/config.py

## readme.md

      
    Raw
  

              readme.md
            
          
    Async Web Crawler with Playwright

This project is an asynchronous web crawler built using Python and Playwright. It navigates through web pages, extracts content and links, and saves the content to a file, managing concurrency with asyncio semaphores.
Features


Asynchronous crawling for efficiency and speed.
Uses Playwright for browser automation, capable of handling JavaScript-rendered pages.
Saves crawled content incrementally to a file.
Configurable concurrency limit.

Requirements


Python 3.7+
Playwright
aiofiles


## config.py
# config.py
CONFIG = {
    "url": ["https://docs.rundeck.com"],  # Assuming list format to potentially handle multiple starting URLs
    "match": "https://docs.rundeck.com",
    "selector": "*",
    "maxPagesToCrawl": 1000,
    "outputFileName": "output.json"
    }

## main.py
# MIT License
# Copyright (c) 2024 Justyn Roberts

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.


from playwright.async_api import async_playwright
import asyncio
import aiofiles  # You need to install aiofiles
import json
from config import CONFIG  # Assuming CONFIG is defined in config.py

async def get_page_text(page, selector):
    """Extracts inner text from the specified selector on a given page."""
    content = await page.inner_text(selector)
    return content

async def extract_links(page, base_url):
    """Extracts and returns all unique links from the given page that match the base URL."""
    links = await page.evaluate('''() => {
        return Array.from(document.querySelectorAll('a')).map(a => a.href);
    }''')
    return list(set(filter(lambda link: link.startswith(base_url), links)))

async def crawl_page(browser, url, visited_urls, results_file, max_pages, semaphore):
    """Crawls a single page, extracts content and links, following links up to a max depth, with concurrency control."""
    async with semaphore:  # Use semaphore to control concurrency
        if len(visited_urls) >= max_pages or url in visited_urls:
            return
        visited_urls.add(url)

        page = await browser.new_page()
        await page.goto(url)
        content = await get_page_text(page, CONFIG['selector'])
        print(f"Crawled: {url}")

        # Append result to file as a JSON line
        async with aiofiles.open(results_file, mode='a') as f:
            await f.write(json.dumps({"url": url, "content": content}) + '\n')

        links = await extract_links(page, CONFIG['url'][0])
        await page.close()

        return links

async def main():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        visited_urls = set()
        results_file = CONFIG['outputFileName']
        concurrency_limit = 5  # Adjust as needed
        semaphore = asyncio.Semaphore(concurrency_limit)

        async def limited_crawl_page(url):
            return await crawl_page(browser, url, visited_urls, results_file, CONFIG['maxPagesToCrawl'], semaphore)

        base_url = CONFIG['url'][0]
        # Ensure the file is empty before starting
        async with aiofiles.open(results_file, mode='w') as f:
            pass  # Just to clear/initialize the file

        # Create an initial task for the base URL
        tasks = [asyncio.create_task(limited_crawl_page(base_url))]

        while tasks:
            finished_tasks, _ = await asyncio.wait(tasks, return_when=asyncio.ALL_COMPLETED)
            new_links = []
            for task in finished_tasks:
                links = await task
                if links:
                    new_links.extend(links)

            tasks = [asyncio.create_task(limited_crawl_page(link)) for link in new_links if link not in visited_urls and len(visited_urls) < CONFIG['maxPagesToCrawl']]

        await browser.close()

if __name__ == '__main__':
    asyncio.run(main())
	# config.py
	CONFIG = {
	"url": ["https://docs.rundeck.com"], # Assuming list format to potentially handle multiple starting URLs
	"match": "https://docs.rundeck.com",
	"selector": "*",
	"maxPagesToCrawl": 1000,
	"outputFileName": "output.json"
	}
	# MIT License
	# Copyright (c) 2024 Justyn Roberts

	# Permission is hereby granted, free of charge, to any person obtaining a copy
	# of this software and associated documentation files (the "Software"), to deal
	# in the Software without restriction, including without limitation the rights
	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	# copies of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be included in all
	# copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	# SOFTWARE.



	from playwright.async_api import async_playwright
	import asyncio
	import aiofiles # You need to install aiofiles
	import json
	from config import CONFIG # Assuming CONFIG is defined in config.py

	async def get_page_text(page, selector):
	"""Extracts inner text from the specified selector on a given page."""
	content = await page.inner_text(selector)
	return content

	async def extract_links(page, base_url):
	"""Extracts and returns all unique links from the given page that match the base URL."""
	links = await page.evaluate('''() => {
	return Array.from(document.querySelectorAll('a')).map(a => a.href);
	}''')
	return list(set(filter(lambda link: link.startswith(base_url), links)))

	async def crawl_page(browser, url, visited_urls, results_file, max_pages, semaphore):
	"""Crawls a single page, extracts content and links, following links up to a max depth, with concurrency control."""
	async with semaphore: # Use semaphore to control concurrency
	if len(visited_urls) >= max_pages or url in visited_urls:
	return
	visited_urls.add(url)

	page = await browser.new_page()
	await page.goto(url)
	content = await get_page_text(page, CONFIG['selector'])
	print(f"Crawled: {url}")

	# Append result to file as a JSON line
	async with aiofiles.open(results_file, mode='a') as f:
	await f.write(json.dumps({"url": url, "content": content}) + '\n')

	links = await extract_links(page, CONFIG['url'][0])
	await page.close()

	return links

	async def main():
	async with async_playwright() as p:
	browser = await p.chromium.launch(headless=True)
	visited_urls = set()
	results_file = CONFIG['outputFileName']
	concurrency_limit = 5 # Adjust as needed
	semaphore = asyncio.Semaphore(concurrency_limit)

	async def limited_crawl_page(url):
	return await crawl_page(browser, url, visited_urls, results_file, CONFIG['maxPagesToCrawl'], semaphore)

	base_url = CONFIG['url'][0]
	# Ensure the file is empty before starting
	async with aiofiles.open(results_file, mode='w') as f:
	pass # Just to clear/initialize the file

	# Create an initial task for the base URL
	tasks = [asyncio.create_task(limited_crawl_page(base_url))]

	while tasks:
	finished_tasks, _ = await asyncio.wait(tasks, return_when=asyncio.ALL_COMPLETED)
	new_links = []
	for task in finished_tasks:
	links = await task
	if links:
	new_links.extend(links)

	tasks = [asyncio.create_task(limited_crawl_page(link)) for link in new_links if link not in visited_urls and len(visited_urls) < CONFIG['maxPagesToCrawl']]

	await browser.close()

	if __name__ == '__main__':
	asyncio.run(main())