|
# MIT License |
|
# Copyright (c) 2024 Justyn Roberts |
|
|
|
# Permission is hereby granted, free of charge, to any person obtaining a copy |
|
# of this software and associated documentation files (the "Software"), to deal |
|
# in the Software without restriction, including without limitation the rights |
|
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
|
# copies of the Software, and to permit persons to whom the Software is |
|
# furnished to do so, subject to the following conditions: |
|
# |
|
# The above copyright notice and this permission notice shall be included in all |
|
# copies or substantial portions of the Software. |
|
# |
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
|
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
|
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
|
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
|
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
|
# SOFTWARE. |
|
|
|
|
|
|
|
from playwright.async_api import async_playwright |
|
import asyncio |
|
import aiofiles # You need to install aiofiles |
|
import json |
|
from config import CONFIG # Assuming CONFIG is defined in config.py |
|
|
|
async def get_page_text(page, selector): |
|
"""Extracts inner text from the specified selector on a given page.""" |
|
content = await page.inner_text(selector) |
|
return content |
|
|
|
async def extract_links(page, base_url): |
|
"""Extracts and returns all unique links from the given page that match the base URL.""" |
|
links = await page.evaluate('''() => { |
|
return Array.from(document.querySelectorAll('a')).map(a => a.href); |
|
}''') |
|
return list(set(filter(lambda link: link.startswith(base_url), links))) |
|
|
|
async def crawl_page(browser, url, visited_urls, results_file, max_pages, semaphore): |
|
"""Crawls a single page, extracts content and links, following links up to a max depth, with concurrency control.""" |
|
async with semaphore: # Use semaphore to control concurrency |
|
if len(visited_urls) >= max_pages or url in visited_urls: |
|
return |
|
visited_urls.add(url) |
|
|
|
page = await browser.new_page() |
|
await page.goto(url) |
|
content = await get_page_text(page, CONFIG['selector']) |
|
print(f"Crawled: {url}") |
|
|
|
# Append result to file as a JSON line |
|
async with aiofiles.open(results_file, mode='a') as f: |
|
await f.write(json.dumps({"url": url, "content": content}) + '\n') |
|
|
|
links = await extract_links(page, CONFIG['url'][0]) |
|
await page.close() |
|
|
|
return links |
|
|
|
async def main(): |
|
async with async_playwright() as p: |
|
browser = await p.chromium.launch(headless=True) |
|
visited_urls = set() |
|
results_file = CONFIG['outputFileName'] |
|
concurrency_limit = 5 # Adjust as needed |
|
semaphore = asyncio.Semaphore(concurrency_limit) |
|
|
|
async def limited_crawl_page(url): |
|
return await crawl_page(browser, url, visited_urls, results_file, CONFIG['maxPagesToCrawl'], semaphore) |
|
|
|
base_url = CONFIG['url'][0] |
|
# Ensure the file is empty before starting |
|
async with aiofiles.open(results_file, mode='w') as f: |
|
pass # Just to clear/initialize the file |
|
|
|
# Create an initial task for the base URL |
|
tasks = [asyncio.create_task(limited_crawl_page(base_url))] |
|
|
|
while tasks: |
|
finished_tasks, _ = await asyncio.wait(tasks, return_when=asyncio.ALL_COMPLETED) |
|
new_links = [] |
|
for task in finished_tasks: |
|
links = await task |
|
if links: |
|
new_links.extend(links) |
|
|
|
tasks = [asyncio.create_task(limited_crawl_page(link)) for link in new_links if link not in visited_urls and len(visited_urls) < CONFIG['maxPagesToCrawl']] |
|
|
|
await browser.close() |
|
|
|
if __name__ == '__main__': |
|
asyncio.run(main()) |