Skip to content

Instantly share code, notes, and snippets.

@guriandoro
Created May 17, 2024 21:36
Show Gist options
  • Save guriandoro/7cfc05c90283e07b30fa88981ddceae3 to your computer and use it in GitHub Desktop.
Save guriandoro/7cfc05c90283e07b30fa88981ddceae3 to your computer and use it in GitHub Desktop.
import requests
from bs4 import BeautifulSoup
import re
import sys
def crawl_and_search(base_url, keyword):
# Define the regex pattern for version 2.x pages
#pattern = re.compile(r'/percona-monitoring-and-management/release-notes/2\.\d+/')
pattern = re.compile(r'2\.\d+')
# Function to get all links from a page
def get_links(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# Print the current page URL
#print(f"Extracting links from: {url}")
links = [a['href'] for a in soup.find_all('a', href=True) if pattern.search(a['href'])]
#print(f"Found {len(links)} links: {links}")
return links
# Function to search for the keyword in the main HTML text
def search_keyword(url, keyword):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# Extract the main content text from the div with class "md-content"
main_content = soup.find('div', class_='md-content')
if main_content:
page_text = main_content.get_text()
if keyword.lower() in page_text.lower():
print(f"=> Keyword '{keyword}' found on page: {url}\n")
else:
print(f"No main content found on page: {url}")
# Start crawling from the base URL
response = requests.get(base_url)
soup = BeautifulSoup(response.content, 'html.parser')
initial_links = get_links(base_url)
if not initial_links:
print("No links found on the base URL.")
return
crawled_urls = set()
for link in initial_links:
link = 'https://docs.percona.com/percona-monitoring-and-management/release-notes/' + link # Construct the full URL
if link not in crawled_urls:
print(f"Crawling page: {link}", file=sys.stderr)
search_keyword(link, keyword)
crawled_urls.add(link)
# else:
# print(f"Skipping already crawled URL: {link}")
if __name__ == "__main__":
base_url = "https://docs.percona.com/percona-monitoring-and-management/release-notes/index.html"
if len(sys.argv) > 1:
keyword = ' '.join(sys.argv[1:]) # Join all arguments to form the keyword string
else:
keyword = input("Enter the keyword to search for: ")
crawl_and_search(base_url, keyword)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment