ABUCKY0/VexParseTournamentRulesPage.py

## VexParseTournamentRulesPage.py
##################################
# Parse VEX Tournament Rule Pages
##################################
MAX_WORKER_THREADS = 6

### IMPORTS
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
from markdownify import markdownify as md
import os
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
### RETREVING THE LIST
chrome_options = Options()
#chrome_options.add_argument("--headless")
#chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1280,720")
# Mimic a popular browser's user-agent
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36")
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
url = "https://www.robotevents.com/storage/game_manual/VRC_2023-2024_Over_Under/rules/"
driver.get(url)
page_source = driver.page_source

### PARSING WEBPAGE
rules_list = []
soup = BeautifulSoup(page_source, "html.parser")
for link in soup.find_all('a'):
    rules_list.append({'Rule': link.text, 'Link': link['href']})
print(rules_list)
driver.close()

### Parsing the rules
"""
This section loops through the rules_list, opens the link, retreives the page source, and saves it to a file.
Then it converts the webpage to a markdown file before closing the tab.
"""
def get_page(rule):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    #chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1280,720")
    # Mimic a popular browser's user-agent
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    try:
        print(rule)
        print("Executing RL: " + str(rule['Link']))
        driver.get(url+rule['Link'])
        rule_page_source = driver.page_source
        if not os.path.exists("rules"):
            os.makedirs("rules")
        with open(f"C:/Users/buckn/OneDrive/Desktop/.PROS Vex Code/rules/{rule['Rule']}.md", "w", encoding='utf-16') as file:
            file.write(md(rule_page_source, exclude=["script", "style"]))
            file.close()
    except Exception as e:
        print(f"An error occurred while processing rule {rule['Rule']}: {e}")
    finally:
        driver.close()


"""
Open MAX_WORKER_THREADS threads to process the rules concurrently.
"""
with ThreadPoolExecutor(max_workers=MAX_WORKER_THREADS) as executor:
    # Submit all tasks to the executor
    futures = [executor.submit(get_page, rule) for rule in rules_list]
    # Wait for all futures to complete
    for future in as_completed(futures):
        future.result()  # You can handle exceptions here if needed

driver.close()
	##################################
	# Parse VEX Tournament Rule Pages
	##################################
	MAX_WORKER_THREADS = 6

	### IMPORTS
	from selenium import webdriver
	from selenium.webdriver.chrome.service import Service
	from selenium.webdriver.chrome.options import Options
	from webdriver_manager.chrome import ChromeDriverManager
	from bs4 import BeautifulSoup
	import time
	from markdownify import markdownify as md
	import os
	import threading
	from concurrent.futures import ThreadPoolExecutor, as_completed
	### RETREVING THE LIST
	chrome_options = Options()
	#chrome_options.add_argument("--headless")
	#chrome_options.add_argument("--disable-gpu")
	chrome_options.add_argument("--window-size=1280,720")
	# Mimic a popular browser's user-agent
	chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36")
	service = Service(ChromeDriverManager().install())
	driver = webdriver.Chrome(service=service, options=chrome_options)
	url = "https://www.robotevents.com/storage/game_manual/VRC_2023-2024_Over_Under/rules/"
	driver.get(url)
	page_source = driver.page_source

	### PARSING WEBPAGE
	rules_list = []
	soup = BeautifulSoup(page_source, "html.parser")
	for link in soup.find_all('a'):
	rules_list.append({'Rule': link.text, 'Link': link['href']})
	print(rules_list)
	driver.close()

	### Parsing the rules
	"""
	This section loops through the rules_list, opens the link, retreives the page source, and saves it to a file.
	Then it converts the webpage to a markdown file before closing the tab.
	"""
	def get_page(rule):
	chrome_options = Options()
	chrome_options.add_argument("--headless")
	#chrome_options.add_argument("--disable-gpu")
	chrome_options.add_argument("--window-size=1280,720")
	# Mimic a popular browser's user-agent
	chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36")
	service = Service(ChromeDriverManager().install())
	driver = webdriver.Chrome(service=service, options=chrome_options)
	try:
	print(rule)
	print("Executing RL: " + str(rule['Link']))
	driver.get(url+rule['Link'])
	rule_page_source = driver.page_source
	if not os.path.exists("rules"):
	os.makedirs("rules")
	with open(f"C:/Users/buckn/OneDrive/Desktop/.PROS Vex Code/rules/{rule['Rule']}.md", "w", encoding='utf-16') as file:
	file.write(md(rule_page_source, exclude=["script", "style"]))
	file.close()
	except Exception as e:
	print(f"An error occurred while processing rule {rule['Rule']}: {e}")
	finally:
	driver.close()


	"""
	Open MAX_WORKER_THREADS threads to process the rules concurrently.
	"""
	with ThreadPoolExecutor(max_workers=MAX_WORKER_THREADS) as executor:
	# Submit all tasks to the executor
	futures = [executor.submit(get_page, rule) for rule in rules_list]
	# Wait for all futures to complete
	for future in as_completed(futures):
	future.result() # You can handle exceptions here if needed

	driver.close()