Skip to content

Instantly share code, notes, and snippets.

@ABUCKY0
Created June 26, 2024 05:58
Show Gist options
  • Save ABUCKY0/c041c3701f4b3cf8c053043371cad807 to your computer and use it in GitHub Desktop.
Save ABUCKY0/c041c3701f4b3cf8c053043371cad807 to your computer and use it in GitHub Desktop.
##################################
# Parse VEX Tournament Rule Pages
##################################
MAX_WORKER_THREADS = 6
### IMPORTS
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
from markdownify import markdownify as md
import os
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
### RETREVING THE LIST
chrome_options = Options()
#chrome_options.add_argument("--headless")
#chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1280,720")
# Mimic a popular browser's user-agent
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36")
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
url = "https://www.robotevents.com/storage/game_manual/VRC_2023-2024_Over_Under/rules/"
driver.get(url)
page_source = driver.page_source
### PARSING WEBPAGE
rules_list = []
soup = BeautifulSoup(page_source, "html.parser")
for link in soup.find_all('a'):
rules_list.append({'Rule': link.text, 'Link': link['href']})
print(rules_list)
driver.close()
### Parsing the rules
"""
This section loops through the rules_list, opens the link, retreives the page source, and saves it to a file.
Then it converts the webpage to a markdown file before closing the tab.
"""
def get_page(rule):
chrome_options = Options()
chrome_options.add_argument("--headless")
#chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1280,720")
# Mimic a popular browser's user-agent
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36")
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
try:
print(rule)
print("Executing RL: " + str(rule['Link']))
driver.get(url+rule['Link'])
rule_page_source = driver.page_source
if not os.path.exists("rules"):
os.makedirs("rules")
with open(f"C:/Users/buckn/OneDrive/Desktop/.PROS Vex Code/rules/{rule['Rule']}.md", "w", encoding='utf-16') as file:
file.write(md(rule_page_source, exclude=["script", "style"]))
file.close()
except Exception as e:
print(f"An error occurred while processing rule {rule['Rule']}: {e}")
finally:
driver.close()
"""
Open MAX_WORKER_THREADS threads to process the rules concurrently.
"""
with ThreadPoolExecutor(max_workers=MAX_WORKER_THREADS) as executor:
# Submit all tasks to the executor
futures = [executor.submit(get_page, rule) for rule in rules_list]
# Wait for all futures to complete
for future in as_completed(futures):
future.result() # You can handle exceptions here if needed
driver.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment