Skip to content

Instantly share code, notes, and snippets.

@myselfshravan
Created September 17, 2023 11:23
Show Gist options
  • Save myselfshravan/5248e85a22b5f6be6e2939fa79063cf9 to your computer and use it in GitHub Desktop.
Save myselfshravan/5248e85a22b5f6be6e2939fa79063cf9 to your computer and use it in GitHub Desktop.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import fitz
import requests
import json
url = "https://www.mstcecommerce.com/auctionhome/container.jsp?title_id=Mine%20Block%20Summary&linkid=0&main_link=y&sublink=n&main_link_name=203&portal=mlcl"
options = webdriver.ChromeOptions()
options.add_argument('--headless') # Run in headless mode
with webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options) as driver:
driver.get(url)
driver.implicitly_wait(4)
print("Page URL:", driver.current_url)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
pdf_links = []
link = soup.find_all("a")
hrefs = []
for a in link:
href = a['href']
hrefs.append(href)
print(href)
print(len(hrefs))
print(hrefs)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment