Skip to content

Instantly share code, notes, and snippets.

@risingsunomi
Last active May 13, 2024 21:53
Show Gist options
  • Save risingsunomi/27969876263181478856a94e695ad436 to your computer and use it in GitHub Desktop.
Save risingsunomi/27969876263181478856a94e695ad436 to your computer and use it in GitHub Desktop.
Simple Noggin scraper
import requests
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from fake_useragent import UserAgent as faUserAgent
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def download_pdf(url: str, filename: str):
"""
Download pdf from google url found
"""
try:
response = requests.get(url)
with open(filename, 'wb') as file:
file.write(response.content)
logger.info(f"Downloaded: {filename}")
except Exception as err:
logger.error(f"Failed to download {filename}: {err}")
raise
def parse_search_results(search_results: list):
"""
Parse through search results extracted
by selenium
Save PDF if one is found
"""
try:
for result in search_results:
url = result.get_attribute("href")
if url.endswith(".pdf"):
logging.info(f"PDF found @ {url}")
file_name = os.path.basename(url)
file_path = f"pdfs/{file_name}"
logging.info(f"Downloading {file_name}")
if os.path.exists(file_path):
logging.info(f"{file_path} already exists.")
else:
download_pdf(url, file_path)
except Exception as err:
logger.error(f"Search parse failed: {err}")
raise
def search_all_site_pdfs_selenium(base_url: str):
"""
Does an automated Google search with base url
to search for subpaths or subdomains
Returns links of list found until getting to the "end" of
Google result list
"""
# site query to search for links to base_url
query = f"site:{base_url}"
fua = faUserAgent(platforms="pc")
user_agent = f"--user-agent={str(fua.random)}"
chrome_options = Options()
chrome_options.add_argument(user_agent)
driver = webdriver.Chrome(options=chrome_options)
try:
driver.get("https://www.google.com")
search_input = WebDriverWait(driver, 1).until(
EC.presence_of_element_located((By.NAME, "q"))
)
search_input.send_keys(query)
search_input.submit()
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.ID, "search"))
)
search_results = driver.find_elements(By.CSS_SELECTOR, ".yuRUbf > div > span > a")
parse_search_results(search_results)
# loop through pages
while True:
logger.info("\nLooping through pages\n")
try:
# click next button
next_button = driver.find_element(By.ID, "pnnext")
next_button.click()
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.ID, "search"))
)
search_results = driver.find_elements(By.CSS_SELECTOR, ".yuRUbf > div > span > a")
parse_search_results(search_results)
except:
break
except Exception as err:
logging.error(f"failed: {err}")
raise
finally:
driver.quit()
if __name__ == "__main__":
search_all_site_pdfs_selenium("https://www.noggin.com/app/uploads/")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment