Skip to content

Instantly share code, notes, and snippets.

@zyryc
Created August 20, 2023 13:31
Show Gist options
  • Save zyryc/ca83344e0c27c4ff420ece89f10842da to your computer and use it in GitHub Desktop.
Save zyryc/ca83344e0c27c4ff420ece89f10842da to your computer and use it in GitHub Desktop.
Fetch wallpapers from the internet and download it
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
import os
Visited_pages = set()
def get_href_values(url):
# Set up Selenium webdriver
selenium_service = Service('path_to_chromedriver') # Replace 'path_to_chromedriver' with the actual path to your ChromeDriver executable
chrome_options = Options()
chrome_options.add_argument('--headless') # Run Chrome in headless mode
driver = webdriver.Chrome(service=selenium_service, options=chrome_options)
driver.get(url)
# Extract page source after JavaScript execution
page_source = driver.page_source
# Close the browser
driver.quit()
return page_source
def extract_links(page_source):
soup = BeautifulSoup(page_source, 'html.parser')
# Find all the <a> elements with the specified CSS path
elements = soup.select("body.is-main main.layout-dynamic div.container.container_width_wide div.album.album_category a[href]")
# Extract the href values based on different criteria
pages = []
images = []
for element in elements:
href = element["href"]
if "?page=" in href:
pages.append(href)
elif "/image/" in href:
images.append(href)
return pages, images
def visit_image_and_download(url):
# Set up Selenium webdriver
selenium_service = Service('path_to_chromedriver') # Replace 'path_to_chromedriver' with the actual path to your ChromeDriver executable
chrome_options = Options()
chrome_options.add_argument('--headless') # Run Chrome in headless mode
driver = webdriver.Chrome(service=selenium_service, options=chrome_options)
driver.get(url)
time.sleep(2) # Wait for the page to load
download_button = driver.find_element(By.CSS_SELECTOR, "html body.is-main main.layout-dynamic div.container.container_width_wide div.wallpaper div.wallpaper__main div.grid div.grid__col.grid__col_xs_12.grid__col_md_3 div.wallpaper__right div.wallpaper__buttons a.btn.btn_block.wallpaper__download")
href = download_button.get_attribute('href')
with open('href.txt', 'a') as f:
f.write('\n' + href)
# Download the image
download_image(href)
# Close the browser
driver.quit()
def download_image(image_url):
response = requests.get(image_url)
if response.status_code == 200:
# Extract the image filename from the URL
filename = image_url.split("/")[-1]
# Save the image to the current directory
with open(filename, "wb") as f:
f.write(response.content)
print(f"Downloaded: {filename}")
def scrape_website(url):
# Get the page source using Selenium
page_source = get_href_values(url)
# Extract the links
pages, images = extract_links(page_source)
# Visit and download each image
for image_url in images:
visit_image_and_download(image_url)
# Follow the pages and continue scraping
for page_url in pages:
if page_url not in Visited_pages:
Visited_pages.add(page_url)
scrape_website(page_url)
# Create a directory to store the downloaded images
os.makedirs("images", exist_ok=True)
os.chdir("images")
# URL to start scraping
start_url = "https://wallspic.com/album/nature/1920x1080"
# Scrape the website
scrape_website(start_url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment