Created
August 20, 2023 13:31
-
-
Save zyryc/ca83344e0c27c4ff420ece89f10842da to your computer and use it in GitHub Desktop.
Fetch wallpapers from the internet and download it
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from selenium import webdriver | |
from selenium.webdriver.chrome.service import Service | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.chrome.options import Options | |
from bs4 import BeautifulSoup | |
import time | |
import os | |
Visited_pages = set() | |
def get_href_values(url): | |
# Set up Selenium webdriver | |
selenium_service = Service('path_to_chromedriver') # Replace 'path_to_chromedriver' with the actual path to your ChromeDriver executable | |
chrome_options = Options() | |
chrome_options.add_argument('--headless') # Run Chrome in headless mode | |
driver = webdriver.Chrome(service=selenium_service, options=chrome_options) | |
driver.get(url) | |
# Extract page source after JavaScript execution | |
page_source = driver.page_source | |
# Close the browser | |
driver.quit() | |
return page_source | |
def extract_links(page_source): | |
soup = BeautifulSoup(page_source, 'html.parser') | |
# Find all the <a> elements with the specified CSS path | |
elements = soup.select("body.is-main main.layout-dynamic div.container.container_width_wide div.album.album_category a[href]") | |
# Extract the href values based on different criteria | |
pages = [] | |
images = [] | |
for element in elements: | |
href = element["href"] | |
if "?page=" in href: | |
pages.append(href) | |
elif "/image/" in href: | |
images.append(href) | |
return pages, images | |
def visit_image_and_download(url): | |
# Set up Selenium webdriver | |
selenium_service = Service('path_to_chromedriver') # Replace 'path_to_chromedriver' with the actual path to your ChromeDriver executable | |
chrome_options = Options() | |
chrome_options.add_argument('--headless') # Run Chrome in headless mode | |
driver = webdriver.Chrome(service=selenium_service, options=chrome_options) | |
driver.get(url) | |
time.sleep(2) # Wait for the page to load | |
download_button = driver.find_element(By.CSS_SELECTOR, "html body.is-main main.layout-dynamic div.container.container_width_wide div.wallpaper div.wallpaper__main div.grid div.grid__col.grid__col_xs_12.grid__col_md_3 div.wallpaper__right div.wallpaper__buttons a.btn.btn_block.wallpaper__download") | |
href = download_button.get_attribute('href') | |
with open('href.txt', 'a') as f: | |
f.write('\n' + href) | |
# Download the image | |
download_image(href) | |
# Close the browser | |
driver.quit() | |
def download_image(image_url): | |
response = requests.get(image_url) | |
if response.status_code == 200: | |
# Extract the image filename from the URL | |
filename = image_url.split("/")[-1] | |
# Save the image to the current directory | |
with open(filename, "wb") as f: | |
f.write(response.content) | |
print(f"Downloaded: {filename}") | |
def scrape_website(url): | |
# Get the page source using Selenium | |
page_source = get_href_values(url) | |
# Extract the links | |
pages, images = extract_links(page_source) | |
# Visit and download each image | |
for image_url in images: | |
visit_image_and_download(image_url) | |
# Follow the pages and continue scraping | |
for page_url in pages: | |
if page_url not in Visited_pages: | |
Visited_pages.add(page_url) | |
scrape_website(page_url) | |
# Create a directory to store the downloaded images | |
os.makedirs("images", exist_ok=True) | |
os.chdir("images") | |
# URL to start scraping | |
start_url = "https://wallspic.com/album/nature/1920x1080" | |
# Scrape the website | |
scrape_website(start_url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment