Skip to content

Instantly share code, notes, and snippets.

@tm9k1
Created January 31, 2024 09:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tm9k1/e5e524869aa7e6d9a442f355dc4f678a to your computer and use it in GitHub Desktop.
Save tm9k1/e5e524869aa7e6d9a442f355dc4f678a to your computer and use it in GitHub Desktop.
Script to scrape bwallpaperhd.com for Microsoft Spotlight wallpapers
#!/usr/bin/python3
import os
from bs4 import BeautifulSoup
import requests
import concurrent.futures
from tqdm import tqdm
# change these two to whatever you like.
# NOTE: Remember that the next run would again download all wallpapers if you move the wallpapers away from wallpaper_save_path
wallpaper_save_path = '/mnt/hdd/baadal/tm9k1/images/wallpapers/horizontal/spotlight'
webpages_save_path = '/tmp/webpages/'
# This is the link I use to get the wallpapers. This site is updated daily, with latest wallpapers at the top.
sitemap_url = 'https://www.bwallpaperhd.com/sitemap.html'
sitemap_filename = webpages_save_path + sitemap_url.split('/')[-1]
# download a file at download_url to file_name
def download_a_file(download_url: str, file_name: str):
incomplete_file_name = file_name[:file_name.rfind('/')+1] + '0INCOMPLETE_'+ file_name[file_name.rfind('/')+1:]
# remove any stale download of the file
if os.path.exists(incomplete_file_name):
os.remove(incomplete_file_name)
# print(download_url)
# download the file
response = requests.get(download_url, stream=True)
if response.status_code == 200:
# write it as 0INCOMPLETE_filename so we can handle any broken files later
with open(incomplete_file_name, 'wb') as f:
for chunk in response:
f.write(chunk)
# rename the file to filename if we received the whole file
os.rename(incomplete_file_name, file_name)
# returns a list of tuples as [(Wallpaper Name, http://wallpaper.webpage/url.html),...]
def extract_links_from_sitemap(sitemap_url: str, sitemap_filename: str):
# remove any stale download of the sitemap
if os.path.exists(sitemap_filename):
os.remove(sitemap_filename)
# download the sitemap
download_a_file(sitemap_url, sitemap_filename)
soup = BeautifulSoup(open(sitemap_filename, encoding='utf-8'), "html.parser")
# print(soup.prettify())
divs = soup.find_all('div', attrs={'id':'content'})
list_of_wallpprs = []
for div in divs:
h3 = div.find_all('h3', string='Latest WallPapers')
if h3:
anchor_tags = div.find_all('a')
for anchor_tag in anchor_tags:
wallpaper_name = anchor_tag.contents[0]
wallpaper_webpage_url = anchor_tag.get('href')
list_of_wallpprs.append((wallpaper_name, wallpaper_webpage_url))
return list_of_wallpprs
# expects the tuples from list_of_wallpapers as [(Wallpaper Name, http://wallpaper.webpage/url.html),...]
def download_a_wallpaper(tup: tuple):
wallpaper_name, wallpaper_webpage_url = tup
wallpaper_webpage_filename = wallpaper_webpage_url.split('/')[-1]
wallpaper_filename = wallpaper_webpage_filename[:wallpaper_webpage_filename.find('.')]
wallpaper_filename = wallpaper_filename + '.jpg' # hopefully they won't ditch jpg anytime soon!
if not os.path.exists(wallpaper_save_path + wallpaper_filename):
if not os.path.exists(webpages_save_path + wallpaper_webpage_filename):
download_a_file(wallpaper_webpage_url, webpages_save_path + wallpaper_webpage_filename)
soup = BeautifulSoup(open(webpages_save_path + wallpaper_webpage_filename, encoding='utf-8'), "html.parser")
divs = soup.find_all('div', attrs={'class':'download'})
for div in divs:
anchors = div.find_all('a')
for anchor in anchors:
if anchor.contents[0].strip().lower() == 'original':
wallpaper_url = anchor.get('href')
# print(wallpaper_name + ': ' + wallpaper_url)
download_a_file(wallpaper_url, wallpaper_save_path + wallpaper_filename)
os.remove(webpages_save_path + wallpaper_webpage_filename)
return wallpaper_name
## MAIN ACTION IS HERE
# make the necessary directories in case they don't exist
os.makedirs(wallpaper_save_path, exist_ok=True)
os.makedirs(webpages_save_path, exist_ok=True)
if not wallpaper_save_path.endswith('/'):
wallpaper_save_path += '/'
if not webpages_save_path.endswith('/'):
webpages_save_path += '/'
# extract the list of wallpapers with links to each wallpaper's own webpage
list_of_wallpapers = extract_links_from_sitemap(sitemap_url, sitemap_filename)
# get the total count of wallpapers we can expect by the end of processing
total_wallpapers = len(list_of_wallpapers)
# Create a progress bar so user knows what's going on
progress_bar = tqdm(range(total_wallpapers), desc="Downloading Wallpapers",colour="blue", bar_format='{l_bar}{bar}| [{n_fmt}/{total_fmt}] [Elapsed: {elapsed}]')
# Start distributing work across multiple threads. 20 seems to be a safe count for now.
with concurrent.futures.ThreadPoolExecutor(max_workers=10, thread_name_prefix='Wallpaper_Downloader_') as executor:
future_to_url = [ executor.submit(download_a_wallpaper, wallpaper_entry) for wallpaper_entry in list_of_wallpapers ]
for future in concurrent.futures.as_completed(future_to_url):
progress_bar.n += 1
progress_bar.refresh()
progress_bar.close()
# clean up the downloaded sitemap webpage
os.remove(sitemap_filename)
print('Done! Enjoy your new collection! =)')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment