Skip to content

Instantly share code, notes, and snippets.

@jonathanmc
Created March 23, 2022 01:25
Show Gist options
  • Save jonathanmc/0200607f57740ee8e0cc03154f01eb7a to your computer and use it in GitHub Desktop.
Save jonathanmc/0200607f57740ee8e0cc03154f01eb7a to your computer and use it in GitHub Desktop.
# Python - 1. TEMPLATE including changing working dir. 2. Code scrapes all htm links on a page, put in a list, then downloads them.
# Python - 1. TEMPLATE including changing working dir. 2. Code scrapes all htm links on a page, put in a list, then downloads them.
# Python - scrape all links on a page, put in a list, then download them.
#!/usr/bin/env python3
# Change working dir
import os
project_path = '/home/WHATEVER'
try:
os.chdir(project_path)
print( "Current working directory: {0}".format( os.getcwd() ) )
except FileNotFoundError:
print( "Directory: {0} does not exist".format(project_path) )
except NotADirectoryError:
print( "{0} is not a directory".format(project_path) )
except PermissionError:
print( "You do not have permissions to change to {0}".format(project_path) )
import os.path
import requests
from bs4 import BeautifulSoup
# for sleeping:
from time import sleep
import random
def make_list_of_links_from_URL(url):
req = requests.get(url)
soup = BeautifulSoup( req.content, "html.parser" )
list_of_links = []
for link in soup.find_all('a', href=True):
if link['href'].lower().endswith(".htm"): # only .htm
list_of_links.append( link.get('href') )
return list_of_links
def download_file_from_URL(URL, save_dir):
file_name = URL.split('/')[-1]
file_path_and_name = file_path + save_dir + "/" + file_name
if os.path.isfile(file_path_and_name) == True :
print( "File already exists: " + file_path_and_name )
else :
req = requests.get( URL, allow_redirects=True )
file = open(file_path_and_name, 'wb')
file.write(req.content)
file.close()
print( "Saved file: " + file_path_and_name )
sleeptime = random.uniform(1, 65)
print("sleeping for ", sleeptime, "seconds")
sleep(sleeptime)
# the following for a stream download:
# req = requests.get( URL, stream = True )
# with open(file_path_and_name, 'wb') as f:
# for chunk in r.iter_content(chunk_size = 1024*1024):
# if chunk:
# f.write(chunk)
return
def download_list_of_URLs(list_of_URLs, save_dir):
for URL in list_of_URLs:
download_file_from_URL(URL, save_dir)
print("All files downloaded!")
return
# TO RUN:
URL = "https://placeholder.com"
list = make_list_of_links_from_URL(URL)
download_list_of_URLs(list, save_dir)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment