Created
March 23, 2022 01:25
-
-
Save jonathanmc/0200607f57740ee8e0cc03154f01eb7a to your computer and use it in GitHub Desktop.
# Python - 1. TEMPLATE including changing working dir. 2. Code scrapes all htm links on a page, put in a list, then downloads them.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Python - 1. TEMPLATE including changing working dir. 2. Code scrapes all htm links on a page, put in a list, then downloads them. | |
# Python - scrape all links on a page, put in a list, then download them. | |
#!/usr/bin/env python3 | |
# Change working dir | |
import os | |
project_path = '/home/WHATEVER' | |
try: | |
os.chdir(project_path) | |
print( "Current working directory: {0}".format( os.getcwd() ) ) | |
except FileNotFoundError: | |
print( "Directory: {0} does not exist".format(project_path) ) | |
except NotADirectoryError: | |
print( "{0} is not a directory".format(project_path) ) | |
except PermissionError: | |
print( "You do not have permissions to change to {0}".format(project_path) ) | |
import os.path | |
import requests | |
from bs4 import BeautifulSoup | |
# for sleeping: | |
from time import sleep | |
import random | |
def make_list_of_links_from_URL(url): | |
req = requests.get(url) | |
soup = BeautifulSoup( req.content, "html.parser" ) | |
list_of_links = [] | |
for link in soup.find_all('a', href=True): | |
if link['href'].lower().endswith(".htm"): # only .htm | |
list_of_links.append( link.get('href') ) | |
return list_of_links | |
def download_file_from_URL(URL, save_dir): | |
file_name = URL.split('/')[-1] | |
file_path_and_name = file_path + save_dir + "/" + file_name | |
if os.path.isfile(file_path_and_name) == True : | |
print( "File already exists: " + file_path_and_name ) | |
else : | |
req = requests.get( URL, allow_redirects=True ) | |
file = open(file_path_and_name, 'wb') | |
file.write(req.content) | |
file.close() | |
print( "Saved file: " + file_path_and_name ) | |
sleeptime = random.uniform(1, 65) | |
print("sleeping for ", sleeptime, "seconds") | |
sleep(sleeptime) | |
# the following for a stream download: | |
# req = requests.get( URL, stream = True ) | |
# with open(file_path_and_name, 'wb') as f: | |
# for chunk in r.iter_content(chunk_size = 1024*1024): | |
# if chunk: | |
# f.write(chunk) | |
return | |
def download_list_of_URLs(list_of_URLs, save_dir): | |
for URL in list_of_URLs: | |
download_file_from_URL(URL, save_dir) | |
print("All files downloaded!") | |
return | |
# TO RUN: | |
URL = "https://placeholder.com" | |
list = make_list_of_links_from_URL(URL) | |
download_list_of_URLs(list, save_dir) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment