jonathanmc/gist:0200607f57740ee8e0cc03154f01eb7a

## gistfile1.txt
# Python - 1. TEMPLATE including changing working dir. 2. Code scrapes all htm links on a page, put in a list, then downloads them.
# Python - scrape all links on a page, put in a list, then download them.

#!/usr/bin/env python3

# Change working dir
import os
project_path = '/home/WHATEVER'
try:
    os.chdir(project_path)
    print( "Current working directory: {0}".format( os.getcwd() ) )
except FileNotFoundError:
    print( "Directory: {0} does not exist".format(project_path) )
except NotADirectoryError:
    print( "{0} is not a directory".format(project_path) )
except PermissionError:
    print( "You do not have permissions to change to {0}".format(project_path) )


import os.path
import requests
from bs4 import BeautifulSoup
# for sleeping:
from time import sleep
import random

def make_list_of_links_from_URL(url):
    req = requests.get(url)
    soup = BeautifulSoup( req.content, "html.parser" )

    list_of_links = []
    for link in soup.find_all('a', href=True):
        if link['href'].lower().endswith(".htm"):             # only .htm
            list_of_links.append( link.get('href') )
    return list_of_links

def download_file_from_URL(URL, save_dir):
    file_name = URL.split('/')[-1]
    file_path_and_name = file_path + save_dir + "/" + file_name

    if os.path.isfile(file_path_and_name) == True :
        print( "File already exists: " + file_path_and_name )
    else :
        req = requests.get( URL, allow_redirects=True )
        file = open(file_path_and_name, 'wb')
        file.write(req.content)
        file.close()
        print( "Saved file: " + file_path_and_name )

        sleeptime = random.uniform(1, 65)
        print("sleeping for ", sleeptime, "seconds")
        sleep(sleeptime)

        # the following for a stream download:
        # req = requests.get( URL, stream = True )
        # with open(file_path_and_name, 'wb') as f:
        #     for chunk in r.iter_content(chunk_size = 1024*1024):
        #         if chunk:
        # 			f.write(chunk)
    return

def download_list_of_URLs(list_of_URLs, save_dir):
    for URL in list_of_URLs:
        download_file_from_URL(URL, save_dir)

    print("All files downloaded!")
    return

# TO RUN:

URL = "https://placeholder.com"
list = make_list_of_links_from_URL(URL)
download_list_of_URLs(list, save_dir)
	# Python - 1. TEMPLATE including changing working dir. 2. Code scrapes all htm links on a page, put in a list, then downloads them.
	# Python - scrape all links on a page, put in a list, then download them.

	#!/usr/bin/env python3

	# Change working dir
	import os
	project_path = '/home/WHATEVER'
	try:
	os.chdir(project_path)
	print( "Current working directory: {0}".format( os.getcwd() ) )
	except FileNotFoundError:
	print( "Directory: {0} does not exist".format(project_path) )
	except NotADirectoryError:
	print( "{0} is not a directory".format(project_path) )
	except PermissionError:
	print( "You do not have permissions to change to {0}".format(project_path) )


	import os.path
	import requests
	from bs4 import BeautifulSoup
	# for sleeping:
	from time import sleep
	import random

	def make_list_of_links_from_URL(url):
	req = requests.get(url)
	soup = BeautifulSoup( req.content, "html.parser" )

	list_of_links = []
	for link in soup.find_all('a', href=True):
	if link['href'].lower().endswith(".htm"): # only .htm
	list_of_links.append( link.get('href') )
	return list_of_links

	def download_file_from_URL(URL, save_dir):
	file_name = URL.split('/')[-1]
	file_path_and_name = file_path + save_dir + "/" + file_name

	if os.path.isfile(file_path_and_name) == True :
	print( "File already exists: " + file_path_and_name )
	else :
	req = requests.get( URL, allow_redirects=True )
	file = open(file_path_and_name, 'wb')
	file.write(req.content)
	file.close()
	print( "Saved file: " + file_path_and_name )

	sleeptime = random.uniform(1, 65)
	print("sleeping for ", sleeptime, "seconds")
	sleep(sleeptime)

	# the following for a stream download:
	# req = requests.get( URL, stream = True )
	# with open(file_path_and_name, 'wb') as f:
	# for chunk in r.iter_content(chunk_size = 1024*1024):
	# if chunk:
	# f.write(chunk)
	return

	def download_list_of_URLs(list_of_URLs, save_dir):
	for URL in list_of_URLs:
	download_file_from_URL(URL, save_dir)

	print("All files downloaded!")
	return

	# TO RUN:

	URL = "https://placeholder.com"
	list = make_list_of_links_from_URL(URL)
	download_list_of_URLs(list, save_dir)