Skip to content

Instantly share code, notes, and snippets.

@droidzone
Created June 16, 2021 20:59
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save droidzone/5f3218db82183b420630e9bdc89ff5ec to your computer and use it in GitHub Desktop.
Save droidzone/5f3218db82183b420630e9bdc89ff5ec to your computer and use it in GitHub Desktop.
Web scraper to extract links from subpages and download files
#!/usr/bin/python3
from selenium import webdriver
import os
from multithread import Downloader
my_url = 'https://readasterix.blogspot.com/2017/01/download-asterix-adventures-in-pdf-en.html'
chromedriver = "/usr/bin/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)
# Get the initial page containing links to page containing the actual links to files.
driver.get(my_url)
elements = driver.find_elements_by_tag_name('a')
links = []
# First collect the links into a list of links where PDF files can be found.
for el in elements:
href=el.get_attribute('href')
if href is not None and "e.filing.ml/p/" in href:
links.append(href)
# Now we have a list of links where the files are present. Let's visit them one by one and download each file
# Create a Directory for Downloading the files
dirpath = "AsterixComics" #Create a directory for these files.
try:
os.mkdir(dirpath)
os.chdir(dirpath)
except Exception as e:
print(f"Failed to create directory: {str(e)}")
i=0
for link in links:
i+=1
print(f'{i} - {link} ..Visiting..')
driver.get(link)
newel = driver.find_element_by_link_text('Download Document') #In our case, the links have this text, so we will select links with this link text, and extract the links to PDF files from them.
download_link =newel.get_attribute('href')
file_name = download_link.split('/')[-1] #Extract file names from the links
print(f"Downloading {download_link} => {file_name}")
if not os.path.exists(file_name): #Download if the files dont already exist. This is helpful if the server reset out connection, which is very common when we do mass downloading or mirroring of sites.
download_object = Downloader(download_link, file_name) #We are using a library called multithread to download these. This has the ability to implement multi threaded downloading of files which is much faster. We can also use wget module or simply the ubiquitous requests module to do this.
download_object.start()
else:
print("Already downloaded.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment