Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Web scraper to extract links from subpages and download files
#!/usr/bin/python3
from selenium import webdriver
import os
from multithread import Downloader
my_url = 'https://readasterix.blogspot.com/2017/01/download-asterix-adventures-in-pdf-en.html'
chromedriver = "/usr/bin/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)
# Get the initial page containing links to page containing the actual links to files.
driver.get(my_url)
elements = driver.find_elements_by_tag_name('a')
links = []
# First collect the links into a list of links where PDF files can be found.
for el in elements:
href=el.get_attribute('href')
if href is not None and "e.filing.ml/p/" in href:
links.append(href)
# Now we have a list of links where the files are present. Let's visit them one by one and download each file
# Create a Directory for Downloading the files
dirpath = "AsterixComics" #Create a directory for these files.
try:
os.mkdir(dirpath)
os.chdir(dirpath)
except Exception as e:
print(f"Failed to create directory: {str(e)}")
i=0
for link in links:
i+=1
print(f'{i} - {link} ..Visiting..')
driver.get(link)
newel = driver.find_element_by_link_text('Download Document') #In our case, the links have this text, so we will select links with this link text, and extract the links to PDF files from them.
download_link =newel.get_attribute('href')
file_name = download_link.split('/')[-1] #Extract file names from the links
print(f"Downloading {download_link} => {file_name}")
if not os.path.exists(file_name): #Download if the files dont already exist. This is helpful if the server reset out connection, which is very common when we do mass downloading or mirroring of sites.
download_object = Downloader(download_link, file_name) #We are using a library called multithread to download these. This has the ability to implement multi threaded downloading of files which is much faster. We can also use wget module or simply the ubiquitous requests module to do this.
download_object.start()
else:
print("Already downloaded.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment