Last active December 30, 2017 22:38
A Simple Python3 script to download all Linux Journal Archives! -- Only available till 31Dec :( -- (needs
import requests
import bs4
import re
import os
full_html_response = requests.get('')
full_html = bs4.BeautifulSoup(full_html_response.text, 'html.parser')
all_tags = full_html.find_all(href=re.compile("pdf"))
print('Number of download links found :: {0}'.format(len(all_tags)))
for tag in all_tags:
redirected_link = requests.get(tag.attrs.get('href'))
link_html = bs4.BeautifulSoup(redirected_link.text, 'html.parser')
link_tag = link_html.find_all(href=re.compile("pdf"))[0]
content_response_head = requests.head('{0}'.format(link_tag.attrs.get('href')))
content_length = int(content_response_head.headers.get('Content-Length'))
content_disposition_header = content_response_head.headers.get('content-disposition')
print('content disposition header : {0}'.format(content_disposition_header))
filename = content_disposition_header.split(' ')[1].split('=')[1].strip('"')
print('Current file -> {0}'.format(filename))
if os.path.isfile(filename):
print('Checking filename {0}'.format(filename))
existing_file_content_length = len(open(filename, 'rb').read())
print('Existing file length {0} | download content length {1}'.format(existing_file_content_length, content_length))
if existing_file_content_length != content_length:
open(filename, 'wb').write(requests.get('' + link_tag.attrs.get('href')).content)
print('Wrote {0}'.format(filename))
print('File {0} exists.. skipping'.format(filename))
open(filename, 'wb').write(requests.get('' + link_tag.attrs.get('href')).content)
print('Wrote {0}'.format(filename))
except Exception:
print('Exception occurred, skipping')
print('Done -- PDFs Downloaded!')
Includiing skipping if the filename exists! :)

Added sha1 checksum checking

Check file Content length by head so that file is not downloaded before just to check if it's available. :)

