Skip to content

Instantly share code, notes, and snippets.

@jinie
Created August 8, 2019 10:09
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jinie/b8b67c80cb4e18477469ea29488cd1ac to your computer and use it in GitHub Desktop.
Save jinie/b8b67c80cb4e18477469ea29488cd1ac to your computer and use it in GitHub Desktop.
Script to archive all electronic issues of Linux Journal
#!/usr/bin/env python3
from bs4 import BeautifulSoup
import requests
import re
account_no = '000000' #LJ Account number, zero prefixed
baseurl = 'https://secure2.linuxjournal.com'
def get_filename_from_cd(cd):
"""
Get filename from content-disposition
"""
if not cd:
return None
fname = re.findall('filename=(.+)', cd)
if len(fname) == 0:
return None
return fname[0]
def soup_filter(tag):
"""
Find all download tags
"""
return (tag.name == 'a' and
tag.parent.name == 'div' and
'downloadbtn' in tag.parent['class'])
def get_archive_list(url):
"""
Return list of all download links, PDF, EPUB and MOBI
"""
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
return [l['href'] for l in soup.findAll(soup_filter)]
def login(session):
"""
Create a session
"""
try:
data= { 'ucLJFooter_accountnumber': account_no }
return session.post('https://secure2.linuxjournal.com/pdf/dljdownload.php',data)
except Exception as e:
print(e)
raise e
def get_download_link(session,url):
"""
Return a real download link from the "your download should begin soon" page
"""
r=session.get(url)
soup = BeautifulSoup(r.content,'html.parser')
ret = soup.find('a')
return ret['href']
def download_file(session, url):
"""
Download the binary file
"""
url = baseurl + url
local_filename = url.split('/')[-1] #In case CD doesn't hold a filename
with session.get(url, stream=True) as r:
r.raise_for_status()
filename=get_filename_from_cd(r.headers.get('content-disposition'))
if filename is not None:
local_filename = filename.lstrip('"').rstrip('"')
print(local_filename)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
# f.flush()
url='https://secure2.linuxjournal.com/pdf/dljdownload.php'
s = requests.session()
r = login(s)
soup = BeautifulSoup(r.content, 'html.parser')
archive = [l['href'] for l in soup.findAll(soup_filter)]
for a in archive:
dlink = get_download_link(s,a)
download_file(s,dlink)
@robomfeinberg
Copy link

Awesome 👍

@jinie
Copy link
Author

jinie commented Aug 8, 2019

It’s not pretty, but considering it’s a one time thing, and I spent just about 20 minutes writing it, I think it’s “good enough”(TM) :-)

@sebastienmasson
Copy link

+1

@sgargbugreporter
Copy link

This looks good, but how does one find the account number? For me the way to access the site is using an e-mail id & zip code ...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment