jinie/archive_lj.py

## archive_lj.py
#!/usr/bin/env python3
from bs4 import BeautifulSoup
import requests
import re

account_no = '000000' #LJ Account number, zero prefixed
baseurl = 'https://secure2.linuxjournal.com'

def get_filename_from_cd(cd):
    """
    Get filename from content-disposition
    """
    if not cd:
        return None
    fname = re.findall('filename=(.+)', cd)
    if len(fname) == 0:
        return None
    return fname[0]

def soup_filter(tag):
    """
    Find all download tags
    """
    return (tag.name == 'a' and
        tag.parent.name == 'div' and
        'downloadbtn' in tag.parent['class'])

def get_archive_list(url):
    """
    Return list of all download links, PDF, EPUB and MOBI
    """
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    return [l['href'] for l in soup.findAll(soup_filter)]

def login(session):
    """
    Create a session
    """
    try:
        data= { 'ucLJFooter_accountnumber': account_no }
        return session.post('https://secure2.linuxjournal.com/pdf/dljdownload.php',data)
    except Exception as e:
        print(e)
        raise e

def get_download_link(session,url):
    """
    Return a real download link from the "your download should begin soon" page
    """
    r=session.get(url)
    soup = BeautifulSoup(r.content,'html.parser')
    ret = soup.find('a')
    return ret['href']

def download_file(session, url):
    """
    Download the binary file
    """
    url = baseurl + url
    local_filename = url.split('/')[-1] #In case CD doesn't hold a filename
    with session.get(url, stream=True) as r:
        r.raise_for_status()
        filename=get_filename_from_cd(r.headers.get('content-disposition'))
        if filename is not None:
            local_filename = filename.lstrip('"').rstrip('"')
        print(local_filename)
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                if chunk: # filter out keep-alive new chunks
                    f.write(chunk)
                    # f.flush()

url='https://secure2.linuxjournal.com/pdf/dljdownload.php'
s = requests.session()
r = login(s)
soup = BeautifulSoup(r.content, 'html.parser')
archive = [l['href'] for l in soup.findAll(soup_filter)]

for a in archive:
    dlink = get_download_link(s,a)
    download_file(s,dlink)
	#!/usr/bin/env python3
	from bs4 import BeautifulSoup
	import requests
	import re

	account_no = '000000' #LJ Account number, zero prefixed
	baseurl = 'https://secure2.linuxjournal.com'

	def get_filename_from_cd(cd):
	"""
	Get filename from content-disposition
	"""
	if not cd:
	return None
	fname = re.findall('filename=(.+)', cd)
	if len(fname) == 0:
	return None
	return fname[0]

	def soup_filter(tag):
	"""
	Find all download tags
	"""
	return (tag.name == 'a' and
	tag.parent.name == 'div' and
	'downloadbtn' in tag.parent['class'])

	def get_archive_list(url):
	"""
	Return list of all download links, PDF, EPUB and MOBI
	"""
	r = requests.get(url)
	soup = BeautifulSoup(r.content, 'html.parser')
	return [l['href'] for l in soup.findAll(soup_filter)]

	def login(session):
	"""
	Create a session
	"""
	try:
	data= { 'ucLJFooter_accountnumber': account_no }
	return session.post('https://secure2.linuxjournal.com/pdf/dljdownload.php',data)
	except Exception as e:
	print(e)
	raise e

	def get_download_link(session,url):
	"""
	Return a real download link from the "your download should begin soon" page
	"""
	r=session.get(url)
	soup = BeautifulSoup(r.content,'html.parser')
	ret = soup.find('a')
	return ret['href']

	def download_file(session, url):
	"""
	Download the binary file
	"""
	url = baseurl + url
	local_filename = url.split('/')[-1] #In case CD doesn't hold a filename
	with session.get(url, stream=True) as r:
	r.raise_for_status()
	filename=get_filename_from_cd(r.headers.get('content-disposition'))
	if filename is not None:
	local_filename = filename.lstrip('"').rstrip('"')
	print(local_filename)
	with open(local_filename, 'wb') as f:
	for chunk in r.iter_content(chunk_size=8192):
	if chunk: # filter out keep-alive new chunks
	f.write(chunk)
	# f.flush()

	url='https://secure2.linuxjournal.com/pdf/dljdownload.php'
	s = requests.session()
	r = login(s)
	soup = BeautifulSoup(r.content, 'html.parser')
	archive = [l['href'] for l in soup.findAll(soup_filter)]

	for a in archive:
	dlink = get_download_link(s,a)
	download_file(s,dlink)