stzsch/magzdb-dl.py

## magzdb-dl.py
# Must have wget installed on system
# Instructions: login on magzdb.org and export your cookies to a cookies.txt file on the same directory as the gist, then run it
import requests
from bs4 import BeautifulSoup

url = 'http://magzdb.org'

# TODO: make mag a cmd parameter
mag = '/j/1341' # 2600 Hackers Quarterly
r = requests.get(url+mag)
soup = BeautifulSoup(r.text)

# Remove site name and colons, and substitute spaces
title = soup.title.text.split(' | ')[0].replace(":","").replace(" ","_")

# yl references the yellow links corresponding to available issues
# TODO: report which issues got downloaded and which ones are missing
for yl in soup.find_all(style="background-color: yellow"):
    [yl_year, yl_issue] = yl.parent['title'].split(' №')
    yl_href = yl.parent['href']
    yl_r = requests.get(url+yl_href)
    yl_soup = BeautifulSoup(yl_r.text)
    yl_dl = yl_soup.find(href=re.compile("file"))['href']
    yl_dl_url = yl_dl.split('..')[1]

    # Using the system wget seemed just less of a hassle
    # TODO: use directories instead of filenames?
    wget_cmd = 'wget --load-cookies=./cookies.txt '+url+yl_dl_url+' -O "'+title+'_'+yl_year+'_'+yl_issue+'.pdf"'
    os.system(wget_cmd)
	# Must have wget installed on system
	# Instructions: login on magzdb.org and export your cookies to a cookies.txt file on the same directory as the gist, then run it
	import requests
	from bs4 import BeautifulSoup

	url = 'http://magzdb.org'

	# TODO: make mag a cmd parameter
	mag = '/j/1341' # 2600 Hackers Quarterly
	r = requests.get(url+mag)
	soup = BeautifulSoup(r.text)

	# Remove site name and colons, and substitute spaces
	title = soup.title.text.split(' \| ')[0].replace(":","").replace(" ","_")

	# yl references the yellow links corresponding to available issues
	# TODO: report which issues got downloaded and which ones are missing
	for yl in soup.find_all(style="background-color: yellow"):
	[yl_year, yl_issue] = yl.parent['title'].split(' №')
	yl_href = yl.parent['href']
	yl_r = requests.get(url+yl_href)
	yl_soup = BeautifulSoup(yl_r.text)
	yl_dl = yl_soup.find(href=re.compile("file"))['href']
	yl_dl_url = yl_dl.split('..')[1]

	# Using the system wget seemed just less of a hassle
	# TODO: use directories instead of filenames?
	wget_cmd = 'wget --load-cookies=./cookies.txt '+url+yl_dl_url+' -O "'+title+'_'+yl_year+'_'+yl_issue+'.pdf"'
	os.system(wget_cmd)