Skip to content

Instantly share code, notes, and snippets.

@stzsch
Created November 4, 2019 21:12
Show Gist options
  • Save stzsch/dfef064b9848a2ec9e1759d48ff66066 to your computer and use it in GitHub Desktop.
Save stzsch/dfef064b9848a2ec9e1759d48ff66066 to your computer and use it in GitHub Desktop.
magzdb.org download
# Must have wget installed on system
# Instructions: login on magzdb.org and export your cookies to a cookies.txt file on the same directory as the gist, then run it
import requests
from bs4 import BeautifulSoup
url = 'http://magzdb.org'
# TODO: make mag a cmd parameter
mag = '/j/1341' # 2600 Hackers Quarterly
r = requests.get(url+mag)
soup = BeautifulSoup(r.text)
# Remove site name and colons, and substitute spaces
title = soup.title.text.split(' | ')[0].replace(":","").replace(" ","_")
# yl references the yellow links corresponding to available issues
# TODO: report which issues got downloaded and which ones are missing
for yl in soup.find_all(style="background-color: yellow"):
[yl_year, yl_issue] = yl.parent['title'].split(' №')
yl_href = yl.parent['href']
yl_r = requests.get(url+yl_href)
yl_soup = BeautifulSoup(yl_r.text)
yl_dl = yl_soup.find(href=re.compile("file"))['href']
yl_dl_url = yl_dl.split('..')[1]
# Using the system wget seemed just less of a hassle
# TODO: use directories instead of filenames?
wget_cmd = 'wget --load-cookies=./cookies.txt '+url+yl_dl_url+' -O "'+title+'_'+yl_year+'_'+yl_issue+'.pdf"'
os.system(wget_cmd)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment