Skip to content

Instantly share code, notes, and snippets.

@Kinjalrk2k
Created April 12, 2020 12:17
Show Gist options
  • Save Kinjalrk2k/a48f007494b23e6115bb4328da32b901 to your computer and use it in GitHub Desktop.
Save Kinjalrk2k/a48f007494b23e6115bb4328da32b901 to your computer and use it in GitHub Desktop.
Download PDFs from Shattak at one go!
from bs4 import BeautifulSoup
import urllib
import re
html_page = urllib.request.urlopen("https://www.shattak.com/quordenet/subject?code=APT-101&name=tapas-sir")
soup = BeautifulSoup(html_page)
links = []
for link in soup.find_all('a', href=True):
links.append(link['href'])
links = list(filter(lambda x: x.startswith('./download'), links))
header = 'https://www.shattak.com/quordenet/'
links = list(map(lambda x: header+x[1:], links))
import wget
pdf_links = []
for l in links:
subpage = urllib.request.urlopen(l)
soup2 = BeautifulSoup(subpage)
for link in soup2.find_all('a', href=True):
if link['href'].endswith('.pdf'):
pdf_links.append(link['href'])
ded_link = header + link['href'][1:]
print(ded_link)
fname = ded_link.split('/')[-1]
wget.download(ded_link, "files//"+fname)
print()
print(links)
print(pdf_links)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment