Skip to content

Instantly share code, notes, and snippets.

@RascalTwo
Last active August 4, 2021 00:21
Show Gist options
  • Save RascalTwo/9c624e5a94292ad6f99852840ecd8245 to your computer and use it in GitHub Desktop.
Save RascalTwo/9c624e5a94292ad6f99852840ecd8245 to your computer and use it in GitHub Desktop.
Downloads almost all the pdfs from https://github.com/terryum/awesome-deep-learning-papers, thrown together in about an hour.
import requests
import os
try:
import requests
has_requests = True
except ImportError:
import urllib.request
has_requests = False
if has_requests:
rawhtml = requests.get("https://github.com/terryum/awesome-deep-learning-papers").text
else:
urllib.request.urlopen("https://github.com/terryum/awesome-deep-learning-papers").read().decode("utf-8")
splits = rawhtml.split('">[pdf]</a>')
pdfs = []
for i in range(len(splits)):
if (i == len(splits) - 1):
continue
raw = splits[i].split('<li>')[-1]
pdfs.append({
"title": raw.split(")")[0] + ")",
"url": raw.split('href="')[1]
})
fails = []
for pdf in pdfs:
print("File: " + "output/{}.pdf".format(pdf["title"]))
print("URL: " + pdf["url"])
try:
path = "output/{}.pdf".format(pdf["title"])
if os.path.isfile(path):
print("File already exists, skipping.")
continue
if has_requests:
req = requests.get(pdf["url"])
if "pdf" not in req.headers["Content-Type"]:
print("Not a PDF")
fails.append(pdf)
continue
with open(path, "wb") as output:
output.write(req.content)
else:
with urllib.request.urlopen(pdf["url"]) as request:
response = request
if "pdf" not in dict(response.info())["Content-Type"]:
print(dict(response.info())["Content-Type"])
fails.append(pdf)
continue
urllib.request.urlretrieve(pdf["url"], path)
except Exception as exception:
print("Unable to download.")
fails.append(pdf)
print("Finished.")
for fail in fails:
print(fail["title"])
print(fail["url"])
print("")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment