Skip to content

Instantly share code, notes, and snippets.

@fronbasal
Created July 6, 2024 19:28
Show Gist options
  • Save fronbasal/d2e3688016b4b07fd27647aaf9c4f142 to your computer and use it in GitHub Desktop.
Save fronbasal/d2e3688016b4b07fd27647aaf9c4f142 to your computer and use it in GitHub Desktop.
SciHub downloader
from pathlib import Path
import requests
from bs4 import BeautifulSoup
# Inspired by https://github.com/stormsuresh92/Bulk-download-Sci-Hub-papers
class SciHub:
def __init__(self, doi: str, path: Path, url='https://sci-hub.se/', timeout=60):
self.url = url
self.timeout = timeout
self.path = path
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36',
}
self.payload = {
'sci-hub-plugin-check': '',
'request': str(doi)
}
def _send_request(self):
res = requests.post(self.url, headers=self.headers, data=self.payload, timeout=self.timeout)
assert res.ok, 'Failed to fetch %s, status code: %d' % (self.url, res.status_code)
return res
def _extract_url(self, response):
soup = BeautifulSoup(response.content, 'html.parser')
try:
content_url = soup.find(id='pdf').get('src').replace('#navpanes=0&view=FitH', '').replace('//', '/')
if not content_url.endswith('.pdf'):
raise AttributeError()
except AttributeError:
print("Failed to find ", self.payload['request'])
return
if content_url.startswith('/downloads'):
return 'https://sci-hub.se' + content_url
elif content_url.startswith('/tree'):
return 'https://sci-hub.se' + content_url
elif content_url.startswith('/uptodate'):
return 'https://sci-hub.se' + content_url
else:
return 'https:/' + content_url
def fetch(self):
response = self._send_request()
pdf_url = self._extract_url(response)
if pdf_url is None:
return
pdf_name = pdf_url.split('/')[-1]
self.path.joinpath(pdf_name).write_bytes(requests.get(pdf_url).content)
print("Downloaded %s" % pdf_name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment