Skip to content

Instantly share code, notes, and snippets.

@tpoisot
Created February 14, 2016 15:53
Show Gist options
  • Save tpoisot/931590ac85a2f6f9f059 to your computer and use it in GitHub Desktop.
Save tpoisot/931590ac85a2f6f9f059 to your computer and use it in GitHub Desktop.
#ICanHazPDF
#! /usr/bin/env python
import re
import requests
import tempfile
import urllib.request
import random
import sys
import os
# ASCII colors
class acol:
END = '\033[0m'
BOLD = '\033[1m'
RED = '\033[91m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
BLUE = '\033[94m'
MAGENTA = '\033[95m'
CYAN = '\033[96m'
def get_scihub_pdf(doi):
_root = random.choice(["sci-hub.io", "sci-hub.cc"])
_doi_url = "http://" + _root + "/" + doi
print("\t" + acol.MAGENTA + "SciHub:\t" + acol.END + _doi_url + acol.END)
getpdf = re.compile(u'<iframe src = "(.+\.pdf)" id = "pdf">')
try :
_url = _doi_url
_redirect_url = requests.get(_url).url
if not _root in _redirect_url :
print("\t" + acol.MAGENTA + "Goto:\t" + acol.END + _redirect_url + acol.END)
raise ValueError("Redirected")
_url = _redirect_url
_url_html_content = requests.get(_url).text
search_result = re.search(getpdf, _url_html_content)
if not search_result == None:
print("\t" + acol.BLUE + "PDF at:\t" + acol.END + search_result.group(1) + acol.END)
return search_result.group(1)
else:
raise ValueError("Unable to read PDF")
except :
raise ValueError("No PDF known to SciHub")
"""
Download the file itself
"""
def download_file(url, fname):
# Look ma, I'm a browser! Fuck you, publishers.
header = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0',}
r = requests.get(url, stream=True, headers=header)
with open(fname, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
return fname
if __name__ == "__main__":
if len(sys.argv) != 2:
raise ValueError("usage: ./icanhazpdf.py doi")
else:
doi = sys.argv[1]
_url = None
try :
_url = get_scihub_pdf(doi)
except :
raise ValueError("Something went wrong.")
if not _url == None:
print("\t" + acol.YELLOW + "PDF URL:\t" + acol.END + _url)
_fname = '.'.join(doi.split('/'))+".pdf"
download_file(_url, _fname)
# Sci Hub might ask captcha
# This next part is flaky as shit
try :
open(_fname).read()
if not 'captcha' in open(_fname).read():
print("\t" + acol.BOLD + acol.GREEN + "#ICanHazPDF! \n")
except :
# If the file can't be read this way, it's likely to be a PDF
os.remove(_fname)
print("\t" + acol.BOLD + acol.RED + "Captcha'ed, try again later" + acol.END + "\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment