Last active
February 2, 2018 07:20
-
-
Save tatsy/fc12cf675759a4868c57ea9b659fab9b to your computer and use it in GitHub Desktop.
Download open access paper from Google Scholar
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import re | |
import argparse | |
import urllib.parse | |
import urllib.request | |
import requests | |
from bs4 import BeautifulSoup | |
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0' | |
base_url = 'https://scholar.google.co.jp/scholar' | |
def get_confirm_token(response): | |
for key, value in response.cookies.items(): | |
if key.startswith('download_warning'): | |
return value | |
return None | |
def save_response_content(response, destination): | |
CHUNK_SIZE = 32768 | |
PROGBAR_WIDTH = 50 | |
with open(destination, "wb") as f: | |
dl = 0 | |
for chunk in response.iter_content(CHUNK_SIZE): | |
if chunk: | |
dl += len(chunk) | |
f.write(chunk) | |
mb = dl / 1.0e6 | |
sys.stdout.write('\r%.2f MB downloaded...' % mb) | |
sys.stdout.flush() | |
sys.stdout.write('\nSaved to: %s\n' % destination) | |
sys.stdout.flush() | |
def download(url, dest): | |
session = requests.Session() | |
response = session.get(url, stream=True) | |
token = get_confirm_token(response) | |
if token: | |
response = session.get(url, stream=True) | |
print('Downloading:', url) | |
save_response_content(response, dest) | |
def build_fname(title, author): | |
title_words = re.split('\s+', title) | |
author_words = re.split('\s+', author) | |
fname = author_words[-1] + '_' + title_words[0] + '_' + title_words[1] + '.pdf' | |
return fname.lower() | |
def main(): | |
# Parse arguments | |
parser = argparse.ArgumentParser(description='Get a literature from Google Scholar.') | |
parser.add_argument('-n', '--name', type=str, required=True, | |
help='Name of the paper that you are looking for.') | |
parser.add_argument('-c', '--count', type=int, default=10, | |
help='Number of paper candidates listed in the console.') | |
parser.add_argument('--since', type=int, default=-1, | |
help='The program searches papers published after the year specified for this parameter.') | |
args = parser.parse_args() | |
# Query | |
data = {} | |
data['hl'] = 'en' | |
data['q'] = args.name | |
if args.since >= 0: | |
data['as_ylo'] = args.since | |
url_data = urllib.parse.urlencode(data) | |
url_query = base_url + '?' + url_data | |
req = urllib.request.Request(url=url_query, headers={'User-Agent':USER_AGENT}) | |
try: | |
resp = urllib.request.urlopen(req) | |
html = resp.read().decode('utf-8') | |
except Exception as e: | |
raise e | |
# Paper list | |
soup = BeautifulSoup(html, 'html.parser') | |
items = soup.find_all('div', attrs={'class': 'gs_r gs_or gs_scl'}) | |
count = min(len(items), args.count) | |
print('\n----- Papers -----') | |
for i in range(count): | |
try: | |
title = items[i].find('h3', attrs={'class': 'gs_rt'}).find('a').get_text() | |
print('[{0:2d}] {1}'.format(i + 1, title)) | |
except: | |
continue | |
print('[ 0] Not found') | |
print('') | |
# Choose paper | |
num = -1 | |
while num < 0 or num > count: | |
num = input('Choose number >> ') | |
try: | |
num = int(num) | |
except: | |
num = -1 | |
print('Invalid number!') | |
continue | |
# Not found | |
if num == 0: | |
print('Sorry! Open Google Scholar papge.') | |
print('Redirect to: {0}'.format(url_query)) | |
os.system('open "{0}"'.format(url_query)) | |
return | |
# Get PDF link | |
try: | |
a_tag = items[num - 1].find('div', attrs={'class': 'gs_ggs gs_fl'}).find('a') | |
pdf_link = a_tag.get('href') | |
title = items[num - 1].find('h3', attrs={'class': 'gs_rt'}).find('a').get_text() | |
first_author = items[num - 1].find('div', attrs={'class': 'gs_a'}).find('a').get_text() | |
download(pdf_link, build_fname(title, first_author)) | |
except: | |
print('PDF not found! Open default URL.') | |
a_tag = items[num - 1].find('h3', attrs={'class': 'gs_rt'}).find('a') | |
page_link = a_tag.get('href') | |
os.system('open "{0}"'.format(page_link)) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Dependencies
Setup
With Minconda or Anaconda,
Usage
Just with the paper title.
$ python scholar.py -n "<PAPAR NAME>"
Specify the year since.
$ python scholar.py --since 2018 -n "<PAPER NAME>"