Skip to content

Instantly share code, notes, and snippets.

@tatsy
Last active February 2, 2018 07:20
Show Gist options
  • Save tatsy/fc12cf675759a4868c57ea9b659fab9b to your computer and use it in GitHub Desktop.
Save tatsy/fc12cf675759a4868c57ea9b659fab9b to your computer and use it in GitHub Desktop.
Download open access paper from Google Scholar
import os
import sys
import re
import argparse
import urllib.parse
import urllib.request
import requests
from bs4 import BeautifulSoup
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'
base_url = 'https://scholar.google.co.jp/scholar'
def get_confirm_token(response):
for key, value in response.cookies.items():
if key.startswith('download_warning'):
return value
return None
def save_response_content(response, destination):
CHUNK_SIZE = 32768
PROGBAR_WIDTH = 50
with open(destination, "wb") as f:
dl = 0
for chunk in response.iter_content(CHUNK_SIZE):
if chunk:
dl += len(chunk)
f.write(chunk)
mb = dl / 1.0e6
sys.stdout.write('\r%.2f MB downloaded...' % mb)
sys.stdout.flush()
sys.stdout.write('\nSaved to: %s\n' % destination)
sys.stdout.flush()
def download(url, dest):
session = requests.Session()
response = session.get(url, stream=True)
token = get_confirm_token(response)
if token:
response = session.get(url, stream=True)
print('Downloading:', url)
save_response_content(response, dest)
def build_fname(title, author):
title_words = re.split('\s+', title)
author_words = re.split('\s+', author)
fname = author_words[-1] + '_' + title_words[0] + '_' + title_words[1] + '.pdf'
return fname.lower()
def main():
# Parse arguments
parser = argparse.ArgumentParser(description='Get a literature from Google Scholar.')
parser.add_argument('-n', '--name', type=str, required=True,
help='Name of the paper that you are looking for.')
parser.add_argument('-c', '--count', type=int, default=10,
help='Number of paper candidates listed in the console.')
parser.add_argument('--since', type=int, default=-1,
help='The program searches papers published after the year specified for this parameter.')
args = parser.parse_args()
# Query
data = {}
data['hl'] = 'en'
data['q'] = args.name
if args.since >= 0:
data['as_ylo'] = args.since
url_data = urllib.parse.urlencode(data)
url_query = base_url + '?' + url_data
req = urllib.request.Request(url=url_query, headers={'User-Agent':USER_AGENT})
try:
resp = urllib.request.urlopen(req)
html = resp.read().decode('utf-8')
except Exception as e:
raise e
# Paper list
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all('div', attrs={'class': 'gs_r gs_or gs_scl'})
count = min(len(items), args.count)
print('\n----- Papers -----')
for i in range(count):
try:
title = items[i].find('h3', attrs={'class': 'gs_rt'}).find('a').get_text()
print('[{0:2d}] {1}'.format(i + 1, title))
except:
continue
print('[ 0] Not found')
print('')
# Choose paper
num = -1
while num < 0 or num > count:
num = input('Choose number >> ')
try:
num = int(num)
except:
num = -1
print('Invalid number!')
continue
# Not found
if num == 0:
print('Sorry! Open Google Scholar papge.')
print('Redirect to: {0}'.format(url_query))
os.system('open "{0}"'.format(url_query))
return
# Get PDF link
try:
a_tag = items[num - 1].find('div', attrs={'class': 'gs_ggs gs_fl'}).find('a')
pdf_link = a_tag.get('href')
title = items[num - 1].find('h3', attrs={'class': 'gs_rt'}).find('a').get_text()
first_author = items[num - 1].find('div', attrs={'class': 'gs_a'}).find('a').get_text()
download(pdf_link, build_fname(title, first_author))
except:
print('PDF not found! Open default URL.')
a_tag = items[num - 1].find('h3', attrs={'class': 'gs_rt'}).find('a')
page_link = a_tag.get('href')
os.system('open "{0}"'.format(page_link))
if __name__ == '__main__':
main()
@tatsy
Copy link
Author

tatsy commented Jan 7, 2018

Dependencies

  • Python 3.5
  • Requests
  • Beatiful Soup 4

Setup

With Minconda or Anaconda,

$ conda install requests beautifulsoup4

Usage

Just with the paper title.

$ python scholar.py -n "<PAPAR NAME>"

Specify the year since.

$ python scholar.py --since 2018 -n "<PAPER NAME>"

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment