tatsy/scholar.py

## scholar.py
import os
import sys
import re
import argparse
import urllib.parse
import urllib.request

import requests
from bs4 import BeautifulSoup

USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'
base_url = 'https://scholar.google.co.jp/scholar'

def get_confirm_token(response):
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            return value

    return None

def save_response_content(response, destination):
    CHUNK_SIZE = 32768
    PROGBAR_WIDTH = 50

    with open(destination, "wb") as f:
        dl = 0
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk:
                dl += len(chunk)
                f.write(chunk)

                mb = dl / 1.0e6
                sys.stdout.write('\r%.2f MB downloaded...' % mb)
                sys.stdout.flush()

        sys.stdout.write('\nSaved to: %s\n' % destination)
        sys.stdout.flush()

def download(url, dest):
    session = requests.Session()

    response = session.get(url, stream=True)
    token = get_confirm_token(response)

    if token:
        response = session.get(url, stream=True)

    print('Downloading:', url)
    save_response_content(response, dest)

def build_fname(title, author):
    title_words = re.split('\s+', title)
    author_words = re.split('\s+', author)
    fname = author_words[-1] + '_' + title_words[0] + '_' + title_words[1] + '.pdf'
    return fname.lower()

def main():
    # Parse arguments
    parser = argparse.ArgumentParser(description='Get a literature from Google Scholar.')
    parser.add_argument('-n', '--name', type=str, required=True,
                        help='Name of the paper that you are looking for.')
    parser.add_argument('-c', '--count', type=int, default=10,
                        help='Number of paper candidates listed in the console.')
    parser.add_argument('--since', type=int, default=-1,
                        help='The program searches papers published after the year specified for this parameter.')
    args = parser.parse_args()

    # Query
    data = {}
    data['hl'] = 'en'
    data['q'] = args.name
    if args.since >= 0:
        data['as_ylo'] = args.since

    url_data = urllib.parse.urlencode(data)
    url_query = base_url + '?' + url_data

    req = urllib.request.Request(url=url_query, headers={'User-Agent':USER_AGENT})
    try:
        resp = urllib.request.urlopen(req)
        html = resp.read().decode('utf-8')
    except Exception as e:
        raise e

    # Paper list
    soup = BeautifulSoup(html, 'html.parser')
    items = soup.find_all('div', attrs={'class': 'gs_r gs_or gs_scl'})
    count = min(len(items), args.count)

    print('\n----- Papers -----')
    for i in range(count):
        try:
            title = items[i].find('h3', attrs={'class': 'gs_rt'}).find('a').get_text()
            print('[{0:2d}] {1}'.format(i + 1, title))
        except:
            continue

    print('[ 0] Not found')
    print('')

    # Choose paper
    num = -1
    while num < 0 or num > count:
        num = input('Choose number >> ')
        try:
            num = int(num)
        except:
            num = -1
            print('Invalid number!')
            continue

    # Not found
    if num == 0:
        print('Sorry! Open Google Scholar papge.')
        print('Redirect to: {0}'.format(url_query))
        os.system('open "{0}"'.format(url_query))
        return

    # Get PDF link
    try:
        a_tag = items[num - 1].find('div', attrs={'class': 'gs_ggs gs_fl'}).find('a')
        pdf_link = a_tag.get('href')
        title = items[num - 1].find('h3', attrs={'class': 'gs_rt'}).find('a').get_text()
        first_author = items[num - 1].find('div', attrs={'class': 'gs_a'}).find('a').get_text()
        download(pdf_link, build_fname(title, first_author))
    except:
        print('PDF not found! Open default URL.')
        a_tag = items[num - 1].find('h3', attrs={'class': 'gs_rt'}).find('a')
        page_link = a_tag.get('href')
        os.system('open "{0}"'.format(page_link))

if __name__ == '__main__':
    main()
	import os
	import sys
	import re
	import argparse
	import urllib.parse
	import urllib.request

	import requests
	from bs4 import BeautifulSoup

	USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'
	base_url = 'https://scholar.google.co.jp/scholar'

	def get_confirm_token(response):
	for key, value in response.cookies.items():
	if key.startswith('download_warning'):
	return value

	return None

	def save_response_content(response, destination):
	CHUNK_SIZE = 32768
	PROGBAR_WIDTH = 50

	with open(destination, "wb") as f:
	dl = 0
	for chunk in response.iter_content(CHUNK_SIZE):
	if chunk:
	dl += len(chunk)
	f.write(chunk)

	mb = dl / 1.0e6
	sys.stdout.write('\r%.2f MB downloaded...' % mb)
	sys.stdout.flush()

	sys.stdout.write('\nSaved to: %s\n' % destination)
	sys.stdout.flush()

	def download(url, dest):
	session = requests.Session()

	response = session.get(url, stream=True)
	token = get_confirm_token(response)

	if token:
	response = session.get(url, stream=True)

	print('Downloading:', url)
	save_response_content(response, dest)

	def build_fname(title, author):
	title_words = re.split('\s+', title)
	author_words = re.split('\s+', author)
	fname = author_words[-1] + '_' + title_words[0] + '_' + title_words[1] + '.pdf'
	return fname.lower()

	def main():
	# Parse arguments
	parser = argparse.ArgumentParser(description='Get a literature from Google Scholar.')
	parser.add_argument('-n', '--name', type=str, required=True,
	help='Name of the paper that you are looking for.')
	parser.add_argument('-c', '--count', type=int, default=10,
	help='Number of paper candidates listed in the console.')
	parser.add_argument('--since', type=int, default=-1,
	help='The program searches papers published after the year specified for this parameter.')
	args = parser.parse_args()

	# Query
	data = {}
	data['hl'] = 'en'
	data['q'] = args.name
	if args.since >= 0:
	data['as_ylo'] = args.since

	url_data = urllib.parse.urlencode(data)
	url_query = base_url + '?' + url_data

	req = urllib.request.Request(url=url_query, headers={'User-Agent':USER_AGENT})
	try:
	resp = urllib.request.urlopen(req)
	html = resp.read().decode('utf-8')
	except Exception as e:
	raise e

	# Paper list
	soup = BeautifulSoup(html, 'html.parser')
	items = soup.find_all('div', attrs={'class': 'gs_r gs_or gs_scl'})
	count = min(len(items), args.count)

	print('\n----- Papers -----')
	for i in range(count):
	try:
	title = items[i].find('h3', attrs={'class': 'gs_rt'}).find('a').get_text()
	print('[{0:2d}] {1}'.format(i + 1, title))
	except:
	continue

	print('[ 0] Not found')
	print('')

	# Choose paper
	num = -1
	while num < 0 or num > count:
	num = input('Choose number >> ')
	try:
	num = int(num)
	except:
	num = -1
	print('Invalid number!')
	continue

	# Not found
	if num == 0:
	print('Sorry! Open Google Scholar papge.')
	print('Redirect to: {0}'.format(url_query))
	os.system('open "{0}"'.format(url_query))
	return

	# Get PDF link
	try:
	a_tag = items[num - 1].find('div', attrs={'class': 'gs_ggs gs_fl'}).find('a')
	pdf_link = a_tag.get('href')
	title = items[num - 1].find('h3', attrs={'class': 'gs_rt'}).find('a').get_text()
	first_author = items[num - 1].find('div', attrs={'class': 'gs_a'}).find('a').get_text()
	download(pdf_link, build_fname(title, first_author))
	except:
	print('PDF not found! Open default URL.')
	a_tag = items[num - 1].find('h3', attrs={'class': 'gs_rt'}).find('a')
	page_link = a_tag.get('href')
	os.system('open "{0}"'.format(page_link))

	if __name__ == '__main__':
	main()