Skip to content

Instantly share code, notes, and snippets.

@dartar
Created May 19, 2012 16:49
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 5 You must be signed in to fork a gist
  • Save dartar/2731466 to your computer and use it in GitHub Desktop.
Save dartar/2731466 to your computer and use it in GitHub Desktop.
Modified version of scholar.py (a python Google Scholar parser by Christian Kreibich) exposing direct PDF download URL when available
#! /usr/bin/env python
"""
This module provides classes for querying Google Scholar and parsing
returned results. It currently *only* processes the first results
page. It is not a recursive crawler.
"""
# Version: 1.3 -- $Date: 2012-02-01 16:51:16 -0800 (Wed, 01 Feb 2012) $
#
# ChangeLog
# ---------
#
# 1.3: Updates to reflect changes in Scholar's page rendering.
#
# 1.2: Minor tweaks, mostly thanks to helpful feedback from Dan Bolser.
# Thanks Dan!
#
# 1.1: Made author field explicit, added --author option.
#
# pylint: disable-msg=C0111
#
# Copyright 2010--2012 Christian Kreibich. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials provided
# with the distribution.
#
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT,
# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import optparse
import sys
import urllib
import urllib2
from BeautifulSoup import BeautifulSoup
class Article():
"""
A class representing articles listed on Google Scholar. The class
provides basic dictionary-like behavior.
"""
def __init__(self):
self.attrs = {'title': [None, 'Title', 0],
'url': [None, 'URL', 1],
'num_citations': [0, 'Citations', 2],
'num_versions': [0, 'Versions', 3],
'url_citations': [None, 'Citations list', 4],
'url_versions': [None, 'Versions list', 5],
'direct_url': [None, 'Direct URL', 6]}
def __getitem__(self, key):
if key in self.attrs:
return self.attrs[key][0]
return None
def __setitem__(self, key, item):
if key in self.attrs:
self.attrs[key][0] = item
else:
self.attrs[key] = [item, key, len(self.attrs)]
def __delitem__(self, key):
if key in self.attrs:
del self.attrs[key]
def as_txt(self):
# Get items sorted in specified order:
items = sorted(self.attrs.values(), key=lambda item: item[2])
# Find largest label length:
max_label_len = max([len(str(item[1])) for item in items])
fmt = '%%%ds %%s' % max_label_len
return '\n'.join([fmt % (item[1], item[0]) for item in items])
def as_csv(self, header=False, sep='|'):
# Get keys sorted in specified order:
keys = [pair[0] for pair in \
sorted([(key, val[2]) for key, val in self.attrs.items()],
key=lambda pair: pair[1])]
res = []
if header:
res.append(sep.join(keys))
res.append(sep.join([str(self.attrs[key][0]) for key in keys]))
return '\n'.join(res)
class ScholarParser():
"""
ScholarParser can parse HTML document strings obtained from Google
Scholar. It invokes the handle_article() callback on each article
that was parsed successfully.
"""
SCHOLAR_SITE = 'http://scholar.google.com'
def __init__(self, site=None):
self.soup = None
self.article = None
self.site = site or self.SCHOLAR_SITE
def handle_article(self, art):
"""
In this base class, the callback does nothing.
"""
def parse(self, html):
"""
This method initiates parsing of HTML content.
"""
self.soup = BeautifulSoup(html)
for div in self.soup.findAll(ScholarParser._tag_checker):
self._parse_article(div)
def _parse_article(self, div):
self.article = Article()
for tag in div:
if not hasattr(tag, 'name'):
continue
if tag.name == 'div' and tag.get('class') == 'gs_rt' and \
tag.h3 and tag.h3.a:
self.article['title'] = ''.join(tag.h3.a.findAll(text=True))
self.article['url'] = self._path2url(tag.h3.a['href'])
if tag.name == 'font':
for tag2 in tag:
if not hasattr(tag2, 'name'):
continue
if tag2.name == 'span' and tag2.get('class') == 'gs_fl':
self._parse_links(tag2)
if self.article['title']:
self.handle_article(self.article)
def _parse_links(self, span):
for tag in span:
if not hasattr(tag, 'name'):
continue
if tag.name != 'a' or tag.get('href') == None:
continue
if tag.get('href').startswith('/scholar?cites'):
if hasattr(tag, 'string') and tag.string.startswith('Cited by'):
self.article['num_citations'] = \
self._as_int(tag.string.split()[-1])
self.article['url_citations'] = self._path2url(tag.get('href'))
if tag.get('href').startswith('/scholar?cluster'):
if hasattr(tag, 'string') and tag.string.startswith('All '):
self.article['num_versions'] = \
self._as_int(tag.string.split()[1])
self.article['url_versions'] = self._path2url(tag.get('href'))
@staticmethod
def _tag_checker(tag):
if tag.name == 'div' and tag.get('class') == 'gs_r':
return True
return False
def _as_int(self, obj):
try:
return int(obj)
except ValueError:
return None
def _path2url(self, path):
if path.startswith('http://'):
return path
if not path.startswith('/'):
path = '/' + path
return self.site + path
class ScholarParser120201(ScholarParser):
"""
This class reflects update to the Scholar results page layout that
Google recently.
"""
def _parse_article(self, div):
self.article = Article()
for tag in div:
if not hasattr(tag, 'name'):
continue
if tag.name == 'h3' and tag.get('class') == 'gs_rt' and tag.a:
self.article['title'] = ''.join(tag.a.findAll(text=True))
self.article['url'] = self._path2url(tag.a['href'])
if tag.name == 'div' and tag.get('class') == 'gs_ggs gs_fl' and tag.a:
self.article['direct_url'] = self._path2url(tag.a['href'])
if tag.name == 'div' and tag.get('class') == 'gs_fl':
self._parse_links(tag)
if self.article['title']:
self.handle_article(self.article)
class ScholarQuerier():
"""
ScholarQuerier instances can conduct a search on Google Scholar
with subsequent parsing of the resulting HTML content. The
articles found are collected in the articles member, a list of
Article instances.
"""
SCHOLAR_URL = 'http://scholar.google.com/scholar?hl=en&q="%(query)s"+author:%(author)s&btnG=Search&as_subj=eng&as_sdt=1,5&as_ylo=&as_vis=0'
"""
Older URLs:
http://scholar.google.com/scholar?q=%s&hl=en&btnG=Search&as_sdt=2001&as_sdtp=on
"""
UA = 'Mozilla/5.0 (X11; U; FreeBSD i386; en-US; rv:1.9.2.9) Gecko/20100913 Firefox/3.6.9'
class Parser(ScholarParser120201):
def __init__(self, querier):
ScholarParser.__init__(self)
self.querier = querier
def handle_article(self, art):
self.querier.add_article(art)
def __init__(self, author='', scholar_url=None):
self.articles = []
self.author = author
self.scholar_url = scholar_url or self.SCHOLAR_URL
def query(self, search):
"""
This method initiates a query with subsequent parsing of the
response.
"""
req = urllib2.Request(url=self.scholar_url \
% {'query': urllib.quote(search),
'author': urllib.quote(self.author)},
headers={'User-Agent': self.UA})
hdl = urllib2.urlopen(req)
html = hdl.read()
hdl.close()
self.parse(html)
def parse(self, html):
"""
This method allows parsing of existing HTML content.
"""
parser = self.Parser(self)
parser.parse(html)
def add_article(self, art):
self.articles.append(art)
def txt(query, author, count):
querier = ScholarQuerier(author=author)
querier.query(query)
articles = querier.articles
if count > 0:
articles = articles[:count]
for art in articles:
print art.as_txt() + '\n'
def csv(query, author, count, header=False, sep='|'):
querier = ScholarQuerier(author=author)
querier.query(query)
articles = querier.articles
if count > 0:
articles = articles[:count]
for art in articles:
result = art.as_csv(header=header, sep=sep)
print result.encode('utf-8')
header = False
def main():
usage = """scholar.py [options] <query string>
A command-line interface to Google Scholar."""
fmt = optparse.IndentedHelpFormatter(max_help_position=50,
width=100)
parser = optparse.OptionParser(usage=usage, formatter=fmt)
parser.add_option('-a', '--author',
help='Author name')
parser.add_option('--csv', action='store_true',
help='Print article data in CSV format (separator is "|")')
parser.add_option('--csv-header', action='store_true',
help='Like --csv, but print header line with column names')
parser.add_option('--txt', action='store_true',
help='Print article data in text format')
parser.add_option('-c', '--count', type='int',
help='Maximum number of results')
parser.set_defaults(count=0, author='')
options, args = parser.parse_args()
if len(args) == 0:
print 'Hrrrm. I need a query string.'
sys.exit(1)
query = ' '.join(args)
if options.csv:
csv(query, author=options.author, count=options.count)
if options.csv_header:
csv(query, author=options.author, count=options.count, header=True)
if options.txt:
txt(query, author=options.author, count=options.count)
if __name__ == "__main__":
main()
@dartar
Copy link
Author

dartar commented May 19, 2012

A DOI/OA resolver

What it should do?

  • lookup a DOI and return a direct link to an OA version of the paper (e.g. a downloadable PDF)
  • lookup a DOI and return TRUE if a direct link to an OA version of the paper exists, FALSE otherwise
  • lookup a DOI and return an embeddable JS widget with an OA icon and the direct OA link

How it works?

Use existing services exposing OA status and direct download links associated with a DOI. Scholar, Mendeley, CiteULike could be a good starting point. The code above uses Scholar as a data source.

Is this really OA?

Depending on the data source used, this service will return a paper's full-text availability status, whether this availability is temporary or permanent and whether the paper is hosted on a platform (journal or repository) that includes bibliographic metadata or not. Note that this is not OA in a strict sense and it could potentially include copyright infringing contents.

Isn't this what Google Scholar does?

It's roughly the same idea but the difference is that this service should provide an API, not a go-to app. It should ubiquitous (display OA status not just in search results but anywhere a citation can be found, like ref managers, online articles, personal homepages with pub lists, CVs etc) and it should use an OA icon.

What would a consumer app look like?

A browser extension that detects DOIs and adds DOI/OA results next to them.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment