Skip to content

Instantly share code, notes, and snippets.

@rgiot
Last active October 10, 2015 12:37
Show Gist options
  • Save rgiot/3690666 to your computer and use it in GitHub Desktop.
Save rgiot/3690666 to your computer and use it in GitHub Desktop.
Python script allowing to extract some metrics from a google citation page
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Python script to extract citations numbers from google citation pages
Extract informations from google citation (number of citations, h-index, i10 index).
The main program outputs LaTeX definition in order to automatically use the result
in a LaTeX document (ie. academic cv)
"""
# imports
from bs4 import BeautifulSoup
import httplib, sys
HEADERS = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
def extract_information_for(user):
"""Extract the information for the required user.
Parameters
==========
- user: str
Code of the user of google citation
"""
conn = httplib.HTTPConnection('scholar.google.co.uk' )
conn.request("GET", '/citations?user=%s' % user, {}, HEADERS)
resp = conn.getresponse()
if resp.status == 200:
# Dowload stuff
html = resp.read()
html = html.decode( 'ascii', 'ignore' )
soup = BeautifulSoup( html, 'html5lib' )
# Search interesting part
for record in soup('table', {'id':'stats'}):
# TODO cleanup this part
pointer = record.next_element.next_element.next_sibling.next_element.next_sibling
nb_citations = pointer.text
pointer = record.next_element.next_element.next_sibling.next_sibling.next_element.next_sibling
h_index = pointer.text
pointer = record.next_element.next_element.next_sibling.next_sibling.next_sibling.next_element.next_sibling
i10_index = pointer.text
break
return int(nb_citations), int(h_index), int(i10_index)
else:
print 'Error: '
print resp.status, resp.reason
exit(-1)
if __name__ == '__main__':
if len(sys.argv) < 2:
user = 'bwwARrQAAAAJ'
else:
user = sys.argv[1]
# Build a latex content
nb_citations, h_index, i10_index = extract_information_for(user)
print "\def\scholarnbcitations{%d}" % nb_citations
print "\def\scholarhindex{%d}" % h_index
print "\def\scholaritenindex{%d}" % i10_index
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment