rgiot/google_citation_export.py

## google_citation_export.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""Python script to extract citations numbers from google citation pages

Extract informations from google citation (number of citations, h-index, i10 index).
The main program outputs LaTeX definition in order to automatically use the result
in a LaTeX document (ie. academic cv)
"""

# imports
from bs4 import BeautifulSoup
import httplib, sys

HEADERS = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}


def extract_information_for(user):
    """Extract the information for the required user.

    Parameters
    ==========
     - user: str
     Code of the user of google citation
    """

    conn = httplib.HTTPConnection('scholar.google.co.uk' )
    conn.request("GET", '/citations?user=%s' % user, {}, HEADERS)
    resp = conn.getresponse()
    if resp.status == 200:

        # Dowload stuff
        html = resp.read()
        html = html.decode( 'ascii', 'ignore' )
        soup = BeautifulSoup( html, 'html5lib' )

        # Search interesting part
        for record in soup('table', {'id':'stats'}):
            # TODO cleanup this part
            pointer = record.next_element.next_element.next_sibling.next_element.next_sibling
            nb_citations = pointer.text
            pointer = record.next_element.next_element.next_sibling.next_sibling.next_element.next_sibling
            h_index = pointer.text
            pointer = record.next_element.next_element.next_sibling.next_sibling.next_sibling.next_element.next_sibling
            i10_index = pointer.text
            break

        return int(nb_citations), int(h_index), int(i10_index)
    else:
        print 'Error: '
        print resp.status, resp.reason
        exit(-1)

if __name__ == '__main__':
    if len(sys.argv) < 2:
        user = 'bwwARrQAAAAJ'
    else:
        user = sys.argv[1]


    # Build a latex content
    nb_citations, h_index, i10_index = extract_information_for(user)

    print "\def\scholarnbcitations{%d}" % nb_citations
    print "\def\scholarhindex{%d}" % h_index
    print "\def\scholaritenindex{%d}" % i10_index
	#!/usr/bin/env python
	# -- coding: utf-8 --

	"""Python script to extract citations numbers from google citation pages

	Extract informations from google citation (number of citations, h-index, i10 index).
	The main program outputs LaTeX definition in order to automatically use the result
	in a LaTeX document (ie. academic cv)
	"""

	# imports
	from bs4 import BeautifulSoup
	import httplib, sys

	HEADERS = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}


	def extract_information_for(user):
	"""Extract the information for the required user.

	Parameters
	==========
	- user: str
	Code of the user of google citation
	"""

	conn = httplib.HTTPConnection('scholar.google.co.uk' )
	conn.request("GET", '/citations?user=%s' % user, {}, HEADERS)
	resp = conn.getresponse()
	if resp.status == 200:

	# Dowload stuff
	html = resp.read()
	html = html.decode( 'ascii', 'ignore' )
	soup = BeautifulSoup( html, 'html5lib' )

	# Search interesting part
	for record in soup('table', {'id':'stats'}):
	# TODO cleanup this part
	pointer = record.next_element.next_element.next_sibling.next_element.next_sibling
	nb_citations = pointer.text
	pointer = record.next_element.next_element.next_sibling.next_sibling.next_element.next_sibling
	h_index = pointer.text
	pointer = record.next_element.next_element.next_sibling.next_sibling.next_sibling.next_element.next_sibling
	i10_index = pointer.text
	break

	return int(nb_citations), int(h_index), int(i10_index)
	else:
	print 'Error: '
	print resp.status, resp.reason
	exit(-1)

	if __name__ == '__main__':
	if len(sys.argv) < 2:
	user = 'bwwARrQAAAAJ'
	else:
	user = sys.argv[1]


	# Build a latex content
	nb_citations, h_index, i10_index = extract_information_for(user)

	print "\def\scholarnbcitations{%d}" % nb_citations
	print "\def\scholarhindex{%d}" % h_index
	print "\def\scholaritenindex{%d}" % i10_index