Skip to content

Instantly share code, notes, and snippets.

@jxtx
Created September 17, 2014 15:58
Show Gist options
  • Save jxtx/fd6fed522541e56b3f40 to your computer and use it in GitHub Desktop.
Save jxtx/fd6fed522541e56b3f40 to your computer and use it in GitHub Desktop.
/**
* usage: node scrape_gs.js USERKEY
*
* Determine h-index for papers published AFTER each year found in a Google
* scholar profile. The USERKEY is found in your Google scholar citations
* page url.
*/
var request = require('request');
var cheerio = require('cheerio');
var _ = require('underscore');
var arguments = process.argv.slice(2);
var userKey = arguments[0];
var url = "http://scholar.google.com/citations?user=" + userKey + "&pagesize=1000"
request(url, function(err, resp, body) {
$ = cheerio.load(body);
var values = $(".gsc_a_tr").map( function() {
var title = $(this).find( "a.gsc_a_at" ).text();
var citations = parseInt( $(this).find( "a.gsc_a_ac" ).text() ) || 0;
var year = parseInt( $(this).find( "span.gsc_a_h" ).text() ) || 0;
// console.log( title, year, citations );
return { year: year, citations: citations };
});
console.log( "Year TotalCitations h-index" );
var byYear = _.groupBy( values, "year" );
var cumCitations = []
_.each( _.keys( byYear ).sort().reverse(), function( year ) {
cumCitations = cumCitations.concat(
_.map( byYear[ year ], function ( v ) { return v.citations } ) );
cumCitations.sort( function( a, b ){ return a - b } ).reverse()
var h = 0;
for ( var i = 0; i < cumCitations.length; i++ ) {
if ( i >= cumCitations[i] ) {
h = i;
break;
}
}
var totalCitations = _.reduce( cumCitations, function( a, b ) { return a + b }, 0 );
console.log( year, totalCitations, h );
});
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment