Skip to content

Instantly share code, notes, and snippets.

@Tessmore
Last active March 21, 2023 11:57
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Tessmore/11099509 to your computer and use it in GitHub Desktop.
Save Tessmore/11099509 to your computer and use it in GitHub Desktop.
Multi import of google scholar BibTex references
<?php
/*
For an answer to:
http://stackoverflow.com/questions/8217769/is-there-a-way-to-download-bibtex-from-google-scholar-using-php
This is rate-limited / prohibited by Google
*/
function get_page($url) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'GET');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$response = curl_exec($ch);
curl_close($ch);
return $response;
}
// helper function
function get_xpath($query_url) {
$dom = new DOMDocument();
@$dom->loadHTMLFile($query_url);
// "User behavior"
sleep(1);
return new DOMXpath($dom);
}
// Loads google scholar and returns the reference ID's that need to be clicked
function get_reference_nodes($query_url) {
$xpath = get_xpath($query_url);
return $xpath->query('//body//a[@class="gs_nph"]/@onclick');
}
function get_reference_id($href) {
preg_match("/return gs_ocit\(event,'([a-zA-Z0-9_\-]*)','[0-9]*'\)/", $href, $tmp);
return isset($tmp[1]) ? $tmp[1] : null;
}
// Input : List of google scholar links
// Output : List of reference IDs
function extract_reference_ids($nodes) {
$links = array();
foreach ($nodes as $node) {
$links[] = get_reference_id($node->value);
}
return $links;
}
// Get a bibID based on initial cite link
function get_bibtex_id($ref_id) {
$query_url = "http://scholar.google.nl/scholar?q=info:{$ref_id}:scholar.google.com/&output=cite";
$xpath = get_xpath($query_url);
// Return the first link (the bibTex import one)
return $xpath->query('//body//a[@class="gs_citi"][1]/@href');
}
function get_bibtex_imports($ref_ids) {
$results = array();
foreach ($ref_ids as $id) {
$tmp = get_bibtex_id($id);
// BibTex entry is first link on the page
$link = 'http://scholar.google.nl' . $tmp->item(0)->value;
// Get the bibtex entry
$results[] = get_page($link);
}
return $results;
}
// Initial page request
$query = "Virtualization"; // example query
$query_url = "http://scholar.google.nl/scholar?q={$query}";
// Get the bibtex entries
$nodes = get_reference_nodes($query_url);
$ref_ids = extract_reference_ids($nodes);
$bib_ids = get_bibtex_imports($ref_ids);
// List of all bibtex imports, could export them to a file or whatever.
var_dump($bib_ids);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment