Last active
March 21, 2023 11:57
-
-
Save Tessmore/11099509 to your computer and use it in GitHub Desktop.
Multi import of google scholar BibTex references
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/* | |
For an answer to: | |
http://stackoverflow.com/questions/8217769/is-there-a-way-to-download-bibtex-from-google-scholar-using-php | |
This is rate-limited / prohibited by Google | |
*/ | |
function get_page($url) { | |
$ch = curl_init(); | |
curl_setopt($ch, CURLOPT_URL, $url); | |
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'GET'); | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |
$response = curl_exec($ch); | |
curl_close($ch); | |
return $response; | |
} | |
// helper function | |
function get_xpath($query_url) { | |
$dom = new DOMDocument(); | |
@$dom->loadHTMLFile($query_url); | |
// "User behavior" | |
sleep(1); | |
return new DOMXpath($dom); | |
} | |
// Loads google scholar and returns the reference ID's that need to be clicked | |
function get_reference_nodes($query_url) { | |
$xpath = get_xpath($query_url); | |
return $xpath->query('//body//a[@class="gs_nph"]/@onclick'); | |
} | |
function get_reference_id($href) { | |
preg_match("/return gs_ocit\(event,'([a-zA-Z0-9_\-]*)','[0-9]*'\)/", $href, $tmp); | |
return isset($tmp[1]) ? $tmp[1] : null; | |
} | |
// Input : List of google scholar links | |
// Output : List of reference IDs | |
function extract_reference_ids($nodes) { | |
$links = array(); | |
foreach ($nodes as $node) { | |
$links[] = get_reference_id($node->value); | |
} | |
return $links; | |
} | |
// Get a bibID based on initial cite link | |
function get_bibtex_id($ref_id) { | |
$query_url = "http://scholar.google.nl/scholar?q=info:{$ref_id}:scholar.google.com/&output=cite"; | |
$xpath = get_xpath($query_url); | |
// Return the first link (the bibTex import one) | |
return $xpath->query('//body//a[@class="gs_citi"][1]/@href'); | |
} | |
function get_bibtex_imports($ref_ids) { | |
$results = array(); | |
foreach ($ref_ids as $id) { | |
$tmp = get_bibtex_id($id); | |
// BibTex entry is first link on the page | |
$link = 'http://scholar.google.nl' . $tmp->item(0)->value; | |
// Get the bibtex entry | |
$results[] = get_page($link); | |
} | |
return $results; | |
} | |
// Initial page request | |
$query = "Virtualization"; // example query | |
$query_url = "http://scholar.google.nl/scholar?q={$query}"; | |
// Get the bibtex entries | |
$nodes = get_reference_nodes($query_url); | |
$ref_ids = extract_reference_ids($nodes); | |
$bib_ids = get_bibtex_imports($ref_ids); | |
// List of all bibtex imports, could export them to a file or whatever. | |
var_dump($bib_ids); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment