Skip to content

Instantly share code, notes, and snippets.

@fbennett
Created May 16, 2011 23:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fbennett/975594 to your computer and use it in GitHub Desktop.
Save fbennett/975594 to your computer and use it in GitHub Desktop.
Google Scholar with related item links
{
"translatorID":"57a00950-f0d1-4b41-b6ba-44ff0fc30289",
"label":"Google Scholar",
"creator":"Simon Kornblith, Frank Bennett",
"target":"http://scholar\\.google\\.(?:com|com?\\.[a-z]{2}|[a-z]{2}|co\\.[a-z]{2})/scholar(?:_case)*",
"minVersion":"1.0.0b3.r1",
"maxVersion":"",
"priority":100,
"inRepository":"1",
"translatorType":4,
"lastUpdated":"2010-11-18 06:10:00"
}
/*
* Test pages
*
* Searches of Google Scholar with the following terms should yield a folder
* icon that works. Check that unlinked ([CITATION]) items that provide
* no BibTeX data (there is currently one under "Marbury v. Madison",
* and "clifford" seems to be a good source of garbage) are
* dropped from the listings:
*
* marbury v madison
* kelo
* smith
* view of the cathedral
* clifford
*
* "How cited" pages should NOT yield a page or folder icon. The
* Urls to these currently look like this:
*
* http://scholar.google.co.jp/scholar_case?about=1101424605047973909&q=kelo&hl=en&as_sdt=2002
*
* Case pages should present a document icon that works:
*
* http://scholar.google.co.jp/scholar_case?case=18273389148555376997&hl=en&as_sdt=2002&kqfp=13204897074208725174&kql=186&kqpfp=16170611681001262513#kq
*/
/*global Zotero: true */
/*global XPathResult: true */
/*
* ####################
* ### Item Factory ###
* ####################
*/
var ItemFactory = function (citeletString, attachmentLinks, titleString, bibtexLink) {
// var strings
this.v = {};
this.v.title = titleString;
this.v.number = false;
this.v.court = false;
this.v.extra = false;
this.v.date = undefined;
this.v.jurisdiction = false;
this.v.docketNumber = false;
this.vv = {};
this.vv.volRepPag = [];
// portable array
this.attachmentLinks = attachmentLinks;
// working strings
this.citelet = citeletString;
this.bibtexLink = bibtexLink;
this.bibtexData = undefined;
this.trailingInfo = false;
// simple arrays of strings
this.hyphenSplit = false;
this.commaSplit = false;
};
ItemFactory.prototype.repairCitelet = function () {
if (!this.citelet.match(/\s+-\s+/)) {
this.citelet = this.citelet.replace(/,\s+([A-Z][a-z]+:)/, " - $1");
}
};
ItemFactory.prototype.repairTitle = function () {
// All-caps words of four or more characters probably need fixing.
if (this.v.title.match(/(?:[^a-z]|^)[A-Z]{4,}(?:[^a-z]|$)/)) {
this.v.title = Zotero.Utilities.capitalizeTitle(this.v.title.toLowerCase()).replace(/([^0-9a-z])V([^0-9a-z])/, "$1v$2");
}
};
ItemFactory.prototype.hasUsefulData = function () {
if (this.getDate()) {
return true;
}
if (this.hasInitials()) {
return true;
}
return false;
};
ItemFactory.prototype.hasInitials = function () {
if (this.hyphenSplit.length && this.hyphenSplit[0].match(/[A-Z] /)) {
return true;
}
return false;
};
ItemFactory.prototype.hasReporter = function () {
if (this.vv.volRepPag.length > 0) {
return true;
}
return false;
};
ItemFactory.prototype.getDate = function () {
var i, m;
// Citelet parsing, step (1)
if (!this.hyphenSplit) {
this.hyphenSplit = this.citelet.split(/\s+-\s+/);
this.trailingInfo = this.hyphenSplit.slice(-1);
}
if (!this.v.date && this.v.date !== false) {
this.v.date = false;
for (i = this.hyphenSplit.length - 1; i > -1; i += -1) {
m = this.hyphenSplit[i].match(/(?:(.*)\s+)*([0-9]{4})$/);
if (m) {
this.v.date = m[2];
if (m[1]) {
this.hyphenSplit[i] = m[1];
} else {
this.hyphenSplit[i] = "";
}
this.hyphenSplit = this.hyphenSplit.slice(0, i + 1);
break;
}
}
}
return this.v.date;
};
ItemFactory.prototype.getCourt = function () {
var s, m;
// Citelet parsing, step (2)
s = this.hyphenSplit.pop().replace(/,\s*$/, "").replace(/\u2026\s*$/, "Court");
m = s.match(/(?:([a-zA-Z]+):\s*)*(.*)/);
if (m) {
this.v.court = m[2].replace("_", " ", "g");
if (m[1]) {
this.v.extra = "{:jurisdiction: " + m[1] + "}";
}
}
return this.v.court;
};
ItemFactory.prototype.getVolRepPag = function () {
var i, m;
// Citelet parsing, step (3)
if (this.hyphenSplit.length) {
this.commaSplit = this.hyphenSplit.slice(-1)[0].split(/\s*,\s+/);
var gotOne = false;
for (i = this.commaSplit.length - 1; i > -1; i += -1) {
m = this.commaSplit[i].match(/^([0-9]+)\s+(.*)\s+(.*)/);
if (m) {
var volRepPag = {};
volRepPag.volume = m[1];
volRepPag.reporter = m[2];
volRepPag.pages = m[3].replace(/\s*$/, "");
this.commaSplit.pop();
if (!volRepPag.pages.match(/[0-9]$/) && (i > 0 || gotOne)) {
continue;
}
gotOne = true;
this.vv.volRepPag.push(volRepPag);
} else {
break;
}
}
}
};
ItemFactory.prototype.getTitle = function () {
// Citelet parsing, step (4) [optional]
if (this.commaSplit) {
this.v.title = this.commaSplit.join(", ");
}
};
ItemFactory.prototype.getDocketNumber = function (doc) {
if (!doc) {
// Needs doc fetch and xpath
doc = Zotero.Utilities.retrieveDocument(this.attachmentLinks[0]);
}
var nsResolver = doc.createNSResolver(doc.documentElement);
if (doc) {
var docNumFrag = doc.evaluate('//center[preceding-sibling::center//h3[@id="gsl_case_name"]]', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
if (docNumFrag) {
this.v.docketNumber = docNumFrag.textContent.replace(/^\s*[Nn][Oo](?:.|\s+)\s*/, "").replace(/\.\s*$/, "");
}
}
};
ItemFactory.prototype.getAttachments = function (doctype) {
var i, ilen, attachments;
attachments = [];
for (i = 0, ilen = this.attachmentLinks.length; i < ilen; i += 1) {
attachments.push({title:"Google Scholar " + doctype + " snapshot", type:"text/html",
url:this.attachmentLinks[i]});
}
return attachments;
};
ItemFactory.prototype.pushAttachments = function (doctype) {
this.item.attachments = this.getAttachments(doctype);
};
ItemFactory.prototype.getBibtexData = function () {
if (!this.bibtexData) {
if (this.bibtexData !== false) {
var bibtexData = Zotero.Utilities.retrieveSource(this.bibtexLink);
if (!bibtexData.match(/title\=\{\{\}\}/)) {
this.bibtexData = bibtexData;
} else {
this.bibtexData = false;
}
}
}
return this.bibtexData;
};
ItemFactory.prototype.saveItem = function () {
var i, ilen, j, jlen, key;
if (this.v.title) {
this.repairTitle();
if (this.vv.volRepPag.length) {
var completed_items = [];
var bogusID = 1;
for (i = 0, ilen = this.vv.volRepPag.length; i < ilen; i += 1) {
this.item = new Zotero.Item("case");
for (key in this.vv.volRepPag[i]) {
if (this.vv.volRepPag[i][key]) {
this.item[key] = this.vv.volRepPag[i][key];
}
}
this.saveItemCommonVars();
if (i === (this.vv.volRepPag.length - 1)) {
this.pushAttachments("judgement");
}
this.item.itemID = "" + bogusID;
bogusID += 1;
completed_items.push(this.item);
}
for (i = 0, ilen = completed_items.length; i < ilen; i += 1) {
for (j = 0, jlen = completed_items.length; j < jlen; j += 1) {
if (i === j) {
continue;
}
completed_items[i].seeAlso.push(completed_items[j].itemID);
}
completed_items[i].complete();
}
} else {
this.item = new Zotero.Item("case");
this.saveItemCommonVars();
this.pushAttachments("judgement");
this.item.complete();
}
}
};
ItemFactory.prototype.saveItemCommonVars = function () {
for (var key in this.v) {
if (this.v[key]) {
this.item[key] = this.v[key];
}
}
};
/*
* #########################
* ### Scraper Functions ###
* #########################
*/
function scrapeListing (doc) {
var factory;
var nsResolver = doc.createNSResolver(doc.documentElement);
// XML fragment lists
var titleFrags = doc.evaluate('//div[@class="gs_r"]//h3', doc, nsResolver, XPathResult.ANY_TYPE, null);
var citeletFrags = doc.evaluate('//span[@class="gs_a"]', doc, nsResolver, XPathResult.ANY_TYPE, null);
var bibtexFrags = doc.evaluate('//a[contains(@href, "scholar.bib")]',
doc, nsResolver, XPathResult.ANY_TYPE, null);
var labels = [];
var factories = [];
var dount = 0;
while (true) {
Zotero.debug("YYY Again "+dount);
dount += 1;
var titleFrag = titleFrags.iterateNext();
if (!titleFrag) {
break;
}
// initialize argument values
var titleString = titleFrag.textContent;
var citeletString = citeletFrags.iterateNext().textContent;
var bibtexLink = bibtexFrags.iterateNext().href;
var attachmentFrag = doc.evaluate('.//a',
titleFrag, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
var attachmentLinks;
if (attachmentFrag) {
attachmentLinks = [attachmentFrag.href];
} else {
attachmentLinks = [];
}
// Instantiate item factory with available data
factory = new ItemFactory(citeletString, attachmentLinks, titleString, bibtexLink);
if (!factory.hasUsefulData()) {
continue;
}
// (Feed the array used in the selection list)
if (factory.hyphenSplit.length) {
labels.push(titleString + " (" + factory.trailingInfo + ")");
} else {
labels.push(titleString);
}
factories.push(factory);
}
var items = Zotero.selectItems(labels);
if(!items) {
return false;
}
// The only supplementary translator we use is BibTeX
var translator = Zotero.loadTranslator("import");
translator.setTranslator("9cb70025-a888-4a29-a210-93ec52da40d4");
translator.setHandler("itemDone", function(obj, item) {
var attachments;
item.attachments = attachments;
item.complete();
});
for(var i in items) {
factory = factories[i];
factory.getCourt();
factory.getVolRepPag();
if (factory.hasReporter()) {
// If we win here, we get by without fetching the BibTeX object at all.
factory.saveItem();
} else {
var res = factory.getBibtexData();
if (res) {
// Has BibTeX data with title, pass it through to the BibTeX translator
var attachments = factory.getAttachments("page");
translator.setString(res);
translator.translate();
} else {
// If BibTeX is empty, this is some kind of case, if anything.
// Metadata from the citelet, supplemented by the target
// document for the docket number, if possible.
if (!factory.hasReporter()) {
factory.getDocketNumber();
}
factory.saveItem();
}
}
}
return true;
}
var scrapeCase = function (doc, url) {
// Citelet is identified by
// id="gsl_reference"
var nsResolver = doc.createNSResolver(doc.documentElement);
var refFrag = doc.evaluate('//div[@id="gsl_reference"]', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
if (refFrag) {
// citelet looks kind of like this
// Powell v. McCormack, 395 US 486 - Supreme Court 1969
var item = new Zotero.Item("case");
var factory = new ItemFactory(refFrag.textContent, [url]);
factory.repairCitelet();
factory.getDate();
factory.getCourt();
factory.getVolRepPag();
if (!factory.hasReporter()) {
// Look for docket number in the current document
factory.getDocketNumber(doc);
}
factory.getTitle();
factory.saveItem();
}
};
/*
* #####################
* ### API Functions ###
* #####################
*/
function detectWeb (doc, url) {
// Icon shows only for search results and law cases
if (url.match(/scholar_case/)) {
if (url.match(/about=/)) {
return false;
} else {
return "case";
}
} else {
return "multiple";
}
}
function doWeb(doc, url) {
var haveBibTexLinks, nsResolver;
// Invoke the case or the listing scraper, as appropriate.
// In a listings page, this forces use of bibtex data and English page version
nsResolver = doc.createNSResolver(doc.documentElement);
if (url.match(/scholar_case/)) {
scrapeCase(doc, url);
} else {
haveBibTexLinks = doc.evaluate('//a[contains(@href, "scholar.bib")]',
doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
if(!haveBibTexLinks) {
url = url.replace (/hl\=[^&]*&?/, "");
url = url.replace("scholar?", "scholar_setprefs?hl=en&scis=yes&scisf=4&submit=Save+Preferences&");
var scisigDoc = Zotero.Utilities.retrieveDocument(url);
var scisig = scisigDoc.evaluate('//input[@name="scisig"]',
scisigDoc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
url = url + "&scisig="+scisig.value;
doc = Zotero.Utilities.retrieveDocument(url);
haveBibTexLinks = true;
Zotero.debug(url);
}
scrapeListing(doc);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment