Skip to content

Instantly share code, notes, and snippets.

@WilliamMayor
Created July 22, 2011 13:47
Show Gist options
  • Save WilliamMayor/1099485 to your computer and use it in GitHub Desktop.
Save WilliamMayor/1099485 to your computer and use it in GitHub Desktop.
CiteSeeing -> Zotero translators
{
"translatorID":"17AF2C40-AAA5-4837-AB16-FDC54298AB42",
"translatorType":1,
"label":"CiteLine",
"creator":"William Mayor",
"target":"citeline",
"minVersion":"1.0.0b3.r1",
"maxVersion":"",
"priority":100,
"browserSupport":"gcs",
"configOptions":{"dataMode":"block"},
"inRepository":false,
"lastUpdated":"2011-07-21 19:58:00"
}
function detectImport() {
Zotero.debug("detectImport");
var title = Zotero.read("title".length).toLowerCase();
return title == "title";
}
function doImport() {
Zotero.debug("doImport");
var details = getDetails();
var item = makeItem(details['type']);
var citeseeingToZotero = {"title":"title", "doi":"doi", "edition":"edition", "number":"issue", "pages":"pages", "report type":"reportType", "volume":"volume", "publisher": "publisher", "publication": ["place", "bookTitle", "publicationTitle"], "series":"series", "institution":"institution", "school":"university"};
for (var where in citeseeingToZotero)
{
var value = details[where];
if (citeseeingToZotero[where].constructor == Array) {
for (var i = 0; i < citeseeingToZotero[where].length; i++) {
item[citeseeingToZotero[where][i]] = value;
}
}
else {
item[citeseeingToZotero[where]] = value;
}
}
var year = details["year"];
if (year) item.date = year;
var month = details["month"];
if (month)
{
if (year)
item.date += "-" + month;
else
item.date = month;
}
var authors = details['author'];
if (authors) {
if (authors.constructor != Array) {
authors = [authors];
}
for (var i = 0; i < authors.length; i++) {
item.creators.push(Zotero.Utilities.cleanAuthor(authors[i], "author"));
}
}
var editors = details['editor'];
if (editors) {
if (editors.constructor != Array) {
editors = [editors];
}
for (var i = 0; i < editors.length; i++) {
item.creators.push(Zotero.Utilities.cleanAuthor(editors[i], "editor"));
}
}
var urls = details['url'];
if (urls) {
if (urls.constructor != Array) {
item.url = urls;
}
else {
item.url = urls[0];
item.attachments = [];
for (var i = 1; i < urls.length; i++) {
item.attachments.push({url:urls[i], title:"Alternate Link " + i, mimeType:"application/pdf"});
}
}
}
item.notes.push({note:"This document's details were found using citeseeing.com"});
item.complete();
}
function getDetails() {
var details = {};
var pair = getNextPair();
while (pair.length == 2) {
var key = pair[0].toLowerCase();
if (details[key])
{
if (details[key].constructor == Array) {
details[key].push(pair[1]);
}
else {
var arr = [details[key], pair[1]];
details[key] = arr;
}
}
else {
details[key] = pair[1];
}
pair = getNextPair();
}
return details;
}
function getNextPair() {
var line = "";
var current;
while ((current = Zotero.read(1)) && current != "\n") {
line += current;
}
return line.split(": ");
}
function makeItem(type) {
var name = "book";
switch (type) {
case "Article":
case "Document":
name = "journalArticle";
break;
case "In Collection":
name = "bookSection";
break;
case "Conference":
case "In Proceedings":
name = "conferencePaper";
break;
case "Masters Thesis":
case "PhD Thesis":
name = "thesis";
break;
case "Tech Report":
name = "report";
break;
case "Unpublished":
name = "manuscript";
break;
}
return new Zotero.Item(name);
}
{
"translatorID":"F72D8E0A-E99F-4EF5-8674-5D3FD190FF5D",
"translatorType":4,
"label":"CiteSeeing",
"creator":"William Mayor",
"target":"^http://localhost/document/view/[0-9]+",
"minVersion":"1.0.0b3.r1",
"maxVersion":"",
"priority":100,
"inRepository":false,
"lastUpdated":"2011-07-15 10:30:00"
}
function detectWeb(doc, url) {
return "journalArticle";
}
function doWeb(doc, url) {
var type = doc.evaluate("//tr[@id='type']/td/text()", doc, null, XPathResult.STRING_TYPE, null);
var item = makeItem(type.stringValue);
var citeseeingToZotero = {"Title":"title", "DOI":"doi", "Edition":"edition", "Number":"issue", "Pages":"pages", "Type":"reportType", "Volume":"volume"}
for (var where in citeseeingToZotero)
{
var value = getValue(doc, where);
if (value) item[citeseeingToZotero[where]] = value;
}
var year = getValue(doc, "Year");
if (year) item.date = year;
var month = getValue(doc, "Month");
if (month)
{
if (year)
item.date += "-" + month;
else
item.date = month;
}
var authors = doc.evaluate("//tr[@class='author']/td/a/text()", doc, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null);
var author;
while (author = authors.iterateNext()) {
item.creators.push(Zotero.Utilities.cleanAuthor(author.textContent, "author"));
}
var editors = doc.evaluate("//tr[@class='editor']/td/a/text()", doc, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null);
var editor;
while (editor = editors.iterateNext()) {
item.creators.push(Zotero.Utilities.cleanAuthor(editor.textContent, "editor"));
}
item.url = url;
var urls = doc.evaluate("//tr[@class='url']/td/a/text()", doc, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null);
var link, i = 1;
while (link = urls.iterateNext()) {
// It seems that Zotero will only accept one of these attachments
if (i == 1) item.attachments = [];
item.attachments.push({url:link.textContent, title:"CiteSeeing Link " + i++, mimeType:"application/pdf"});
}
var publisher = getCombined(doc, "publisher");
if (publisher) item.publisher = publisher;
var publication = getCombined(doc, "publication");
if (publication) {
item.place = publication;
item.bookTitle = publication;
item.publicationTitle = publication;
}
var series = getCombined(doc, "series");
if (series) item.series = series;
var institution = getCombined(doc, "institution");
if (institution) item.institution = institution;
var school = getCombined(doc, "school");
if (school) item.university = school;
item.notes.push({note:"This document's details were found using citeseeing.com"});
item.complete();
}
function getCombined(doc, id) {
var publisher = doc.evaluate("//tr[@id='" + id + "']/following-sibling::tr[position() < 3 and (@class='name' or @class='address')]/td[2]/text()", doc, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null);
var value = publisher.iterateNext();
var combined = null;
if (value) combined = value.textContent;
while (value = publisher.iterateNext()) {
combined += ", " + value.textContent;
}
return combined;
}
function makeItem(type) {
var name = "book";
switch (type) {
case "Article":
case "Document":
name = "journalArticle";
break;
case "In Collection":
name = "bookSection";
break;
case "Conference":
case "In Proceedings":
name = "conferencePaper";
break;
case "Masters Thesis":
case "PhD Thesis":
name = "thesis";
break;
case "Tech Report":
name = "report";
break;
case "Unpublished":
name = "manuscript";
break;
}
return new Zotero.Item(name);
}
function getValue(doc, type) {
var result = doc.evaluate("//td[text()='" + type + "']/following-sibling::td/text()", doc, null, XPathResult.STRING_TYPE, null);
return result.stringValue;
}
/*
***** BEGIN LICENSE BLOCK *****
Copyright © 2009 Center for History and New Media
George Mason University, Fairfax, Virginia, USA
http://zotero.org
This file is part of Zotero.
Zotero is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Zotero is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with Zotero. If not, see <http://www.gnu.org/licenses/>.
***** END LICENSE BLOCK *****
*/
/**
* @fileOverview Tools for automatically retrieving a citation for the given PDF
*/
const Zotero_RecognizePDF_SUCCESS_IMAGE = "chrome://zotero/skin/tick.png";
const Zotero_RecognizePDF_FAILURE_IMAGE = "chrome://zotero/skin/cross.png";
const Zotero_RecognizePDF_LOADING_IMAGE = "chrome://global/skin/icons/loading_16.png";
/**
* Front end for recognizing PDFs
* @namespace
*/
var Zotero_RecognizePDF = new function() {
var _progressWindow, _progressIndicator;
/**
* Checks whether a given PDF could theoretically be recognized
* @returns {Boolean} True if the PDF can be recognized, false if it cannot be
*/
this.canRecognize = function(/**Zotero.Item*/ item) {
return (item.attachmentMIMEType &&
item.attachmentMIMEType == "application/pdf" && !item.getSource());
}
/**
* Retrieves metadata for the PDF(s) selected in the Zotero Pane, placing the PDFs as a children
* of the new items
*/
this.recognizeSelected = function() {
var installed = ZoteroPane_Local.checkPDFConverter();
if (!installed) {
return;
}
var items = ZoteroPane_Local.getSelectedItems();
if (!items) return;
var itemRecognizer = new Zotero_RecognizePDF.ItemRecognizer();
itemRecognizer.recognizeItems(items);
}
}
/**
* @class Handles UI, etc. for recognizing multiple items
*/
Zotero_RecognizePDF.ItemRecognizer = function () {
this._stopped = false;
}
/**
* Retreives metadata for the PDF items passed, displaying a progress dialog during conversion
* and placing the PDFs as a children of the new items
* @param {Zotero.Item[]} items
*/
Zotero_RecognizePDF.ItemRecognizer.prototype.recognizeItems = function(items) {
var me = this;
this._items = items.slice();
this._itemTotal = items.length;
this._progressWindow = window.openDialog("chrome://zotero/content/pdfProgress.xul", "", "chrome,close=yes,resizable=yes,dependent,dialog,centerscreen");
this._progressWindow.addEventListener("pageshow", function() { me._onWindowLoaded() }, false);
}
/**
* Halts recognition of PDFs
*/
Zotero_RecognizePDF.ItemRecognizer.prototype.stop = function() {
this._stopped = true;
}
/**
* Called when the progress window has been opened; adds items to the tree and begins recognizing
* @param
*/
Zotero_RecognizePDF.ItemRecognizer.prototype._onWindowLoaded = function() {
// populate progress window
var treechildren = this._progressWindow.document.getElementById("treechildren");
for(var i in this._items) {
var treeitem = this._progressWindow.document.createElement('treeitem');
var treerow = this._progressWindow.document.createElement('treerow');
var treecell = this._progressWindow.document.createElement('treecell');
treecell.setAttribute("id", "item-"+this._items[i].id+"-icon");
treerow.appendChild(treecell);
treecell = this._progressWindow.document.createElement('treecell');
treecell.setAttribute("label", this._items[i].getField("title"));
treerow.appendChild(treecell);
treecell = this._progressWindow.document.createElement('treecell');
treecell.setAttribute("id", "item-"+this._items[i].id+"-title");
treerow.appendChild(treecell);
treeitem.appendChild(treerow);
treechildren.appendChild(treeitem);
}
var me = this;
this._progressIndicator = this._progressWindow.document.getElementById("progress-indicator");
this._progressWindow.document.getElementById("cancel-button").addEventListener("command", function() {
me.stop();
me._progressWindow.close();
}, false);
this._progressWindow.addEventListener("close", function() { me.stop() }, false);
this._recognizeItem();
}
/**
* Shifts an item off of this._items and recognizes it, then calls itself again if there are more
* @private
*/
Zotero_RecognizePDF.ItemRecognizer.prototype._recognizeItem = function() {
if(!this._items.length) {
this._done();
return;
}
this._progressIndicator.value = (this._itemTotal-this._items.length)/this._itemTotal*100;
this._item = this._items.shift();
this._progressWindow.document.getElementById("item-"+this._item.id+"-icon").
setAttribute("src", Zotero_RecognizePDF_LOADING_IMAGE);
var file = this._item.getFile();
if(file) {
var recognizer = new Zotero_RecognizePDF.Recognizer();
var me = this;
recognizer.recognize(file, this._item.libraryID, function(newItem, error) { me._callback(newItem, error) });
} else {
this._callback(false, "recognizePDF.fileNotFound");
}
}
/**
* Cleans up after items are recognized, disabling the cancel button and making the progress window
* close on blur
*/
Zotero_RecognizePDF.ItemRecognizer.prototype._done = function() {
this._progressIndicator.value = 100;
this._progressWindow.document.getElementById("cancel-button").label = Zotero.getString("recognizePDF.close.label");
var me = this;
this._progressWindow.addEventListener("blur",
function() { me._progressWindow.setTimeout(function() { me._progressWindow.close() }, 2000) }, false);
this._progressWindow.document.getElementById("label").value = Zotero.getString("recognizePDF.complete.label");
}
/**
* Callback function to be executed upon recognition completion
* @param {Zotero.Item|Boolean} newItem The new item created from translation, or false if
* recognition was unsuccessful
* @param {String} [error] The error name, if recognition was unsuccessful.
*/
Zotero_RecognizePDF.ItemRecognizer.prototype._callback = function(newItem, error) {
if(this._stopped) {
if(newItem) Zotero.Items.erase(newItem.id);
return;
}
if(newItem) {
// put new item in same collections as the old one
var itemCollections = this._item.getCollections();
for(var j=0; j<itemCollections.length; j++) {
var collection = Zotero.Collections.get(itemCollections[j]);
collection.addItem(newItem.id);
}
// put old item as a child of the new item
this._item.setSource(newItem.id);
this._item.save();
}
// add name
this._progressWindow.document.getElementById("item-"+this._item.id+"-title").
setAttribute("label", (newItem ? newItem.getField("title") : Zotero.getString(error)));
// update icon
this._progressWindow.document.getElementById("item-"+this._item.id+"-icon").
setAttribute("src", (newItem ? Zotero_RecognizePDF_SUCCESS_IMAGE : Zotero_RecognizePDF_FAILURE_IMAGE));
if(error == "recognizePDF.limit") {
// now done, since we hit the query limit
var error = Zotero.getString(error);
for(var i in this._items) {
this._progressWindow.document.getElementById("item-"+this._items[i].id+"-title").
setAttribute("label", error);
this._progressWindow.document.getElementById("item-"+this._items[i].id+"-icon").
setAttribute("src", Zotero_RecognizePDF_FAILURE_IMAGE);
}
this._done();
} else {
// scroll to this item
this._progressWindow.document.getElementById("tree").treeBoxObject.scrollToRow(Math.max(0, this._itemTotal-this._items.length-5));
// continue recognizing
this._recognizeItem();
}
}
/*Zotero_RecognizePDF.ItemRecognizer.prototype._captchaCallback = function(img) {
var io = {dataIn:img};
Zotero.debug(img);
this._progressWindow.openDialog("chrome://zotero/content/pdfCaptcha.xul", "", "chrome,modal,resizable=no", io);
if(io.dataOut) return io.dataOut;
this.stop();
this._progressWindow.close();
return false;
}*/
/**
* @class PDF recognizer backend
*/
Zotero_RecognizePDF.Recognizer = function () {
this._citeseeing = 1;
this._crossref = 2;
this._googlescholar = 4;
this._mode = 7;
}
/**
* Retrieves metadata for a PDF and saves it as an item
*
* @param {nsIFile} file The PDF file to retrieve metadata for
* @param {Function} callback The function to be executed when recognition is complete
* @param {Function} [captchaCallback] The function to be executed if a CAPTCHA is encountered
* (function will be passed image as URL and must return text of CAPTCHA)
*/
Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, libraryID, callback, captchaCallback) {
// Save arguments to object so later recursive calls don't have to provide them.
if (file) this._file = file;
if (libraryID) this._libraryID = libraryID;
if (callback) this._callback = callback;
Zotero.debug("Trying to recognize " + this._file.path);
if (this._mode & this._citeseeing) {
Zotero.debug("Checking citeseeing.com for PDF details.");
this._mode -= this._citeseeing;
this._queryCiteSeeing(this._file, this._libraryID);
}
else {
this._getLines(this._file);
if (this._mode & this._crossref) {
Zotero.debug("Checking for DOI then searching CrossRef for PDF details.");
this._mode -= this._crossref;
var allText = this._lines.join("\n");
Zotero.debug(allText);
var m = Zotero.Utilities.cleanDOI(allText);
if(m) {
this._queryCrossRef(m[0]);
}
else {
this.recognize();
}
}
else if (this._mode & this._googlescholar) {
Zotero.debug("Searching GoogleScholar for PDF details.");
this._mode -= this._googlescholar;
// get (not quite) median length
var lineLengthsLength = this._lineLengths.length;
if(lineLengthsLength < 20) {
this._callback(false, "recognizePDF.noOCR");
}
else {
var sortedLengths = this._lineLengths.sort();
var medianLength = sortedLengths[Math.floor(lineLengthsLength/2)];
// pick lines within 4 chars of the median (this is completely arbitrary)
this._goodLines = [];
var uBound = medianLength + 4;
var lBound = medianLength - 4;
for (var i=0; i<lineLengthsLength; i++) {
if(this._lineLengths[i] > lBound && this._lineLengths[i] < uBound) {
// Strip quotation marks so they don't mess up search query quoting
var line = this._lines[i].replace('"', '');
this._goodLines.push(line);
}
}
this._startLine = this._iteration = 0;
}
if(lineLengthsLength >= 20) {
this._queryGoogle();
}
}
else {
this._callback(false, "recognizePDF.noMatches");
}
}
}
Zotero_RecognizePDF.Recognizer.prototype._getLines = function(file) {
if (this._lines && this._lineLengths) {
return;
}
Zotero.debug("Running pdf2text and saving lines in first three pages.");
const MAX_PAGES = 3;
const lineRe = /^\s*([^\s]+(?: [^\s]+)+)/;
var cacheFile = Zotero.getZoteroDirectory();
cacheFile.append("recognizePDFcache.txt");
if(cacheFile.exists()) {
cacheFile.remove(false);
}
Zotero.debug('Running pdftotext -enc UTF-8 -nopgbrk '
+ '-l ' + MAX_PAGES + ' "' + file.path + '" "'
+ cacheFile.path + '"');
var proc = Components.classes["@mozilla.org/process/util;1"].
createInstance(Components.interfaces.nsIProcess);
var exec = Zotero.getZoteroDirectory();
exec.append(Zotero.Fulltext.pdfConverterFileName);
proc.init(exec);
var args = ['-enc', 'UTF-8', '-nopgbrk', '-layout', '-l', MAX_PAGES];
args.push(file.path, cacheFile.path);
proc.run(true, args, args.length);
if(!cacheFile.exists()) {
this._callback(false, "recognizePDF.couldNotRead");
return;
}
var inputStream = Components.classes["@mozilla.org/network/file-input-stream;1"]
.createInstance(Components.interfaces.nsIFileInputStream);
inputStream.init(cacheFile, 0x01, 0664, 0);
var intlStream = Components.classes["@mozilla.org/intl/converter-input-stream;1"]
.createInstance(Components.interfaces.nsIConverterInputStream);
intlStream.init(inputStream, "UTF-8", 65535,
Components.interfaces.nsIConverterInputStream.DEFAULT_REPLACEMENT_CHARACTER);
intlStream.QueryInterface(Components.interfaces.nsIUnicharLineInputStream);
// get the lines in this sample
this._lines = [];
this._lineLengths = [];
var str = {};
while(intlStream.readLine(str)) {
var line = lineRe.exec(str.value);
if(line) {
this._lines.push(line[1]);
this._lineLengths.push(line[1].length);
}
}
inputStream.close();
cacheFile.remove(false);
}
/**
* Tries to fetch an nsiFile pointer to the system's java binary.
* Returns null if java cannot be found.
* @private
*/
Zotero_RecognizePDF.Recognizer.prototype._getJava = function() {
Zotero.debug("Getting java");
var env = Components.classes["@mozilla.org/process/environment;1"].
getService(Components.interfaces.nsIEnvironment);
if (!env.exists("PATH"))
{
return null;
}
var path;
if (Zotero.isWin)
{
path = env.get("PATH").split(";");
}
else
{
path = env.get("PATH").split(":");
}
for (var i = 0; i < path.length; i++)
{
var file = Components.classes["@mozilla.org/file/local;1"].
createInstance(Components.interfaces.nsILocalFile);
file.initWithPath(path[i]);
file.append("java");
if (file.exists()) return file;
}
return null;
}
/**
* Queries citeseeing.com for details on the given pdf.
* If details are found then they are stored in a file and imported using
* the CiteLine translator. This file is then deleted.
* Returns a Zotero.Item object reference if identified, null if not.
* @private
*/
Zotero_RecognizePDF.Recognizer.prototype._queryCiteSeeing = function(pdf, libraryID) {
var java = this._getJava();
if (null == java) return null;
Zotero.debug(java.path);
var detailFile = Zotero.getZoteroDirectory();
detailFile.append("pdfDetails.citeline");
if(detailFile.exists()) {
detailFile.remove(false);
}
var jarFile = Zotero.getZoteroDirectory();
jarFile.append("citeline.jar");
if (!jarFile.exists()) {
Zotero.debug("No citeline.jar");
this.recognize();
return null;
}
Zotero.debug("Running java -jar " + jarFile.path + " " + pdf.path + " " + detailFile.path);
var proc = Components.classes["@mozilla.org/process/util;1"].
createInstance(Components.interfaces.nsIProcess);
proc.init(java);
var args = ["-jar", jarFile.path, pdf.path, detailFile.path];
proc.run(true, args, args.length);
Zotero.debug("Run");
if (!detailFile.exists()) {
Zotero.debug("No details");
this.recognize();
return null;
}
Zotero.debug("Got details");
var data = "";
var fstream = Components.classes["@mozilla.org/network/file-input-stream;1"].
createInstance(Components.interfaces.nsIFileInputStream);
var cstream = Components.classes["@mozilla.org/intl/converter-input-stream;1"].
createInstance(Components.interfaces.nsIConverterInputStream);
fstream.init(detailFile, -1, 0, 0);
cstream.init(fstream, "UTF-8", 0, 0);
var str = {};
var read = 0;
do {
read = cstream.readString(0xffffffff, str);
data += str.value;
} while (read != 0);
cstream.close();
Zotero.debug(data);
var me = this;
var translate = new Zotero.Translate.Import;
translate.setTranslator("17AF2C40-AAA5-4837-AB16-FDC54298AB42");
var location = {"path":detailFile.path};
translate.setLocation(location);
translate.setString(data);
translate.setHandler("itemDone", function(translate, item) {
Zotero.debug("Item done");
me._callback(item);
});
translate.setHandler("select", function(translate, items, callback) {
return me._selectItems(translate, items, callback);
});
translate.setHandler("done", function(translate, success) {
if(!success) me.recognize();
});
translate.translate(this._libraryID, false);
detailFile.remove(false);
}
Zotero_RecognizePDF.Recognizer.prototype._queryCrossRef = function(doi) {
// use CrossRef to look for DOI
var me = this;
var translate = new Zotero.Translate("search");
translate.setTranslator("11645bd1-0420-45c1-badb-53fb41eeb753");
var item = {"itemType":"journalArticle", "DOI":doi};
translate.setSearch(item);
translate.setHandler("itemDone", function(translate, item) {
me._callback(item);
});
translate.setHandler("select", function(translate, items, callback) {
return me._selectItems(translate, items, callback);
});
translate.setHandler("done", function(translate, success) {
Zotero.debug("Finished search");
if(!success) me.recognize();
});
Zotero.debug("Searching crossref for " + doi);
translate.translate(this._libraryID, false);
}
/**
* Queries Google Scholar for metadata for this PDF
* @private
*/
Zotero_RecognizePDF.Recognizer.prototype._queryGoogle = function() {
if(this._iteration > 3 || this._startLine >= this._goodLines.length) {
try {
if(this._hiddenBrowser) Zotero.Browser.deleteHiddenBrowser(me._hiddenBrowser);
} catch(e) {}
this.recognize();
return;
}
this._iteration++;
var queryString = "";
var me = this;
// take the relevant parts of some lines (exclude hyphenated word)
var queryStringWords = 0;
while(queryStringWords < 25 && this._startLine < this._goodLines.length) {
var words = this._goodLines[this._startLine].split(/\s+/);
// get rid of first and last words
words.shift();
words.pop();
// make sure there are no long words (probably OCR mistakes)
var skipLine = false;
for(var i=0; i<words.length; i++) {
if(words[i].length > 20) {
skipLine = true;
break;
}
}
// add words to query
if(!skipLine && words.length) {
queryStringWords += words.length;
queryString += '"'+words.join(" ")+'" ';
}
this._startLine++;
}
Zotero.debug("RecognizePDF: Query string "+queryString);
// pass query string to Google Scholar and translate
var url = "http://scholar.google.com/scholar?q="+encodeURIComponent(queryString)+"&hl=en&lr=&btnG=Search";
if(!this._hiddenBrowser) {
this._hiddenBrowser = Zotero.Browser.createHiddenBrowser();
this._hiddenBrowser.docShell.allowImages = false;
}
var translate = new Zotero.Translate("web");
translate.setTranslator("57a00950-f0d1-4b41-b6ba-44ff0fc30289");
translate.setHandler("itemDone", function(translate, item) {
Zotero.Browser.deleteHiddenBrowser(me._hiddenBrowser);
me._callback(item);
});
translate.setHandler("select", function(translate, items, callback) {
me._selectItems(translate, items, callback);
});
translate.setHandler("done", function(translate, success) {
if(!success) me._queryGoogle();
});
this._hiddenBrowser.addEventListener("pageshow", function() { me._scrape(translate) }, true);
this._hiddenBrowser.loadURIWithFlags(url,
Components.interfaces.nsIWebNavigation.LOAD_FLAGS_BYPASS_HISTORY, null, null, null);
}
/**
* To be executed when Google Scholar is loaded
* @private
*/
Zotero_RecognizePDF.Recognizer.prototype._scrape = function(/**Zotero.Translate*/ translate) {
if(this._hiddenBrowser.contentDocument.location.href == "about:blank") return;
if(this._hiddenBrowser.contentDocument.title == "403 Forbidden") {
// hit the captcha
/*
var forms = this._hiddenBrowser.contentDocument.getElementsByTagName("form");
if(forms.length && forms[0].getAttribute("action") == "Captcha") {
var captchaImage = forms[0].getElementsByTagName("img");
var captchaBox = this._hiddenBrowser.contentDocument.getElementsByName("captcha");
if(captchaImage.length && captchaBox.length && this._captchaCallback) {
var text = this._captchaCallback(captchaImage[0].src);
if(text) {
captchaBox[0].value = text;
forms[0].submit();
return;
}
}
}*/
this._callback(false, "recognizePDF.limit");
return;
}
this._hiddenBrowser.removeEventListener("pageshow", this._scrape.caller, true);
translate.setDocument(this._hiddenBrowser.contentDocument);
translate.translate(this._libraryID, false);
}
/**
* Callback to pick first item in the Google Scholar item list
* @private
* @type Object
*/
Zotero_RecognizePDF.Recognizer.prototype._selectItems = function(/**Zotero.Translate*/ translate,
/**Object*/ items, /**Function**/ callback) {
for(var i in items) {
var obj = {};
obj[i] = items;
callback(obj);
return;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment