Created
July 22, 2011 13:47
-
-
Save WilliamMayor/1099485 to your computer and use it in GitHub Desktop.
CiteSeeing -> Zotero translators
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"translatorID":"17AF2C40-AAA5-4837-AB16-FDC54298AB42", | |
"translatorType":1, | |
"label":"CiteLine", | |
"creator":"William Mayor", | |
"target":"citeline", | |
"minVersion":"1.0.0b3.r1", | |
"maxVersion":"", | |
"priority":100, | |
"browserSupport":"gcs", | |
"configOptions":{"dataMode":"block"}, | |
"inRepository":false, | |
"lastUpdated":"2011-07-21 19:58:00" | |
} | |
function detectImport() { | |
Zotero.debug("detectImport"); | |
var title = Zotero.read("title".length).toLowerCase(); | |
return title == "title"; | |
} | |
function doImport() { | |
Zotero.debug("doImport"); | |
var details = getDetails(); | |
var item = makeItem(details['type']); | |
var citeseeingToZotero = {"title":"title", "doi":"doi", "edition":"edition", "number":"issue", "pages":"pages", "report type":"reportType", "volume":"volume", "publisher": "publisher", "publication": ["place", "bookTitle", "publicationTitle"], "series":"series", "institution":"institution", "school":"university"}; | |
for (var where in citeseeingToZotero) | |
{ | |
var value = details[where]; | |
if (citeseeingToZotero[where].constructor == Array) { | |
for (var i = 0; i < citeseeingToZotero[where].length; i++) { | |
item[citeseeingToZotero[where][i]] = value; | |
} | |
} | |
else { | |
item[citeseeingToZotero[where]] = value; | |
} | |
} | |
var year = details["year"]; | |
if (year) item.date = year; | |
var month = details["month"]; | |
if (month) | |
{ | |
if (year) | |
item.date += "-" + month; | |
else | |
item.date = month; | |
} | |
var authors = details['author']; | |
if (authors) { | |
if (authors.constructor != Array) { | |
authors = [authors]; | |
} | |
for (var i = 0; i < authors.length; i++) { | |
item.creators.push(Zotero.Utilities.cleanAuthor(authors[i], "author")); | |
} | |
} | |
var editors = details['editor']; | |
if (editors) { | |
if (editors.constructor != Array) { | |
editors = [editors]; | |
} | |
for (var i = 0; i < editors.length; i++) { | |
item.creators.push(Zotero.Utilities.cleanAuthor(editors[i], "editor")); | |
} | |
} | |
var urls = details['url']; | |
if (urls) { | |
if (urls.constructor != Array) { | |
item.url = urls; | |
} | |
else { | |
item.url = urls[0]; | |
item.attachments = []; | |
for (var i = 1; i < urls.length; i++) { | |
item.attachments.push({url:urls[i], title:"Alternate Link " + i, mimeType:"application/pdf"}); | |
} | |
} | |
} | |
item.notes.push({note:"This document's details were found using citeseeing.com"}); | |
item.complete(); | |
} | |
function getDetails() { | |
var details = {}; | |
var pair = getNextPair(); | |
while (pair.length == 2) { | |
var key = pair[0].toLowerCase(); | |
if (details[key]) | |
{ | |
if (details[key].constructor == Array) { | |
details[key].push(pair[1]); | |
} | |
else { | |
var arr = [details[key], pair[1]]; | |
details[key] = arr; | |
} | |
} | |
else { | |
details[key] = pair[1]; | |
} | |
pair = getNextPair(); | |
} | |
return details; | |
} | |
function getNextPair() { | |
var line = ""; | |
var current; | |
while ((current = Zotero.read(1)) && current != "\n") { | |
line += current; | |
} | |
return line.split(": "); | |
} | |
function makeItem(type) { | |
var name = "book"; | |
switch (type) { | |
case "Article": | |
case "Document": | |
name = "journalArticle"; | |
break; | |
case "In Collection": | |
name = "bookSection"; | |
break; | |
case "Conference": | |
case "In Proceedings": | |
name = "conferencePaper"; | |
break; | |
case "Masters Thesis": | |
case "PhD Thesis": | |
name = "thesis"; | |
break; | |
case "Tech Report": | |
name = "report"; | |
break; | |
case "Unpublished": | |
name = "manuscript"; | |
break; | |
} | |
return new Zotero.Item(name); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"translatorID":"F72D8E0A-E99F-4EF5-8674-5D3FD190FF5D", | |
"translatorType":4, | |
"label":"CiteSeeing", | |
"creator":"William Mayor", | |
"target":"^http://localhost/document/view/[0-9]+", | |
"minVersion":"1.0.0b3.r1", | |
"maxVersion":"", | |
"priority":100, | |
"inRepository":false, | |
"lastUpdated":"2011-07-15 10:30:00" | |
} | |
function detectWeb(doc, url) { | |
return "journalArticle"; | |
} | |
function doWeb(doc, url) { | |
var type = doc.evaluate("//tr[@id='type']/td/text()", doc, null, XPathResult.STRING_TYPE, null); | |
var item = makeItem(type.stringValue); | |
var citeseeingToZotero = {"Title":"title", "DOI":"doi", "Edition":"edition", "Number":"issue", "Pages":"pages", "Type":"reportType", "Volume":"volume"} | |
for (var where in citeseeingToZotero) | |
{ | |
var value = getValue(doc, where); | |
if (value) item[citeseeingToZotero[where]] = value; | |
} | |
var year = getValue(doc, "Year"); | |
if (year) item.date = year; | |
var month = getValue(doc, "Month"); | |
if (month) | |
{ | |
if (year) | |
item.date += "-" + month; | |
else | |
item.date = month; | |
} | |
var authors = doc.evaluate("//tr[@class='author']/td/a/text()", doc, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null); | |
var author; | |
while (author = authors.iterateNext()) { | |
item.creators.push(Zotero.Utilities.cleanAuthor(author.textContent, "author")); | |
} | |
var editors = doc.evaluate("//tr[@class='editor']/td/a/text()", doc, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null); | |
var editor; | |
while (editor = editors.iterateNext()) { | |
item.creators.push(Zotero.Utilities.cleanAuthor(editor.textContent, "editor")); | |
} | |
item.url = url; | |
var urls = doc.evaluate("//tr[@class='url']/td/a/text()", doc, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null); | |
var link, i = 1; | |
while (link = urls.iterateNext()) { | |
// It seems that Zotero will only accept one of these attachments | |
if (i == 1) item.attachments = []; | |
item.attachments.push({url:link.textContent, title:"CiteSeeing Link " + i++, mimeType:"application/pdf"}); | |
} | |
var publisher = getCombined(doc, "publisher"); | |
if (publisher) item.publisher = publisher; | |
var publication = getCombined(doc, "publication"); | |
if (publication) { | |
item.place = publication; | |
item.bookTitle = publication; | |
item.publicationTitle = publication; | |
} | |
var series = getCombined(doc, "series"); | |
if (series) item.series = series; | |
var institution = getCombined(doc, "institution"); | |
if (institution) item.institution = institution; | |
var school = getCombined(doc, "school"); | |
if (school) item.university = school; | |
item.notes.push({note:"This document's details were found using citeseeing.com"}); | |
item.complete(); | |
} | |
function getCombined(doc, id) { | |
var publisher = doc.evaluate("//tr[@id='" + id + "']/following-sibling::tr[position() < 3 and (@class='name' or @class='address')]/td[2]/text()", doc, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null); | |
var value = publisher.iterateNext(); | |
var combined = null; | |
if (value) combined = value.textContent; | |
while (value = publisher.iterateNext()) { | |
combined += ", " + value.textContent; | |
} | |
return combined; | |
} | |
function makeItem(type) { | |
var name = "book"; | |
switch (type) { | |
case "Article": | |
case "Document": | |
name = "journalArticle"; | |
break; | |
case "In Collection": | |
name = "bookSection"; | |
break; | |
case "Conference": | |
case "In Proceedings": | |
name = "conferencePaper"; | |
break; | |
case "Masters Thesis": | |
case "PhD Thesis": | |
name = "thesis"; | |
break; | |
case "Tech Report": | |
name = "report"; | |
break; | |
case "Unpublished": | |
name = "manuscript"; | |
break; | |
} | |
return new Zotero.Item(name); | |
} | |
function getValue(doc, type) { | |
var result = doc.evaluate("//td[text()='" + type + "']/following-sibling::td/text()", doc, null, XPathResult.STRING_TYPE, null); | |
return result.stringValue; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
***** BEGIN LICENSE BLOCK ***** | |
Copyright © 2009 Center for History and New Media | |
George Mason University, Fairfax, Virginia, USA | |
http://zotero.org | |
This file is part of Zotero. | |
Zotero is free software: you can redistribute it and/or modify | |
it under the terms of the GNU Affero General Public License as published by | |
the Free Software Foundation, either version 3 of the License, or | |
(at your option) any later version. | |
Zotero is distributed in the hope that it will be useful, | |
but WITHOUT ANY WARRANTY; without even the implied warranty of | |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
GNU Affero General Public License for more details. | |
You should have received a copy of the GNU Affero General Public License | |
along with Zotero. If not, see <http://www.gnu.org/licenses/>. | |
***** END LICENSE BLOCK ***** | |
*/ | |
/** | |
* @fileOverview Tools for automatically retrieving a citation for the given PDF | |
*/ | |
const Zotero_RecognizePDF_SUCCESS_IMAGE = "chrome://zotero/skin/tick.png"; | |
const Zotero_RecognizePDF_FAILURE_IMAGE = "chrome://zotero/skin/cross.png"; | |
const Zotero_RecognizePDF_LOADING_IMAGE = "chrome://global/skin/icons/loading_16.png"; | |
/** | |
* Front end for recognizing PDFs | |
* @namespace | |
*/ | |
var Zotero_RecognizePDF = new function() { | |
var _progressWindow, _progressIndicator; | |
/** | |
* Checks whether a given PDF could theoretically be recognized | |
* @returns {Boolean} True if the PDF can be recognized, false if it cannot be | |
*/ | |
this.canRecognize = function(/**Zotero.Item*/ item) { | |
return (item.attachmentMIMEType && | |
item.attachmentMIMEType == "application/pdf" && !item.getSource()); | |
} | |
/** | |
* Retrieves metadata for the PDF(s) selected in the Zotero Pane, placing the PDFs as a children | |
* of the new items | |
*/ | |
this.recognizeSelected = function() { | |
var installed = ZoteroPane_Local.checkPDFConverter(); | |
if (!installed) { | |
return; | |
} | |
var items = ZoteroPane_Local.getSelectedItems(); | |
if (!items) return; | |
var itemRecognizer = new Zotero_RecognizePDF.ItemRecognizer(); | |
itemRecognizer.recognizeItems(items); | |
} | |
} | |
/** | |
* @class Handles UI, etc. for recognizing multiple items | |
*/ | |
Zotero_RecognizePDF.ItemRecognizer = function () { | |
this._stopped = false; | |
} | |
/** | |
* Retreives metadata for the PDF items passed, displaying a progress dialog during conversion | |
* and placing the PDFs as a children of the new items | |
* @param {Zotero.Item[]} items | |
*/ | |
Zotero_RecognizePDF.ItemRecognizer.prototype.recognizeItems = function(items) { | |
var me = this; | |
this._items = items.slice(); | |
this._itemTotal = items.length; | |
this._progressWindow = window.openDialog("chrome://zotero/content/pdfProgress.xul", "", "chrome,close=yes,resizable=yes,dependent,dialog,centerscreen"); | |
this._progressWindow.addEventListener("pageshow", function() { me._onWindowLoaded() }, false); | |
} | |
/** | |
* Halts recognition of PDFs | |
*/ | |
Zotero_RecognizePDF.ItemRecognizer.prototype.stop = function() { | |
this._stopped = true; | |
} | |
/** | |
* Called when the progress window has been opened; adds items to the tree and begins recognizing | |
* @param | |
*/ | |
Zotero_RecognizePDF.ItemRecognizer.prototype._onWindowLoaded = function() { | |
// populate progress window | |
var treechildren = this._progressWindow.document.getElementById("treechildren"); | |
for(var i in this._items) { | |
var treeitem = this._progressWindow.document.createElement('treeitem'); | |
var treerow = this._progressWindow.document.createElement('treerow'); | |
var treecell = this._progressWindow.document.createElement('treecell'); | |
treecell.setAttribute("id", "item-"+this._items[i].id+"-icon"); | |
treerow.appendChild(treecell); | |
treecell = this._progressWindow.document.createElement('treecell'); | |
treecell.setAttribute("label", this._items[i].getField("title")); | |
treerow.appendChild(treecell); | |
treecell = this._progressWindow.document.createElement('treecell'); | |
treecell.setAttribute("id", "item-"+this._items[i].id+"-title"); | |
treerow.appendChild(treecell); | |
treeitem.appendChild(treerow); | |
treechildren.appendChild(treeitem); | |
} | |
var me = this; | |
this._progressIndicator = this._progressWindow.document.getElementById("progress-indicator"); | |
this._progressWindow.document.getElementById("cancel-button").addEventListener("command", function() { | |
me.stop(); | |
me._progressWindow.close(); | |
}, false); | |
this._progressWindow.addEventListener("close", function() { me.stop() }, false); | |
this._recognizeItem(); | |
} | |
/** | |
* Shifts an item off of this._items and recognizes it, then calls itself again if there are more | |
* @private | |
*/ | |
Zotero_RecognizePDF.ItemRecognizer.prototype._recognizeItem = function() { | |
if(!this._items.length) { | |
this._done(); | |
return; | |
} | |
this._progressIndicator.value = (this._itemTotal-this._items.length)/this._itemTotal*100; | |
this._item = this._items.shift(); | |
this._progressWindow.document.getElementById("item-"+this._item.id+"-icon"). | |
setAttribute("src", Zotero_RecognizePDF_LOADING_IMAGE); | |
var file = this._item.getFile(); | |
if(file) { | |
var recognizer = new Zotero_RecognizePDF.Recognizer(); | |
var me = this; | |
recognizer.recognize(file, this._item.libraryID, function(newItem, error) { me._callback(newItem, error) }); | |
} else { | |
this._callback(false, "recognizePDF.fileNotFound"); | |
} | |
} | |
/** | |
* Cleans up after items are recognized, disabling the cancel button and making the progress window | |
* close on blur | |
*/ | |
Zotero_RecognizePDF.ItemRecognizer.prototype._done = function() { | |
this._progressIndicator.value = 100; | |
this._progressWindow.document.getElementById("cancel-button").label = Zotero.getString("recognizePDF.close.label"); | |
var me = this; | |
this._progressWindow.addEventListener("blur", | |
function() { me._progressWindow.setTimeout(function() { me._progressWindow.close() }, 2000) }, false); | |
this._progressWindow.document.getElementById("label").value = Zotero.getString("recognizePDF.complete.label"); | |
} | |
/** | |
* Callback function to be executed upon recognition completion | |
* @param {Zotero.Item|Boolean} newItem The new item created from translation, or false if | |
* recognition was unsuccessful | |
* @param {String} [error] The error name, if recognition was unsuccessful. | |
*/ | |
Zotero_RecognizePDF.ItemRecognizer.prototype._callback = function(newItem, error) { | |
if(this._stopped) { | |
if(newItem) Zotero.Items.erase(newItem.id); | |
return; | |
} | |
if(newItem) { | |
// put new item in same collections as the old one | |
var itemCollections = this._item.getCollections(); | |
for(var j=0; j<itemCollections.length; j++) { | |
var collection = Zotero.Collections.get(itemCollections[j]); | |
collection.addItem(newItem.id); | |
} | |
// put old item as a child of the new item | |
this._item.setSource(newItem.id); | |
this._item.save(); | |
} | |
// add name | |
this._progressWindow.document.getElementById("item-"+this._item.id+"-title"). | |
setAttribute("label", (newItem ? newItem.getField("title") : Zotero.getString(error))); | |
// update icon | |
this._progressWindow.document.getElementById("item-"+this._item.id+"-icon"). | |
setAttribute("src", (newItem ? Zotero_RecognizePDF_SUCCESS_IMAGE : Zotero_RecognizePDF_FAILURE_IMAGE)); | |
if(error == "recognizePDF.limit") { | |
// now done, since we hit the query limit | |
var error = Zotero.getString(error); | |
for(var i in this._items) { | |
this._progressWindow.document.getElementById("item-"+this._items[i].id+"-title"). | |
setAttribute("label", error); | |
this._progressWindow.document.getElementById("item-"+this._items[i].id+"-icon"). | |
setAttribute("src", Zotero_RecognizePDF_FAILURE_IMAGE); | |
} | |
this._done(); | |
} else { | |
// scroll to this item | |
this._progressWindow.document.getElementById("tree").treeBoxObject.scrollToRow(Math.max(0, this._itemTotal-this._items.length-5)); | |
// continue recognizing | |
this._recognizeItem(); | |
} | |
} | |
/*Zotero_RecognizePDF.ItemRecognizer.prototype._captchaCallback = function(img) { | |
var io = {dataIn:img}; | |
Zotero.debug(img); | |
this._progressWindow.openDialog("chrome://zotero/content/pdfCaptcha.xul", "", "chrome,modal,resizable=no", io); | |
if(io.dataOut) return io.dataOut; | |
this.stop(); | |
this._progressWindow.close(); | |
return false; | |
}*/ | |
/** | |
* @class PDF recognizer backend | |
*/ | |
Zotero_RecognizePDF.Recognizer = function () { | |
this._citeseeing = 1; | |
this._crossref = 2; | |
this._googlescholar = 4; | |
this._mode = 7; | |
} | |
/** | |
* Retrieves metadata for a PDF and saves it as an item | |
* | |
* @param {nsIFile} file The PDF file to retrieve metadata for | |
* @param {Function} callback The function to be executed when recognition is complete | |
* @param {Function} [captchaCallback] The function to be executed if a CAPTCHA is encountered | |
* (function will be passed image as URL and must return text of CAPTCHA) | |
*/ | |
Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, libraryID, callback, captchaCallback) { | |
// Save arguments to object so later recursive calls don't have to provide them. | |
if (file) this._file = file; | |
if (libraryID) this._libraryID = libraryID; | |
if (callback) this._callback = callback; | |
Zotero.debug("Trying to recognize " + this._file.path); | |
if (this._mode & this._citeseeing) { | |
Zotero.debug("Checking citeseeing.com for PDF details."); | |
this._mode -= this._citeseeing; | |
this._queryCiteSeeing(this._file, this._libraryID); | |
} | |
else { | |
this._getLines(this._file); | |
if (this._mode & this._crossref) { | |
Zotero.debug("Checking for DOI then searching CrossRef for PDF details."); | |
this._mode -= this._crossref; | |
var allText = this._lines.join("\n"); | |
Zotero.debug(allText); | |
var m = Zotero.Utilities.cleanDOI(allText); | |
if(m) { | |
this._queryCrossRef(m[0]); | |
} | |
else { | |
this.recognize(); | |
} | |
} | |
else if (this._mode & this._googlescholar) { | |
Zotero.debug("Searching GoogleScholar for PDF details."); | |
this._mode -= this._googlescholar; | |
// get (not quite) median length | |
var lineLengthsLength = this._lineLengths.length; | |
if(lineLengthsLength < 20) { | |
this._callback(false, "recognizePDF.noOCR"); | |
} | |
else { | |
var sortedLengths = this._lineLengths.sort(); | |
var medianLength = sortedLengths[Math.floor(lineLengthsLength/2)]; | |
// pick lines within 4 chars of the median (this is completely arbitrary) | |
this._goodLines = []; | |
var uBound = medianLength + 4; | |
var lBound = medianLength - 4; | |
for (var i=0; i<lineLengthsLength; i++) { | |
if(this._lineLengths[i] > lBound && this._lineLengths[i] < uBound) { | |
// Strip quotation marks so they don't mess up search query quoting | |
var line = this._lines[i].replace('"', ''); | |
this._goodLines.push(line); | |
} | |
} | |
this._startLine = this._iteration = 0; | |
} | |
if(lineLengthsLength >= 20) { | |
this._queryGoogle(); | |
} | |
} | |
else { | |
this._callback(false, "recognizePDF.noMatches"); | |
} | |
} | |
} | |
Zotero_RecognizePDF.Recognizer.prototype._getLines = function(file) { | |
if (this._lines && this._lineLengths) { | |
return; | |
} | |
Zotero.debug("Running pdf2text and saving lines in first three pages."); | |
const MAX_PAGES = 3; | |
const lineRe = /^\s*([^\s]+(?: [^\s]+)+)/; | |
var cacheFile = Zotero.getZoteroDirectory(); | |
cacheFile.append("recognizePDFcache.txt"); | |
if(cacheFile.exists()) { | |
cacheFile.remove(false); | |
} | |
Zotero.debug('Running pdftotext -enc UTF-8 -nopgbrk ' | |
+ '-l ' + MAX_PAGES + ' "' + file.path + '" "' | |
+ cacheFile.path + '"'); | |
var proc = Components.classes["@mozilla.org/process/util;1"]. | |
createInstance(Components.interfaces.nsIProcess); | |
var exec = Zotero.getZoteroDirectory(); | |
exec.append(Zotero.Fulltext.pdfConverterFileName); | |
proc.init(exec); | |
var args = ['-enc', 'UTF-8', '-nopgbrk', '-layout', '-l', MAX_PAGES]; | |
args.push(file.path, cacheFile.path); | |
proc.run(true, args, args.length); | |
if(!cacheFile.exists()) { | |
this._callback(false, "recognizePDF.couldNotRead"); | |
return; | |
} | |
var inputStream = Components.classes["@mozilla.org/network/file-input-stream;1"] | |
.createInstance(Components.interfaces.nsIFileInputStream); | |
inputStream.init(cacheFile, 0x01, 0664, 0); | |
var intlStream = Components.classes["@mozilla.org/intl/converter-input-stream;1"] | |
.createInstance(Components.interfaces.nsIConverterInputStream); | |
intlStream.init(inputStream, "UTF-8", 65535, | |
Components.interfaces.nsIConverterInputStream.DEFAULT_REPLACEMENT_CHARACTER); | |
intlStream.QueryInterface(Components.interfaces.nsIUnicharLineInputStream); | |
// get the lines in this sample | |
this._lines = []; | |
this._lineLengths = []; | |
var str = {}; | |
while(intlStream.readLine(str)) { | |
var line = lineRe.exec(str.value); | |
if(line) { | |
this._lines.push(line[1]); | |
this._lineLengths.push(line[1].length); | |
} | |
} | |
inputStream.close(); | |
cacheFile.remove(false); | |
} | |
/** | |
* Tries to fetch an nsiFile pointer to the system's java binary. | |
* Returns null if java cannot be found. | |
* @private | |
*/ | |
Zotero_RecognizePDF.Recognizer.prototype._getJava = function() { | |
Zotero.debug("Getting java"); | |
var env = Components.classes["@mozilla.org/process/environment;1"]. | |
getService(Components.interfaces.nsIEnvironment); | |
if (!env.exists("PATH")) | |
{ | |
return null; | |
} | |
var path; | |
if (Zotero.isWin) | |
{ | |
path = env.get("PATH").split(";"); | |
} | |
else | |
{ | |
path = env.get("PATH").split(":"); | |
} | |
for (var i = 0; i < path.length; i++) | |
{ | |
var file = Components.classes["@mozilla.org/file/local;1"]. | |
createInstance(Components.interfaces.nsILocalFile); | |
file.initWithPath(path[i]); | |
file.append("java"); | |
if (file.exists()) return file; | |
} | |
return null; | |
} | |
/** | |
* Queries citeseeing.com for details on the given pdf. | |
* If details are found then they are stored in a file and imported using | |
* the CiteLine translator. This file is then deleted. | |
* Returns a Zotero.Item object reference if identified, null if not. | |
* @private | |
*/ | |
Zotero_RecognizePDF.Recognizer.prototype._queryCiteSeeing = function(pdf, libraryID) { | |
var java = this._getJava(); | |
if (null == java) return null; | |
Zotero.debug(java.path); | |
var detailFile = Zotero.getZoteroDirectory(); | |
detailFile.append("pdfDetails.citeline"); | |
if(detailFile.exists()) { | |
detailFile.remove(false); | |
} | |
var jarFile = Zotero.getZoteroDirectory(); | |
jarFile.append("citeline.jar"); | |
if (!jarFile.exists()) { | |
Zotero.debug("No citeline.jar"); | |
this.recognize(); | |
return null; | |
} | |
Zotero.debug("Running java -jar " + jarFile.path + " " + pdf.path + " " + detailFile.path); | |
var proc = Components.classes["@mozilla.org/process/util;1"]. | |
createInstance(Components.interfaces.nsIProcess); | |
proc.init(java); | |
var args = ["-jar", jarFile.path, pdf.path, detailFile.path]; | |
proc.run(true, args, args.length); | |
Zotero.debug("Run"); | |
if (!detailFile.exists()) { | |
Zotero.debug("No details"); | |
this.recognize(); | |
return null; | |
} | |
Zotero.debug("Got details"); | |
var data = ""; | |
var fstream = Components.classes["@mozilla.org/network/file-input-stream;1"]. | |
createInstance(Components.interfaces.nsIFileInputStream); | |
var cstream = Components.classes["@mozilla.org/intl/converter-input-stream;1"]. | |
createInstance(Components.interfaces.nsIConverterInputStream); | |
fstream.init(detailFile, -1, 0, 0); | |
cstream.init(fstream, "UTF-8", 0, 0); | |
var str = {}; | |
var read = 0; | |
do { | |
read = cstream.readString(0xffffffff, str); | |
data += str.value; | |
} while (read != 0); | |
cstream.close(); | |
Zotero.debug(data); | |
var me = this; | |
var translate = new Zotero.Translate.Import; | |
translate.setTranslator("17AF2C40-AAA5-4837-AB16-FDC54298AB42"); | |
var location = {"path":detailFile.path}; | |
translate.setLocation(location); | |
translate.setString(data); | |
translate.setHandler("itemDone", function(translate, item) { | |
Zotero.debug("Item done"); | |
me._callback(item); | |
}); | |
translate.setHandler("select", function(translate, items, callback) { | |
return me._selectItems(translate, items, callback); | |
}); | |
translate.setHandler("done", function(translate, success) { | |
if(!success) me.recognize(); | |
}); | |
translate.translate(this._libraryID, false); | |
detailFile.remove(false); | |
} | |
Zotero_RecognizePDF.Recognizer.prototype._queryCrossRef = function(doi) { | |
// use CrossRef to look for DOI | |
var me = this; | |
var translate = new Zotero.Translate("search"); | |
translate.setTranslator("11645bd1-0420-45c1-badb-53fb41eeb753"); | |
var item = {"itemType":"journalArticle", "DOI":doi}; | |
translate.setSearch(item); | |
translate.setHandler("itemDone", function(translate, item) { | |
me._callback(item); | |
}); | |
translate.setHandler("select", function(translate, items, callback) { | |
return me._selectItems(translate, items, callback); | |
}); | |
translate.setHandler("done", function(translate, success) { | |
Zotero.debug("Finished search"); | |
if(!success) me.recognize(); | |
}); | |
Zotero.debug("Searching crossref for " + doi); | |
translate.translate(this._libraryID, false); | |
} | |
/** | |
* Queries Google Scholar for metadata for this PDF | |
* @private | |
*/ | |
Zotero_RecognizePDF.Recognizer.prototype._queryGoogle = function() { | |
if(this._iteration > 3 || this._startLine >= this._goodLines.length) { | |
try { | |
if(this._hiddenBrowser) Zotero.Browser.deleteHiddenBrowser(me._hiddenBrowser); | |
} catch(e) {} | |
this.recognize(); | |
return; | |
} | |
this._iteration++; | |
var queryString = ""; | |
var me = this; | |
// take the relevant parts of some lines (exclude hyphenated word) | |
var queryStringWords = 0; | |
while(queryStringWords < 25 && this._startLine < this._goodLines.length) { | |
var words = this._goodLines[this._startLine].split(/\s+/); | |
// get rid of first and last words | |
words.shift(); | |
words.pop(); | |
// make sure there are no long words (probably OCR mistakes) | |
var skipLine = false; | |
for(var i=0; i<words.length; i++) { | |
if(words[i].length > 20) { | |
skipLine = true; | |
break; | |
} | |
} | |
// add words to query | |
if(!skipLine && words.length) { | |
queryStringWords += words.length; | |
queryString += '"'+words.join(" ")+'" '; | |
} | |
this._startLine++; | |
} | |
Zotero.debug("RecognizePDF: Query string "+queryString); | |
// pass query string to Google Scholar and translate | |
var url = "http://scholar.google.com/scholar?q="+encodeURIComponent(queryString)+"&hl=en&lr=&btnG=Search"; | |
if(!this._hiddenBrowser) { | |
this._hiddenBrowser = Zotero.Browser.createHiddenBrowser(); | |
this._hiddenBrowser.docShell.allowImages = false; | |
} | |
var translate = new Zotero.Translate("web"); | |
translate.setTranslator("57a00950-f0d1-4b41-b6ba-44ff0fc30289"); | |
translate.setHandler("itemDone", function(translate, item) { | |
Zotero.Browser.deleteHiddenBrowser(me._hiddenBrowser); | |
me._callback(item); | |
}); | |
translate.setHandler("select", function(translate, items, callback) { | |
me._selectItems(translate, items, callback); | |
}); | |
translate.setHandler("done", function(translate, success) { | |
if(!success) me._queryGoogle(); | |
}); | |
this._hiddenBrowser.addEventListener("pageshow", function() { me._scrape(translate) }, true); | |
this._hiddenBrowser.loadURIWithFlags(url, | |
Components.interfaces.nsIWebNavigation.LOAD_FLAGS_BYPASS_HISTORY, null, null, null); | |
} | |
/** | |
* To be executed when Google Scholar is loaded | |
* @private | |
*/ | |
Zotero_RecognizePDF.Recognizer.prototype._scrape = function(/**Zotero.Translate*/ translate) { | |
if(this._hiddenBrowser.contentDocument.location.href == "about:blank") return; | |
if(this._hiddenBrowser.contentDocument.title == "403 Forbidden") { | |
// hit the captcha | |
/* | |
var forms = this._hiddenBrowser.contentDocument.getElementsByTagName("form"); | |
if(forms.length && forms[0].getAttribute("action") == "Captcha") { | |
var captchaImage = forms[0].getElementsByTagName("img"); | |
var captchaBox = this._hiddenBrowser.contentDocument.getElementsByName("captcha"); | |
if(captchaImage.length && captchaBox.length && this._captchaCallback) { | |
var text = this._captchaCallback(captchaImage[0].src); | |
if(text) { | |
captchaBox[0].value = text; | |
forms[0].submit(); | |
return; | |
} | |
} | |
}*/ | |
this._callback(false, "recognizePDF.limit"); | |
return; | |
} | |
this._hiddenBrowser.removeEventListener("pageshow", this._scrape.caller, true); | |
translate.setDocument(this._hiddenBrowser.contentDocument); | |
translate.translate(this._libraryID, false); | |
} | |
/** | |
* Callback to pick first item in the Google Scholar item list | |
* @private | |
* @type Object | |
*/ | |
Zotero_RecognizePDF.Recognizer.prototype._selectItems = function(/**Zotero.Translate*/ translate, | |
/**Object*/ items, /**Function**/ callback) { | |
for(var i in items) { | |
var obj = {}; | |
obj[i] = items; | |
callback(obj); | |
return; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment