-
-
Save emmareisz/a5f3e371d4fee523a3e147c9e452dde8 to your computer and use it in GitHub Desktop.
{ | |
"translatorID": "a304870e-c4f3-45e3-ab75-e7afef13dff0", | |
"label": "British Newspaper Archive", | |
"creator": "Emma Reisz", | |
"target": "^https?://www\\.britishnewspaperarchive\\.co\\.uk/(search/results|viewer)", | |
"minVersion": "3.0", | |
"maxVersion": "", | |
"priority": 100, | |
"inRepository": true, | |
"translatorType": 4, | |
"browserSupport": "gcsibv", | |
"lastUpdated": "2017-11-22 17:49:30" | |
} | |
/* | |
***** BEGIN LICENSE BLOCK ***** | |
Copyright © 2017 Emma Reisz | |
This file is part of Zotero. | |
Zotero is free software: you can redistribute it and/or modify | |
it under the terms of the GNU Affero General Public License as published by | |
the Free Software Foundation, either version 3 of the License, or | |
(at your option) any later version. | |
Zotero is distributed in the hope that it will be useful, | |
but WITHOUT ANY WARRANTY; without even the implied warranty of | |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
GNU Affero General Public License for more details. | |
You should have received a copy of the GNU Affero General Public License | |
along with Zotero. If not, see <http://www.gnu.org/licenses/>. | |
***** END LICENSE BLOCK ***** | |
*/ | |
function detectWeb(doc, url) { | |
if (url.includes("/search/results")) { | |
var nullPath = '//*[@id="ajaxcontainer"]/div[3]/div/div[2]/div[3]/h1'; | |
if (ZU.xpath(doc, nullPath)[0]===undefined) { | |
return "multiple"; | |
} | |
} else if (url.includes("/viewer")) { | |
return "newspaperArticle"; | |
} | |
} | |
function doWeb(doc, url) { | |
if (detectWeb(doc, url) == 'multiple') { | |
scrapeSearch(doc, url); //Non-subsribers have access to search only so we scrape from search results view | |
} else { | |
scrapeImage(doc, url); | |
} | |
} | |
function scrapeSearch(doc, url) { | |
var titlesPath = '//*[@id="ajaxcontainer"]/div[3]/div/div[2]/article/div[3]/header/h4/a'; | |
var titles = ZU.xpath(doc, titlesPath); | |
var mapping = { | |
title :"innerText", | |
url : "href", | |
}; | |
var results = {}; | |
for(var i = 0; i<titles.length; i++) { | |
var result ={}; | |
for (var j in mapping){ | |
if (mapping.hasOwnProperty(j)) { | |
result[j]=titles[i][mapping[j]]; | |
} | |
} | |
result.title = ZU.capitalizeTitle(result.title.toLowerCase(),true); //Converts to title case. | |
results[i]=result; | |
} | |
Z.selectItems(results, function( selected ) { | |
if ( !selected ) return true; | |
var scrape = { | |
1 :"date", | |
2 :"publicationTitle", | |
3 :"place", | |
6 :"pages", | |
7 :"tags", | |
}; | |
for (var i in selected) { | |
if (selected.hasOwnProperty(i)) { | |
var item = new Z.Item("newspaperArticle"); | |
for (var j in results[i]){ | |
if (results[i].hasOwnProperty(j)) { | |
item[j]= results[i][j]; | |
} | |
} | |
var index = Number(i) + 1; | |
var detailPath = '//*[@id="ajaxcontainer"]/div[3]/div/div[2]/article[' + index + ']/div[3]/footer/div/small'; | |
for (var k in scrape){ | |
if (scrape.hasOwnProperty(k)) { | |
var scrapePath = detailPath + '/span[' + k + ']'; | |
item[scrape[k]] = ZU.trimInternal(ZU.xpathText(doc, scrapePath)).split(/: (.+)/)[1]; | |
} | |
} | |
item.pages=item.pages.replace(/^Pages?: ?/,''); //Done automatically by Zotero | |
item.tags=item.tags.split(", "); | |
if (item.tags == "none") item.tags=undefined; //Quicker than delete | |
item.complete(); | |
} | |
} | |
}); | |
} | |
function scrapeImage(doc, url) { | |
var item = new Z.Item("newspaperArticle"); | |
var mapping = { | |
publicationTitle :"newspaperTitle", | |
date :"newspaperDate", | |
rights :"newspaperCopy", | |
}; | |
for (var i in mapping ){ | |
if (mapping.hasOwnProperty(i)) { | |
var scrapePath="//*[@id=\"" + mapping[i] +"\"]"; | |
item[i]=ZU.xpathText(doc, scrapePath); | |
} | |
} | |
item.title=doc.title.split("|")[0].trim().replace(/[.,;]?$/, ''); | |
item.url=url; | |
item.complete(); | |
} | |
/** BEGIN TEST CASES **/ | |
var testCases = [ | |
{ | |
"type": "web", | |
"url": "https://www.britishnewspaperarchive.co.uk/viewer/bl/0001578/18901206/021/0007", | |
"items": [ | |
{ | |
"itemType": "newspaperArticle", | |
"title": "Testimonial To Sir Robert Hart, G.c..m.g., Inspector-General Op Customs In China", | |
"creators": [], | |
"libraryCatalog": "British Newspaper Archive", | |
"publicationTitle": "Loading publication", | |
"url": "https://www.britishnewspaperarchive.co.uk/viewer/bl/0001578/18901206/021/0007", | |
"attachments": [], | |
"tags": [], | |
"notes": [], | |
"seeAlso": [] | |
} | |
] | |
}, | |
{ | |
"type": "web", | |
"url": "https://www.britishnewspaperarchive.co.uk/search/results/1700-01-01/1749-12-31?basicsearch=robert%20hart%2020%20november%201890%20plate&exactsearch=true&retrievecountrycounts=false", | |
"items": "multiple" | |
} | |
] | |
/** END TEST CASES **/ |
@adam3smith We could but I'm not sure we should.
As I mentioned in our original forum discussion, one of BNA's payment options is 'Pay as you go' bundles of 40 pages, so users on that tariff would not I think want Zotero to use up their page credits when scraping multiples from the (free) search page. So we should I think be talking not about identifying users with access, but whether we want to identify users with unlimited paywall access.
I think it would be possible for Zotero to differentiate between types of BNA subscription based on the 'pages remaining' notification, though there are a couple of issues here. I wonder if it might be alarming to users for us to differentiate in that way; it would also require the coder to have one of each type of subscription in order to test it.
Before considering that, we should also consider the usefulness to Zotero of the paywalled data. The scan is the truffle behind the paywall(*), and can easily be attached (my personal copy of the translator attaches it). But I omitted the pdf attachment from the translator draft above because the scan pdfs are typically 1MB in size, and in relation to CONTENTdm the general view was that Zotero should not automatically download large scan files. I thought we might similarly feel that BNA scans should not be automatically downloaded.
In summary, if we want to attach the scan, then it might be worth identifying subscribers on an unlimited tariff so we can route their multiples through the single item scraper [ETA: actually we wouldn't route multiples to the single item scraper because I have noticed that the metadata is very slow to load on the pages behind the paywall; instead we would scrape from the search page as currently, and additionally download the scan]. But if we don't want to attach the scan, then I don't think there is much benefit to identifying unlimited-tariff subscribers.
(*) Pages behind the paywall also offer machine-generated OCR, but this does not (currently) differentiate effectively between columns, meaning the OCR is often from an unrelated article on the same page, or runs straight across multiple columns, concatenating a few words from each article. I considered adding the OCR as a Note, but did not since it is too often gibberish and would, I think, often be perceived as cruft.
I think I have fixed the three code niggles you mentioned above.
The /Pages?:/ regex is redundant, however.
how can i use this file
@emmareisz -- I have some code nits (e.g., escape target regex, use includes instead of match for literal string matches, handle /Pages?/ replace with a regex) but the only larger question I have if we can't differentiate between people with and without access when accessing multiples -- that would seem to be a useful thing to to so subscribers get the full benefit of their subscription when importing multiples.