Skip to content

Instantly share code, notes, and snippets.

@emmareisz
Last active January 26, 2021 21:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save emmareisz/a5f3e371d4fee523a3e147c9e452dde8 to your computer and use it in GitHub Desktop.
Save emmareisz/a5f3e371d4fee523a3e147c9e452dde8 to your computer and use it in GitHub Desktop.
Zotero translator for British Newspaper Archive
{
"translatorID": "a304870e-c4f3-45e3-ab75-e7afef13dff0",
"label": "British Newspaper Archive",
"creator": "Emma Reisz",
"target": "^https?://www\\.britishnewspaperarchive\\.co\\.uk/(search/results|viewer)",
"minVersion": "3.0",
"maxVersion": "",
"priority": 100,
"inRepository": true,
"translatorType": 4,
"browserSupport": "gcsibv",
"lastUpdated": "2017-11-22 17:49:30"
}
/*
***** BEGIN LICENSE BLOCK *****
Copyright © 2017 Emma Reisz
This file is part of Zotero.
Zotero is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Zotero is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with Zotero. If not, see <http://www.gnu.org/licenses/>.
***** END LICENSE BLOCK *****
*/
function detectWeb(doc, url) {
if (url.includes("/search/results")) {
var nullPath = '//*[@id="ajaxcontainer"]/div[3]/div/div[2]/div[3]/h1';
if (ZU.xpath(doc, nullPath)[0]===undefined) {
return "multiple";
}
} else if (url.includes("/viewer")) {
return "newspaperArticle";
}
}
function doWeb(doc, url) {
if (detectWeb(doc, url) == 'multiple') {
scrapeSearch(doc, url); //Non-subsribers have access to search only so we scrape from search results view
} else {
scrapeImage(doc, url);
}
}
function scrapeSearch(doc, url) {
var titlesPath = '//*[@id="ajaxcontainer"]/div[3]/div/div[2]/article/div[3]/header/h4/a';
var titles = ZU.xpath(doc, titlesPath);
var mapping = {
title :"innerText",
url : "href",
};
var results = {};
for(var i = 0; i<titles.length; i++) {
var result ={};
for (var j in mapping){
if (mapping.hasOwnProperty(j)) {
result[j]=titles[i][mapping[j]];
}
}
result.title = ZU.capitalizeTitle(result.title.toLowerCase(),true); //Converts to title case.
results[i]=result;
}
Z.selectItems(results, function( selected ) {
if ( !selected ) return true;
var scrape = {
1 :"date",
2 :"publicationTitle",
3 :"place",
6 :"pages",
7 :"tags",
};
for (var i in selected) {
if (selected.hasOwnProperty(i)) {
var item = new Z.Item("newspaperArticle");
for (var j in results[i]){
if (results[i].hasOwnProperty(j)) {
item[j]= results[i][j];
}
}
var index = Number(i) + 1;
var detailPath = '//*[@id="ajaxcontainer"]/div[3]/div/div[2]/article[' + index + ']/div[3]/footer/div/small';
for (var k in scrape){
if (scrape.hasOwnProperty(k)) {
var scrapePath = detailPath + '/span[' + k + ']';
item[scrape[k]] = ZU.trimInternal(ZU.xpathText(doc, scrapePath)).split(/: (.+)/)[1];
}
}
item.pages=item.pages.replace(/^Pages?: ?/,''); //Done automatically by Zotero
item.tags=item.tags.split(", ");
if (item.tags == "none") item.tags=undefined; //Quicker than delete
item.complete();
}
}
});
}
function scrapeImage(doc, url) {
var item = new Z.Item("newspaperArticle");
var mapping = {
publicationTitle :"newspaperTitle",
date :"newspaperDate",
rights :"newspaperCopy",
};
for (var i in mapping ){
if (mapping.hasOwnProperty(i)) {
var scrapePath="//*[@id=\"" + mapping[i] +"\"]";
item[i]=ZU.xpathText(doc, scrapePath);
}
}
item.title=doc.title.split("|")[0].trim().replace(/[.,;]?$/, '');
item.url=url;
item.complete();
}
/** BEGIN TEST CASES **/
var testCases = [
{
"type": "web",
"url": "https://www.britishnewspaperarchive.co.uk/viewer/bl/0001578/18901206/021/0007",
"items": [
{
"itemType": "newspaperArticle",
"title": "Testimonial To Sir Robert Hart, G.c..m.g., Inspector-General Op Customs In China",
"creators": [],
"libraryCatalog": "British Newspaper Archive",
"publicationTitle": "Loading publication",
"url": "https://www.britishnewspaperarchive.co.uk/viewer/bl/0001578/18901206/021/0007",
"attachments": [],
"tags": [],
"notes": [],
"seeAlso": []
}
]
},
{
"type": "web",
"url": "https://www.britishnewspaperarchive.co.uk/search/results/1700-01-01/1749-12-31?basicsearch=robert%20hart%2020%20november%201890%20plate&exactsearch=true&retrievecountrycounts=false",
"items": "multiple"
}
]
/** END TEST CASES **/
@emmareisz
Copy link
Author

emmareisz commented Nov 22, 2017

I think I have fixed the three code niggles you mentioned above.

The /Pages?:/ regex is redundant, however.

@medouuii
Copy link

how can i use this file

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment