Skip to content

Instantly share code, notes, and snippets.

@avram
Forked from anonymous/New Zealand Herald.js
Created July 29, 2010 21:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avram/499358 to your computer and use it in GitHub Desktop.
Save avram/499358 to your computer and use it in GitHub Desktop.
{"translatorID":"207f4aad-b604-43ef-a7f5-3e6229aade9f",
"label":"New Zealand Herald",
"creator":"Sopheak Hean (University of Waikato, Faculty of Education)",
"target":"www.nzherald.co.nz",
"minVersion":"1.0",
"maxVersion":"",
"priority":100,
"inRepository":"1",
"translatorType":4,
"lastUpdated":"2010-07-30 09:26:09"}
function detectWeb(doc, url) {
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == "x" ) return namespace; else return null;
} : null;
/* If the address bar has /news in it then it's a newspapers article*/
if (doc.title.indexOf("Search Result") !=-1){
return "multiple";
} else if (doc.location.href.indexOf("/news") !=-1){
return "newspaperArticle";
}
}
function associateData (newItem, items, field, zoteroField) {
if (items[field]){
newItem[zoteroField] = items[field];
}
}
function scrape(doc, url){
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;
var articleLanguage = "English";
var newItem = new Zotero.Item('newspaperArticle');
newItem.url = doc.location.href;
Zotero.Utilities.HTTP.doGet(newItem.url, function(text) {
newItem.title = "No Title Found";
newItem.publicationTitle = "New Zealand Herald";
//Get title of the news via xpath
var myXPath = '//h1';
var myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
var headers;
var items = new Object();
var authorsTemp;
var blankCell;
var contents;
var authorArray = new Array();
/*
//Get authors of the article
Remove "By " then replace "and " with ", "
Put the string into an array then split the array and loop all authors then push author to Zotero. Possible with more than 1 author on an article.
*/
var aut = /<span class=\"credits\">(.*)/g;
if (text.match(aut)){
var authorXPath = '//span[@class="credits"]';
var authorXPathObject = doc.evaluate(authorXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.replace(/\bBy\W+/g, '');
if (authorXPathObject.match(/\W\band\W+/g)){
authorTemp = authorXPathObject.replace(/\W\band\W+/g, ', ');
authorArray = authorTemp.split(", ");
} else if (!authorXPathObject.match(/\W\band\W+/g)){
authorArray = authorXPathObject;
}
if( authorArray instanceof Array ) {
for (var i in authorArray){
var author;
author = authorArray[i];
newItem.creators.push(Zotero.Utilities.cleanAuthor(author, "author"));
}
}
else {
if (authorXPathObject.match(/\W\bof\W+/g)){
authorTemp = authorXPathObject.replace (/\W\bof\W(.*)/g, '');
authorArray = authorTemp;
newItem.creators.push(Zotero.Utilities.cleanAuthor(authorTemp, "author"));
} else {
newItem.creators.push(Zotero.Utilities.cleanAuthor(authorArray, "author"));
}
}
} else{
var authorname = "Unknown ";
newItem.creators.push(Zotero.Utilities.cleanAuthor(authorname, "author"));
}
//
//date-Year
var dateXPath = '//div[@class="tools"]/span';
var dateXPathObject = doc.evaluate(dateXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.replace(/\d{1,2}:\d{1,2} (AM|PM) (\w)+ /g, '');
//newItem.date = dateXPathObject;
//If the original Xpath1 is equal to Updated then go to XPath2
if ((dateXPathObject =="Updated")|| (dateXPathObject =="New")){
var dateXPath = '//div[@class="tools"]/span[2]';
var dateXPathObject = doc.evaluate(dateXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.replace(/\d{1,2}:\d{1,2} (AM|PM) (\w)+ /g, '');
newItem.date = dateXPathObject ;
}
else{ //great found the date just push it to Zotero.
var dateXPath = '//div[@class="tools"]/span';
var dateXPathObject = doc.evaluate(dateXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.replace(/\d{1,2}:\d{1,2} (AM|PM) (\w)+ /g, '');
newItem.date = dateXPathObject ;
}
//Get Section of the news
var sectionXPath = '//div[@class="sectionHeader"]/span/a[1]';
var sectionXPathObject = doc.evaluate(sectionXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
newItem.section = sectionXPathObject;
//Get news title
headers =myXPathObject;
newItem.title = headers;
newItem.language= articleLanguage;
//grab abstract from meta data
var a= /meta name=\"description\" content=\"([^&]*)/;
newItem.abstractNote = text.match(a)[1];
newItem.complete();
Zotero.done();
}, function() {});
/* These doing nothing but leaving it here just in case
associateData (newItem, items, "Language:", "language");
associateData (newItem, items, "Section:", "section");
associateData (newItem, items, "Abstract:", "abstract");
associateData (newItem, items, "Author:", "author");
*/
}
function doWeb(doc, url){
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix){
if (prefix =='x')
return namespace; else return null;
} :null;
var articles = new Array();
var items = new Object();
var nextTitle;
if (detectWeb(doc, url) == "multiple"){
var titles = doc.evaluate('//p[@class="g"]/a', doc, nsResolver, XpathResult.ANY_TYPE, null);
while (nextTitle = titles.iterateNext()){
items[nextTitle.href] = nextTitle.textContent;
}
items= Zotero.selectItems(items);
for (var i in items){
articles.push(i);
}
} else if (detectWeb(doc,url) =="newspaperArticle"){
articles = [url];
}
Zotero.debug(articles);
Zotero.Utilities.HTTP.doPost(articles, "", function(text) {
for (var i = 0 ; i < articles.length ; i++) {
scrape(articles[i]);
}
});
//Zotero.Util only works when scrape function is declared
Zotero.Utilities.processDocuments(articles, scrape, function(){Zotero.done();});
Zotero.wait();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment