avram/New Zealand Herald.js

## gistfile2.txt

      
    Raw
  

              gistfile2.txt
            
          
## New Zealand Herald.js
{"translatorID":"207f4aad-b604-43ef-a7f5-3e6229aade9f",
"label":"New Zealand Herald",
"creator":"Sopheak Hean (University of Waikato, Faculty of Education)",
"target":"www.nzherald.co.nz",
"minVersion":"1.0",
"maxVersion":"",
"priority":100,
"inRepository":"1",
"translatorType":4,
"lastUpdated":"2010-07-30 09:26:09"}


function detectWeb(doc, url) {
	var namespace = doc.documentElement.namespaceURI;
	var nsResolver = namespace ? function(prefix) {
	if (prefix == "x" ) return namespace; else return null;
	} : null;

/* If the address bar has /news in it then it's a newspapers article*/

	if (doc.title.indexOf("Search Result") !=-1){
		return "multiple";
	} else if (doc.location.href.indexOf("/news") !=-1){
		return "newspaperArticle";
	}


}

function associateData (newItem, items, field, zoteroField) {
	if (items[field]){
		newItem[zoteroField] = items[field];
	}
}


function scrape(doc, url){
	var namespace = doc.documentElement.namespaceURI;
	var nsResolver = namespace ? function(prefix) {
		if (prefix == 'x') return namespace; else return null;
	} : null;

	var articleLanguage = "English";

	var newItem = new Zotero.Item('newspaperArticle');
	newItem.url = doc.location.href;

	Zotero.Utilities.HTTP.doGet(newItem.url, function(text) {
	newItem.title = "No Title Found";
	newItem.publicationTitle = "New Zealand Herald";

	//Get title of the news via xpath
	var myXPath = '//h1';
	var myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
	var headers;
	var items = new Object();
	var authorsTemp;
	var blankCell;
	var contents;
	var authorArray = new Array();

	/*
		//Get authors of the article
		Remove "By " then replace "and " with ", "
		Put the string into an array then split the array and loop all authors then push author to Zotero.  Possible with more than 1 author on an article.
	*/
	var aut = /<span class=\"credits\">(.*)/g;
	if (text.match(aut)){

			var authorXPath = '//span[@class="credits"]';
			var authorXPathObject = doc.evaluate(authorXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.replace(/\bBy\W+/g, '');
			if (authorXPathObject.match(/\W\band\W+/g)){
				authorTemp = authorXPathObject.replace(/\W\band\W+/g, ', ');
				authorArray = authorTemp.split(", ");
			} else if (!authorXPathObject.match(/\W\band\W+/g)){
				authorArray = authorXPathObject;
			}
			if( authorArray instanceof Array ) {
				for (var i in authorArray){
				var author;
					author = authorArray[i];
					newItem.creators.push(Zotero.Utilities.cleanAuthor(author, "author"));
				}
			}
			else {

				if (authorXPathObject.match(/\W\bof\W+/g)){
					authorTemp = authorXPathObject.replace (/\W\bof\W(.*)/g, '');
					authorArray = authorTemp;

					newItem.creators.push(Zotero.Utilities.cleanAuthor(authorTemp, "author"));


				}  else {
				newItem.creators.push(Zotero.Utilities.cleanAuthor(authorArray, "author"));
				}
			}

	} else{

		var authorname = "Unknown ";
		newItem.creators.push(Zotero.Utilities.cleanAuthor(authorname, "author"));

	}


	//
	//date-Year
	var dateXPath = '//div[@class="tools"]/span';
	var dateXPathObject = doc.evaluate(dateXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.replace(/\d{1,2}:\d{1,2} (AM|PM) (\w)+ /g, '');
	//newItem.date = dateXPathObject;

	//If the original Xpath1 is equal to Updated then go to XPath2
	if ((dateXPathObject =="Updated")|| (dateXPathObject =="New")){
		var dateXPath = '//div[@class="tools"]/span[2]';
		var dateXPathObject = doc.evaluate(dateXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.replace(/\d{1,2}:\d{1,2} (AM|PM) (\w)+ /g, '');
		newItem.date = dateXPathObject ;
	}
	else{ //great found the date just push it to Zotero.
		var dateXPath = '//div[@class="tools"]/span';
		var dateXPathObject = doc.evaluate(dateXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.replace(/\d{1,2}:\d{1,2} (AM|PM) (\w)+ /g, '');

		newItem.date = dateXPathObject ;
	}

	//Get Section of the news
	var sectionXPath = '//div[@class="sectionHeader"]/span/a[1]';
	var sectionXPathObject = doc.evaluate(sectionXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
	newItem.section = sectionXPathObject;

	//Get news title
	headers =myXPathObject;
	newItem.title = headers;

	newItem.language= articleLanguage;

	//grab abstract from meta data
	var a= /meta name=\"description\" content=\"([^&]*)/;
		newItem.abstractNote = text.match(a)[1];
		newItem.complete();
		Zotero.done();


	}, function() {});

	/* These doing nothing but leaving it here just in case
	associateData (newItem, items, "Language:", "language");
	associateData (newItem, items, "Section:", "section");
	associateData (newItem, items, "Abstract:", "abstract");
	associateData (newItem, items, "Author:", "author");
	*/
}


function doWeb(doc, url){
	var namespace = doc.documentElement.namespaceURI;
	var nsResolver = namespace ? function(prefix){
		if (prefix =='x')
		return namespace; else return null;
	} :null;

	var articles = new Array();
	var items = new Object();
	var nextTitle;


	if (detectWeb(doc, url) == "multiple"){
		var titles = doc.evaluate('//p[@class="g"]/a', doc, nsResolver, XpathResult.ANY_TYPE, null);
		while (nextTitle = titles.iterateNext()){
			items[nextTitle.href] = nextTitle.textContent;
		}
		items= Zotero.selectItems(items);
		for (var i in items){
			articles.push(i);
		}
	} else if (detectWeb(doc,url) =="newspaperArticle"){
	articles = [url];

	}
	Zotero.debug(articles);
	Zotero.Utilities.HTTP.doPost(articles, "", function(text) {
		for (var i = 0 ; i < articles.length ; i++) {
			scrape(articles[i]);
		}
	});


	//Zotero.Util only works when scrape function is declared
	Zotero.Utilities.processDocuments(articles, scrape, function(){Zotero.done();});


	Zotero.wait();

}
	{"translatorID":"207f4aad-b604-43ef-a7f5-3e6229aade9f",
	"label":"New Zealand Herald",
	"creator":"Sopheak Hean (University of Waikato, Faculty of Education)",
	"target":"www.nzherald.co.nz",
	"minVersion":"1.0",
	"maxVersion":"",
	"priority":100,
	"inRepository":"1",
	"translatorType":4,
	"lastUpdated":"2010-07-30 09:26:09"}



	function detectWeb(doc, url) {
	var namespace = doc.documentElement.namespaceURI;
	var nsResolver = namespace ? function(prefix) {
	if (prefix == "x" ) return namespace; else return null;
	} : null;

	/* If the address bar has /news in it then it's a newspapers article*/

	if (doc.title.indexOf("Search Result") !=-1){
	return "multiple";
	} else if (doc.location.href.indexOf("/news") !=-1){
	return "newspaperArticle";
	}


	}

	function associateData (newItem, items, field, zoteroField) {
	if (items[field]){
	newItem[zoteroField] = items[field];
	}
	}



	function scrape(doc, url){
	var namespace = doc.documentElement.namespaceURI;
	var nsResolver = namespace ? function(prefix) {
	if (prefix == 'x') return namespace; else return null;
	} : null;

	var articleLanguage = "English";

	var newItem = new Zotero.Item('newspaperArticle');
	newItem.url = doc.location.href;

	Zotero.Utilities.HTTP.doGet(newItem.url, function(text) {
	newItem.title = "No Title Found";
	newItem.publicationTitle = "New Zealand Herald";

	//Get title of the news via xpath
	var myXPath = '//h1';
	var myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
	var headers;
	var items = new Object();
	var authorsTemp;
	var blankCell;
	var contents;
	var authorArray = new Array();

	/*
	//Get authors of the article
	Remove "By " then replace "and " with ", "
	Put the string into an array then split the array and loop all authors then push author to Zotero. Possible with more than 1 author on an article.
	*/
	var aut = /<span class=\"credits\">(.*)/g;
	if (text.match(aut)){

	var authorXPath = '//span[@class="credits"]';
	var authorXPathObject = doc.evaluate(authorXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.replace(/\bBy\W+/g, '');
	if (authorXPathObject.match(/\W\band\W+/g)){
	authorTemp = authorXPathObject.replace(/\W\band\W+/g, ', ');
	authorArray = authorTemp.split(", ");
	} else if (!authorXPathObject.match(/\W\band\W+/g)){
	authorArray = authorXPathObject;
	}
	if( authorArray instanceof Array ) {
	for (var i in authorArray){
	var author;
	author = authorArray[i];
	newItem.creators.push(Zotero.Utilities.cleanAuthor(author, "author"));
	}
	}
	else {

	if (authorXPathObject.match(/\W\bof\W+/g)){
	authorTemp = authorXPathObject.replace (/\W\bof\W(.*)/g, '');
	authorArray = authorTemp;

	newItem.creators.push(Zotero.Utilities.cleanAuthor(authorTemp, "author"));


	} else {
	newItem.creators.push(Zotero.Utilities.cleanAuthor(authorArray, "author"));
	}
	}

	} else{

	var authorname = "Unknown ";
	newItem.creators.push(Zotero.Utilities.cleanAuthor(authorname, "author"));

	}



	//
	//date-Year
	var dateXPath = '//div[@class="tools"]/span';
	var dateXPathObject = doc.evaluate(dateXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.replace(/\d{1,2}:\d{1,2} (AM\|PM) (\w)+ /g, '');
	//newItem.date = dateXPathObject;

	//If the original Xpath1 is equal to Updated then go to XPath2
	if ((dateXPathObject =="Updated")\|\| (dateXPathObject =="New")){
	var dateXPath = '//div[@class="tools"]/span[2]';
	var dateXPathObject = doc.evaluate(dateXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.replace(/\d{1,2}:\d{1,2} (AM\|PM) (\w)+ /g, '');
	newItem.date = dateXPathObject ;
	}
	else{ //great found the date just push it to Zotero.
	var dateXPath = '//div[@class="tools"]/span';
	var dateXPathObject = doc.evaluate(dateXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.replace(/\d{1,2}:\d{1,2} (AM\|PM) (\w)+ /g, '');

	newItem.date = dateXPathObject ;
	}

	//Get Section of the news
	var sectionXPath = '//div[@class="sectionHeader"]/span/a[1]';
	var sectionXPathObject = doc.evaluate(sectionXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
	newItem.section = sectionXPathObject;

	//Get news title
	headers =myXPathObject;
	newItem.title = headers;

	newItem.language= articleLanguage;

	//grab abstract from meta data
	var a= /meta name=\"description\" content=\"([^&]*)/;
	newItem.abstractNote = text.match(a)[1];
	newItem.complete();
	Zotero.done();


	}, function() {});

	/* These doing nothing but leaving it here just in case
	associateData (newItem, items, "Language:", "language");
	associateData (newItem, items, "Section:", "section");
	associateData (newItem, items, "Abstract:", "abstract");
	associateData (newItem, items, "Author:", "author");
	*/
	}




	function doWeb(doc, url){
	var namespace = doc.documentElement.namespaceURI;
	var nsResolver = namespace ? function(prefix){
	if (prefix =='x')
	return namespace; else return null;
	} :null;

	var articles = new Array();
	var items = new Object();
	var nextTitle;



	if (detectWeb(doc, url) == "multiple"){
	var titles = doc.evaluate('//p[@class="g"]/a', doc, nsResolver, XpathResult.ANY_TYPE, null);
	while (nextTitle = titles.iterateNext()){
	items[nextTitle.href] = nextTitle.textContent;
	}
	items= Zotero.selectItems(items);
	for (var i in items){
	articles.push(i);
	}
	} else if (detectWeb(doc,url) =="newspaperArticle"){
	articles = [url];

	}
	Zotero.debug(articles);
	Zotero.Utilities.HTTP.doPost(articles, "", function(text) {
	for (var i = 0 ; i < articles.length ; i++) {
	scrape(articles[i]);
	}
	});


	//Zotero.Util only works when scrape function is declared
	Zotero.Utilities.processDocuments(articles, scrape, function(){Zotero.done();});


	Zotero.wait();

	}