planetWayne/index.js

## index.js
/*
Please note, credit needs to go to Ania Kubów's YT - Build and sell your own API $$$ -
without such, this wouldnt have existed --> https://youtu.be/GK4Pl-GmPHk

My take addresses thoughts I had when watching the original that at what point does the scrapes get updated.
This version scrapes each time the /news page is called. It also carries on if there is a problem with a source
(example given with a 'bad source' entry, this is to show what happens if a scraped source goes off line or 'bad')

It also makes use of the URL() function to return the URL if a 'base' needs to be supplied. If a site doesnt need a base,
dont specify the emiment in the array. (another point raised in the comments)

A couple of other additions.
When selecting a specific source, this can now be done with a passed URL parameter in the format of
http://localhost:8000/news?source=The Times

Request all the source names that are known to the API via
http://localhost:8000/news/sources

Dump the complete 'newspapers' array with
http://localhost:8000/dumplist

Note - The list of newspapers is not the complete list Ania used in the final version, but just enough to prove the concept.

Also note that this is my interpritation and it has flaws! I dont like the way that 'arewedone()' is used to be able to
return the results, at this point of my learning, I couldnt figure out how async functions are used correctly to chain
results together. Its a quick n dirty fix.

Suggestions welcomed.

Cheers
pW.

*/

const PORT = 8000
const express = require('express')
const axios = require('axios')
const cheerio = require ('cheerio')

const app = express()

const newspapers = [
	{
		name: 'The Times',
		address: 'https://www.thetimes.co.uk/environment/climate-change',
	},
	{
		name: 'The Guardian',
		address: 'https://www.theguardian.com/environment/climate-crisis',
	},
	{
		name: 'Daily Telegraph',
		address: 'https://www.telegraph.co.uk/climate-change',
		base: 'https://www.telegraph.co.uk'
	},
	{
		name: 'Broken',
		address: 'https://www.dsdfoesnotexist.com'
	}
]


app.get('/', (req , res ) => {
	res.json('Welcome to my Climate Change News API')
})


// Just return a list of all the names in the newspapers list.
app.get('/news/sources', (req, res) => {
	res.json ( newspapers.map( obj => {
		let newObj = {}
		newObj['Name'] = obj.name
		return newObj
	}) )
})

// Dump the newspapers array as a JSON result.
app.get('/dumplist' , (req,res) => res.json(newspapers))


// This version scrapes the sources each time that '/news' is called.
// You can also now pass a URL parameter with the source you want to use
// eg. http://localhost/news?source=The Times

app.get('/news', async (req , res) => {

	// Initialise the Articles array each time the 'news' page is called so not to just 'add' to the list each time.
	const articles = []

	// a simple counter used below to see if we have gone through all the newspapers
	var count = 0

	console.log("Query Sent",req.query.source)

	// If we provided a parameter (?source=somthing) on the URL, then filter the list down to the ones that match.
	const papers = (req.query.source ? newspapers.filter(newspaper => newspaper.name === req.query.source) : newspapers)


	// If we filtered everything, by not getting a source correctly, then we need to return somthing to say that.
	if (papers.length == 0){
		res.json("No Source Selected")
	}

	// Now go through the scraping
	papers.forEach(newspaper => {

		axios.get(newspaper.address)
		.then((response) => {

			const html = response.data
			const $ = cheerio.load(html)
			const artCount = articles.length

		 	$('a:contains("climate")',html).each(function () {
		 		const Title = $(this).text().trim()

		 		// Using the URL function to add in the base URL if specified.
		 		const fullURL = new URL($(this).attr('href'), newspaper.base)

		 		articles.push ({
	 				Source: newspaper.name,
		 			Title,
		 			URL: fullURL
		 		})
		 	})

		 	console.log ("Scraped",articles.length-artCount,"Articles from",newspaper.name)

		 	// Dirty Fix
		 	areWeDone()

		})

		.catch((err) => {
			console.log("Could not scrape",err.hostname)
			articles.push({
				Source: newspaper.name,
				Title: "Problem Scraping Source",
				URL: err.config.url
			})

			areWeDone()
		})
	})


	// dirty way to see if we have gone through the list of newspapers so we can send the result back to the client.
	// so needed a counter to see how many times the '.then' or '.catch' had been called
 	// and if its gone through the same number of times that the length of the
 	// newspapers array is long, then we have gone through all the sites.
 	// at this point we can then return the articles via the res.json callback
 	// function.

	function areWeDone(){
			count ++
		 	if (count === papers.length){
		 		res.json(articles)
		 	}
	}
})


app.listen(PORT, () => console.log(`Server Running on port ${PORT}`) )
	/*
	Please note, credit needs to go to Ania Kubów's YT - Build and sell your own API $$$ -
	without such, this wouldnt have existed --> https://youtu.be/GK4Pl-GmPHk

	My take addresses thoughts I had when watching the original that at what point does the scrapes get updated.
	This version scrapes each time the /news page is called. It also carries on if there is a problem with a source
	(example given with a 'bad source' entry, this is to show what happens if a scraped source goes off line or 'bad')

	It also makes use of the URL() function to return the URL if a 'base' needs to be supplied. If a site doesnt need a base,
	dont specify the emiment in the array. (another point raised in the comments)

	A couple of other additions.
	When selecting a specific source, this can now be done with a passed URL parameter in the format of
	http://localhost:8000/news?source=The Times

	Request all the source names that are known to the API via
	http://localhost:8000/news/sources

	Dump the complete 'newspapers' array with
	http://localhost:8000/dumplist

	Note - The list of newspapers is not the complete list Ania used in the final version, but just enough to prove the concept.

	Also note that this is my interpritation and it has flaws! I dont like the way that 'arewedone()' is used to be able to
	return the results, at this point of my learning, I couldnt figure out how async functions are used correctly to chain
	results together. Its a quick n dirty fix.

	Suggestions welcomed.

	Cheers
	pW.

	*/

	const PORT = 8000
	const express = require('express')
	const axios = require('axios')
	const cheerio = require ('cheerio')

	const app = express()

	const newspapers = [
	{
	name: 'The Times',
	address: 'https://www.thetimes.co.uk/environment/climate-change',
	},
	{
	name: 'The Guardian',
	address: 'https://www.theguardian.com/environment/climate-crisis',
	},
	{
	name: 'Daily Telegraph',
	address: 'https://www.telegraph.co.uk/climate-change',
	base: 'https://www.telegraph.co.uk'
	},
	{
	name: 'Broken',
	address: 'https://www.dsdfoesnotexist.com'
	}
	]


	app.get('/', (req , res ) => {
	res.json('Welcome to my Climate Change News API')
	})


	// Just return a list of all the names in the newspapers list.
	app.get('/news/sources', (req, res) => {
	res.json ( newspapers.map( obj => {
	let newObj = {}
	newObj['Name'] = obj.name
	return newObj
	}) )
	})

	// Dump the newspapers array as a JSON result.
	app.get('/dumplist' , (req,res) => res.json(newspapers))



	// This version scrapes the sources each time that '/news' is called.
	// You can also now pass a URL parameter with the source you want to use
	// eg. http://localhost/news?source=The Times

	app.get('/news', async (req , res) => {

	// Initialise the Articles array each time the 'news' page is called so not to just 'add' to the list each time.
	const articles = []

	// a simple counter used below to see if we have gone through all the newspapers
	var count = 0

	console.log("Query Sent",req.query.source)

	// If we provided a parameter (?source=somthing) on the URL, then filter the list down to the ones that match.
	const papers = (req.query.source ? newspapers.filter(newspaper => newspaper.name === req.query.source) : newspapers)


	// If we filtered everything, by not getting a source correctly, then we need to return somthing to say that.
	if (papers.length == 0){
	res.json("No Source Selected")
	}

	// Now go through the scraping
	papers.forEach(newspaper => {

	axios.get(newspaper.address)
	.then((response) => {

	const html = response.data
	const $ = cheerio.load(html)
	const artCount = articles.length

	$('a:contains("climate")',html).each(function () {
	const Title = $(this).text().trim()

	// Using the URL function to add in the base URL if specified.
	const fullURL = new URL($(this).attr('href'), newspaper.base)

	articles.push ({
	Source: newspaper.name,
	Title,
	URL: fullURL
	})
	})

	console.log ("Scraped",articles.length-artCount,"Articles from",newspaper.name)

	// Dirty Fix
	areWeDone()

	})

	.catch((err) => {
	console.log("Could not scrape",err.hostname)
	articles.push({
	Source: newspaper.name,
	Title: "Problem Scraping Source",
	URL: err.config.url
	})

	areWeDone()
	})
	})


	// dirty way to see if we have gone through the list of newspapers so we can send the result back to the client.
	// so needed a counter to see how many times the '.then' or '.catch' had been called
	// and if its gone through the same number of times that the length of the
	// newspapers array is long, then we have gone through all the sites.
	// at this point we can then return the articles via the res.json callback
	// function.

	function areWeDone(){
	count ++
	if (count === papers.length){
	res.json(articles)
	}
	}
	})



	app.listen(PORT, () => console.log(`Server Running on port ${PORT}`) )