Created
November 5, 2021 12:36
-
-
Save planetWayne/f39ed2b462713a7f297d10042aa99778 to your computer and use it in GitHub Desktop.
My Interpretation of Ania Kubów's YT - Build and sell your own API $$$ - https://youtu.be/GK4Pl-GmPHk
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Please note, credit needs to go to Ania Kubów's YT - Build and sell your own API $$$ - | |
without such, this wouldnt have existed --> https://youtu.be/GK4Pl-GmPHk | |
My take addresses thoughts I had when watching the original that at what point does the scrapes get updated. | |
This version scrapes each time the /news page is called. It also carries on if there is a problem with a source | |
(example given with a 'bad source' entry, this is to show what happens if a scraped source goes off line or 'bad') | |
It also makes use of the URL() function to return the URL if a 'base' needs to be supplied. If a site doesnt need a base, | |
dont specify the emiment in the array. (another point raised in the comments) | |
A couple of other additions. | |
When selecting a specific source, this can now be done with a passed URL parameter in the format of | |
http://localhost:8000/news?source=The Times | |
Request all the source names that are known to the API via | |
http://localhost:8000/news/sources | |
Dump the complete 'newspapers' array with | |
http://localhost:8000/dumplist | |
Note - The list of newspapers is not the complete list Ania used in the final version, but just enough to prove the concept. | |
Also note that this is my interpritation and it has flaws! I dont like the way that 'arewedone()' is used to be able to | |
return the results, at this point of my learning, I couldnt figure out how async functions are used correctly to chain | |
results together. Its a quick n dirty fix. | |
Suggestions welcomed. | |
Cheers | |
pW. | |
*/ | |
const PORT = 8000 | |
const express = require('express') | |
const axios = require('axios') | |
const cheerio = require ('cheerio') | |
const app = express() | |
const newspapers = [ | |
{ | |
name: 'The Times', | |
address: 'https://www.thetimes.co.uk/environment/climate-change', | |
}, | |
{ | |
name: 'The Guardian', | |
address: 'https://www.theguardian.com/environment/climate-crisis', | |
}, | |
{ | |
name: 'Daily Telegraph', | |
address: 'https://www.telegraph.co.uk/climate-change', | |
base: 'https://www.telegraph.co.uk' | |
}, | |
{ | |
name: 'Broken', | |
address: 'https://www.dsdfoesnotexist.com' | |
} | |
] | |
app.get('/', (req , res ) => { | |
res.json('Welcome to my Climate Change News API') | |
}) | |
// Just return a list of all the names in the newspapers list. | |
app.get('/news/sources', (req, res) => { | |
res.json ( newspapers.map( obj => { | |
let newObj = {} | |
newObj['Name'] = obj.name | |
return newObj | |
}) ) | |
}) | |
// Dump the newspapers array as a JSON result. | |
app.get('/dumplist' , (req,res) => res.json(newspapers)) | |
// This version scrapes the sources each time that '/news' is called. | |
// You can also now pass a URL parameter with the source you want to use | |
// eg. http://localhost/news?source=The Times | |
app.get('/news', async (req , res) => { | |
// Initialise the Articles array each time the 'news' page is called so not to just 'add' to the list each time. | |
const articles = [] | |
// a simple counter used below to see if we have gone through all the newspapers | |
var count = 0 | |
console.log("Query Sent",req.query.source) | |
// If we provided a parameter (?source=somthing) on the URL, then filter the list down to the ones that match. | |
const papers = (req.query.source ? newspapers.filter(newspaper => newspaper.name === req.query.source) : newspapers) | |
// If we filtered everything, by not getting a source correctly, then we need to return somthing to say that. | |
if (papers.length == 0){ | |
res.json("No Source Selected") | |
} | |
// Now go through the scraping | |
papers.forEach(newspaper => { | |
axios.get(newspaper.address) | |
.then((response) => { | |
const html = response.data | |
const $ = cheerio.load(html) | |
const artCount = articles.length | |
$('a:contains("climate")',html).each(function () { | |
const Title = $(this).text().trim() | |
// Using the URL function to add in the base URL if specified. | |
const fullURL = new URL($(this).attr('href'), newspaper.base) | |
articles.push ({ | |
Source: newspaper.name, | |
Title, | |
URL: fullURL | |
}) | |
}) | |
console.log ("Scraped",articles.length-artCount,"Articles from",newspaper.name) | |
// Dirty Fix | |
areWeDone() | |
}) | |
.catch((err) => { | |
console.log("Could not scrape",err.hostname) | |
articles.push({ | |
Source: newspaper.name, | |
Title: "Problem Scraping Source", | |
URL: err.config.url | |
}) | |
areWeDone() | |
}) | |
}) | |
// dirty way to see if we have gone through the list of newspapers so we can send the result back to the client. | |
// so needed a counter to see how many times the '.then' or '.catch' had been called | |
// and if its gone through the same number of times that the length of the | |
// newspapers array is long, then we have gone through all the sites. | |
// at this point we can then return the articles via the res.json callback | |
// function. | |
function areWeDone(){ | |
count ++ | |
if (count === papers.length){ | |
res.json(articles) | |
} | |
} | |
}) | |
app.listen(PORT, () => console.log(`Server Running on port ${PORT}`) ) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment