Last active
October 27, 2019 18:38
-
-
Save Arquetipo28/03adb486e9431bdd1f2cfa56f732a66d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Require npm modules | |
const express = require('express'), | |
logger = require('morgan'), | |
bodyParser = require('body-parser'); | |
var app = module.exports = express(); | |
const routes = require('./src/config/routes'); | |
// Use morgan to log connections, status, and errors in console | |
app.use(logger('dev')); | |
// Use body-parser to catch params sended as url encoded and json | |
app.use(bodyParser.urlencoded({extended: false})); | |
app.use(bodyParser.json()); | |
// Call routes from the root path | |
app.use('', routes); | |
// Start server on port 3000 | |
app.listen(3000, () => { | |
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const puppeteer = require('puppeteer'); | |
async function posts () { | |
const browser = await puppeteer.launch({ headless: true }); | |
const page = await browser.newPage(); | |
await page.goto('https://www.milenio.com/ultima-hora'); | |
await page.waitForSelector("[class='content']"); | |
let wallData = await page.evaluate(e => { | |
window.scrollTo(0, 500); | |
// Get all containers with the class equal to lr-row-news and create an array from them | |
let elements = Array.from(document.querySelectorAll("[class='lr-row-news']")); | |
// Iterate through all found items | |
let posts = elements.map(cont => { | |
// Searching elements by its parent classes | |
let elTitle = cont.querySelector(".title-container > .title"); | |
let elContent = cont.querySelector(".title-container > .summary > span"); | |
let elDateTime = cont.querySelector("[class='hour']"); | |
let elPostLink = cont.querySelector(".title-container > .title > a"); | |
// Return JSON with the text inside the found elements | |
return { | |
title: (elTitle && elTitle.innerText) ? elTitle.innerText.replace('\n', ' ') : 'Not found', | |
content: (elContent && elContent.innerText) ? elContent.innerText.replace('\n', ' ') : "No content found", | |
created_at: (elDateTime && elDateTime.innerText) ? elDateTime.innerText.replace('\n', ' ') : "No content found", | |
link: (elPostLink && elPostLink.href) ? elPostLink.href : 'Not link', | |
provider: 'MILENIO' | |
} | |
}) | |
return posts | |
}) | |
await page.goto('https://elpais.com/tag/mexico/a'); | |
await page.waitForSelector("[class='articulos articulos_cuerpo']"); | |
wallData = [...wallData, ...(await page.evaluate(e => { | |
window.scrollTo(0, 500); | |
let elements = Array.from(document.querySelectorAll("div[class='articulo__interior']")); | |
let posts = elements.map(cont => { | |
let elTitle = cont.querySelector(".articulo-titulo"); | |
let elContent = cont.querySelector(".articulo-entradilla"); | |
let elDateTime = cont.querySelector(".articulo-metadatos > time"); | |
let elPostLink = cont.querySelector(".articulo-titulo > a"); | |
return { | |
title: (elTitle && elTitle.innerText) ? elTitle.innerText.replace('\n', ' ') : 'Not found', | |
content: (elContent && elContent.innerText) ? elContent.innerText.replace('\n', ' ') : "No content found", | |
created_at: (elDateTime && elDateTime.innerText) ? elDateTime.innerText.replace('\n', ' ') : "No content found", | |
link: (elPostLink && elPostLink.href) ? elPostLink.href : 'Not link', | |
provider: 'EL PAÍS' | |
} | |
}) | |
return posts; | |
}))] | |
return wallData | |
} | |
module.exports = { | |
Crawler: function (app) { | |
constructor() | |
{ | |
} | |
this.posts = posts | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Require Crawler model | |
const Crawler = require('../models/Crawler').Crawler; | |
// Creates an instance of the crawler | |
const principalCrawler = new Crawler() | |
class NewsController { | |
/* | |
Asynchronous method that takes the parameters | |
of the router.get callback (request, response) | |
*/ | |
async all (_req, res) { | |
// Set Content-Type header to application/json | |
res.setHeader('Content-Type', 'application/json'); | |
// Call posts method from Crawler model and return its value | |
res.send(JSON.stringify(await principalCrawler.posts())); | |
} | |
} | |
module.exports = NewsController |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
NewsController: contains the response that will be returned. | |
NewsInstance: instance of NewsController | |
express: require express to access Router | |
*/ | |
const NewsController = require('../controllers/news'), | |
NewsInstance = new NewsController(), | |
express = require('express'), | |
router = express.Router(); | |
// Catch root path | |
router.get('/', (_req, res) => { | |
res.send('Root page') | |
}); | |
// Catch /news path and executes method all of NewsController | |
router.get('/news', NewsInstance.all); | |
// Returns router | |
module.exports = router; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment