Skip to content

Instantly share code, notes, and snippets.

@Arquetipo28
Last active October 27, 2019 18:38
Show Gist options
  • Save Arquetipo28/03adb486e9431bdd1f2cfa56f732a66d to your computer and use it in GitHub Desktop.
Save Arquetipo28/03adb486e9431bdd1f2cfa56f732a66d to your computer and use it in GitHub Desktop.
// Require npm modules
const express = require('express'),
logger = require('morgan'),
bodyParser = require('body-parser');
var app = module.exports = express();
const routes = require('./src/config/routes');
// Use morgan to log connections, status, and errors in console
app.use(logger('dev'));
// Use body-parser to catch params sended as url encoded and json
app.use(bodyParser.urlencoded({extended: false}));
app.use(bodyParser.json());
// Call routes from the root path
app.use('', routes);
// Start server on port 3000
app.listen(3000, () => {
});
const puppeteer = require('puppeteer');
async function posts () {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.goto('https://www.milenio.com/ultima-hora');
await page.waitForSelector("[class='content']");
let wallData = await page.evaluate(e => {
window.scrollTo(0, 500);
// Get all containers with the class equal to lr-row-news and create an array from them
let elements = Array.from(document.querySelectorAll("[class='lr-row-news']"));
// Iterate through all found items
let posts = elements.map(cont => {
// Searching elements by its parent classes
let elTitle = cont.querySelector(".title-container > .title");
let elContent = cont.querySelector(".title-container > .summary > span");
let elDateTime = cont.querySelector("[class='hour']");
let elPostLink = cont.querySelector(".title-container > .title > a");
// Return JSON with the text inside the found elements
return {
title: (elTitle && elTitle.innerText) ? elTitle.innerText.replace('\n', ' ') : 'Not found',
content: (elContent && elContent.innerText) ? elContent.innerText.replace('\n', ' ') : "No content found",
created_at: (elDateTime && elDateTime.innerText) ? elDateTime.innerText.replace('\n', ' ') : "No content found",
link: (elPostLink && elPostLink.href) ? elPostLink.href : 'Not link',
provider: 'MILENIO'
}
})
return posts
})
await page.goto('https://elpais.com/tag/mexico/a');
await page.waitForSelector("[class='articulos articulos_cuerpo']");
wallData = [...wallData, ...(await page.evaluate(e => {
window.scrollTo(0, 500);
let elements = Array.from(document.querySelectorAll("div[class='articulo__interior']"));
let posts = elements.map(cont => {
let elTitle = cont.querySelector(".articulo-titulo");
let elContent = cont.querySelector(".articulo-entradilla");
let elDateTime = cont.querySelector(".articulo-metadatos > time");
let elPostLink = cont.querySelector(".articulo-titulo > a");
return {
title: (elTitle && elTitle.innerText) ? elTitle.innerText.replace('\n', ' ') : 'Not found',
content: (elContent && elContent.innerText) ? elContent.innerText.replace('\n', ' ') : "No content found",
created_at: (elDateTime && elDateTime.innerText) ? elDateTime.innerText.replace('\n', ' ') : "No content found",
link: (elPostLink && elPostLink.href) ? elPostLink.href : 'Not link',
provider: 'EL PAÍS'
}
})
return posts;
}))]
return wallData
}
module.exports = {
Crawler: function (app) {
constructor()
{
}
this.posts = posts
}
}
// Require Crawler model
const Crawler = require('../models/Crawler').Crawler;
// Creates an instance of the crawler
const principalCrawler = new Crawler()
class NewsController {
/*
Asynchronous method that takes the parameters
of the router.get callback (request, response)
*/
async all (_req, res) {
// Set Content-Type header to application/json
res.setHeader('Content-Type', 'application/json');
// Call posts method from Crawler model and return its value
res.send(JSON.stringify(await principalCrawler.posts()));
}
}
module.exports = NewsController
/*
NewsController: contains the response that will be returned.
NewsInstance: instance of NewsController
express: require express to access Router
*/
const NewsController = require('../controllers/news'),
NewsInstance = new NewsController(),
express = require('express'),
router = express.Router();
// Catch root path
router.get('/', (_req, res) => {
res.send('Root page')
});
// Catch /news path and executes method all of NewsController
router.get('/news', NewsInstance.all);
// Returns router
module.exports = router;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment