Skip to content

Instantly share code, notes, and snippets.

@Ayushverma8
Last active April 14, 2019 08:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Ayushverma8/5a0444b01c5f3d15a116cda5138ddbff to your computer and use it in GitHub Desktop.
Save Ayushverma8/5a0444b01c5f3d15a116cda5138ddbff to your computer and use it in GitHub Desktop.
'use strict';
const express = require('express');
const req_pro = require('request-promise');
const cheerio = require('cheerio');
const mongoose = require('mongoose');
const helmet = require('helmet');
const config = require('./configurations/config');
const URL = require('url');
const schema = require('./data/models');
const chalk = require('chalk');
const _log = console.log;
var bunyan = require('bunyan');
var log = bunyan.createLogger({name: 'RentoMojo'});
const app = express();
const HTTP_SUCCESS = 200;
const HTTP_ERROR = 500;
app.use(helmet());
let urlModel;
// Connect to mlab remote MongoDB instance.
let init = async function () {
try {
const conn = await mongoose.createConnection(config.MongoURI, { useNewUrlParser: true })
urlModel = conn.model('Url', schema);
} catch(err) {
console.log('err', err);
process.exit(1);
}
}
let scrap_the_web;
scrap_the_web = async (request, response) => {
let MongoDbObjects;
let neighbourhood_links;
try {
log.info('Fetching Required Links from URI');
// takes website url passed in Query params. Default url is medium website Url
let uri = request.query.url ? request.query.url : config.URLtoScrap;
let requestObjectFromURI = await req_pro.get(uri);
// neighbourhood_links array contains all urls and params as keys
neighbourhood_links = [];
let $ = cheerio.load(requestObjectFromURI.toString());
// using url module to get url and params from all the hyperlinks
$("a").each((i, link) => {
let u = URL.parse($(link).attr("href"), true);
let params = Object.keys(u.query);
let url = u.href.split('?')[0];
neighbourhood_links.push({url, params});
});
// takes neighbourhood_links array as input and gives unique urls and its count
MongoDbObjects = await [...new Set(neighbourhood_links.map(x => x.url))].map(
x => ({
url: x,
reference_count: neighbourhood_links.filter(y => y.url === x).length,
params: neighbourhood_links.find(neighbour => neighbour.url === x).params,
}));
let Object_Terminal = JSON.stringify(MongoDbObjects);
_log(chalk.blue(Object_Terminal));
let dbResponse = await urlModel.collection.insertMany(MongoDbObjects);
response.status(HTTP_SUCCESS).json(dbResponse.ops);
} catch (err) {
console.log('err', err);
log.warn('cannot fetch Required Links from URI');
response.status(HTTP_ERROR).json({status: 'FAIL', Error: err});
}
};
// scrap_the_web route
app.get('/scrap_the_web', scrap_the_web);
// Server Listens and calls init function to make DB connection live.
app.listen(config.port, init);
console.log('Express Server listening on port ' + config.port);
exports = module.exports = app;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment