Last active
April 14, 2019 08:49
-
-
Save Ayushverma8/5a0444b01c5f3d15a116cda5138ddbff to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'use strict'; | |
const express = require('express'); | |
const req_pro = require('request-promise'); | |
const cheerio = require('cheerio'); | |
const mongoose = require('mongoose'); | |
const helmet = require('helmet'); | |
const config = require('./configurations/config'); | |
const URL = require('url'); | |
const schema = require('./data/models'); | |
const chalk = require('chalk'); | |
const _log = console.log; | |
var bunyan = require('bunyan'); | |
var log = bunyan.createLogger({name: 'RentoMojo'}); | |
const app = express(); | |
const HTTP_SUCCESS = 200; | |
const HTTP_ERROR = 500; | |
app.use(helmet()); | |
let urlModel; | |
// Connect to mlab remote MongoDB instance. | |
let init = async function () { | |
try { | |
const conn = await mongoose.createConnection(config.MongoURI, { useNewUrlParser: true }) | |
urlModel = conn.model('Url', schema); | |
} catch(err) { | |
console.log('err', err); | |
process.exit(1); | |
} | |
} | |
let scrap_the_web; | |
scrap_the_web = async (request, response) => { | |
let MongoDbObjects; | |
let neighbourhood_links; | |
try { | |
log.info('Fetching Required Links from URI'); | |
// takes website url passed in Query params. Default url is medium website Url | |
let uri = request.query.url ? request.query.url : config.URLtoScrap; | |
let requestObjectFromURI = await req_pro.get(uri); | |
// neighbourhood_links array contains all urls and params as keys | |
neighbourhood_links = []; | |
let $ = cheerio.load(requestObjectFromURI.toString()); | |
// using url module to get url and params from all the hyperlinks | |
$("a").each((i, link) => { | |
let u = URL.parse($(link).attr("href"), true); | |
let params = Object.keys(u.query); | |
let url = u.href.split('?')[0]; | |
neighbourhood_links.push({url, params}); | |
}); | |
// takes neighbourhood_links array as input and gives unique urls and its count | |
MongoDbObjects = await [...new Set(neighbourhood_links.map(x => x.url))].map( | |
x => ({ | |
url: x, | |
reference_count: neighbourhood_links.filter(y => y.url === x).length, | |
params: neighbourhood_links.find(neighbour => neighbour.url === x).params, | |
})); | |
let Object_Terminal = JSON.stringify(MongoDbObjects); | |
_log(chalk.blue(Object_Terminal)); | |
let dbResponse = await urlModel.collection.insertMany(MongoDbObjects); | |
response.status(HTTP_SUCCESS).json(dbResponse.ops); | |
} catch (err) { | |
console.log('err', err); | |
log.warn('cannot fetch Required Links from URI'); | |
response.status(HTTP_ERROR).json({status: 'FAIL', Error: err}); | |
} | |
}; | |
// scrap_the_web route | |
app.get('/scrap_the_web', scrap_the_web); | |
// Server Listens and calls init function to make DB connection live. | |
app.listen(config.port, init); | |
console.log('Express Server listening on port ' + config.port); | |
exports = module.exports = app; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment