Created
April 12, 2019 16:19
-
-
Save Ayushverma8/fe2cbc1f3805cbe02a3a61a777c39ef4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const request = require('request'); | |
const DatabaseMongoAtlasSchema = require('./Models/model'); | |
const express = require('express'); | |
const _URL = require('url'); | |
var querystring = require('querystring'); | |
let concurrency = require('./Utilities/concurrent_engine') | |
const config = require('./Configurations/config'); | |
const utility_url_methods = require('./Utilities/utils'); | |
const cheerio = require('cheerio'); | |
const mongoose = require('mongoose'); | |
const _ = require('lodash/core'); | |
const URL = require('url-parse') ; | |
const app = express(); | |
const hyperlinks_to_be_harvested = {}; | |
const arr_links = []; | |
const url = new URL(config.UrlToScrap); | |
CONCURRENT_REQUESTS = 5; | |
root_url_value = ''; | |
let queryStringProcess = []; | |
/* MongoDB Initiated at localhost 27017*/ | |
let modelMongoForURLParser; | |
const conn = mongoose.createConnection(config.MongoURI, {useNewUrlParser: true}); | |
modelMongoForURLParser = conn.model('Url', DatabaseMongoAtlasSchema); | |
// Formatted URL must be with origin Protocol | |
if ( utility_url_methods === true ) { | |
root_url_value = url => { | |
`{$url.protocol}"//"{$url.hostname}` | |
}; | |
} else console.log(`${root_url_value}`); | |
let count_of_already_visited_links = 0; | |
// Starting with Empty array of URLs | |
arr_links.unshift(config.UrlToScrap); | |
// Calling Function to calculate visits | |
const globalURICall = () => complete_unique_list(); | |
globalURICall(); | |
// Exit Condition for Program | |
function complete_unique_list() { | |
if ( count_of_already_visited_links >= CONCURRENT_REQUESTS ) { | |
console.log(`Depth of Available URls Reached. Exiting`); | |
return; | |
} | |
let next_neighbourhood_link = arr_links.pop(); | |
if ( next_neighbourhood_link in hyperlinks_to_be_harvested ) { | |
complete_unique_list(); | |
} else { | |
url_searching_in_next_queue(next_neighbourhood_link, complete_unique_list); | |
} | |
} | |
function url_searching_in_next_queue(url, callback) { | |
hyperlinks_to_be_harvested[url] = true; | |
count_of_already_visited_links++; | |
// console.log(`I found ${url} `); | |
request(url, function (error, response, body) { | |
if ( response.statusCode !== 200 ) { | |
callback(); | |
return; | |
} | |
let $ = cheerio.load(body); | |
spider_to_fetch_links($); | |
callback(); | |
}); | |
} | |
const spider_to_fetch_links = ($) => { | |
let reachable_links_href; | |
reachable_links_href = $("a[href^='/']"); | |
let absolute_links; | |
absolute_links = $("a[href^='http']"); | |
// console.log("Came Across " + reachable_links_href.length + " hyperlinks on All Page"); | |
// console.log("Came Across " + absolute_links.length + " absolute links"); | |
reachable_links_href.each(function () { | |
arr_links.push(url.protocol + '/' + $(this).attr('href'), true); | |
let url_sliced = _URL.parse(url.protocol + '/' + | |
+ $(this).attr('href'), true); | |
list_ = arr_links.filter((x, i, a) => a.indexOf(x) == i); | |
let result = []; | |
var parse = require('url-parse') | |
, u2rl = parse(url_sliced.pathname, true); | |
result.push({ url_sliced, u2rl}); | |
let data = ([...new Set(arr_links.map(x => x.root_url_value))].map( | |
x => { | |
// var parse = require('url-parse') | |
// , u2rl = parse(url_sliced.pathname, true); | |
return { | |
url:list_[0], | |
reference_count: result.filter(y => y.url === x).length, | |
params: Object.keys(u2rl.query), | |
} | |
})); | |
console.log(data); | |
// var sendDatatoMongo = new modelMongoForURLParser(data); | |
// console.log(data); | |
// console.log('Data is pushed at ObjectID' + sendDatatoMongo); | |
}); | |
absolute_links.each(function () { | |
arr_links.push($(this).attr('href')); | |
}); | |
}; | |
app.get('/', function(req, res){ | |
res.setHeader("Content-Type", "text/plain"); | |
res.send("Hello RentoMojo!"); | |
}); | |
app.get('/scrap_the_web', spider_to_fetch_links); | |
app.listen(3000) | |
// console.log('Express is up and Running ' + config.port); | |
exports = module.exports = app; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment