Skip to content

Instantly share code, notes, and snippets.

@Ayushverma8
Created April 12, 2019 16:19
Show Gist options
  • Save Ayushverma8/fe2cbc1f3805cbe02a3a61a777c39ef4 to your computer and use it in GitHub Desktop.
Save Ayushverma8/fe2cbc1f3805cbe02a3a61a777c39ef4 to your computer and use it in GitHub Desktop.
const request = require('request');
const DatabaseMongoAtlasSchema = require('./Models/model');
const express = require('express');
const _URL = require('url');
var querystring = require('querystring');
let concurrency = require('./Utilities/concurrent_engine')
const config = require('./Configurations/config');
const utility_url_methods = require('./Utilities/utils');
const cheerio = require('cheerio');
const mongoose = require('mongoose');
const _ = require('lodash/core');
const URL = require('url-parse') ;
const app = express();
const hyperlinks_to_be_harvested = {};
const arr_links = [];
const url = new URL(config.UrlToScrap);
CONCURRENT_REQUESTS = 5;
root_url_value = '';
let queryStringProcess = [];
/* MongoDB Initiated at localhost 27017*/
let modelMongoForURLParser;
const conn = mongoose.createConnection(config.MongoURI, {useNewUrlParser: true});
modelMongoForURLParser = conn.model('Url', DatabaseMongoAtlasSchema);
// Formatted URL must be with origin Protocol
if ( utility_url_methods === true ) {
root_url_value = url => {
`{$url.protocol}"//"{$url.hostname}`
};
} else console.log(`${root_url_value}`);
let count_of_already_visited_links = 0;
// Starting with Empty array of URLs
arr_links.unshift(config.UrlToScrap);
// Calling Function to calculate visits
const globalURICall = () => complete_unique_list();
globalURICall();
// Exit Condition for Program
function complete_unique_list() {
if ( count_of_already_visited_links >= CONCURRENT_REQUESTS ) {
console.log(`Depth of Available URls Reached. Exiting`);
return;
}
let next_neighbourhood_link = arr_links.pop();
if ( next_neighbourhood_link in hyperlinks_to_be_harvested ) {
complete_unique_list();
} else {
url_searching_in_next_queue(next_neighbourhood_link, complete_unique_list);
}
}
function url_searching_in_next_queue(url, callback) {
hyperlinks_to_be_harvested[url] = true;
count_of_already_visited_links++;
// console.log(`I found ${url} `);
request(url, function (error, response, body) {
if ( response.statusCode !== 200 ) {
callback();
return;
}
let $ = cheerio.load(body);
spider_to_fetch_links($);
callback();
});
}
const spider_to_fetch_links = ($) => {
let reachable_links_href;
reachable_links_href = $("a[href^='/']");
let absolute_links;
absolute_links = $("a[href^='http']");
// console.log("Came Across " + reachable_links_href.length + " hyperlinks on All Page");
// console.log("Came Across " + absolute_links.length + " absolute links");
reachable_links_href.each(function () {
arr_links.push(url.protocol + '/' + $(this).attr('href'), true);
let url_sliced = _URL.parse(url.protocol + '/' +
+ $(this).attr('href'), true);
list_ = arr_links.filter((x, i, a) => a.indexOf(x) == i);
let result = [];
var parse = require('url-parse')
, u2rl = parse(url_sliced.pathname, true);
result.push({ url_sliced, u2rl});
let data = ([...new Set(arr_links.map(x => x.root_url_value))].map(
x => {
// var parse = require('url-parse')
// , u2rl = parse(url_sliced.pathname, true);
return {
url:list_[0],
reference_count: result.filter(y => y.url === x).length,
params: Object.keys(u2rl.query),
}
}));
console.log(data);
// var sendDatatoMongo = new modelMongoForURLParser(data);
// console.log(data);
// console.log('Data is pushed at ObjectID' + sendDatatoMongo);
});
absolute_links.each(function () {
arr_links.push($(this).attr('href'));
});
};
app.get('/', function(req, res){
res.setHeader("Content-Type", "text/plain");
res.send("Hello RentoMojo!");
});
app.get('/scrap_the_web', spider_to_fetch_links);
app.listen(3000)
// console.log('Express is up and Running ' + config.port);
exports = module.exports = app;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment