Skip to content

Instantly share code, notes, and snippets.

@kayslay
Last active May 11, 2021 08:24
Show Gist options
  • Save kayslay/7c9afd6d82b8c054f0e393b55d0f75ec to your computer and use it in GitHub Desktop.
Save kayslay/7c9afd6d82b8c054f0e393b55d0f75ec to your computer and use it in GitHub Desktop.
const crawler = require("web-crawljs");
/**//
* @description removes duplicate hotels for the array
* @param arr
* @returns {Array.<T>|*}
*/
function makeUnique(arr) {
"use strict";
const key = {};
return arr.filter(hotel => {
if (key[`${hotel.hotel_name}_${hotel.city}_${hotel.state}`]) {
return false
}
key[`${hotel.hotel_name}_${hotel.city}_${hotel.state}`] = true;
return true
})
}
const mongoose = require("mongoose");
require("dotenv").config();
const env = process.env;
mongoose.Promise = Promise;
mongoose.connect(env.MONGO_URI);
const hotelModel = require("./hotelModel");
let allData = [];
const config = {
fetchSelector: {
hotel_name: "div.about-item > h2 > a",
location: "div p.item-location span",
city: "div p.item-location a:first-child",
state: "div p.item-location a:nth-child(2)",
present_price: "div.present-price span",
features: ".item-features",
hotelsNg_link: "div.about-item > h2 > a"
},
fetchSelectBy: {
hotel_name: "text",
location: "text",
city: "text",
state: "text",
present_price: "text",
features: "text",
hotelsNg_link: ['attr', 'href']
},
nextSelector: {links: 'ul.pagination a'},
nextSelectBy: {links: ['attr', 'href']},
fetchFn: (err, data, url) => {
if (err) console.error(err.message);
const val = [];
data.hotel_name.forEach((item, i) => {
"use strict";
let f = data.features[i].trim().replace(/(\s{2,}|\n)/g, ":"); //removing unneeded spaces and tabs
f = (f.length > 0) ? f.split(":") : [];
let location = data.location[i] ? data.location[i] : " ";
val.push({
hotel_name: item.trim(),
location: location.trim().replace(/(\n|\s\s)/g, ""),
city: data.city[i].replace(/,$/, ""),
state: data.state[i],
present_price: data.present_price[i],
features: f,
hotelsNg_link: data.hotelsNg_link[i]
})
});
allData = allData.concat(val)
// console.log(val, val.length)
},
finalFn: function () {
hotelModel.insertMany(makeUnique(allData)).then(model => {
allData = [];
console.log("model inserted", model.length)
})
.catch(err => {
allData = [];
console.error(err)
})
},
depth: process.env.DEPTH,
urls: [
"https://hotels.ng/hotels-in-abia",
"https://hotels.ng/hotels-in-abuja",
"https://hotels.ng/hotels-in-lagos",
]
};
const Crawler = crawler(config);
// hotelModel.remove({}).then(p => console.log(p))
// .catch(err => console.error(err))
//crawl all the link
Crawler.CrawlAllUrl();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment