Skip to content

Instantly share code, notes, and snippets.

@TechNinjaWeb
Last active September 19, 2017 15:39
Show Gist options
  • Save TechNinjaWeb/f4dfe60c6c713c2524280e72aa28e5f7 to your computer and use it in GitHub Desktop.
Save TechNinjaWeb/f4dfe60c6c713c2524280e72aa28e5f7 to your computer and use it in GitHub Desktop.
Recursive async function to grab all products and their properties
//////////////////////////////////////////////////////////////////////
//////// Parse item saver prototype //////////
//////////////////////////////////////////////////////////////////////
Object.prototype.saveItemToParse = function(className, cb) {
var Obj = Parse.Object.extend(className);
var obj = new Obj(this);
// Callback for both error and success are the same
obj.save(null, { success: cb, error: cb });
}
//////////////////////////////////////////////////////////////////////
//////// USE IT! //////////
//////////////////////////////////////////////////////////////////////
var ParseTableClassName = "My Parse Class Name"
var ParseSavedItemCallback = function(result) { console.log("Saved item to parse: ", result); }
UltimateSiteRipper(
ParseTableClassName,
ParseSavedItemCallback
).then(res => console.warn("Completed Process", res));
//////////////////////////////////////////////////////////////////////
//////// The Function //////////
//////////////////////////////////////////////////////////////////////
function UltimateSiteRipper(ClassName, Callback) {
var products = {};
return new Promise((Resolve, Reject) => {
// The first urls var will use every vowel and number to create a query to the CDN_DSL_ONL site
// It's too powerful to use in dev.. only use when ready to go production!!!!
// urls = ['a', 'e', 'i', 'o', 'u', 1, 2, 3, 4, 5, 6, 7, 8, 9, 0]
var urls = ['power']
// Map new urls with query inserted
.map(query => "https://www.canadiandieselonline.ca/wc-api/wc_ps_legacy_api/?action=get_results&q=" + query + "&cat_in=all&search_in=product&ps_lang=&psp=")
// Run recursive find on the url starting from -1 (technically idx: 0)
.map(url => { return collectJsonFromURL(url, -1) });
// Recursive Function to grab url and append new pagination index
function collectJsonFromURL(url, idx) {
return new Promise((resolve, reject) => {
var newUrl = url + (idx + 1);
// Asynchronously get the url
$.get(newUrl).then(res => {
// If there is no total, just resolve the products, as this
// means we're at the end of all possible paginated urls
if (res.total <= 0) return resolve(products);
// For each item add the url as a key to the
// var products = {} object, it's value is the entire item
res.items.forEach(item => { products[item.url] = item; });
// Resolve the products for the recursive function
resolve(products);
// Return the res for use in the .then() chain
return res;
}).then(res => {
// If a response exists, try to increment the pagination and recursively find products
if (res) collectJsonFromURL(url, (idx + 1));
// Or else just process the entire list of products
else {
// Build itemized array of objects
products = Object.keys(products).map(Key => {
// Get local reference to the item
item = products[Key];
// Save products to parse if you want
// item.saveItemToParse(ClassName, Callback);
return item;
});
// Resolve the entire products list now
// instead of an object with urls as their keys,
// products is now an array of objects without duplicates
Resolve(products);
return products; // Overkill
}
})
})
}
})
}
<html>
<head>
<meta charset="UTF-8">
<title>Canadian Diesel Online Site Ripper</title>
<style>
.products {
display: flex;
flex-direction: column;
}
.product {
margin-bottom: 20px;
list-style: none;
border: 1px solid black;
padding: 10px 20px;
display: flex;
flex-direction: row;
}
.image {
max-height: 80px;
max-width: 80px;
margin-right: 20px;
}
.item-number {
margin-top: 10px;
float: left;
}
</style>
</head>
<body>
<h1>Enjoy your data!</h1>
<hr />
<h3>Products</h3>
<ul class="products">
<% products.forEach(function(product, index){ %>
<li class="product">
<div class="image">
<img src='<%= product.images.thumbnail %>' /> <br />
<span class="item-number"><b>No.</b> <%= index %></span>
</div>
<div class="data">
<h3><span class="title"><%= product.title %></span></h3>
<hr />
<div class="sku"><b>SKU:</b> <%= product.sku %></div>
<div class="price"><b>PRICE:</b> $<%= product.price.toFixed(2) || 0.00 %></div>
<div class="description"><b>DESRIPTION:</b><%= product.description %></div>
<div class="keyword"><b>KEYWORDS:</b> <%= product.keyword %></div>
</div>
</li>
<% })%>
</ul>
</body>
</html>
<html>
<head>
<meta charset="UTF-8">
<title>CDO Site Products Ripper</title>
<style>
.button {
border: 1px solid #70b35f;
padding: 5px 30px;
background-color: #95f97c;
text-decoration: none;
}
.button:hover {
background-color: #73d05c
}
</style>
</head>
<body>
<h1>CDO Site Products Ripper</h1>
<p>
Would you like to get a list of products from CanadianDieselOnline.com? <a class="button" href="./rip-cdo">Yes</a>
<p>
</body>
</html>
//////////////////////////////////////////////////////////////////////
//////// Required Dependencies //////////
//////////////////////////////////////////////////////////////////////
var port = process.env.PORT || 5000,
express = require('express'),
fs = require('fs'),
path = require('path'),
app = express(),
compression = require('compression'),
server = require('http').createServer(app),
request = require('request'),
$ = require('cheerio'),
bodyParser = require('body-parser'),
Parse = require('parse/node');
//////////////////////////////////////////////////////////////////////
//////// Parse item saver prototype //////////
//////////////////////////////////////////////////////////////////////
function saveParseObject(className, cb) {
var Obj = Parse.Object.extend(className);
var obj = new Obj(this);
// Callback for both error and success are the same
obj.save(null, { success: cb, error: cb });
}
Object.prototype.saveParseObject = saveParseObject;
Object.defineProperty(Object.prototype, 'saveParseObject', {
enumerable: false,
configurable: false,
writable: false,
value: saveParseObject
});
// Initialize parse
Parse.initialize('3ouzEPg1EC1gDGCcyGniMtOhzLItxNZpFm58ZFjL', 'uVKdIy4d7bYWjdkSHK8YRwJWNMpiQX4la8uYmj2u');
Parse.serverURL = "https://bba.back4app.io/";
//////////////////////////////////////////////////////////////////////
//////// Express App Setup //////////
//////////////////////////////////////////////////////////////////////
app.set('views', __dirname + '/public');
app.engine('ejs', require('ejs').renderFile);
// Compress all resources for better performance
app.use(compression({
filter: compressionFilter,
level: -1
}));
// Get port from environment and store in Express.
app.set('port', port);
// Enable trust proxy (heroku fix)
app.enable('trust proxy');
app.set('view engine', 'ejs');
app.set('views', path.join(__dirname, '/public'));
// Get port from environment and store in Express.
app.set('port', port);
// Set Body Parsing
app.use(bodyParser.json());
// Set aliases to node_modules folder for public folder refs
app.use('/css', express.static(__dirname + '/node_modules'));
app.use('/scripts', express.static(__dirname + '/node_modules'));
// create default route logic
app.get('/', function(req, res, next){
res.render('index')
});
//////////////////////////////////////////////////////////////////////
//////// Rip The Site //////////
//////////////////////////////////////////////////////////////////////
app.get('/rip-cdo', function(req, res, next){
var ParseTableClassName = "CDO_PRODUCTS"
var ParseSavedItemCallback = function(result) { console.log("Saved item to parse: ", result); }
// Here's where the magic starts!
UltimateSiteRipper(
ParseTableClassName,
ParseSavedItemCallback
).then(results => {
console.log("Finished Processing Site: ", !!results);
// res.json(results);
res.render('cdo', { products: results });
});
});
// Use default index from public folder
app.use(express.static(__dirname + '/public'));
//////////////////////////////////////////////////////////////////////
//////// Server Listener //////////
//////////////////////////////////////////////////////////////////////
server.listen(port, () => console.log(`App running on localhost:${port}`));
//////////////////////////////////////////////////////////////////////
//////// Hoisted Required Functions //////////
//////////////////////////////////////////////////////////////////////
function compressionFilter(req, res) {
if (req.headers['x-no-compression']) {
// don't compress responses with this request header
return false
}
// fallback to standard filter function
return compression.filter(req, res)
}
function UltimateSiteRipper(ClassName, Callback) {
if (!ClassName) ClassName = "Blank";
if (!Callback) Callback = function(){};
var products = {};
return new Promise((Resolve, Reject) => {
// The queries var will use every vowel and number to create a query to the CDN_DSL_ONL site
// It's too powerful to use in dev.. only use when ready to go production!!!!
// var queries = ['a', 'e', 'i', 'o', 'u', 1, 2, 3, 4, 5, 6, 7, 8, 9, 0];
var queries = ['power']; // Returns 17 Links each with about 16 products per response
var urls = queries
// Map new urls with query inserted
.map((query)=> {return "https://www.canadiandieselonline.ca/wc-api/wc_ps_legacy_api/?action=get_results&q=" + query + "&cat_in=all&search_in=product&ps_lang=&psp="})
// Run recursive find on the url starting from -1 (technically idx: 0)
.map((url) => {
return collectJsonFromURL(url, -1)
.then(results=>{
// If a response exists, try to increment the pagination and recursively find products
// Or else just process the entire list of products
if (!results) {
// Build itemized array of objects
products = Object.keys(products).map(Key => {
// Get local reference to the item
item = products[Key];
// Save products to parse if you want
item.saveParseObject(ClassName, Callback);
// Return item to replace existing Key
return item;
});
// Resolve the entire products list now.
// Instead of an object with urls as their keys,
// products is now an array of objects without duplicates
Resolve(products);
}
// Return for chaining
return results;
});
});
// console.log("Promise Results:", urls);
// Begin executing ajax call for all urls in the list
Promise.all(urls).then(() => {
// console.log("Finished Processing Site")
}).catch(error=>{Reject(error)});
// Recursive Function to grab url and append new pagination index
function collectJsonFromURL(url, idx) {
return new Promise(function(resolve, reject){
var newIndex = (idx + 1);
var newUrl = url + newIndex
console.log("Processing URL: " + newUrl); // Leave this in here for console updates
// Asynchronously get the url
request(newUrl, (error, response, body) => {
// Define the results
var result;
// Try to parse the body into an object
try { result = JSON.parse(body)} catch(e){result = null}
// If there's an error just reject this promise
if (error) return Reject(error);
// If there is no total, just resolve null, as this
// means we're at the end of all possible paginated urls
if (result.total <= 0) return resolve(null);
// For each item add the url as a key to the
// var products = {} object, it's value is the entire item
result.items.forEach(item => {
// Get local reference to the thumbnail image
var thumbnail_url = item.image_url.search("placeholder") < 0
? item.image_url : "";
// Delete the image url property
delete item.image_url
// Delete the addtocart field
delete item.addtocart;
// Process the price
item.price = parseFloat( $(item.price).text().replace('$', '').replace(',', '') );
// Create images object
item.images = {
thumbnail: thumbnail_url,
image: thumbnail_url
.split('/')
.map((st, idx)=>{
return st = st.replace('-80x80', '');
}).join('/')
}
// Create item reference with URL to guard against duplicates
products[item.url] = item;
});
// If there was results then try to find more links
if ( result.total > 0) resolve(collectJsonFromURL(url, newIndex ));
})
})
}
})
}
{
"name": "cdo_site_product_ripper",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1",
"start": "nodemon index.js"
},
"author": "",
"license": "ISC",
"dependencies": {
"body-parser": "^1.17.2",
"cheerio": "^1.0.0-rc.2",
"compression": "^1.7.0",
"ejs": "^2.5.7",
"express": "^4.15.4",
"fs": "0.0.1-security",
"http": "0.0.0",
"nodemon": "^1.11.0",
"parse": "^1.10.0",
"path": "^0.12.7",
"request": "^2.81.0"
}
}
/*
Site Map Builder will build a list of links
on the same domain by recursively following all
unknown links
*/
var SiteMap = {};
function ProcessPageForLinks( domain ) {
return new Promise((resolve, reject){
// Get page from Domain
// Process HTML
links = ProcessAllLinks( GatherAllLinksOnPage( $(html) ) );
links.map( AddLinkToSiteMap )
.map( GetUnFollowedLinks )
});
// Get content
return ProcessAllLinks( links )
}
function GatherAllLinksOnPage(page) {
return Array.prototype.slice.call(
$('html').find('a').map((i,l)=>l.href)
);
}
function AddLinkToSiteMap( link ) {
// Return Link if it hasn't been checked
var unique = !SiteMap.hasOwnProperty( link );
var valid_domain = VerifyDomainForLink( domain, link );
// Create ref if its unique and valid domain
if ( unique && valid_domain ) SiteMap[ link ] = false;
if( !!SiteMap[ link ] ) return SiteMap[ link ];
}
function FollowAllLinks( links ) {
return links
.map(link=>{
if (CheckSiteMapForMatchingLink(link))
return link;
})
}
function FollowLink( link ) {
}
function ProcessAllLinks ( links ) {
return links.map(link=>{
if( VerifyDomainForLink(this.domain, link) )
return link
}).filter(NotNull)
}
function CheckSiteMapForMatchingLink( link ){
return Object.keys( SiteMap )
.some( url=> link === url ? true : false );
}
function GetUnFollowedLinks () {
return Object.keys( SiteMap )
.map(Key=>if(!!SiteMap[ Key ]) Key)
}
function VerifyDomainForLink( domain, link ) {
var link_domain = link.split('/')[2];
return domain === link_domain ? true : false;
}
// Handlers
function success( results )
{ console.log("Got results", results); return results; }
function error( error, results )
{ console.log("Got error", error, results); return results; }
function NotNull(r){if(!!r) return r}
links.map(link=>{
if( VerifyDomainForLink(domain, link) )
return link
}).filter(NotNull)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment