Last active
September 19, 2017 15:39
-
-
Save TechNinjaWeb/f4dfe60c6c713c2524280e72aa28e5f7 to your computer and use it in GitHub Desktop.
Recursive async function to grab all products and their properties
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
////////////////////////////////////////////////////////////////////// | |
//////// Parse item saver prototype ////////// | |
////////////////////////////////////////////////////////////////////// | |
Object.prototype.saveItemToParse = function(className, cb) { | |
var Obj = Parse.Object.extend(className); | |
var obj = new Obj(this); | |
// Callback for both error and success are the same | |
obj.save(null, { success: cb, error: cb }); | |
} | |
////////////////////////////////////////////////////////////////////// | |
//////// USE IT! ////////// | |
////////////////////////////////////////////////////////////////////// | |
var ParseTableClassName = "My Parse Class Name" | |
var ParseSavedItemCallback = function(result) { console.log("Saved item to parse: ", result); } | |
UltimateSiteRipper( | |
ParseTableClassName, | |
ParseSavedItemCallback | |
).then(res => console.warn("Completed Process", res)); | |
////////////////////////////////////////////////////////////////////// | |
//////// The Function ////////// | |
////////////////////////////////////////////////////////////////////// | |
function UltimateSiteRipper(ClassName, Callback) { | |
var products = {}; | |
return new Promise((Resolve, Reject) => { | |
// The first urls var will use every vowel and number to create a query to the CDN_DSL_ONL site | |
// It's too powerful to use in dev.. only use when ready to go production!!!! | |
// urls = ['a', 'e', 'i', 'o', 'u', 1, 2, 3, 4, 5, 6, 7, 8, 9, 0] | |
var urls = ['power'] | |
// Map new urls with query inserted | |
.map(query => "https://www.canadiandieselonline.ca/wc-api/wc_ps_legacy_api/?action=get_results&q=" + query + "&cat_in=all&search_in=product&ps_lang=&psp=") | |
// Run recursive find on the url starting from -1 (technically idx: 0) | |
.map(url => { return collectJsonFromURL(url, -1) }); | |
// Recursive Function to grab url and append new pagination index | |
function collectJsonFromURL(url, idx) { | |
return new Promise((resolve, reject) => { | |
var newUrl = url + (idx + 1); | |
// Asynchronously get the url | |
$.get(newUrl).then(res => { | |
// If there is no total, just resolve the products, as this | |
// means we're at the end of all possible paginated urls | |
if (res.total <= 0) return resolve(products); | |
// For each item add the url as a key to the | |
// var products = {} object, it's value is the entire item | |
res.items.forEach(item => { products[item.url] = item; }); | |
// Resolve the products for the recursive function | |
resolve(products); | |
// Return the res for use in the .then() chain | |
return res; | |
}).then(res => { | |
// If a response exists, try to increment the pagination and recursively find products | |
if (res) collectJsonFromURL(url, (idx + 1)); | |
// Or else just process the entire list of products | |
else { | |
// Build itemized array of objects | |
products = Object.keys(products).map(Key => { | |
// Get local reference to the item | |
item = products[Key]; | |
// Save products to parse if you want | |
// item.saveItemToParse(ClassName, Callback); | |
return item; | |
}); | |
// Resolve the entire products list now | |
// instead of an object with urls as their keys, | |
// products is now an array of objects without duplicates | |
Resolve(products); | |
return products; // Overkill | |
} | |
}) | |
}) | |
} | |
}) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<html> | |
<head> | |
<meta charset="UTF-8"> | |
<title>Canadian Diesel Online Site Ripper</title> | |
<style> | |
.products { | |
display: flex; | |
flex-direction: column; | |
} | |
.product { | |
margin-bottom: 20px; | |
list-style: none; | |
border: 1px solid black; | |
padding: 10px 20px; | |
display: flex; | |
flex-direction: row; | |
} | |
.image { | |
max-height: 80px; | |
max-width: 80px; | |
margin-right: 20px; | |
} | |
.item-number { | |
margin-top: 10px; | |
float: left; | |
} | |
</style> | |
</head> | |
<body> | |
<h1>Enjoy your data!</h1> | |
<hr /> | |
<h3>Products</h3> | |
<ul class="products"> | |
<% products.forEach(function(product, index){ %> | |
<li class="product"> | |
<div class="image"> | |
<img src='<%= product.images.thumbnail %>' /> <br /> | |
<span class="item-number"><b>No.</b> <%= index %></span> | |
</div> | |
<div class="data"> | |
<h3><span class="title"><%= product.title %></span></h3> | |
<hr /> | |
<div class="sku"><b>SKU:</b> <%= product.sku %></div> | |
<div class="price"><b>PRICE:</b> $<%= product.price.toFixed(2) || 0.00 %></div> | |
<div class="description"><b>DESRIPTION:</b><%= product.description %></div> | |
<div class="keyword"><b>KEYWORDS:</b> <%= product.keyword %></div> | |
</div> | |
</li> | |
<% })%> | |
</ul> | |
</body> | |
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<html> | |
<head> | |
<meta charset="UTF-8"> | |
<title>CDO Site Products Ripper</title> | |
<style> | |
.button { | |
border: 1px solid #70b35f; | |
padding: 5px 30px; | |
background-color: #95f97c; | |
text-decoration: none; | |
} | |
.button:hover { | |
background-color: #73d05c | |
} | |
</style> | |
</head> | |
<body> | |
<h1>CDO Site Products Ripper</h1> | |
<p> | |
Would you like to get a list of products from CanadianDieselOnline.com? <a class="button" href="./rip-cdo">Yes</a> | |
<p> | |
</body> | |
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
////////////////////////////////////////////////////////////////////// | |
//////// Required Dependencies ////////// | |
////////////////////////////////////////////////////////////////////// | |
var port = process.env.PORT || 5000, | |
express = require('express'), | |
fs = require('fs'), | |
path = require('path'), | |
app = express(), | |
compression = require('compression'), | |
server = require('http').createServer(app), | |
request = require('request'), | |
$ = require('cheerio'), | |
bodyParser = require('body-parser'), | |
Parse = require('parse/node'); | |
////////////////////////////////////////////////////////////////////// | |
//////// Parse item saver prototype ////////// | |
////////////////////////////////////////////////////////////////////// | |
function saveParseObject(className, cb) { | |
var Obj = Parse.Object.extend(className); | |
var obj = new Obj(this); | |
// Callback for both error and success are the same | |
obj.save(null, { success: cb, error: cb }); | |
} | |
Object.prototype.saveParseObject = saveParseObject; | |
Object.defineProperty(Object.prototype, 'saveParseObject', { | |
enumerable: false, | |
configurable: false, | |
writable: false, | |
value: saveParseObject | |
}); | |
// Initialize parse | |
Parse.initialize('3ouzEPg1EC1gDGCcyGniMtOhzLItxNZpFm58ZFjL', 'uVKdIy4d7bYWjdkSHK8YRwJWNMpiQX4la8uYmj2u'); | |
Parse.serverURL = "https://bba.back4app.io/"; | |
////////////////////////////////////////////////////////////////////// | |
//////// Express App Setup ////////// | |
////////////////////////////////////////////////////////////////////// | |
app.set('views', __dirname + '/public'); | |
app.engine('ejs', require('ejs').renderFile); | |
// Compress all resources for better performance | |
app.use(compression({ | |
filter: compressionFilter, | |
level: -1 | |
})); | |
// Get port from environment and store in Express. | |
app.set('port', port); | |
// Enable trust proxy (heroku fix) | |
app.enable('trust proxy'); | |
app.set('view engine', 'ejs'); | |
app.set('views', path.join(__dirname, '/public')); | |
// Get port from environment and store in Express. | |
app.set('port', port); | |
// Set Body Parsing | |
app.use(bodyParser.json()); | |
// Set aliases to node_modules folder for public folder refs | |
app.use('/css', express.static(__dirname + '/node_modules')); | |
app.use('/scripts', express.static(__dirname + '/node_modules')); | |
// create default route logic | |
app.get('/', function(req, res, next){ | |
res.render('index') | |
}); | |
////////////////////////////////////////////////////////////////////// | |
//////// Rip The Site ////////// | |
////////////////////////////////////////////////////////////////////// | |
app.get('/rip-cdo', function(req, res, next){ | |
var ParseTableClassName = "CDO_PRODUCTS" | |
var ParseSavedItemCallback = function(result) { console.log("Saved item to parse: ", result); } | |
// Here's where the magic starts! | |
UltimateSiteRipper( | |
ParseTableClassName, | |
ParseSavedItemCallback | |
).then(results => { | |
console.log("Finished Processing Site: ", !!results); | |
// res.json(results); | |
res.render('cdo', { products: results }); | |
}); | |
}); | |
// Use default index from public folder | |
app.use(express.static(__dirname + '/public')); | |
////////////////////////////////////////////////////////////////////// | |
//////// Server Listener ////////// | |
////////////////////////////////////////////////////////////////////// | |
server.listen(port, () => console.log(`App running on localhost:${port}`)); | |
////////////////////////////////////////////////////////////////////// | |
//////// Hoisted Required Functions ////////// | |
////////////////////////////////////////////////////////////////////// | |
function compressionFilter(req, res) { | |
if (req.headers['x-no-compression']) { | |
// don't compress responses with this request header | |
return false | |
} | |
// fallback to standard filter function | |
return compression.filter(req, res) | |
} | |
function UltimateSiteRipper(ClassName, Callback) { | |
if (!ClassName) ClassName = "Blank"; | |
if (!Callback) Callback = function(){}; | |
var products = {}; | |
return new Promise((Resolve, Reject) => { | |
// The queries var will use every vowel and number to create a query to the CDN_DSL_ONL site | |
// It's too powerful to use in dev.. only use when ready to go production!!!! | |
// var queries = ['a', 'e', 'i', 'o', 'u', 1, 2, 3, 4, 5, 6, 7, 8, 9, 0]; | |
var queries = ['power']; // Returns 17 Links each with about 16 products per response | |
var urls = queries | |
// Map new urls with query inserted | |
.map((query)=> {return "https://www.canadiandieselonline.ca/wc-api/wc_ps_legacy_api/?action=get_results&q=" + query + "&cat_in=all&search_in=product&ps_lang=&psp="}) | |
// Run recursive find on the url starting from -1 (technically idx: 0) | |
.map((url) => { | |
return collectJsonFromURL(url, -1) | |
.then(results=>{ | |
// If a response exists, try to increment the pagination and recursively find products | |
// Or else just process the entire list of products | |
if (!results) { | |
// Build itemized array of objects | |
products = Object.keys(products).map(Key => { | |
// Get local reference to the item | |
item = products[Key]; | |
// Save products to parse if you want | |
item.saveParseObject(ClassName, Callback); | |
// Return item to replace existing Key | |
return item; | |
}); | |
// Resolve the entire products list now. | |
// Instead of an object with urls as their keys, | |
// products is now an array of objects without duplicates | |
Resolve(products); | |
} | |
// Return for chaining | |
return results; | |
}); | |
}); | |
// console.log("Promise Results:", urls); | |
// Begin executing ajax call for all urls in the list | |
Promise.all(urls).then(() => { | |
// console.log("Finished Processing Site") | |
}).catch(error=>{Reject(error)}); | |
// Recursive Function to grab url and append new pagination index | |
function collectJsonFromURL(url, idx) { | |
return new Promise(function(resolve, reject){ | |
var newIndex = (idx + 1); | |
var newUrl = url + newIndex | |
console.log("Processing URL: " + newUrl); // Leave this in here for console updates | |
// Asynchronously get the url | |
request(newUrl, (error, response, body) => { | |
// Define the results | |
var result; | |
// Try to parse the body into an object | |
try { result = JSON.parse(body)} catch(e){result = null} | |
// If there's an error just reject this promise | |
if (error) return Reject(error); | |
// If there is no total, just resolve null, as this | |
// means we're at the end of all possible paginated urls | |
if (result.total <= 0) return resolve(null); | |
// For each item add the url as a key to the | |
// var products = {} object, it's value is the entire item | |
result.items.forEach(item => { | |
// Get local reference to the thumbnail image | |
var thumbnail_url = item.image_url.search("placeholder") < 0 | |
? item.image_url : ""; | |
// Delete the image url property | |
delete item.image_url | |
// Delete the addtocart field | |
delete item.addtocart; | |
// Process the price | |
item.price = parseFloat( $(item.price).text().replace('$', '').replace(',', '') ); | |
// Create images object | |
item.images = { | |
thumbnail: thumbnail_url, | |
image: thumbnail_url | |
.split('/') | |
.map((st, idx)=>{ | |
return st = st.replace('-80x80', ''); | |
}).join('/') | |
} | |
// Create item reference with URL to guard against duplicates | |
products[item.url] = item; | |
}); | |
// If there was results then try to find more links | |
if ( result.total > 0) resolve(collectJsonFromURL(url, newIndex )); | |
}) | |
}) | |
} | |
}) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "cdo_site_product_ripper", | |
"version": "1.0.0", | |
"description": "", | |
"main": "index.js", | |
"scripts": { | |
"test": "echo \"Error: no test specified\" && exit 1", | |
"start": "nodemon index.js" | |
}, | |
"author": "", | |
"license": "ISC", | |
"dependencies": { | |
"body-parser": "^1.17.2", | |
"cheerio": "^1.0.0-rc.2", | |
"compression": "^1.7.0", | |
"ejs": "^2.5.7", | |
"express": "^4.15.4", | |
"fs": "0.0.1-security", | |
"http": "0.0.0", | |
"nodemon": "^1.11.0", | |
"parse": "^1.10.0", | |
"path": "^0.12.7", | |
"request": "^2.81.0" | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Site Map Builder will build a list of links | |
on the same domain by recursively following all | |
unknown links | |
*/ | |
var SiteMap = {}; | |
function ProcessPageForLinks( domain ) { | |
return new Promise((resolve, reject){ | |
// Get page from Domain | |
// Process HTML | |
links = ProcessAllLinks( GatherAllLinksOnPage( $(html) ) ); | |
links.map( AddLinkToSiteMap ) | |
.map( GetUnFollowedLinks ) | |
}); | |
// Get content | |
return ProcessAllLinks( links ) | |
} | |
function GatherAllLinksOnPage(page) { | |
return Array.prototype.slice.call( | |
$('html').find('a').map((i,l)=>l.href) | |
); | |
} | |
function AddLinkToSiteMap( link ) { | |
// Return Link if it hasn't been checked | |
var unique = !SiteMap.hasOwnProperty( link ); | |
var valid_domain = VerifyDomainForLink( domain, link ); | |
// Create ref if its unique and valid domain | |
if ( unique && valid_domain ) SiteMap[ link ] = false; | |
if( !!SiteMap[ link ] ) return SiteMap[ link ]; | |
} | |
function FollowAllLinks( links ) { | |
return links | |
.map(link=>{ | |
if (CheckSiteMapForMatchingLink(link)) | |
return link; | |
}) | |
} | |
function FollowLink( link ) { | |
} | |
function ProcessAllLinks ( links ) { | |
return links.map(link=>{ | |
if( VerifyDomainForLink(this.domain, link) ) | |
return link | |
}).filter(NotNull) | |
} | |
function CheckSiteMapForMatchingLink( link ){ | |
return Object.keys( SiteMap ) | |
.some( url=> link === url ? true : false ); | |
} | |
function GetUnFollowedLinks () { | |
return Object.keys( SiteMap ) | |
.map(Key=>if(!!SiteMap[ Key ]) Key) | |
} | |
function VerifyDomainForLink( domain, link ) { | |
var link_domain = link.split('/')[2]; | |
return domain === link_domain ? true : false; | |
} | |
// Handlers | |
function success( results ) | |
{ console.log("Got results", results); return results; } | |
function error( error, results ) | |
{ console.log("Got error", error, results); return results; } | |
function NotNull(r){if(!!r) return r} | |
links.map(link=>{ | |
if( VerifyDomainForLink(domain, link) ) | |
return link | |
}).filter(NotNull) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment