Skip to content

Instantly share code, notes, and snippets.

@adambyram
Created September 19, 2015 03:38
Show Gist options
  • Save adambyram/9f4e6eea2d983de83a16 to your computer and use it in GitHub Desktop.
Save adambyram/9f4e6eea2d983de83a16 to your computer and use it in GitHub Desktop.
Review Downloader
var url = "https://appexchange.salesforce.com/listingDetail?listingId=a0N30000004fnkUEAQ";
var page = require('webpage').create();
var loaded = false;
var currentPage = 1;
// Set this to the highest page of reviews you want to load (e.g. if there are 1000 reviews, that's 100 pages to load)
var maximumPage = 1;
// It takes time to load each page of reviews. If this value is too low, you'll get errors because the data hasn't
// fully loaded. 15 seconds seems to always work, but 5 or 10 seconds may be enough.
var pageLoadDelay = 15000;
var collectedReviews = [];
function scrapeAfterLoad() {
if(loaded) {
console.log('Collecting Reviews for Page ' + currentPage);
var results = page.evaluate(function () {
var reviewCollection = [];
var reviews = document.getElementsByClassName('feed-container')[0].getElementsByClassName('feed-item');
var reviewCount = reviews.length;
for (var reviewIndex = 0; reviewIndex < reviewCount; reviewIndex++) {
var reviewItem = reviews[reviewIndex];
// Grab the review text - if there is a "more" link, follow that and use that text instead
var reviewText = reviewItem.getElementsByClassName('feed-item-text-short-desc')[0].textContent;
var reviewMoreText = reviewItem.getElementsByClassName('feed-item-text-more-text');
if(reviewMoreText.length > 0) {
reviewText = reviewMoreText[0].textContent;
}
// Grab the rating from the CSS class "50" = 5 stars, "40" = 4 stars, etc.
var rating = reviewItem.getElementsByClassName('feed-item-rating')[0].getElementsByClassName('rating-stars')[0].className.split(' ')[1].split('-')[2];
// Grab the date - this is a relative date (so the more recent reviews may say "Today" or "Yesterday" etc)
var date = reviewItem.getElementsByClassName('feed-footer-link-secondary')[0].textContent;
reviewCollection.push({'rating': rating, 'date': date, 'text': reviewText});
}
return reviewCollection;
});
collectedReviews = collectedReviews.concat(results);
// Checks to see if the "next page" link is there - if it is, we can load more pages
var performAnotherLoad = page.evaluate(function() {
return document.getElementById('listingDetailPage:AppExchangeLayout:listingDetailForm:listingDetailReviewsTab:reviewsTabComponent:nextRvwCmdLink') != null;
});
if(performAnotherLoad && currentPage < maximumPage) {
nextReviewLoad();
} else {
// Write out the reviews to a json file
var fs = require('fs');
try {
fs.write("reviews.json", JSON.stringify(collectedReviews), 'w');
} catch(e) {
console.log(e);
}
console.log('Done.');
phantom.exit();
}
} else {
setTimeout(scrapeAfterLoad, pageLoadDelay);
}
}
// To start loading reviews, we have to click the reviews tab to force the page to start loading reviews
function startReviewLoad() {
console.log('Waiting for Page ' + currentPage + ' Reviews');
loaded = false;
page.evaluate(function() {
var ev = document.createEvent("MouseEvent");
ev.initMouseEvent(
"click",
true /* bubble */, true /* cancelable */,
window, null,
0, 0, 0, 0, /* coordinates */
false, false, false, false, /* modifier keys */
0 /*left*/, null
);
document.getElementById('tab_content_reviews').dispatchEvent(ev);
});
scrapeAfterLoad();
}
// This clicks the "next page" link to force the load of another round of reviews
function nextReviewLoad() {
currentPage++;
loaded = false;
console.log('Waiting for Page ' + currentPage + ' Reviews');
page.evaluate(function() {
var ev = document.createEvent("MouseEvent");
ev.initMouseEvent(
"click",
true /* bubble */, true /* cancelable */,
window, null,
0, 0, 0, 0, /* coordinates */
false, false, false, false, /* modifier keys */
0 /*left*/, null
);
document.getElementById('listingDetailPage:AppExchangeLayout:listingDetailForm:listingDetailReviewsTab:reviewsTabComponent:nextRvwCmdLink').dispatchEvent(ev);
});
scrapeAfterLoad();
}
page.open(url, function(status) {
if (status !== 'success') {
console.log('Unable to access network');
phantom.exit();
} else {
// We have to wait on the listingDetail url to load since that has the data we need
page.onResourceReceived = function (response) {
if(response.url === "https://appexchange.salesforce.com/listingDetail") {
loaded = true;
}
};
// Kick off the whole process
startReviewLoad();
}
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment