Skip to content

Instantly share code, notes, and snippets.

@ekyfauzi
Forked from juanbrujo/react-app-scrapping.js
Created October 18, 2017 07:24
Show Gist options
  • Save ekyfauzi/1ca464ce0a94d79238be855c1de38f07 to your computer and use it in GitHub Desktop.
Save ekyfauzi/1ca464ce0a94d79238be855c1de38f07 to your computer and use it in GitHub Desktop.
Scrapping a React App using PhantomJS and Cheerio
var phantom = require('phantom');
var Q = require('q');
var cheerio = require('cheerio');
var _ph, _page, _outObj;
var url = ABSOLUTE_URL; // change here for your React app site
phantom.create().then(ph => {
_ph = ph;
return _ph.createPage();
}).then(page => {
_page = page;
return _page.open(url);
}).then(status => {
console.log(status);
return waitState(textPopulated, 3);
}).then(() => {
return _page.property('content');
}).then(content => {
var $ = cheerio.load(content);
var resultados = [];
$('.item.panel.panel-default').each(function() {
var title = $(this).find('.title').text();
var link = $(this).find('a').attr('href');
resultados.push( '<' + link + '|' + title + '>' );
});
_page.close();
_ph.exit();
}).catch(e => console.log(e));
function textPopulated() {
return _page.evaluate(function() {
return document.querySelector('#app').outerHTML;
}).then(function(html) {
return html;
});
}
function waitState(state, timeout) { // timeout in seconds is optional
console.log('Start waiting for state: ' + state.name);
var limitTime = timeout * 1000 || 20000;
var startTime = new Date();
return wait();
function wait() {
return state().then(function(result) {
if (result) {
console.log('Reached state: ' + state.name);
return;
} else if (new Date() - startTime > limitTime) {
var errorMessage = 'Timeout state: ' + state.name;
throw new Error(errorMessage);
} else {
return Q.delay(50).then(wait);
}
}).catch(function(error) {
throw error;
});
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment