Skip to content

Instantly share code, notes, and snippets.

@ronanguilloux
Last active March 30, 2016 09:44
Show Gist options
  • Save ronanguilloux/b587281fdb3b524d2c04918ffa03e59c to your computer and use it in GitHub Desktop.
Save ronanguilloux/b587281fdb3b524d2c04918ffa03e59c to your computer and use it in GitHub Desktop.
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var SimpleJson2Csv = require('simple-json2csv');
/*
package.json:
{
"name": "akeneo-scraper",
"version": "0.0.1",
"description": "Scrape le web.",
"main": "server.js",
"author": "Akeneo",
"dependencies": {
"cheerio": "latest",
"express": "latest",
"request": "latest",
"simple-json2csv": "0.0.5"
}
}
*/
var app = express();
app.get('/scrape', function(req, res){
url = 'http://imagine.magento.com/attendees';
// The structure of our request call
// The first parameter is our URL
// The callback function takes 3 parameters, an error, response status code and the html
request(url, function(error, response, html){
// First we'll check to make sure no errors occurred when making the request
var json = {fields: [{ name: "name", header: "Name" }], data: []};
var csv = '';
if(!error){
// Next, we'll utilize the cheerio library on the returned html which will essentially give us jQuery functionality
var $ = cheerio.load(html);
// Finally, we'll define the variables we're going to capture
$('.attendee').filter(function(){
// Let's store the data we filter into a variable so we can easily see what's going on.
var data = $(this);
// In examining the DOM we notice that the title rests within the first child element of the header tag.
// Utilizing jQuery we can easily navigate and get the text by writing the following code:
json.data.push({name: data.text().trim().replace('\n', '')});
})
}
// To write to the system we will use the built in 'fs' library.
// In this example we will pass 3 parameters to the writeFile function
// Parameter 1 : output.json - this is what the created filename will be called
// Parameter 2 : JSON.stringify(json, null, 4) - the data to write, here we do an extra step by calling JSON.stringify to make our JSON easier to read
// Parameter 3 : callback function - a callback function to let us know the status of our function
var outputFile = 'output.csv';
var json2Csv = new SimpleJson2Csv(json);
json2Csv.pipe(fs.createWriteStream(outputFile));
console.log('Data successfully scrapped and stored! - Check your project directory for the ' + outputFile + ' file');
// Finally, we'll just send out a message to the browser reminding you that this app does not have a UI.
res.send('Check your console!')
}) ;
});
app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment