Created
July 1, 2015 21:52
-
-
Save techgaun/cf37344a758f377a313f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//required node modules | |
//untested | |
var request = require('request'); | |
var cheerio = require('cheerio'); | |
var fs = require('fs'); | |
var s3 = require('s3'); | |
var AWS = require('aws-sdk'); | |
var url = 'https://medium.com/@benjaminhardy/8-things-every-person-should-do-before-8-a-m-cc0233e15c8d'; //url of site | |
AWS.config.loadFromPath('./config.json'); | |
var s3bucket = new AWS.S3({params: {Bucket: 'crawlerdemo'}}); | |
request(url, function(err, resp, body){ | |
$ = cheerio.load(body); | |
links = $('img'); //get the image tags | |
$(links).each(function(i, link){ //go through every link on page | |
image_path = $(link).attr('src'); | |
var tmp = image_path.lastIndexOf('/'); | |
var filename = image_path.substring(tmp+1, image_path.length); //set file name | |
download(url, filename, function(){ | |
fs.readFile(filename, function(err, data) { | |
if (err) { | |
throw err; | |
} | |
var bdata = new Buffer(data, 'binary'); | |
var params = {Key: filename, Body: bdata}; | |
s3bucket.upload(params, function(err,data) { | |
if (err) { | |
console.log("Error: ", err); | |
} | |
}); | |
}); | |
}); | |
}); | |
}); | |
//download method | |
var download = function(uri, filename, callback){ | |
request.head(uri, function(err, res, body){ | |
request(uri).pipe(fs.createWriteStream(filename)).on('close', callback); | |
}); | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@techgaun , This worked for me after 3 hours of scraping internet. Thanks for the gist!