Created
June 22, 2017 18:17
-
-
Save thomaswilburn/30a5df743145084eda2c72dae05b5215 to your computer and use it in GitHub Desktop.
Archive that blogspot feed that you really like
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var async = require("async"); | |
var cheerio = require("cheerio"); | |
var FeedParser = require("feedparser"); | |
var request = require("request"); | |
var shell = require("shelljs"); | |
var fs = require("fs"); | |
var path = require("path"); | |
var url = require("url"); | |
var zlib = require("zlib"); | |
var makeURL = i => `http://fafblog.blogspot.com/feeds/posts/default?start-index=${i}&max-results=3000`; | |
var index = 1; | |
var savePost = function(post, done) { | |
var u = url.parse(post.link); | |
var p = u.pathname.replace(/^\//, ""); | |
var dirname = path.dirname(p); | |
shell.mkdir("-p", dirname); | |
var html = ` | |
<!doctype html> | |
<html> | |
<head> | |
<title>${post.title}</title> | |
</head> | |
<body> | |
<h1> | |
<a href="${post.origlink}"> | |
${post.title} | |
</a> | |
</h1> | |
<h2>${post.author}</h2> | |
<h3>${post.pubdate}</h3> | |
<main> | |
${post.description} | |
</main> | |
</body> | |
</html> | |
` | |
fs.writeFile(p, html, done); | |
}; | |
var scrapePost = function(url, done) { | |
request(url, function(err, response, body) { | |
if (err) return done(err); | |
var $ = cheerio.load(body); | |
var postElement = $("#text-post"); | |
var html = postElement.html(); | |
done(null, html); | |
}); | |
}; | |
var processPost = function(post, done) { | |
scrapePost(post.link, function(err, html) { | |
post.description = html; | |
savePost(post, done); | |
}); | |
}; | |
var getChunk = function(done) { | |
var items = []; | |
var r = request(makeURL(index)); | |
var parser = new FeedParser(); | |
parser.on("readable", function() { | |
var item; | |
while (item = parser.read()) { | |
items.push(item); | |
} | |
}); | |
parser.on("finish", function() { | |
index += items.length; | |
if (items.length) { | |
//parse items | |
async.eachLimit(items, 10, processPost, getChunk); | |
} | |
}); | |
parser.on("error", err => console.log(err)); | |
r.on("response", () => r.pipe(parser)); | |
} | |
getChunk(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment