Skip to content

Instantly share code, notes, and snippets.

@thomaswilburn
Created June 22, 2017 18:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thomaswilburn/30a5df743145084eda2c72dae05b5215 to your computer and use it in GitHub Desktop.
Save thomaswilburn/30a5df743145084eda2c72dae05b5215 to your computer and use it in GitHub Desktop.
Archive that blogspot feed that you really like
var async = require("async");
var cheerio = require("cheerio");
var FeedParser = require("feedparser");
var request = require("request");
var shell = require("shelljs");
var fs = require("fs");
var path = require("path");
var url = require("url");
var zlib = require("zlib");
var makeURL = i => `http://fafblog.blogspot.com/feeds/posts/default?start-index=${i}&max-results=3000`;
var index = 1;
var savePost = function(post, done) {
var u = url.parse(post.link);
var p = u.pathname.replace(/^\//, "");
var dirname = path.dirname(p);
shell.mkdir("-p", dirname);
var html = `
<!doctype html>
<html>
<head>
<title>${post.title}</title>
</head>
<body>
<h1>
<a href="${post.origlink}">
${post.title}
</a>
</h1>
<h2>${post.author}</h2>
<h3>${post.pubdate}</h3>
<main>
${post.description}
</main>
</body>
</html>
`
fs.writeFile(p, html, done);
};
var scrapePost = function(url, done) {
request(url, function(err, response, body) {
if (err) return done(err);
var $ = cheerio.load(body);
var postElement = $("#text-post");
var html = postElement.html();
done(null, html);
});
};
var processPost = function(post, done) {
scrapePost(post.link, function(err, html) {
post.description = html;
savePost(post, done);
});
};
var getChunk = function(done) {
var items = [];
var r = request(makeURL(index));
var parser = new FeedParser();
parser.on("readable", function() {
var item;
while (item = parser.read()) {
items.push(item);
}
});
parser.on("finish", function() {
index += items.length;
if (items.length) {
//parse items
async.eachLimit(items, 10, processPost, getChunk);
}
});
parser.on("error", err => console.log(err));
r.on("response", () => r.pipe(parser));
}
getChunk();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment