Skip to content

Instantly share code, notes, and snippets.

@sachac
Created January 6, 2015 23:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sachac/fa8d050998ffd161a041 to your computer and use it in GitHub Desktop.
Save sachac/fa8d050998ffd161a041 to your computer and use it in GitHub Desktop.
var fs = require('fs');
var cheerio = require('cheerio');
function extractInfo(article) {
var prettyLink = article.find("h2 a").attr("href");
var matches = article.find(".permalink a").attr("href").match(/p\/([0-9]+)/);
var postID = matches[1];
var title = article.find("h2").text();
var date = article.find(".date").attr("data-date");
return { prettyLink: prettyLink,
postID: postID,
title: title,
date: date};
}
function collectLinks(article) {
var results = [];
article.find(".body a").each(function() {
var link = $(this);
if (link.attr('href') && (link.attr('href').match(/sachachua/)
|| !link.attr('href').match(/^http/))) {
results.push({href: link.attr('href'),
text: link.text()});
}
});
return results;
}
function formatEntry(data) {
var s = "";
for (var i = 0; i < data.links.length; i++) {
var matches;
if (s) { s += "\n"; }
if ((matches = data.links[i].href.match(/uploads\/[0-9]+\/[0-9]+\/(.*?)\.(?:png|jpg)/))) {
s += matches[1] + "\t" + data.prettyLink;
}
}
return s;
}
function parseFile(filename) {
$ = cheerio.load(fs.readFileSync(filename));
$('article').each(function() {
var data = extractInfo($(this));
data.links = collectLinks($(this));
var output = formatEntry(data);
if (output) {
console.log(output);
}
});
}
for (var i = 2; i < process.argv.length; i++) {
parseFile(process.argv[i]);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment