Skip to content

Instantly share code, notes, and snippets.

@SteveMcArthur
Last active September 15, 2016 08:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save SteveMcArthur/cdfa99264757a3efe5412b32789a4438 to your computer and use it in GitHub Desktop.
Save SteveMcArthur/cdfa99264757a3efe5412b32789a4438 to your computer and use it in GitHub Desktop.
Scrape and download wikipedia's country flags from http://commons.wikimedia.org/wiki/Sovereign-state_flags (and other pages) using nodejs. Most of Wikipedia's country flags are stored as SVG, so they can be scaled. Their width is stored in the flag's URL, so this can be changed to download whatever size of a flag you'd like.
/* global require */
/*This is basically a port from php of the gist: https://gist.github.com/nickbuddendotcom/5792710*/
var cheerio = require('cheerio');
var fs = require('fs');
var path = require('path');
var request = require('request');
var pageURL = 'http://commons.wikimedia.org/wiki/Sovereign-state_flags';
//var pageURL = 'https://commons.wikimedia.org/wiki/Flags_of_active_autonomist_and_secessionist_movements';
var outDir = path.resolve("..", 'flags');
//var outDir = path.resolve("..", 'flags', 'secessionists');
//regex to extract the image size from the URL
var sizeReg = /\/\d+px/;
//regex to extract the country name from the URL and use it as your output image name
var countryReg = /\/([^/]+)(?:.svg|.png|.jpg|.gif)\/\d+px/;
//set this to whatever output size you want
var outSize = "/30px";
var flagURLs = [];
var e = 1;
var flagCount = -1;
function getCountryName(url) {
var match = countryReg.exec(url);
var country = match ? match[1] : "";
if (!country) {
country = "error" + e;
e++;
console.log("cannot determine country for:");
console.log(url);
}
country = country.replace(/^Flag_of_/, '');
country = country.replace(/^the_/, '');
country = decodeURIComponent(country);
country = country.replace("/","").replace("/","").replace(",","");
return country;
}
function doDownload(url, callback) {
var country = getCountryName(url);
var outfile = path.join(outDir, country + ".png");
request(url)
.on('error', function (err) {
console.log(err);
})
.on('end', function () {
callback();
})
.pipe(fs.createWriteStream(outfile));
}
function downloadFlags() {
flagCount++;
if (flagCount < flagURLs.length) {
doDownload(flagURLs[flagCount], downloadFlags);
}
}
function processPage(body) {
var $ = cheerio.load(body);
var images = $('img.thumbborder');
for (var i = 0; i < images.length; i++) {
var imgURL = images[i].attribs.src;
if (/^https?/.exec(imgURL)) {
imgURL = imgURL.replace(sizeReg, outSize);
flagURLs.push(imgURL);
}
}
downloadFlags();
}
request(pageURL, function (error, response, body) {
if (!error && response.statusCode == 200) {
processPage(body);
}
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment