Skip to content

Instantly share code, notes, and snippets.

Created April 27, 2024 02:07
Show Gist options
  • Save cheeseonamonkey/752b4f17ec0df6e80959d1526d1f9b21 to your computer and use it in GitHub Desktop.
Save cheeseonamonkey/752b4f17ec0df6e80959d1526d1f9b21 to your computer and use it in GitHub Desktop.
Wikipedia dumps download script
npm install axios cheerio fs-extra
// Import the necessary libraries
const axios = require('axios');
const cheerio = require('cheerio');
const fs = require('fs-extra');
const path = require('path');
// Base URL for Wikimedia dumps
const baseUrl = '';
// Function to download a file
const downloadFile = async (url, outputLocationPath) => {
// Create a writable stream to download the file
const writer = fs.createWriteStream(outputLocationPath);
const response = await axios({
method: 'GET',
responseType: 'stream', // This allows us to pipe the response to the file
// Pipe the response data into the writer stream;
// Return a promise that resolves when the write operation is complete
return new Promise((resolve, reject) => {
writer.on('finish', resolve);
writer.on('error', reject);
// Fetch the main page and extract the links
const fetchMainPage = async () => {
const response = await axios.get(baseUrl);
return cheerio.load(; // Load the HTML into Cheerio for parsing
const getThirdPageLink = ($) => {
// Get all links on the page and return the third one
const links = $('a');
return $(links.get(2)).attr('href'); // 0-indexed, so this is the third link
// Fetch the third page and extract the multistream links
const fetchThirdPage = async (link) => {
const response = await axios.get(`${baseUrl}${link}`);
return cheerio.load(; // Load the HTML into Cheerio for parsing
const getMultistreamLinks = ($) => {
// Extract all links containing 'multistream' but not 'index'
const origin = baseUrl.replace('enwiki/',''); // Origin base URL
let output = $('a')
.map((_, a) => $(a).attr('href'))
.filter((link) => link.includes('multistream') && !link.includes('index'))
.map((link) => `${origin}${link}`); // Ensure full URL by prepending the origin
return output
const main = async () => {
console.log(`Fetching main page: ${baseUrl}`);
const $mainPage = await fetchMainPage();
const thirdPageLink = getThirdPageLink($mainPage);
console.log(`Extracted index URL: ${thirdPageLink}`);
const $thirdPage = await fetchThirdPage(thirdPageLink);
const multistreamLinks = getMultistreamLinks($thirdPage);
console.log(`Extracted ${multistreamLinks.length} download links`);
// Use __dirname to create a relative path for the download directory
const tempDir = path.resolve(__dirname, 'downloads'); // Ensure you have __dirname defined
await fs.ensureDir(tempDir); // Make sure the directory exists
console.log('Downloading files...');
for (const link of multistreamLinks) {
const filename = path.basename(link); // Extract the filename from the link
const outputLocation = path.join(tempDir, filename); // Create the full path for the downloaded file
console.log(`Downloading ${link} to ${outputLocation}`);
await downloadFile(link, outputLocation); // Download the file using the full URL
console.log('Download complete.'); // Indicate that the downloads are finished
// Execute the main function and catch any errors
main().catch((error) => {
console.error('Error:', error); // Output any errors that occur
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment