Skip to content

Instantly share code, notes, and snippets.

@cheeseonamonkey
Created April 27, 2024 02:07
Show Gist options
  • Save cheeseonamonkey/752b4f17ec0df6e80959d1526d1f9b21 to your computer and use it in GitHub Desktop.
Save cheeseonamonkey/752b4f17ec0df6e80959d1526d1f9b21 to your computer and use it in GitHub Desktop.
Wikipedia dumps download script
/*
npm install axios cheerio fs-extra
*/
// Import the necessary libraries
const axios = require('axios');
const cheerio = require('cheerio');
const fs = require('fs-extra');
const path = require('path');
// Base URL for Wikimedia dumps
const baseUrl = 'https://dumps.wikimedia.org/enwiki/';
// Function to download a file
const downloadFile = async (url, outputLocationPath) => {
// Create a writable stream to download the file
const writer = fs.createWriteStream(outputLocationPath);
const response = await axios({
url,
method: 'GET',
responseType: 'stream', // This allows us to pipe the response to the file
});
// Pipe the response data into the writer stream
response.data.pipe(writer);
// Return a promise that resolves when the write operation is complete
return new Promise((resolve, reject) => {
writer.on('finish', resolve);
writer.on('error', reject);
});
};
// Fetch the main page and extract the links
const fetchMainPage = async () => {
const response = await axios.get(baseUrl);
return cheerio.load(response.data); // Load the HTML into Cheerio for parsing
};
const getThirdPageLink = ($) => {
// Get all links on the page and return the third one
const links = $('a');
return $(links.get(2)).attr('href'); // 0-indexed, so this is the third link
};
// Fetch the third page and extract the multistream links
const fetchThirdPage = async (link) => {
const response = await axios.get(`${baseUrl}${link}`);
return cheerio.load(response.data); // Load the HTML into Cheerio for parsing
};
const getMultistreamLinks = ($) => {
// Extract all links containing 'multistream' but not 'index'
const origin = baseUrl.replace('enwiki/',''); // Origin base URL
let output = $('a')
.map((_, a) => $(a).attr('href'))
.get()
.filter((link) => link.includes('multistream') && !link.includes('index'))
.map((link) => `${origin}${link}`); // Ensure full URL by prepending the origin
return output
};
const main = async () => {
console.log(`Fetching main page: ${baseUrl}`);
const $mainPage = await fetchMainPage();
const thirdPageLink = getThirdPageLink($mainPage);
console.log(`Extracted index URL: ${thirdPageLink}`);
const $thirdPage = await fetchThirdPage(thirdPageLink);
const multistreamLinks = getMultistreamLinks($thirdPage);
console.log(`Extracted ${multistreamLinks.length} download links`);
// Use __dirname to create a relative path for the download directory
const tempDir = path.resolve(__dirname, 'downloads'); // Ensure you have __dirname defined
await fs.ensureDir(tempDir); // Make sure the directory exists
console.log('Downloading files...');
for (const link of multistreamLinks) {
const filename = path.basename(link); // Extract the filename from the link
const outputLocation = path.join(tempDir, filename); // Create the full path for the downloaded file
console.log(`Downloading ${link} to ${outputLocation}`);
await downloadFile(link, outputLocation); // Download the file using the full URL
}
console.log('Download complete.'); // Indicate that the downloads are finished
};
// Execute the main function and catch any errors
main().catch((error) => {
console.error('Error:', error); // Output any errors that occur
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment