Skip to content

Instantly share code, notes, and snippets.

@ehzawad
Created December 18, 2023 06:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ehzawad/a05766ffc7c862df893ce9d7b32addbf to your computer and use it in GitHub Desktop.
Save ehzawad/a05766ffc7c862df893ce9d7b32addbf to your computer and use it in GitHub Desktop.
// // const puppeteer = require('puppeteer');
// // async function downloadWikipediaPDF(url) {
// // // Launch a new browser instance in headless mode
// // const browser = await puppeteer.launch({ headless: true });
// // // Create a new page in the browser
// // const page = await browser.newPage();
// // // Navigate to the provided URL
// // await page.goto(url, { waitUntil: 'networkidle2' });
// // // Extract the title from the URL to use as the filename
// // const urlParts = url.split('/');
// // const pageTitle = urlParts[urlParts.length - 1];
// // const pdfPath = `${pageTitle}.pdf`;
// // // Using Puppeteer's page.pdf() method to generate a PDF directly
// // await page.pdf({ path: pdfPath, format: 'A4' });
// // console.log(`The PDF download for ${pageTitle} is completed.`);
// // // Close the browser
// // await browser.close();
// // }
// // // Replace with the Wikipedia page URL you want to download as PDF
// // downloadWikipediaPDF('https://en.wikipedia.org/wiki/Python_(programming_language)');
// const puppeteer = require('puppeteer');
// async function downloadWikipediaPDF(url, pageTitle) {
// const browser = await puppeteer.launch({ headless: true });
// const page = await browser.newPage();
// await page.goto(url, { waitUntil: 'networkidle2' });
// const pdfPath = `${pageTitle.replace(/\s/g, '_')}.pdf`;
// await page.pdf({ path: pdfPath, format: 'A4' });
// console.log(`The PDF download for ${pageTitle} is completed.`);
// await browser.close();
// }
// async function downloadAllStates() {
// const states = ["Alabama", "Alaska", "Arizona", "Arkansas", "California",
// "Colorado", "Connecticut", "Delaware", "Florida", "Georgia",
// "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa",
// "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland",
// "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri",
// "Montana", "Nebraska", "Nevada", "New Hampshire", "New Jersey",
// "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio",
// "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina",
// "South Dakota", "Tennessee", "Texas", "Utah", "Vermont",
// "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming"];
// for (const state of states) {
// const url = `https://en.wikipedia.org/wiki/${state.replace(/\s/g, '_')}`;
// await downloadWikipediaPDF(url, state);
// }
// }
// downloadAllStates();
////////
const puppeteer = require('puppeteer');
const { PDFDocument } = require('pdf-lib');
const fs = require('fs');
async function downloadWikipediaPDF(url, pageTitle) {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.goto(url, { waitUntil: 'networkidle2' });
const pdfPath = `${pageTitle.replace(/\s/g, '_')}.pdf`;
await page.pdf({ path: pdfPath, format: 'A4' });
console.log(`The PDF download for ${pageTitle} is completed.`);
await browser.close();
}
async function mergePDFs(pdfFiles, outputFileName) {
const mergedPdf = await PDFDocument.create();
for (const pdfFile of pdfFiles) {
const pdfBytes = fs.readFileSync(pdfFile);
const pdf = await PDFDocument.load(pdfBytes);
const copiedPages = await mergedPdf.copyPages(pdf, pdf.getPageIndices());
copiedPages.forEach(page => {
mergedPdf.addPage(page);
});
}
const mergedPdfFile = await mergedPdf.save();
fs.writeFileSync(outputFileName, mergedPdfFile);
console.log(`Merged PDF saved as ${outputFileName}`);
}
const states = ["Alabama", "Alaska", "Arizona", "Arkansas", "California",
"Colorado", "Connecticut", "Delaware", "Florida", "Georgia",
"Hawaii", "Idaho", "Illinois", "Indiana", "Iowa",
"Kansas", "Kentucky", "Louisiana", "Maine", "Maryland",
"Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri",
"Montana", "Nebraska", "Nevada", "New Hampshire", "New Jersey",
"New Mexico", "New York", "North Carolina", "North Dakota", "Ohio",
"Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina",
"South Dakota", "Tennessee", "Texas", "Utah", "Vermont",
"Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming"];
states.sort();
const createAcronym = (states) => states.map(state => state[0]).join('');
const groupedStates = [];
for (let i = 0; i < states.length; i += 5) {
const group = states.slice(i, i + 5);
const acronym = createAcronym(group);
groupedStates.push({ group, acronym });
}
async function processStates() {
for (const groupInfo of groupedStates) {
const pdfFiles = groupInfo.group.map(state => `${state.replace(/\s/g, '_')}.pdf`);
const outputFileName = `${groupInfo.acronym}.pdf`;
await mergePDFs(pdfFiles, outputFileName);
}
}
processStates();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment