Skip to content

Instantly share code, notes, and snippets.

@ejfox
Created December 25, 2023 02:58
Show Gist options
  • Save ejfox/492436ba80f9762e87bf51041f58221c to your computer and use it in GitHub Desktop.
Save ejfox/492436ba80f9762e87bf51041f58221c to your computer and use it in GitHub Desktop.
const puppeteer = require('puppeteer');
const fs = require('fs');
(async () => {
try {
// Read the CSV file
const inputCsv = fs.readFileSync('output.csv', 'utf-8');
const records = inputCsv.split('\n').map((row, index) => {
if (index === 0) return null; // Skip the header row
const [Index, Title, URL] = row.split(',');
const longerURL = URL?.match(/"(.*?)"/)[1]; // Extract the longer URL
return { Index, Title, URL: longerURL };
}).filter(record => record !== null);
// Check if the video has already been scraped
const scrapedCsvPath = 'updated_youtube_videos.csv';
let scrapedCsv = '';
if (fs.existsSync(scrapedCsvPath)) {
scrapedCsv = fs.readFileSync(scrapedCsvPath, 'utf-8');
}
const scrapedRecords = scrapedCsv.split('\n').map((row, index) => {
if (index === 0) return null; // Skip the header row
const [Index, Title, URL] = row.split(',');
return { Index, Title, URL };
}).filter(record => record !== null);
const unscrapedRecords = records.filter(record => {
const isScraped = scrapedRecords.some(scrapedRecord => scrapedRecord.URL === record.URL);
return !isScraped;
});
// Launch Puppeteer
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
// Process each URL to get the video title
for (const record of unscrapedRecords) {
try {
await page.goto(record.URL, { waitUntil: 'networkidle2', timeout: 30000 });
const title = await page.title();
// Update the title in your record
record.Title = title.replace(' - YouTube', ''); // Removing additional YouTube text
console.log(`Updated record ${record.Index} with title ${record.Title}`);
} catch (error) {
console.error(`Error processing record ${record.Index}: ${error.message}`);
// Continue to the next record
continue;
}
}
// Close the browser
await browser.close();
// Convert the updated records back to CSV
const outputCsv = unscrapedRecords.map(record => `${record.Index},"${record.Title}",${record.URL}`).join('\n');
// Write the new CSV to file
fs.writeFileSync('updated_youtube_videos.csv', outputCsv);
} catch (error) {
console.error(`An error occurred: ${error.message}`);
}
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment