Skip to content

Instantly share code, notes, and snippets.

@reinvanoyen
Last active February 7, 2024 08:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save reinvanoyen/3ce721d7e4c42aa3eabacab2a395e2ba to your computer and use it in GitHub Desktop.
Save reinvanoyen/3ce721d7e4c42aa3eabacab2a395e2ba to your computer and use it in GitHub Desktop.
scrape-text-from-sitemap
"use strict";
const https = require('https');
const Sitemapper = require('sitemapper');
const jsdom = require('jsdom');
const fs = require('fs');
const { JSDOM } = jsdom;
if (!process.argv[2]) {
console.error('No path to sitemap given');
}
const sitemapUrl = process.argv[2];
const elementSelector = process.argv[3] || 'body';
const removeMultipleNewlines = process.argv[4] || false;
const get = async (url) => {
console.log('Fetching html from webpage ('+url+')');
return new Promise((resolve, reject) => {
https.get(url, res => {
res.setEncoding('utf8');
let response = '';
res.on('data', data => { response += data; });
res.on('end', () => { resolve(response); });
}).on('error', error => reject(error));
});
};
const getUrlsFromSitemap = async (sitemapUrl) => {
const sitemap = new Sitemapper();
return await sitemap.fetch(sitemapUrl);
};
const scrapeUrl = async (url) => {
console.log('Scraping webpage');
const html = await get(url);
return getTextFromHtml(html);
};
const getTextFromHtmlNode = (htmlNode) => {
if (
htmlNode.tagName === 'STYLE' ||
htmlNode.tagName === 'style' ||
htmlNode.nodeType === 8
) {
return '';
}
if (htmlNode.tagName === 'BR') {
return "\n";
}
if (htmlNode.nodeType === 3) {
return htmlNode.textContent.trim();
}
let output = '';
for (let i = 0; i < htmlNode.childNodes.length; i++) {
output += getTextFromHtmlNode(htmlNode.childNodes[i]);
}
if (output) {
const markdownMap = {
'H1': '# ',
'H2': '## ',
'H3': '### ',
'H4': '#### ',
'H5': '#### ',
'H6': '##### ',
'LI': '* ',
'OPTION': '* ',
};
if (markdownMap[htmlNode.tagName]) {
return "\n"+markdownMap[htmlNode.tagName] + output.trim();
}
return "\n"+output.trim();
}
return '';
};
const getTextFromHtml = (html) => {
console.log('Getting text from webpage');
const dom = new JSDOM(html);
let output = '';
const containerEl = dom.window.document.querySelector(elementSelector);
for (let i = 0; i < containerEl.childNodes.length; i++) {
output += getTextFromHtmlNode(containerEl.childNodes[i]);
}
return output;
};
const removeNewlines = (string) => {
return string.replace(/[\r\n]{2,}/g, "\n");
};
const scrapeUrls = async (urls) => {
let output = '';
for (let i = 0; i < urls.length; i++) {
output += "\n";
output += '# '+urls[i];
output += "\n";
output += await scrapeUrl(urls[i]);
}
return output;
};
const writeFile = (filename, content) => {
console.log('Writing all text to output.txt');
fs.writeFile(filename, content, (err) => {
if(err) {
return console.error(err);
}
console.log('The file was saved!');
});
};
// IIFE
(async function() {
const urls = await getUrlsFromSitemap(sitemapUrl);
const content = await scrapeUrls(urls.sites);
writeFile('output.md', (removeMultipleNewlines ? removeNewlines(content) : content));
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment