Skip to content

Instantly share code, notes, and snippets.

@jim80net
Last active December 14, 2023 23:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jim80net/0a6e372606026beb2b47d58b4f1d29fb to your computer and use it in GitHub Desktop.
Save jim80net/0a6e372606026beb2b47d58b4f1d29fb to your computer and use it in GitHub Desktop.
Extract text from a website - make sure you adjust the filter on line 20
import scrape from 'website-scraper';
import TurndownService from 'turndown';
import { JSDOM } from 'jsdom';
import path from 'path';
import fs from 'fs-extra';
const turndownService = new TurndownService();
class MyPlugin {
apply(registerAction) {
registerAction('error', async ({error}) => {console.error(error)});
registerAction('onResourceSaved', ({resource}) => console.log(`Resource ${resource.url} saved!`));
registerAction('onResourceError', ({resource, error}) => console.log(`Resource ${resource.url} has error ${error}`));
registerAction('saveResource', ({resource}) => {
const absoluteDirectoryPath = path.resolve(process.cwd(), options.directory); // yuck options is global
const filename = path.join(absoluteDirectoryPath, resource.getFilename()).slice(0, -4) + 'txt';
const original_text = resource.getText();
const dom = new JSDOM(original_text);
const contentBody = dom.window.document.querySelector('.content-body');
let text = "";
if (contentBody) {
text = turndownService.turndown(contentBody.textContent);
}
console.log(`Saving resource ${filename} with content ${text.substring(0, 12)}...`);
fs.outputFile(filename, text, { encoding: resource.getEncoding() });
});
}
}
const options = {
urls: ['http://localhost:3001/docs'],
urlFilter: function(url) {
return url.indexOf('http://localhost:3001/docs') === 0;
},
directory: 'output' + new Date().getTime(),
recursive: true,
maxRecursiveDepth: 5,
plugins: [new MyPlugin()],
//filenameGenerator: 'bySiteStructure',
};
// with async/await
const result = await scrape(options);
// Now condense the files into one file
const files = await fs.readdir(options.directory);
const output = files.filter(file => file.endsWith('.txt')).map(file => fs.readFileSync(path.join(options.directory, file), {encoding: 'utf8'})).join('\n');
await fs.outputFile(path.join(options.directory, 'output.txt'), output);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment