Skip to content

Instantly share code, notes, and snippets.

@freshyill
Created January 23, 2024 17:56
Show Gist options
  • Save freshyill/30a4581971edf6550f7595106ec37afc to your computer and use it in GitHub Desktop.
Save freshyill/30a4581971edf6550f7595106ec37afc to your computer and use it in GitHub Desktop.
Quick and dirty scraper
{
"name": "scrape",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "",
"license": "ISC",
"dependencies": {
"cheerio": "^1.0.0-rc.12",
"js-yaml": "^4.1.0",
"node-html-markdown": "^1.3.0",
"superagent": "^8.1.2"
}
}
const fs = require("fs");
const superagent = require("superagent");
const cheerio = require("cheerio");
const yaml = require("js-yaml");
const { NodeHtmlMarkdown, NodeHtmlMarkdownOptions } = require("node-html-markdown");
const urlList = fs
.readFileSync("people.txt", "utf-8")
.split("\n")
.map((url) => url.trim());
// Here's a helper to get images (not used here currently, but easily could be)
async function downloadImage(url, localPath) {
const response = await superagent.get(url);
fs.writeFileSync(localPath, response.body);
}
// Separate function because it wouldn't work inline for some reason
function replaceNbsp(input) {
return input.replace(/\s/g, " ");
}
async function scrapeUrl(url) {
try {
const response = await superagent.get(url);
const $ = cheerio.load(response.text);
const slug = url
.replace("https://original-site.domain/person/", "")
.slice(0, 100);
const title_scrape = $("title").text();
const title = title_scrape.replace(
" | Whatever from the page title",
""
);
const given_name = $(".field--name-field-given-name").text();
const surname = $(".field--name-field-surname").text();
const email = $(".field--name-field-email").text().toLowerCase();
const role = $(".person__role").text();
const job_title = $(".layout--fourcol-section .field--name-field-job-title").text();
const affiliation_scrape = $(".block-field-blocknodepersonfield-organization h2 a").attr("href") || "";
const affiliation = affiliation_scrape.replace("/org/", "");
const description_scrape = $('meta[name="description"]').attr("content") || "";
const description_pass1 = description_scrape
.replace(/"/g, "")
.replace(/\n/g, " ")
.slice(0, 159);
const description = replaceNbsp(description_pass1);
// Get the body HTML and convert it to Markdown
const body_scrape = $(".field--name-body").html();
const bodyMarkdown = NodeHtmlMarkdown.translate(body_scrape);
const summary_scrape =
$(".layout--fourcol-section .field--type-text-with-summary").text() || "";
const summary_pass1 = summary_scrape
.replace(/ /g, " ")
.replace(/(<([^> ]+)>)/gi, "")
.trim();
const summary = replaceNbsp(summary_pass1);
let main_image_filename = $(".person__headshot img").attr("src") || "";
// Keeping this step-by-step rather than making efficient regexes
// Renaming things because the sources are a mess
// I'm downloading images from a list separately, but it could be done here too
let main_image = "";
if (main_image_filename != "") {
main_image = main_image_filename
.toLowerCase()
.replace(/ /g, "-")
.replace(/_/g, "-")
.replace(/%20/g, "-")
.replace("/sites/default/files/styles/headshot/public/", "/images")
.replace(/\d{4}-\d{2}/, "")
.replace(/\?.*/, "");
}
// I want this to end up in an object
const meta = {
description,
};
return {
frontmatter: {
original_url: url,
name: title,
slug,
given_name,
surname,
role,
job_title,
affiliation,
email,
main_image,
summary,
meta,
},
body: bodyMarkdown,
};
} catch (error) {
console.error(`Error scraping ${url}: ${error.message}`);
return { url, error: true };
}
}
// Process each URL and save to a Markdown file with YAML frontmatter
urlList.forEach(async (url) => {
const result = await scrapeUrl(url);
const frontmatter = yaml.dump(result.frontmatter);
const markdownContent = `---\n${frontmatter}---\n\n${result.body}`;
const fileName = `${url.replace(
"https://original-site.domain/person/",
""
)}.md`;
fs.writeFileSync("../../src/people/" + fileName, markdownContent, "utf-8");
console.log(`Scraped ${url} and saved data to ${fileName}`);
});
https://original-site.domain/person/person-name-1
https://original-site.domain/person/person-name-2
https://original-site.domain/person/person-name-3
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment