freshyill/package.json

## package.json
{
  "name": "scrape",
  "version": "1.0.0",
  "description": "",
  "main": "index.js",
  "scripts": {
    "test": "echo \"Error: no test specified\" && exit 1"
  },
  "author": "",
  "license": "ISC",
  "dependencies": {
    "cheerio": "^1.0.0-rc.12",
    "js-yaml": "^4.1.0",
    "node-html-markdown": "^1.3.0",
    "superagent": "^8.1.2"
  }
}

## people-scrape.js
const fs = require("fs");
const superagent = require("superagent");
const cheerio = require("cheerio");
const yaml = require("js-yaml");
const { NodeHtmlMarkdown, NodeHtmlMarkdownOptions } = require("node-html-markdown");

const urlList = fs
  .readFileSync("people.txt", "utf-8")
  .split("\n")
  .map((url) => url.trim());

// Here's a helper to get images (not used here currently, but easily could be)
async function downloadImage(url, localPath) {
  const response = await superagent.get(url);
  fs.writeFileSync(localPath, response.body);
}

// Separate function because it wouldn't work inline for some reason
function replaceNbsp(input) {
  return input.replace(/\s/g, " ");
}

async function scrapeUrl(url) {
  try {
    const response = await superagent.get(url);
    const $ = cheerio.load(response.text);

    const slug = url
      .replace("https://original-site.domain/person/", "")
      .slice(0, 100);

    const title_scrape = $("title").text();
    const title = title_scrape.replace(
      " | Whatever from the page title",
      ""
    );

    const given_name = $(".field--name-field-given-name").text();

    const surname = $(".field--name-field-surname").text();

    const email = $(".field--name-field-email").text().toLowerCase();

    const role = $(".person__role").text();

    const job_title = $(".layout--fourcol-section .field--name-field-job-title").text();

    const affiliation_scrape = $(".block-field-blocknodepersonfield-organization h2 a").attr("href") || "";
    const affiliation = affiliation_scrape.replace("/org/", "");

    const description_scrape = $('meta[name="description"]').attr("content") || "";

    const description_pass1 = description_scrape
      .replace(/"/g, "")
      .replace(/\n/g, " ")
      .slice(0, 159);

    const description = replaceNbsp(description_pass1);

    // Get the body HTML and convert it to Markdown
    const body_scrape = $(".field--name-body").html();
    const bodyMarkdown = NodeHtmlMarkdown.translate(body_scrape);

    const summary_scrape =
      $(".layout--fourcol-section .field--type-text-with-summary").text() || "";
    const summary_pass1 = summary_scrape
      .replace(/&nbsp;/g, " ")
      .replace(/(<([^> ]+)>)/gi, "")
      .trim();
    const summary = replaceNbsp(summary_pass1);

    let main_image_filename = $(".person__headshot img").attr("src") || "";

    // Keeping this step-by-step rather than making efficient regexes
    // Renaming things because the sources are a mess
    // I'm downloading images from a list separately, but it could be done here too
    let main_image = "";
    if (main_image_filename != "") {
      main_image = main_image_filename
        .toLowerCase()
        .replace(/ /g, "-")
        .replace(/_/g, "-")
        .replace(/%20/g, "-")
        .replace("/sites/default/files/styles/headshot/public/", "/images")
        .replace(/\d{4}-\d{2}/, "")
        .replace(/\?.*/, "");
    }

    // I want this to end up in an object
    const meta = {
      description,
    };

    return {
      frontmatter: {
        original_url: url,
        name: title,
        slug,
        given_name,
        surname,
        role,
        job_title,
        affiliation,
        email,
        main_image,
        summary,
        meta,
      },
      body: bodyMarkdown,
    };
  } catch (error) {
    console.error(`Error scraping ${url}: ${error.message}`);
    return { url, error: true };
  }
}

// Process each URL and save to a Markdown file with YAML frontmatter
urlList.forEach(async (url) => {
  const result = await scrapeUrl(url);

  const frontmatter = yaml.dump(result.frontmatter);
  const markdownContent = `---\n${frontmatter}---\n\n${result.body}`;

  const fileName = `${url.replace(
    "https://original-site.domain/person/",
    ""
  )}.md`;
  fs.writeFileSync("../../src/people/" + fileName, markdownContent, "utf-8");

  console.log(`Scraped ${url} and saved data to ${fileName}`);
});

## people.txt
https://original-site.domain/person/person-name-1
https://original-site.domain/person/person-name-2
https://original-site.domain/person/person-name-3
	{
	"name": "scrape",
	"version": "1.0.0",
	"description": "",
	"main": "index.js",
	"scripts": {
	"test": "echo \"Error: no test specified\" && exit 1"
	},
	"author": "",
	"license": "ISC",
	"dependencies": {
	"cheerio": "^1.0.0-rc.12",
	"js-yaml": "^4.1.0",
	"node-html-markdown": "^1.3.0",
	"superagent": "^8.1.2"
	}
	}
	const fs = require("fs");
	const superagent = require("superagent");
	const cheerio = require("cheerio");
	const yaml = require("js-yaml");
	const { NodeHtmlMarkdown, NodeHtmlMarkdownOptions } = require("node-html-markdown");

	const urlList = fs
	.readFileSync("people.txt", "utf-8")
	.split("\n")
	.map((url) => url.trim());

	// Here's a helper to get images (not used here currently, but easily could be)
	async function downloadImage(url, localPath) {
	const response = await superagent.get(url);
	fs.writeFileSync(localPath, response.body);
	}

	// Separate function because it wouldn't work inline for some reason
	function replaceNbsp(input) {
	return input.replace(/\s/g, " ");
	}

	async function scrapeUrl(url) {
	try {
	const response = await superagent.get(url);
	const $ = cheerio.load(response.text);

	const slug = url
	.replace("https://original-site.domain/person/", "")
	.slice(0, 100);

	const title_scrape = $("title").text();
	const title = title_scrape.replace(
	" \| Whatever from the page title",
	""
	);

	const given_name = $(".field--name-field-given-name").text();

	const surname = $(".field--name-field-surname").text();

	const email = $(".field--name-field-email").text().toLowerCase();

	const role = $(".person__role").text();

	const job_title = $(".layout--fourcol-section .field--name-field-job-title").text();

	const affiliation_scrape = $(".block-field-blocknodepersonfield-organization h2 a").attr("href") \|\| "";
	const affiliation = affiliation_scrape.replace("/org/", "");

	const description_scrape = $('meta[name="description"]').attr("content") \|\| "";

	const description_pass1 = description_scrape
	.replace(/"/g, "")
	.replace(/\n/g, " ")
	.slice(0, 159);

	const description = replaceNbsp(description_pass1);

	// Get the body HTML and convert it to Markdown
	const body_scrape = $(".field--name-body").html();
	const bodyMarkdown = NodeHtmlMarkdown.translate(body_scrape);

	const summary_scrape =
	$(".layout--fourcol-section .field--type-text-with-summary").text() \|\| "";
	const summary_pass1 = summary_scrape
	.replace(/ /g, " ")
	.replace(/(<([^> ]+)>)/gi, "")
	.trim();
	const summary = replaceNbsp(summary_pass1);

	let main_image_filename = $(".person__headshot img").attr("src") \|\| "";

	// Keeping this step-by-step rather than making efficient regexes
	// Renaming things because the sources are a mess
	// I'm downloading images from a list separately, but it could be done here too
	let main_image = "";
	if (main_image_filename != "") {
	main_image = main_image_filename
	.toLowerCase()
	.replace(/ /g, "-")
	.replace(/_/g, "-")
	.replace(/%20/g, "-")
	.replace("/sites/default/files/styles/headshot/public/", "/images")
	.replace(/\d{4}-\d{2}/, "")
	.replace(/\?.*/, "");
	}

	// I want this to end up in an object
	const meta = {
	description,
	};

	return {
	frontmatter: {
	original_url: url,
	name: title,
	slug,
	given_name,
	surname,
	role,
	job_title,
	affiliation,
	email,
	main_image,
	summary,
	meta,
	},
	body: bodyMarkdown,
	};
	} catch (error) {
	console.error(`Error scraping ${url}: ${error.message}`);
	return { url, error: true };
	}
	}

	// Process each URL and save to a Markdown file with YAML frontmatter
	urlList.forEach(async (url) => {
	const result = await scrapeUrl(url);

	const frontmatter = yaml.dump(result.frontmatter);
	const markdownContent = `---\n${frontmatter}---\n\n${result.body}`;

	const fileName = `${url.replace(
	"https://original-site.domain/person/",
	""
	)}.md`;
	fs.writeFileSync("../../src/people/" + fileName, markdownContent, "utf-8");

	console.log(`Scraped ${url} and saved data to ${fileName}`);
	});
	https://original-site.domain/person/person-name-1
	https://original-site.domain/person/person-name-2
	https://original-site.domain/person/person-name-3