Created
January 23, 2024 17:56
-
-
Save freshyill/30a4581971edf6550f7595106ec37afc to your computer and use it in GitHub Desktop.
Quick and dirty scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "scrape", | |
"version": "1.0.0", | |
"description": "", | |
"main": "index.js", | |
"scripts": { | |
"test": "echo \"Error: no test specified\" && exit 1" | |
}, | |
"author": "", | |
"license": "ISC", | |
"dependencies": { | |
"cheerio": "^1.0.0-rc.12", | |
"js-yaml": "^4.1.0", | |
"node-html-markdown": "^1.3.0", | |
"superagent": "^8.1.2" | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const fs = require("fs"); | |
const superagent = require("superagent"); | |
const cheerio = require("cheerio"); | |
const yaml = require("js-yaml"); | |
const { NodeHtmlMarkdown, NodeHtmlMarkdownOptions } = require("node-html-markdown"); | |
const urlList = fs | |
.readFileSync("people.txt", "utf-8") | |
.split("\n") | |
.map((url) => url.trim()); | |
// Here's a helper to get images (not used here currently, but easily could be) | |
async function downloadImage(url, localPath) { | |
const response = await superagent.get(url); | |
fs.writeFileSync(localPath, response.body); | |
} | |
// Separate function because it wouldn't work inline for some reason | |
function replaceNbsp(input) { | |
return input.replace(/\s/g, " "); | |
} | |
async function scrapeUrl(url) { | |
try { | |
const response = await superagent.get(url); | |
const $ = cheerio.load(response.text); | |
const slug = url | |
.replace("https://original-site.domain/person/", "") | |
.slice(0, 100); | |
const title_scrape = $("title").text(); | |
const title = title_scrape.replace( | |
" | Whatever from the page title", | |
"" | |
); | |
const given_name = $(".field--name-field-given-name").text(); | |
const surname = $(".field--name-field-surname").text(); | |
const email = $(".field--name-field-email").text().toLowerCase(); | |
const role = $(".person__role").text(); | |
const job_title = $(".layout--fourcol-section .field--name-field-job-title").text(); | |
const affiliation_scrape = $(".block-field-blocknodepersonfield-organization h2 a").attr("href") || ""; | |
const affiliation = affiliation_scrape.replace("/org/", ""); | |
const description_scrape = $('meta[name="description"]').attr("content") || ""; | |
const description_pass1 = description_scrape | |
.replace(/"/g, "") | |
.replace(/\n/g, " ") | |
.slice(0, 159); | |
const description = replaceNbsp(description_pass1); | |
// Get the body HTML and convert it to Markdown | |
const body_scrape = $(".field--name-body").html(); | |
const bodyMarkdown = NodeHtmlMarkdown.translate(body_scrape); | |
const summary_scrape = | |
$(".layout--fourcol-section .field--type-text-with-summary").text() || ""; | |
const summary_pass1 = summary_scrape | |
.replace(/ /g, " ") | |
.replace(/(<([^> ]+)>)/gi, "") | |
.trim(); | |
const summary = replaceNbsp(summary_pass1); | |
let main_image_filename = $(".person__headshot img").attr("src") || ""; | |
// Keeping this step-by-step rather than making efficient regexes | |
// Renaming things because the sources are a mess | |
// I'm downloading images from a list separately, but it could be done here too | |
let main_image = ""; | |
if (main_image_filename != "") { | |
main_image = main_image_filename | |
.toLowerCase() | |
.replace(/ /g, "-") | |
.replace(/_/g, "-") | |
.replace(/%20/g, "-") | |
.replace("/sites/default/files/styles/headshot/public/", "/images") | |
.replace(/\d{4}-\d{2}/, "") | |
.replace(/\?.*/, ""); | |
} | |
// I want this to end up in an object | |
const meta = { | |
description, | |
}; | |
return { | |
frontmatter: { | |
original_url: url, | |
name: title, | |
slug, | |
given_name, | |
surname, | |
role, | |
job_title, | |
affiliation, | |
email, | |
main_image, | |
summary, | |
meta, | |
}, | |
body: bodyMarkdown, | |
}; | |
} catch (error) { | |
console.error(`Error scraping ${url}: ${error.message}`); | |
return { url, error: true }; | |
} | |
} | |
// Process each URL and save to a Markdown file with YAML frontmatter | |
urlList.forEach(async (url) => { | |
const result = await scrapeUrl(url); | |
const frontmatter = yaml.dump(result.frontmatter); | |
const markdownContent = `---\n${frontmatter}---\n\n${result.body}`; | |
const fileName = `${url.replace( | |
"https://original-site.domain/person/", | |
"" | |
)}.md`; | |
fs.writeFileSync("../../src/people/" + fileName, markdownContent, "utf-8"); | |
console.log(`Scraped ${url} and saved data to ${fileName}`); | |
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
https://original-site.domain/person/person-name-1 | |
https://original-site.domain/person/person-name-2 | |
https://original-site.domain/person/person-name-3 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment