Scrape Grace Gems Script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import chalk from 'chalk'; | |
import clear from 'clear'; | |
import CLI from 'clui'; | |
import figlet from 'figlet'; | |
import fs from 'fs'; | |
import jsonFormat from 'json-format'; | |
import kebabCase from 'lodash.kebabcase'; | |
import startcase from 'lodash.startcase'; | |
import tolower from 'lodash.tolower'; | |
import truncate from 'lodash.truncate'; | |
import fetch from 'node-fetch'; | |
import puppeteer from 'puppeteer'; | |
import { createApi } from 'unsplash-js'; | |
const argv = require('minimist')(process.argv.slice(2)); | |
global.fetch = fetch; | |
let warn = console.warn; | |
console.warn = (msg, ...args) => { | |
if (!msg.includes('waitFor is deprecated')) { | |
warn.call(console, msg, ...args); | |
} | |
}; | |
const DELAY = 100; | |
const SERMONS = 'https://www.gracegems.org/SERMONS.htm'; | |
const TRUNCATE = 250; | |
const blacklist = ['http://gracegems.org/photography.htm', 'http://gracegems.org/']; | |
const blacklistMatchers = /javascript:void\(0\)|mailto|caudillwebsolutions|sermonaudio/gm; | |
const unsplash = createApi({ | |
accessKey: 'pLS36KfA5kmNERfHV1CyvaZgU4j_xGwdaSjhe3_ZduE', | |
}); | |
async function init() { | |
clear(); | |
console.log(chalk.yellow(figlet.textSync('Sermonizer', { horizontalLayout: 'full' }))); | |
const status = newStatus('🚀 Launching browser...'); | |
status.start(); | |
const browser = await puppeteer.launch(); | |
const page = await browser.newPage(); | |
status.stop(); | |
return [browser, page]; | |
} | |
async function getLinks(page) { | |
const result = await page.$$eval('a', (as) => as.map((a) => a.href)); | |
return result.filter((x) => { | |
return !blacklist.includes(x) && !x.match(blacklistMatchers); | |
}); | |
} | |
function _titlecase(str) { | |
return startcase(tolower(str)); | |
} | |
function _truncate(str) { | |
return truncate(str, { | |
length: TRUNCATE, | |
}); | |
} | |
async function getMeta(page) { | |
const title = await page.$$eval('p[align=center]', (ps) => ps[0]?.innerText); | |
const author = await page.$$eval('p[align=center]', (ps) => ps[1]?.innerText); | |
const description = await page.$$eval( | |
'p[align=justify]', | |
(ps) => ps.find((p) => p?.innerText?.length > 2)?.innerText | |
); | |
return { | |
id: kebabCase(title), | |
author: _titlecase(author?.replace(/[0-9]/gm, '')), | |
title: _titlecase(title), | |
description: _truncate(description), | |
}; | |
} | |
async function getSermonAuthorsLinks(page) { | |
await page.goto(SERMONS); | |
await page.waitFor(DELAY); | |
return await getLinks(page); | |
} | |
async function getSermonLinks(page, sermonAuthorLinks) { | |
await page.goto(sermonAuthorLinks.find((x) => tolower(x).includes(tolower(argv?.author)))); | |
await page.waitFor(DELAY); | |
return await getLinks(page); | |
} | |
async function getSermonMetas(page, sermonLinks) { | |
let result = []; | |
for (const link of sermonLinks) { | |
await page.goto(link); | |
await page.waitFor(DELAY); | |
const meta = await getMeta(page); | |
result.push(meta); | |
} | |
return result; | |
} | |
function newStatus(str, color = 'blueBright') { | |
return new CLI.Spinner(chalk[color](str)); | |
} | |
(async () => { | |
const [browser, page] = await init(); | |
const status1 = newStatus('Fetching author pages...'); | |
status1.start(); | |
const sermonAuthorLinks = await getSermonAuthorsLinks(page); | |
status1.stop(); | |
const status2 = newStatus('Fetching sermon pages from authors...'); | |
status2.start(); | |
const sermonLinks = await getSermonLinks(page, sermonAuthorLinks); | |
status2.stop(); | |
const status3 = newStatus('Fetching sermon meta data...'); | |
status3.start(); | |
const sermonMetas = await (await getSermonMetas(page, sermonLinks)).filter((meta) => { | |
return Object.values(meta).every((x) => !!x); | |
}); | |
status3.stop(); | |
fs.writeFile('sermons.json', jsonFormat({ data: sermonMetas }), (err) => { | |
if (err) throw err; | |
}); | |
const photos = await unsplash.search.getPhotos({ | |
query: 'praying', | |
page: 1, | |
perPage: 10, | |
}); | |
if (photos?.result?.type === 'success') { | |
const firstPhoto = photos?.response?.results[0]; | |
console.log(firstPhoto); | |
} | |
console.log(chalk.greenBright('🌟 All sermon meta data uploaded.')); | |
await browser.close(); | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment