Skip to content

Instantly share code, notes, and snippets.

@michaelmang
Created December 9, 2020 21:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save michaelmang/8eb18e26743d5ec9b6bd73f99c9dc3ff to your computer and use it in GitHub Desktop.
Save michaelmang/8eb18e26743d5ec9b6bd73f99c9dc3ff to your computer and use it in GitHub Desktop.
Scrape Grace Gems Script
import chalk from 'chalk';
import clear from 'clear';
import CLI from 'clui';
import figlet from 'figlet';
import fs from 'fs';
import jsonFormat from 'json-format';
import kebabCase from 'lodash.kebabcase';
import startcase from 'lodash.startcase';
import tolower from 'lodash.tolower';
import truncate from 'lodash.truncate';
import fetch from 'node-fetch';
import puppeteer from 'puppeteer';
import { createApi } from 'unsplash-js';
const argv = require('minimist')(process.argv.slice(2));
global.fetch = fetch;
let warn = console.warn;
console.warn = (msg, ...args) => {
if (!msg.includes('waitFor is deprecated')) {
warn.call(console, msg, ...args);
}
};
const DELAY = 100;
const SERMONS = 'https://www.gracegems.org/SERMONS.htm';
const TRUNCATE = 250;
const blacklist = ['http://gracegems.org/photography.htm', 'http://gracegems.org/'];
const blacklistMatchers = /javascript:void\(0\)|mailto|caudillwebsolutions|sermonaudio/gm;
const unsplash = createApi({
accessKey: 'pLS36KfA5kmNERfHV1CyvaZgU4j_xGwdaSjhe3_ZduE',
});
async function init() {
clear();
console.log(chalk.yellow(figlet.textSync('Sermonizer', { horizontalLayout: 'full' })));
const status = newStatus('🚀 Launching browser...');
status.start();
const browser = await puppeteer.launch();
const page = await browser.newPage();
status.stop();
return [browser, page];
}
async function getLinks(page) {
const result = await page.$$eval('a', (as) => as.map((a) => a.href));
return result.filter((x) => {
return !blacklist.includes(x) && !x.match(blacklistMatchers);
});
}
function _titlecase(str) {
return startcase(tolower(str));
}
function _truncate(str) {
return truncate(str, {
length: TRUNCATE,
});
}
async function getMeta(page) {
const title = await page.$$eval('p[align=center]', (ps) => ps[0]?.innerText);
const author = await page.$$eval('p[align=center]', (ps) => ps[1]?.innerText);
const description = await page.$$eval(
'p[align=justify]',
(ps) => ps.find((p) => p?.innerText?.length > 2)?.innerText
);
return {
id: kebabCase(title),
author: _titlecase(author?.replace(/[0-9]/gm, '')),
title: _titlecase(title),
description: _truncate(description),
};
}
async function getSermonAuthorsLinks(page) {
await page.goto(SERMONS);
await page.waitFor(DELAY);
return await getLinks(page);
}
async function getSermonLinks(page, sermonAuthorLinks) {
await page.goto(sermonAuthorLinks.find((x) => tolower(x).includes(tolower(argv?.author))));
await page.waitFor(DELAY);
return await getLinks(page);
}
async function getSermonMetas(page, sermonLinks) {
let result = [];
for (const link of sermonLinks) {
await page.goto(link);
await page.waitFor(DELAY);
const meta = await getMeta(page);
result.push(meta);
}
return result;
}
function newStatus(str, color = 'blueBright') {
return new CLI.Spinner(chalk[color](str));
}
(async () => {
const [browser, page] = await init();
const status1 = newStatus('Fetching author pages...');
status1.start();
const sermonAuthorLinks = await getSermonAuthorsLinks(page);
status1.stop();
const status2 = newStatus('Fetching sermon pages from authors...');
status2.start();
const sermonLinks = await getSermonLinks(page, sermonAuthorLinks);
status2.stop();
const status3 = newStatus('Fetching sermon meta data...');
status3.start();
const sermonMetas = await (await getSermonMetas(page, sermonLinks)).filter((meta) => {
return Object.values(meta).every((x) => !!x);
});
status3.stop();
fs.writeFile('sermons.json', jsonFormat({ data: sermonMetas }), (err) => {
if (err) throw err;
});
const photos = await unsplash.search.getPhotos({
query: 'praying',
page: 1,
perPage: 10,
});
if (photos?.result?.type === 'success') {
const firstPhoto = photos?.response?.results[0];
console.log(firstPhoto);
}
console.log(chalk.greenBright('🌟 All sermon meta data uploaded.'));
await browser.close();
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment