Last active
April 6, 2021 05:39
-
-
Save crazy4groovy/7a85404b14e3ae47785965d80a48e65f to your computer and use it in GitHub Desktop.
Simple web page html parser into markdown or JSON (NodeJS)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const fetch = require('node-fetch') | |
module.exports = async ({q, url}) => | |
fetch( | |
url | |
? decodeURIComponent(url) | |
: `https://en.wikipedia.org/wiki/${decodeURIComponent(q)}` | |
).then(res => res.text()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const cheerio = require('cheerio') | |
// const html2json = require('html2json').html2json; | |
const {parse} = require('himalaya') | |
const TurndownService = require('turndown') | |
const cleanHtml = require('clean-html') | |
const fetcher = require('./fetcher') | |
const t = new TurndownService() | |
const clean = html => | |
new Promise((resolve, reject) => cleanHtml.clean(html, resolve) || reject) | |
const cheerioConfig = { | |
xml: { | |
lowerCaseAttributeNames: true, | |
lowerCaseTags: true, | |
normalizeWhitespace: true, | |
recognizeSelfClosing: true | |
} | |
} | |
module.exports = async function(req) { | |
const {query} = req.url | |
const replyHtml = await fetcher(query) | |
const $ = cheerio.load(replyHtml, cheerioConfig) | |
let data = $(query.selector || (query.q && '#mw-content-text')) | |
// .map((i, el) => ($(el).html() || $(el).parent().html()).trim().replace(/\s+/g, ' ')) | |
.map((i, el) => | |
$(el) | |
.toString() | |
.trim() | |
.replace(/\s+/g, ' ') | |
) | |
.get() | |
if (query.json != null) { | |
// data = data.map(p => html2json(p)) | |
data = await Promise.all(data.map(async p => parse(await clean(p)))) | |
} else if (query.markdown != null || query.q) { | |
data = await Promise.all(data.map(async p => t.turndown(await clean(p)))) | |
} | |
return { | |
selector: query.selector, | |
url: query.url, | |
q: query.q, | |
json: query.json != null, | |
markdown: query.markdown != null, | |
data | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const dotenv = require('dotenv') | |
const dotenvExpand = require('dotenv-expand') | |
const myEnv = dotenv.config(); | |
dotenvExpand(myEnv); | |
// const { parse, stringify } = require('flatted/cjs'); | |
(async () => { | |
const fastify = require('fastify')({ | |
logger: true | |
}) | |
await fastify.register(require('middie')) | |
await fastify.register(require('point-of-view'), { | |
engine: { marko: require('marko') } | |
}) | |
const cheerioHandler = require('./handlers-cheerio') | |
fastify.use(require('cors')()) | |
fastify.get('/', (req, reply) => { | |
reply.view('/index.marko', {text: 'text'}) | |
}) | |
fastify.get('/help', async (req, res) => { | |
res.type('application/json').code(200) | |
return { | |
'/cheerio': { | |
url: 'REQUIRED: HTTP endpoint (overrides q)', | |
q: 'REQUIRED: wikipedia search query (if no url)', | |
selector: 'REQUIRED: CSS selector(s) for url HTML elements', | |
json: 'parse selector elements as json', | |
markdown: 'parse selector elements as markdown' | |
} | |
} | |
}) | |
fastify.get('/cheerio', async (req, res) => { | |
const {query} = req | |
res.type('application/json').code(200) | |
const resp = await cheerioHandler({url: {query}}) | |
// console.log(resp) | |
return resp | |
}) | |
fastify.listen(Number(process.env.PORT) || 3000, '0.0.0.0', (err, address) => { | |
if (err) throw err | |
// fastify.log.info(`server listening on ${address}`) | |
console.log(`server listening on ${address}`) | |
}) | |
})() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<script> | |
function onSumbit() { | |
var url = encodeURIComponent(document.querySelector('#url').value) | |
var selector = encodeURIComponent(document.querySelector('#selector').value) | |
var json = encodeURIComponent(document.querySelector('#json').checked || '') | |
var markdown = encodeURIComponent(document.querySelector('#markdown').checked || '') | |
document.location = './cheerio?url=' + url | |
+ '&selector=' + selector | |
+ (json ? '&json=' + json : '') | |
+ (markdown ? '&markdown=' + markdown : '') | |
} | |
</script> | |
<style> | |
label { | |
display: inline-block; | |
width: 5em; | |
} | |
</style> | |
div.field | |
label -- URL: | |
input#url | |
br | |
label -- Selector: | |
input#selector | |
br | |
label -- JSON: | |
input#json type="checkbox" | |
br | |
label -- Markdown | |
input#markdown type="checkbox" | |
br | |
button.example-button onclick='onSumbit()' -- Scrape! |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment