Skip to content

Instantly share code, notes, and snippets.

@crazy4groovy
Last active April 6, 2021 05:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save crazy4groovy/7a85404b14e3ae47785965d80a48e65f to your computer and use it in GitHub Desktop.
Save crazy4groovy/7a85404b14e3ae47785965d80a48e65f to your computer and use it in GitHub Desktop.
Simple web page html parser into markdown or JSON (NodeJS)
const fetch = require('node-fetch')
module.exports = async ({q, url}) =>
fetch(
url
? decodeURIComponent(url)
: `https://en.wikipedia.org/wiki/${decodeURIComponent(q)}`
).then(res => res.text())
const cheerio = require('cheerio')
// const html2json = require('html2json').html2json;
const {parse} = require('himalaya')
const TurndownService = require('turndown')
const cleanHtml = require('clean-html')
const fetcher = require('./fetcher')
const t = new TurndownService()
const clean = html =>
new Promise((resolve, reject) => cleanHtml.clean(html, resolve) || reject)
const cheerioConfig = {
xml: {
lowerCaseAttributeNames: true,
lowerCaseTags: true,
normalizeWhitespace: true,
recognizeSelfClosing: true
}
}
module.exports = async function(req) {
const {query} = req.url
const replyHtml = await fetcher(query)
const $ = cheerio.load(replyHtml, cheerioConfig)
let data = $(query.selector || (query.q && '#mw-content-text'))
// .map((i, el) => ($(el).html() || $(el).parent().html()).trim().replace(/\s+/g, ' '))
.map((i, el) =>
$(el)
.toString()
.trim()
.replace(/\s+/g, ' ')
)
.get()
if (query.json != null) {
// data = data.map(p => html2json(p))
data = await Promise.all(data.map(async p => parse(await clean(p))))
} else if (query.markdown != null || query.q) {
data = await Promise.all(data.map(async p => t.turndown(await clean(p))))
}
return {
selector: query.selector,
url: query.url,
q: query.q,
json: query.json != null,
markdown: query.markdown != null,
data
}
}
const dotenv = require('dotenv')
const dotenvExpand = require('dotenv-expand')
const myEnv = dotenv.config();
dotenvExpand(myEnv);
// const { parse, stringify } = require('flatted/cjs');
(async () => {
const fastify = require('fastify')({
logger: true
})
await fastify.register(require('middie'))
await fastify.register(require('point-of-view'), {
engine: { marko: require('marko') }
})
const cheerioHandler = require('./handlers-cheerio')
fastify.use(require('cors')())
fastify.get('/', (req, reply) => {
reply.view('/index.marko', {text: 'text'})
})
fastify.get('/help', async (req, res) => {
res.type('application/json').code(200)
return {
'/cheerio': {
url: 'REQUIRED: HTTP endpoint (overrides q)',
q: 'REQUIRED: wikipedia search query (if no url)',
selector: 'REQUIRED: CSS selector(s) for url HTML elements',
json: 'parse selector elements as json',
markdown: 'parse selector elements as markdown'
}
}
})
fastify.get('/cheerio', async (req, res) => {
const {query} = req
res.type('application/json').code(200)
const resp = await cheerioHandler({url: {query}})
// console.log(resp)
return resp
})
fastify.listen(Number(process.env.PORT) || 3000, '0.0.0.0', (err, address) => {
if (err) throw err
// fastify.log.info(`server listening on ${address}`)
console.log(`server listening on ${address}`)
})
})()
<script>
function onSumbit() {
var url = encodeURIComponent(document.querySelector('#url').value)
var selector = encodeURIComponent(document.querySelector('#selector').value)
var json = encodeURIComponent(document.querySelector('#json').checked || '')
var markdown = encodeURIComponent(document.querySelector('#markdown').checked || '')
document.location = './cheerio?url=' + url
+ '&selector=' + selector
+ (json ? '&json=' + json : '')
+ (markdown ? '&markdown=' + markdown : '')
}
</script>
<style>
label {
display: inline-block;
width: 5em;
}
</style>
div.field
label -- URL:
input#url
br
label -- Selector:
input#selector
br
label -- JSON:
input#json type="checkbox"
br
label -- Markdown
input#markdown type="checkbox"
br
button.example-button onclick='onSumbit()' -- Scrape!
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment