Skip to content

Instantly share code, notes, and snippets.

@Grawl
Created March 5, 2020 07:35
Show Gist options
  • Save Grawl/485ceafb7256920043350fc64e4fe8d4 to your computer and use it in GitHub Desktop.
Save Grawl/485ceafb7256920043350fc64e4fe8d4 to your computer and use it in GitHub Desktop.
/**
* Return array of URLs found in HTML string
*
* If `<img>` wrapped into `<a>` uses `[href]` of `<a>` if it's an image
*
* @param {string} html
* @returns {string[]}
*/
async ImageURLsFromHTML(html) {
try {
const allowedImageTypes = [
'image/jpeg',
'image/jpg',
'image/png',
'image/webp',
]
const urls = []
await posthtml()
.use(async tree => {
tree.walk(node => {
// TODO don't return img[src] if it's child of a[href] having image content-type
if (node.tag === 'a') {
const a = node
const src = a.attrs.href
// Node has <img> child
if (node.content.find(node => (
typeof node === 'object' &&
node.tag === 'img' &&
node.attrs.src
))) {
// TODO ensure it works
(async function() {
const request = await Promise.resolve(fetch(src, {
method: 'HEAD',
}))
const headers = request.headers
const contentType = headers.get('Content-Type')
if (allowedImageTypes.includes(contentType)) {
urls.push(src)
}
}())
}
} else if (
node.tag === 'img' &&
node.attrs.src
) {
urls.push(node.attrs.src)
}
return node
})
return tree
})
.process(html)
return urls
} catch (error) {
console.error(error)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment