Created
May 7, 2025 03:41
-
-
Save blueorionn/156d8bfcedf84efd703c7800fd87bb4c to your computer and use it in GitHub Desktop.
Extract External URLs from single HTML page
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Copyright (c) 2025-present, Swadhin | |
// biome-ignore lint/complexity/useArrowFunction: Use function keyword for legacy support. | |
(function () { | |
const currentHost = window.location.host; | |
const html = document.documentElement.innerHTML; | |
// Regex to find all http(s) URLs | |
const urlRegex = /https?:\/\/[^\s"'<>]+/g; | |
const allMatches = Array.from(html.matchAll(urlRegex), m => m[0]); | |
// Filter external URLs and dedupe | |
const externalUrls = [...new Set( | |
allMatches.filter(href => { | |
try { | |
const url = new URL(href); | |
return (url.protocol === 'http:' || url.protocol === 'https:') | |
&& url.host !== currentHost; | |
} catch (e) { | |
return false; | |
} | |
}) | |
)]; | |
// Prepare JSON blob | |
const dataStr = JSON.stringify({ externalUrls }, null, 2); | |
const blob = new Blob([dataStr], { type: 'application/json' }); | |
const blobUrl = URL.createObjectURL(blob); | |
// Trigger download | |
const dl = document.createElement('a'); | |
dl.href = blobUrl; | |
dl.download = 'external-urls.json'; | |
document.body.appendChild(dl); | |
dl.click(); | |
document.body.removeChild(dl); | |
URL.revokeObjectURL(blobUrl); | |
console.log(`Found ${externalUrls.length} external URLs.`); | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment