Skip to content

Instantly share code, notes, and snippets.

@notsobad
Last active August 16, 2023 06:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save notsobad/5ffcd24adc47c65a5c81bc11065deffa to your computer and use it in GitHub Desktop.
Save notsobad/5ffcd24adc47c65a5c81bc11065deffa to your computer and use it in GitHub Desktop.
使用 chromedp,加载一个网页,获取网页加载的所有资源地址,获取页面里的所有链接地址
// Command click is a chromedp example demonstrating how to use a selector to
// click on an element.
package main
import (
"context"
"fmt"
"log"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/chromedp/chromedp"
)
func main() {
// create chrome instance
ctx, cancel := chromedp.NewContext(
context.Background(),
//chromedp.WithDebugf(log.Printf),
)
defer cancel()
// create a timeout
ctx, cancel = context.WithTimeout(ctx, 15*time.Second)
defer cancel()
jsCode := `(() => {
let ret = {};
ret['resource'] = performance.getEntriesByType("resource").map((r) => r.name);
ret['links'] = Array.from(document.getElementsByTagName('a')).map(link => link.href).filter(href => href);
return JSON.stringify(ret); })();`
var urls string
// 定义用于存储HTML内容的字符串变量
var htmlContent string
// navigate到目标网站并等待domcontentloaded(即网站首屏加载完全)
err := chromedp.Run(ctx,
chromedp.Navigate("http://127.0.0.1:8888/hi.html"),
chromedp.WaitReady("body", chromedp.ByQuery),
// 获取整个页面DOM树并且赋值给htmlContent
chromedp.InnerHTML(`html`, &htmlContent, chromedp.NodeVisible),
chromedp.EvaluateAsDevTools(jsCode, &urls),
)
if err != nil {
log.Fatal(err)
}
fmt.Println(htmlContent)
// 从渲染后的 dom 中获取链接地址的方式,不推荐
urls1 := getLinksFromHtml(htmlContent)
fmt.Println(urls1)
// 使用 插入 js 代码来获取链接地址和加载的资源,推荐
fmt.Println("js get all page loaded urls:")
fmt.Println(urls)
}
func getLinksFromHtml(html string) []string {
var urls []string
parsedHtmlReader := strings.NewReader(html)
doc, _ := goquery.NewDocumentFromReader(parsedHtmlReader)
doc.Find("a").Each(func(index int, link *goquery.Selection) {
url, _ := link.Attr("href")
if len(url) > 0 {
urls = append(urls, url)
}
})
return urls
}
<dir>
<a href="/abc.html">assss</a>
</dir>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="/ssssss0.js"></script>
<script crossorigin="anonymous" defer="defer" type="application/javascript" src="/test.js"></script>
<img src="/xxxx.jpg" />
<iframe src="/iframelink"></iframe>
<script>
// 生成随机地址函数
function randomUrl() {
const chars = 'abcdefghijklmnopqrstuvwxyz0123456789';
let url = '';
for (let i = 0; i < 10; i++) {
url += chars.charAt(Math.floor(Math.random() * chars.length));
}
return `https://${url}.com`;
}
// 创建<a>元素并设置随机href属性值
const aTag = document.createElement('a');
aTag.href = "/randomlink";
aTag.innerText = "test url";
// 创建<img>元素并设置随机src属性值
const imgTag = document.createElement('img');
imgTag.src = "/randomimg";
// 创建<iframe>元素并设置随机src属性值
const iframeTag = document.createElement('iframe');
iframeTag.src = "/randomiframe";
// 将三个标签添加到body中
document.body.appendChild(aTag);
document.body.appendChild(imgTag);
document.body.appendChild(iframeTag);
</script>
// 创建<a>元素并设置随机href属性值
(function () {
const aTag = document.createElement('a');
aTag.href = "/randomlink222222";
aTag.innerText = "test url222";
const imgTag = document.createElement('img');
imgTag.src = "/randomimg2222";
// 将三个标签添加到body中
document.body.appendChild(aTag);
document.body.appendChild(imgTag);
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment