Skip to content

Instantly share code, notes, and snippets.

@akkuman
Created December 8, 2021 04:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save akkuman/8eaffdaec34263649471be375b62ace6 to your computer and use it in GitHub Desktop.
Save akkuman/8eaffdaec34263649471be375b62ace6 to your computer and use it in GitHub Desktop.
[提取html标题] 猜测html编码提取标题 #go
package main
import (
"bytes"
"fmt"
"io/ioutil"
"net/http"
"os"
"regexp"
"github.com/gogs/chardet"
"golang.org/x/net/html/charset"
)
var pattern = regexp.MustCompile(`<title>.*</title>`)
// getMostlikeyEncoding 获取可能性最大的编码
func getMostlikeyEncoding(body []byte) (encoding *chardet.Result, err error) {
detector := chardet.NewHtmlDetector()
result, err := detector.DetectBest(body)
if err != nil {
return nil, err
}
return result, nil
}
func convrtToUTF8(data []byte, origEncoding string) string {
byteReader := bytes.NewReader(data)
reader, _ := charset.NewReaderLabel(origEncoding, byteReader)
data, _ = ioutil.ReadAll(reader)
return string(data)
}
func main() {
url := os.Args[1]
resp, err := http.DefaultClient.Get(url)
if err != nil {
panic(err)
}
defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
panic(err)
}
encoding, err := getMostlikeyEncoding(body)
if err != nil {
panic(err)
}
fmt.Println(encoding)
matches := pattern.FindAll(body, -1)
for _, match := range matches {
fmt.Println(string(match))
fmt.Println(convrtToUTF8(match, encoding.Charset))
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment