Skip to content

Instantly share code, notes, and snippets.

@conanca
Last active May 15, 2019 07:37
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save conanca/7348222 to your computer and use it in GitHub Desktop.
Save conanca/7348222 to your computer and use it in GitHub Desktop.
爬bootstrap主题的脚本,仅供学习交流golang之用
package main
import (
"fmt"
"github.com/PuerkitoBio/goquery"
"io"
"io/ioutil"
"net/http"
"os"
"regexp"
"strings"
)
const (
ThemesUrl = "http://responsiweb.com/themes/preview/ace/1.3/"
Index = "index.html"
)
func pError(err error) {
if err != nil {
panic(err.Error())
}
}
func Exist(filename string) bool {
_, err := os.Stat(filename)
return err == nil || os.IsExist(err)
}
// 将指定内容保存为指定文件名的文件
func content2File(fileName string, content string) {
if strings.Contains(fileName, "/") {
os.MkdirAll(fileName[:strings.LastIndex(fileName, "/")], 0775)
}
dstFile, err := os.Create(fileName)
pError(err)
defer dstFile.Close()
content = strings.Replace(content, "\"//", "\"http://", -1)
dstFile.WriteString(content)
}
// 保存指定url的HTML文件并返回Document和content
func url2Html(url string) (doc *goquery.Document, content string) {
var err error
if doc, err = goquery.NewDocument(ThemesUrl + url); err != nil {
panic(err.Error())
}
var _ bool
content, _ = doc.Html()
content2File(url, content)
return doc, content
}
// 保存指定url的资源(图片/js/css等文件)
func url2File(url string)(download bool) {
if url == "" || strings.HasPrefix(url, "http://") || strings.HasPrefix(url, "//") || Exist(url) {
return false
}
fmt.Println(url + " downloading......")
resp, err := http.Get(ThemesUrl + url)
pError(err)
if strings.Contains(url, "/") {
os.MkdirAll(url[:strings.LastIndex(url, "/")], 0775)
}
file, err := os.Create(url)
pError(err)
defer func() {
resp.Body.Close()
file.Close()
}()
io.Copy(file, resp.Body)
return true
}
// 保存网页中引用的js和css等文件
func saveHtmlDoc(doc *goquery.Document, content string) {
// 解析引用的css
doc.Find("link").Each(func(i int, s *goquery.Selection) {
url, _ := s.Attr("href")
// 保存css文件
var download = url2File(url)
if !download {
return
}
cssFile, err := os.Open(url)
pError(err)
defer cssFile.Close()
cssContent, err := ioutil.ReadAll(cssFile)
// 保存css文件中所引用的图片
re, _ := regexp.Compile("url\\((.*?)\\)")
all := re.FindAllString(string(cssContent), -1)
for _, img := range all {
if strings.Contains(img, ".") {
// 提取url
img = strings.Replace(strings.Replace(img, "'", "", -1), "\"", "", -1)
img = img[4:strings.Index(img, ")")]
if strings.Contains(img, "../") {
img = strings.Replace(img, "../", "assets/", -1)
} else {
img = "assets/css/" + img
}
if strings.Contains(img, "?") {
img = img[:strings.Index(img, "?")]
}
if strings.Contains(img, "#") {
img = img[:strings.Index(img, "#")]
}
// 保存图片
url2File(img)
}
}
})
// 解析引用的js
doc.Find("script[src]").Each(func(i int, s *goquery.Selection) {
url, _ := s.Attr("src")
// 保存js文件
url2File(url)
})
// 解析引用的img
doc.Find("img").Each(func(i int, s *goquery.Selection) {
url, _ := s.Attr("src")
// 保存文件
url2File(url)
})
}
func main() {
fmt.Println("start!")
// 处理首页
fmt.Println("==== Page " + Index + "====")
indexHtmlDoc, content := url2Html(Index)
saveHtmlDoc(indexHtmlDoc, content)
// 获取其他页
indexHtmlDoc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
url, _ := s.Attr("href")
if url != "" && url != "#" && url != "index.html" && strings.Contains(url, ".html") {
// 处理其他页
fmt.Println("==== Page " + url + "====")
saveHtmlDoc(url2Html(url))
}
})
// 完成
fmt.Println("finish!")
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment