Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
colly(a golang web cralwer framework ) with auto decode example
package main
import (
"fmt"
"github.com/gocolly/colly"
"golang.org/x/net/html/charset"
"bytes"
"io/ioutil"
"github.com/saintfish/chardet"
"strings"
"regexp"
"github.com/astaxie/beego/logs"
"github.com/gocolly/colly/queue"
)
func get_head_encoding(response *colly.Response) string {
contentType := strings.ToLower(response.Headers.Get("content-type"))
rtn_value := ""
if len(contentType) > 0 {
if (strings.Contains(contentType, "charset")) {
re := regexp.MustCompile(`(?i)charset=(?P<charset>.*)`)
a := re.FindSubmatch([]byte(contentType))
if len(a) > 0 {
rtn_value = string(a[1])
}
}
}
return rtn_value
}
func get_body_encoding(response *colly.Response) string {
charset_re := regexp.MustCompile(`(?i)<meta.*?charset=["']*(?P<charset>.+?)["'>]`)
//println(string(response.Body[1:2000]))
temp := make([]byte, 1024)
if len(response.Body) > 1024 {
temp = response.Body[1:1024]
} else {
temp = response.Body
}
cs := charset_re.FindSubmatch(temp)
if len(cs) == 2 {
return string(cs[1])
}
return ""
}
func convertToUTF8(str string, origEncoding string) string {
strBytes := []byte(str)
byteReader := bytes.NewReader(strBytes)
reader, _ := charset.NewReaderLabel(origEncoding, byteReader)
strBytes, _ = ioutil.ReadAll(reader)
return string(strBytes)
}
func test(some_text []byte) string {
detector := chardet.NewTextDetector()
result, err := detector.DetectBest(some_text)
if err == nil {
fmt.Printf(
"Detected charset is %s, language is %s\n",
result.Charset,
result.Language)
}
return result.Charset
}
func batch_test() {
// Instantiate default collector
c := colly.NewCollector(
// Visit only domains: hackerspaces.org, wiki.hackerspaces.org
//colly.AllowedDomains("hackerspaces.org", "wiki.hackerspaces.org"),
)
c.OnResponse(func(response *colly.Response) {
logs.Info("%s,%d,%s,%s\n", response.Request.URL.Host, response.StatusCode, get_head_encoding(response), get_body_encoding(response))
})
// Before making a request print "Visiting ..."
c.OnRequest(func(r *colly.Request) {
logs.Debug("Visiting", r.URL.String())
})
data, _ := ioutil.ReadFile("top-10000")
// Start scraping on https://hackerspaces.org
lines := string(data)
q, _ := queue.New(
2, // Number of consumer threads
&queue.InMemoryQueueStorage{MaxSize: 10000}, // Use default queue storage
)
for _, t := range strings.Split(lines, "\n") {
q.AddURL("http://" + t)
}
c.Async = true
q.Run(c)
c.Wait()
}
func main() {
batch_test()
}
@imfht

This comment has been minimized.

Copy link
Owner Author

commented May 10, 2018

test-10000 look like

www.sdu.edu.cn
www.baidu.com
www.sjtu.edu.cn
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.