Skip to content

Instantly share code, notes, and snippets.

@imfht
Created May 10, 2018 14:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save imfht/4b5c062cfdcf2fd3537264f3dd8b9007 to your computer and use it in GitHub Desktop.
Save imfht/4b5c062cfdcf2fd3537264f3dd8b9007 to your computer and use it in GitHub Desktop.
colly(a golang web cralwer framework ) with auto decode example
package main
import (
"fmt"
"github.com/gocolly/colly"
"golang.org/x/net/html/charset"
"bytes"
"io/ioutil"
"github.com/saintfish/chardet"
"strings"
"regexp"
"github.com/astaxie/beego/logs"
"github.com/gocolly/colly/queue"
)
func get_head_encoding(response *colly.Response) string {
contentType := strings.ToLower(response.Headers.Get("content-type"))
rtn_value := ""
if len(contentType) > 0 {
if (strings.Contains(contentType, "charset")) {
re := regexp.MustCompile(`(?i)charset=(?P<charset>.*)`)
a := re.FindSubmatch([]byte(contentType))
if len(a) > 0 {
rtn_value = string(a[1])
}
}
}
return rtn_value
}
func get_body_encoding(response *colly.Response) string {
charset_re := regexp.MustCompile(`(?i)<meta.*?charset=["']*(?P<charset>.+?)["'>]`)
//println(string(response.Body[1:2000]))
temp := make([]byte, 1024)
if len(response.Body) > 1024 {
temp = response.Body[1:1024]
} else {
temp = response.Body
}
cs := charset_re.FindSubmatch(temp)
if len(cs) == 2 {
return string(cs[1])
}
return ""
}
func convertToUTF8(str string, origEncoding string) string {
strBytes := []byte(str)
byteReader := bytes.NewReader(strBytes)
reader, _ := charset.NewReaderLabel(origEncoding, byteReader)
strBytes, _ = ioutil.ReadAll(reader)
return string(strBytes)
}
func test(some_text []byte) string {
detector := chardet.NewTextDetector()
result, err := detector.DetectBest(some_text)
if err == nil {
fmt.Printf(
"Detected charset is %s, language is %s\n",
result.Charset,
result.Language)
}
return result.Charset
}
func batch_test() {
// Instantiate default collector
c := colly.NewCollector(
// Visit only domains: hackerspaces.org, wiki.hackerspaces.org
//colly.AllowedDomains("hackerspaces.org", "wiki.hackerspaces.org"),
)
c.OnResponse(func(response *colly.Response) {
logs.Info("%s,%d,%s,%s\n", response.Request.URL.Host, response.StatusCode, get_head_encoding(response), get_body_encoding(response))
})
// Before making a request print "Visiting ..."
c.OnRequest(func(r *colly.Request) {
logs.Debug("Visiting", r.URL.String())
})
data, _ := ioutil.ReadFile("top-10000")
// Start scraping on https://hackerspaces.org
lines := string(data)
q, _ := queue.New(
2, // Number of consumer threads
&queue.InMemoryQueueStorage{MaxSize: 10000}, // Use default queue storage
)
for _, t := range strings.Split(lines, "\n") {
q.AddURL("http://" + t)
}
c.Async = true
q.Run(c)
c.Wait()
}
func main() {
batch_test()
}
@imfht
Copy link
Author

imfht commented May 10, 2018

test-10000 look like

www.sdu.edu.cn
www.baidu.com
www.sjtu.edu.cn

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment