Skip to content

Instantly share code, notes, and snippets.

@zhangskills
Last active August 29, 2015 14:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zhangskills/a8204ce1df93db8e8700 to your computer and use it in GitHub Desktop.
Save zhangskills/a8204ce1df93db8e8700 to your computer and use it in GitHub Desktop.
golang抓取百度收录数
package utils
import (
"bytes"
"code.google.com/p/go.text/encoding/simplifiedchinese"
"code.google.com/p/go.text/transform"
"compress/gzip"
"errors"
"io"
"io/ioutil"
"net"
"net/http"
"regexp"
"strconv"
"strings"
"time"
)
var client = *&http.Client{
Transport: &http.Transport{
Dial: func(netw, addr string) (net.Conn, error) {
deadline := time.Now().Add(10 * time.Second)
c, err := net.DialTimeout(netw, addr, 5*time.Second) //连接超时时间
if err != nil {
return nil, err
}
c.SetDeadline(deadline)
return c, nil
},
},
}
func getRespReader(resp *http.Response) (io.Reader, error) {
//判断是否为gzip响应
if resp.Header.Get("Content-Encoding") == "gzip" {
return gzip.NewReader(resp.Body)
}
return resp.Body, nil
}
func getDecoder(b []byte) transform.Transformer {
reg, _ := regexp.Compile(`charset=["']?(.+?)["'/>]`)
match := reg.FindSubmatch(b)
if len(match) > 0 {
charset := strings.ToLower(string(match[1]))
if strings.Contains(charset, "gb") {
return simplifiedchinese.GBK.NewDecoder()
}
}
return nil
}
func GetHtmlByte(uri string) ([]byte, error) {
req, err := http.NewRequest("GET", uri, nil)
if err != nil {
return nil, err
}
//启用gzip
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36")
req.Header.Set("Accept-Encoding", "gzip")
resp, err := client.Do(req)
defer resp.Body.Close()
if err != nil {
return nil, err
}
resReader, err := getRespReader(resp)
if err != nil {
return nil, err
}
b, err := ioutil.ReadAll(resReader)
if err != nil {
return nil, err
}
//解码
decoder := getDecoder(b)
if decoder != nil {
tr := transform.NewReader(bytes.NewReader(b), decoder)
return ioutil.ReadAll(tr)
} else {
return b, nil
}
}
func getHtmlAndFirstRegex(uri, regex string) (int64, error) {
htmlByte, _ := GetHtmlByte(uri)
reg, _ := regexp.Compile(regex)
match := reg.FindSubmatch(htmlByte)
if len(match) < 1 {
return 0, errors.New("未匹配")
} else {
numStr := string(match[1])
return strconv.ParseInt(strings.Replace(numStr, ",", "", -1), 10, 0)
}
}
func GetBaiduIndexNum() (int64, error) {
uri := "http://www.baidu.com/s?wd=site%3Abaidu.com"
regex := `>([\d,]+)</b>个网页被百度收录`
return getHtmlAndFirstRegex(uri, regex)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment