Last active
August 29, 2015 14:02
-
-
Save zhangskills/a8204ce1df93db8e8700 to your computer and use it in GitHub Desktop.
golang抓取百度收录数
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package utils | |
import ( | |
"bytes" | |
"code.google.com/p/go.text/encoding/simplifiedchinese" | |
"code.google.com/p/go.text/transform" | |
"compress/gzip" | |
"errors" | |
"io" | |
"io/ioutil" | |
"net" | |
"net/http" | |
"regexp" | |
"strconv" | |
"strings" | |
"time" | |
) | |
var client = *&http.Client{ | |
Transport: &http.Transport{ | |
Dial: func(netw, addr string) (net.Conn, error) { | |
deadline := time.Now().Add(10 * time.Second) | |
c, err := net.DialTimeout(netw, addr, 5*time.Second) //连接超时时间 | |
if err != nil { | |
return nil, err | |
} | |
c.SetDeadline(deadline) | |
return c, nil | |
}, | |
}, | |
} | |
func getRespReader(resp *http.Response) (io.Reader, error) { | |
//判断是否为gzip响应 | |
if resp.Header.Get("Content-Encoding") == "gzip" { | |
return gzip.NewReader(resp.Body) | |
} | |
return resp.Body, nil | |
} | |
func getDecoder(b []byte) transform.Transformer { | |
reg, _ := regexp.Compile(`charset=["']?(.+?)["'/>]`) | |
match := reg.FindSubmatch(b) | |
if len(match) > 0 { | |
charset := strings.ToLower(string(match[1])) | |
if strings.Contains(charset, "gb") { | |
return simplifiedchinese.GBK.NewDecoder() | |
} | |
} | |
return nil | |
} | |
func GetHtmlByte(uri string) ([]byte, error) { | |
req, err := http.NewRequest("GET", uri, nil) | |
if err != nil { | |
return nil, err | |
} | |
//启用gzip | |
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36") | |
req.Header.Set("Accept-Encoding", "gzip") | |
resp, err := client.Do(req) | |
defer resp.Body.Close() | |
if err != nil { | |
return nil, err | |
} | |
resReader, err := getRespReader(resp) | |
if err != nil { | |
return nil, err | |
} | |
b, err := ioutil.ReadAll(resReader) | |
if err != nil { | |
return nil, err | |
} | |
//解码 | |
decoder := getDecoder(b) | |
if decoder != nil { | |
tr := transform.NewReader(bytes.NewReader(b), decoder) | |
return ioutil.ReadAll(tr) | |
} else { | |
return b, nil | |
} | |
} | |
func getHtmlAndFirstRegex(uri, regex string) (int64, error) { | |
htmlByte, _ := GetHtmlByte(uri) | |
reg, _ := regexp.Compile(regex) | |
match := reg.FindSubmatch(htmlByte) | |
if len(match) < 1 { | |
return 0, errors.New("未匹配") | |
} else { | |
numStr := string(match[1]) | |
return strconv.ParseInt(strings.Replace(numStr, ",", "", -1), 10, 0) | |
} | |
} | |
func GetBaiduIndexNum() (int64, error) { | |
uri := "http://www.baidu.com/s?wd=site%3Abaidu.com" | |
regex := `>([\d,]+)</b>个网页被百度收录` | |
return getHtmlAndFirstRegex(uri, regex) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment