Created
August 31, 2012 08:00
-
-
Save fxsjy/3550052 to your computer and use it in GitHub Desktop.
Bakup Weibo to Disk
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"net/http" | |
"net/url" | |
"log" | |
"io/ioutil" | |
"regexp" | |
"fmt" | |
//"net/http/httputil" | |
"strconv" | |
"time" | |
"flag" | |
"os" | |
"strings" | |
) | |
type myjar struct { | |
jar map[string] []*http.Cookie | |
} | |
func (p* myjar) SetCookies(u *url.URL, cookies []*http.Cookie) { | |
//log.Println(cookies) | |
p.jar[u.Host] = append(p.jar[u.Host],cookies...) | |
} | |
func (p *myjar) Cookies(u *url.URL) []*http.Cookie { | |
return p.jar[u.Host] | |
} | |
var weibo_address = flag.String("user","","the name/id of weibo account that you want to download, i.e renzhiqiang") | |
var sleep_span = flag.Int("span",500,"the time span between two requests, default is 500 milliseconds.") | |
var save_path = flag.String("save","weibo","the local directory which you want save the weibo page in there.") | |
var start_page = flag.Int("start",1,"the page number whichi is the start page number for downloading") | |
var robot_config []string | |
func getAuthCookie(username string, password string) (cookies []*http.Cookie, err error){ | |
log.Println("rob_user:",username,"; rob_pass:", password) | |
client := &http.Client{} | |
jar := &myjar{} | |
jar.jar = make(map[string] []*http.Cookie) | |
client.Jar = jar | |
auth_url := "http://3g.sina.com.cn/prog/wapsite/sso/login.php" | |
auth_url_URL, _ := url.Parse(auth_url) | |
req, _ := http.NewRequest("GET",auth_url,nil) | |
req.Header.Add("User-Agent","Mozilla/5.0 (Windows NT 5.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.83 Safari/537.1") | |
req.Header.Add("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") | |
req.Header.Add("Accept-Charset","GBK,utf-8;q=0.7,*;q=0.3") | |
//req.Header.Add("Accept-Encoding","gzip,deflate,sdch") | |
req.Header.Add("Accept-Language","zh-CN,zh;q=0.8") | |
req.Header.Add("Cache-Control","max-age=0") | |
req.Header.Add("Connection","keep-alive") | |
//req_bytes,_ := httputil.DumpRequest(req,false) | |
//log.Println(string(req_bytes)) | |
rsps1, err1 := client.Do(req) | |
defer rsps1.Body.Close() | |
if err1!=nil{ | |
return nil,err1 | |
} | |
login_page, err_read:= ioutil.ReadAll(rsps1.Body) | |
if err_read!=nil{ | |
return nil,err_read | |
} | |
//ioutil.WriteFile("c:/logint.html",login_page,777) | |
str_login_page := string(login_page) | |
//log.Println(str_login_page) | |
re_pass,_ := regexp.Compile("type=\"password\" name=\"(.+?)\"") | |
re_vk,_ := regexp.Compile(`name="vk" value="(.+?)"`) | |
passbox_name := re_pass.FindStringSubmatch(str_login_page)[1] | |
vk_value := re_vk.FindStringSubmatch(str_login_page)[1] | |
//log.Println(passbox_name,vk_value) | |
rsps2,err2 := client.PostForm("http://3g.sina.com.cn/prog/wapsite/sso/login_submit.php",url.Values{"mobile":{username}, passbox_name:{password}, "remember":{"off"}, "vk":{vk_value}, "submit":{"登录"}}) | |
if err2!=nil{ | |
return nil,err2 | |
} | |
//log.Println(rsps2) | |
defer rsps2.Body.Close() | |
return client.Jar.Cookies(auth_url_URL),nil | |
} | |
func download(target_url string, jar *myjar) (page string, err error){ | |
client := &http.Client{} | |
client.Jar = jar | |
rsps, err_get := client.Get(target_url) | |
if err_get!=nil{ | |
return "",err | |
} | |
defer rsps.Body.Close() | |
body, err := ioutil.ReadAll(rsps.Body) | |
page = string(body) | |
return page,nil | |
} | |
func getAuthJar(retry_count int) *myjar{ | |
var username,password string | |
line := robot_config[retry_count % 2] | |
tup := strings.Split(line,"\t") | |
username = tup[0] | |
password = tup[1] | |
cookies, err := getAuthCookie(username, password) | |
if err!=nil{ | |
panic(err) | |
} | |
//log.Println(cookies) | |
//log.Println(cookies[0].Name,",",cookies[0].Value) | |
if len(cookies)<1 || strings.Contains(cookies[0].Value,"deleted"){ | |
log.Println("[ERROR] wrong username or password") | |
os.Exit(1) | |
} | |
common_jar := &myjar{} | |
common_jar.jar = make(map[string] []*http.Cookie) | |
common_jar.jar["weibo.cn"] = cookies | |
return common_jar | |
} | |
func main(){ | |
flag.Parse() | |
var name_flag = "" | |
if *weibo_address == ""{ | |
flag.PrintDefaults() | |
fmt.Println(` | |
usage exampe: | |
downloader.exe -user treapdb -save ./bak/treapdb | |
`) | |
return | |
}else{ | |
name_flag = *weibo_address | |
_,err_number := strconv.Atoi(*weibo_address) | |
if err_number!=nil{ | |
*weibo_address = "http://weibo.cn/"+*weibo_address | |
}else{ | |
*weibo_address = "http://weibo.cn/u/"+*weibo_address | |
} | |
os.MkdirAll(*save_path,777) | |
} | |
log.Println("> weibo url: ",*weibo_address) | |
defer func(){ | |
if x := recover(); x != nil { | |
log.Println("[FATAL] some unknown error happen when downloading !!", *weibo_address) | |
//panic(x) | |
} | |
}() | |
func(){ | |
content, err := ioutil.ReadFile("robot.txt") | |
if err!=nil{ | |
log.Println("need robot.txt in current directory") | |
os.Exit(1) | |
} | |
robot_config = strings.Split(string(content),"\n") | |
if len(robot_config)<1{ | |
log.Println("please put weibo account/password in robot.txt in current directory, seperated by tab") | |
os.Exit(1) | |
} | |
}() | |
var lstart_page_no = *start_page | |
common_jar := getAuthJar(0) | |
page, err_download := download(fmt.Sprintf("%s?page=%d",*weibo_address,1),common_jar) | |
if err_download!=nil{ | |
panic(err_download) | |
} | |
re_page_no,_ := regexp.Compile(`name="mp" type="hidden" value="(.+?)"`) | |
mc_page_no := re_page_no.FindStringSubmatch(page) | |
retry_no := 0 | |
wait_span := time.Duration(30 / len(robot_config) ) * time.Minute | |
for len(mc_page_no)<1 { | |
log.Printf("[ERROR] the spider's account may be forbidden by sina, it will sleep for %v and then retry\n",wait_span) | |
time.Sleep(wait_span) | |
retry_no++ | |
common_jar = getAuthJar(retry_no) | |
page, err_download = download(fmt.Sprintf("%s?page=%d",*weibo_address,1),common_jar) | |
if err_download!=nil{ | |
panic(err_download) | |
} | |
mc_page_no = re_page_no.FindStringSubmatch(page) | |
} | |
total_page,_ := strconv.Atoi(mc_page_no[1]) | |
log.Println("total page should be grabbed:",total_page) | |
grabed_page := lstart_page_no - 1 | |
spider_count := 15 | |
quit_chan := make(chan bool) | |
url_chan := make(chan int) | |
feedback_chan := make(chan bool) | |
for i:=0;i<spider_count;i++{ | |
go func(){ | |
for { | |
select{ | |
case <-quit_chan: | |
break | |
case p_no:= <- url_chan: | |
retry_no := 0 | |
for{ | |
d_page, d_err := download(fmt.Sprintf("%s?page=%d",*weibo_address,p_no),common_jar) | |
if !strings.Contains(d_page,name_flag) || d_err!=nil { | |
log.Printf("[ERROR] the downloader may be forbidden by sina, it will sleep for %v and then retry",wait_span) | |
time.Sleep(wait_span) | |
retry_no ++ | |
common_jar = getAuthJar(retry_no) | |
}else{ | |
ioutil.WriteFile(fmt.Sprintf("%s/page_%d.html",*save_path,p_no),[]byte(d_page),777) | |
feedback_chan <- true | |
break | |
} | |
} | |
} | |
} | |
}() | |
} | |
go func(){ | |
for i:=lstart_page_no;i<=total_page;i++ { | |
url_chan <- i | |
wait_span := time.Duration(*sleep_span) | |
time.Sleep(wait_span* time.Millisecond) | |
} | |
}() | |
for{ | |
<- feedback_chan | |
grabed_page++ | |
log.Printf("grabbed %d/%d", grabed_page, total_page ) | |
if grabed_page == total_page{ | |
break | |
} | |
} | |
for i:=0;i<spider_count;i++{ | |
quit_chan <- true | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment