Skip to content

Instantly share code, notes, and snippets.

@fxsjy
Created August 31, 2012 08:00
Show Gist options
  • Save fxsjy/3550052 to your computer and use it in GitHub Desktop.
Save fxsjy/3550052 to your computer and use it in GitHub Desktop.
Bakup Weibo to Disk
package main
import (
"net/http"
"net/url"
"log"
"io/ioutil"
"regexp"
"fmt"
//"net/http/httputil"
"strconv"
"time"
"flag"
"os"
"strings"
)
type myjar struct {
jar map[string] []*http.Cookie
}
func (p* myjar) SetCookies(u *url.URL, cookies []*http.Cookie) {
//log.Println(cookies)
p.jar[u.Host] = append(p.jar[u.Host],cookies...)
}
func (p *myjar) Cookies(u *url.URL) []*http.Cookie {
return p.jar[u.Host]
}
var weibo_address = flag.String("user","","the name/id of weibo account that you want to download, i.e renzhiqiang")
var sleep_span = flag.Int("span",500,"the time span between two requests, default is 500 milliseconds.")
var save_path = flag.String("save","weibo","the local directory which you want save the weibo page in there.")
var start_page = flag.Int("start",1,"the page number whichi is the start page number for downloading")
var robot_config []string
func getAuthCookie(username string, password string) (cookies []*http.Cookie, err error){
log.Println("rob_user:",username,"; rob_pass:", password)
client := &http.Client{}
jar := &myjar{}
jar.jar = make(map[string] []*http.Cookie)
client.Jar = jar
auth_url := "http://3g.sina.com.cn/prog/wapsite/sso/login.php"
auth_url_URL, _ := url.Parse(auth_url)
req, _ := http.NewRequest("GET",auth_url,nil)
req.Header.Add("User-Agent","Mozilla/5.0 (Windows NT 5.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.83 Safari/537.1")
req.Header.Add("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
req.Header.Add("Accept-Charset","GBK,utf-8;q=0.7,*;q=0.3")
//req.Header.Add("Accept-Encoding","gzip,deflate,sdch")
req.Header.Add("Accept-Language","zh-CN,zh;q=0.8")
req.Header.Add("Cache-Control","max-age=0")
req.Header.Add("Connection","keep-alive")
//req_bytes,_ := httputil.DumpRequest(req,false)
//log.Println(string(req_bytes))
rsps1, err1 := client.Do(req)
defer rsps1.Body.Close()
if err1!=nil{
return nil,err1
}
login_page, err_read:= ioutil.ReadAll(rsps1.Body)
if err_read!=nil{
return nil,err_read
}
//ioutil.WriteFile("c:/logint.html",login_page,777)
str_login_page := string(login_page)
//log.Println(str_login_page)
re_pass,_ := regexp.Compile("type=\"password\" name=\"(.+?)\"")
re_vk,_ := regexp.Compile(`name="vk" value="(.+?)"`)
passbox_name := re_pass.FindStringSubmatch(str_login_page)[1]
vk_value := re_vk.FindStringSubmatch(str_login_page)[1]
//log.Println(passbox_name,vk_value)
rsps2,err2 := client.PostForm("http://3g.sina.com.cn/prog/wapsite/sso/login_submit.php",url.Values{"mobile":{username}, passbox_name:{password}, "remember":{"off"}, "vk":{vk_value}, "submit":{"登录"}})
if err2!=nil{
return nil,err2
}
//log.Println(rsps2)
defer rsps2.Body.Close()
return client.Jar.Cookies(auth_url_URL),nil
}
func download(target_url string, jar *myjar) (page string, err error){
client := &http.Client{}
client.Jar = jar
rsps, err_get := client.Get(target_url)
if err_get!=nil{
return "",err
}
defer rsps.Body.Close()
body, err := ioutil.ReadAll(rsps.Body)
page = string(body)
return page,nil
}
func getAuthJar(retry_count int) *myjar{
var username,password string
line := robot_config[retry_count % 2]
tup := strings.Split(line,"\t")
username = tup[0]
password = tup[1]
cookies, err := getAuthCookie(username, password)
if err!=nil{
panic(err)
}
//log.Println(cookies)
//log.Println(cookies[0].Name,",",cookies[0].Value)
if len(cookies)<1 || strings.Contains(cookies[0].Value,"deleted"){
log.Println("[ERROR] wrong username or password")
os.Exit(1)
}
common_jar := &myjar{}
common_jar.jar = make(map[string] []*http.Cookie)
common_jar.jar["weibo.cn"] = cookies
return common_jar
}
func main(){
flag.Parse()
var name_flag = ""
if *weibo_address == ""{
flag.PrintDefaults()
fmt.Println(`
usage exampe:
downloader.exe -user treapdb -save ./bak/treapdb
`)
return
}else{
name_flag = *weibo_address
_,err_number := strconv.Atoi(*weibo_address)
if err_number!=nil{
*weibo_address = "http://weibo.cn/"+*weibo_address
}else{
*weibo_address = "http://weibo.cn/u/"+*weibo_address
}
os.MkdirAll(*save_path,777)
}
log.Println("> weibo url: ",*weibo_address)
defer func(){
if x := recover(); x != nil {
log.Println("[FATAL] some unknown error happen when downloading !!", *weibo_address)
//panic(x)
}
}()
func(){
content, err := ioutil.ReadFile("robot.txt")
if err!=nil{
log.Println("need robot.txt in current directory")
os.Exit(1)
}
robot_config = strings.Split(string(content),"\n")
if len(robot_config)<1{
log.Println("please put weibo account/password in robot.txt in current directory, seperated by tab")
os.Exit(1)
}
}()
var lstart_page_no = *start_page
common_jar := getAuthJar(0)
page, err_download := download(fmt.Sprintf("%s?page=%d",*weibo_address,1),common_jar)
if err_download!=nil{
panic(err_download)
}
re_page_no,_ := regexp.Compile(`name="mp" type="hidden" value="(.+?)"`)
mc_page_no := re_page_no.FindStringSubmatch(page)
retry_no := 0
wait_span := time.Duration(30 / len(robot_config) ) * time.Minute
for len(mc_page_no)<1 {
log.Printf("[ERROR] the spider's account may be forbidden by sina, it will sleep for %v and then retry\n",wait_span)
time.Sleep(wait_span)
retry_no++
common_jar = getAuthJar(retry_no)
page, err_download = download(fmt.Sprintf("%s?page=%d",*weibo_address,1),common_jar)
if err_download!=nil{
panic(err_download)
}
mc_page_no = re_page_no.FindStringSubmatch(page)
}
total_page,_ := strconv.Atoi(mc_page_no[1])
log.Println("total page should be grabbed:",total_page)
grabed_page := lstart_page_no - 1
spider_count := 15
quit_chan := make(chan bool)
url_chan := make(chan int)
feedback_chan := make(chan bool)
for i:=0;i<spider_count;i++{
go func(){
for {
select{
case <-quit_chan:
break
case p_no:= <- url_chan:
retry_no := 0
for{
d_page, d_err := download(fmt.Sprintf("%s?page=%d",*weibo_address,p_no),common_jar)
if !strings.Contains(d_page,name_flag) || d_err!=nil {
log.Printf("[ERROR] the downloader may be forbidden by sina, it will sleep for %v and then retry",wait_span)
time.Sleep(wait_span)
retry_no ++
common_jar = getAuthJar(retry_no)
}else{
ioutil.WriteFile(fmt.Sprintf("%s/page_%d.html",*save_path,p_no),[]byte(d_page),777)
feedback_chan <- true
break
}
}
}
}
}()
}
go func(){
for i:=lstart_page_no;i<=total_page;i++ {
url_chan <- i
wait_span := time.Duration(*sleep_span)
time.Sleep(wait_span* time.Millisecond)
}
}()
for{
<- feedback_chan
grabed_page++
log.Printf("grabbed %d/%d", grabed_page, total_page )
if grabed_page == total_page{
break
}
}
for i:=0;i<spider_count;i++{
quit_chan <- true
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment