Last active
May 9, 2020 09:01
-
-
Save zhnxin/c14ce8a7ec6771e8ff1b2f1e2822796a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"io" | |
"os" | |
"strings" | |
"sync" | |
"time" | |
"github.com/sirupsen/logrus" | |
"github.com/BurntSushi/toml" | |
"github.com/PuerkitoBio/goquery" | |
"github.com/parnurzeal/gorequest" | |
"gopkg.in/alecthomas/kingpin.v2" | |
) | |
type ( | |
Config struct { | |
Base string | |
Start string | |
IsNext bool | |
Output string | |
ValidNext *ValidNext | |
Selector *CssSelector | |
} | |
CssSelector struct { | |
Title string | |
Content string | |
Next string | |
} | |
ValidNext struct { | |
EndWith string | |
NotContains string | |
} | |
ConfigPackage struct { | |
Base string | |
ValidNext ValidNext | |
Selector CssSelector | |
Config []Config | |
} | |
) | |
var ( | |
ConfigPath = kingpin.Flag("config", "config file for multi").Default("config.toml").Short('c').String() | |
IsDebug = kingpin.Flag("debug", "is print debug log").Bool() | |
IsInit = kingpin.Flag("init", "init the config file").Bool() | |
ReqAgent = gorequest.New() | |
) | |
func (c *Config) InjectDefault(cp *ConfigPackage) { | |
if c.Base == "" { | |
c.Base = cp.Base | |
} | |
if c.ValidNext == nil { | |
c.ValidNext = &cp.ValidNext | |
} | |
if c.Selector == nil { | |
c.Selector = &cp.Selector | |
} | |
} | |
func (c *Config) reqeust(url string, writer io.Writer) (next string, err error) { | |
res, _, errs := ReqAgent.Get(url).End() | |
if len(errs) > 0 { | |
return "", fmt.Errorf("request error:%v", errs) | |
} | |
if res.StatusCode != 200 { | |
return "", fmt.Errorf("status code:%d", res.StatusCode) | |
} | |
doc, err := goquery.NewDocumentFromReader(res.Body) | |
if err != nil { | |
return "", err | |
} | |
if writer != nil { | |
if c.Selector.Title != "" { | |
doc.Find(c.Selector.Title).Each(func(i int, s *goquery.Selection) { | |
_, err = writer.Write([]byte(s.Text())) | |
}) | |
if err != nil { | |
return "", err | |
} | |
} | |
doc.Find(c.Selector.Content).Each(func(i int, s *goquery.Selection) { | |
_, err = writer.Write([]byte(s.Text())) | |
}) | |
if err != nil { | |
return "", err | |
} | |
} | |
next, _ = doc.Find(c.Selector.Next).Attr("href") | |
logrus.Debug(c.Selector.Next, ": ", next) | |
if c.ValidNext.NotContains != "" && | |
strings.Contains(next, c.ValidNext.NotContains) { | |
next = "" | |
} | |
if c.ValidNext.EndWith != "" && | |
!strings.HasSuffix(next, c.ValidNext.EndWith) { | |
next = "" | |
} | |
return | |
} | |
func (c *Config) Process() (err error) { | |
url := c.Start | |
if c.IsNext { | |
url, err = c.reqeust(c.Base+url, nil) | |
if err != nil { | |
return | |
} | |
if url == "" { | |
logrus.Infof("there is no next page for %s\n", c.Start) | |
return | |
} | |
} | |
output, err := os.OpenFile(c.Output, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0755) | |
if err != nil { | |
return fmt.Errorf("open output file %s:%v", c.Output, err) | |
} | |
defer output.Close() | |
count := 0 | |
for { | |
url, err = c.reqeust(c.Base+url, output) | |
if err != nil { | |
return err | |
} | |
if url == "" { | |
return nil | |
} | |
count++ | |
logrus.Infof("next: %d %s\n", count, url) | |
c.Start = url | |
c.IsNext = true | |
} | |
} | |
func (cp *ConfigPackage) ReOrg() { | |
for i := 0; i < len(cp.Config); i++ { | |
cp.Config[i].InjectDefault(cp) | |
} | |
} | |
func createConfigFile() error { | |
f, err := os.OpenFile("config.toml", os.O_CREATE|os.O_WRONLY, 0755) | |
if err != nil { | |
return err | |
} | |
_, err = fmt.Fprintf(f, ` | |
base="" | |
[CssSelector] | |
Title='' | |
Content='div.box_box' | |
Next='#keyright' | |
[ValidNext] | |
EndWith='' | |
NotContains='' | |
[[Config]] | |
start='' | |
isnext = false | |
output="output.txt"`) | |
return err | |
} | |
func main() { | |
kingpin.Parse() | |
if *IsInit { | |
if err := createConfigFile(); err != nil { | |
logrus.Fatalln(err) | |
} | |
fmt.Println("config.toml creation completed") | |
return | |
} | |
CONFIG := ConfigPackage{} | |
_, err := toml.DecodeFile(*ConfigPath, CONFIG) | |
if err != nil { | |
logrus.Fatalln(err) | |
} | |
CONFIG.ReOrg() | |
logrus.SetFormatter(&logrus.TextFormatter{ | |
TimestampFormat: time.RFC3339, | |
FullTimestamp: true, | |
}) | |
if *IsDebug { | |
logrus.SetLevel(logrus.DebugLevel) | |
} | |
logrus.Debugf("config: %+v", CONFIG) | |
ReqAgent.SetDoNotClearSuperAgent(true) | |
wait := new(sync.WaitGroup) | |
configChan := make(chan Config) | |
for _, c := range CONFIG.Config { | |
wait.Add(1) | |
go func(con Config) { | |
defer func() { | |
wait.Done() | |
configChan <- con | |
}() | |
if err := con.Process(); err != nil { | |
logrus.Errorln(err) | |
} else { | |
logrus.Info("complete: ", con.Output) | |
} | |
}(c) | |
} | |
go func() { | |
wait.Wait() | |
close(configChan) | |
}() | |
CONFIG.Config = []Config{} | |
for c := range configChan { | |
CONFIG.Config = append(CONFIG.Config, c) | |
} | |
confFile, err := os.OpenFile(*ConfigPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) | |
if err != nil { | |
logrus.Fatal("update config file: ", err) | |
} | |
err = toml.NewEncoder(confFile).Encode(CONFIG) | |
if err != nil { | |
logrus.Fatal("update config file: ", err) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment