Skip to content

Instantly share code, notes, and snippets.

@zhnxin
Last active May 9, 2020 09:01
Show Gist options
  • Save zhnxin/c14ce8a7ec6771e8ff1b2f1e2822796a to your computer and use it in GitHub Desktop.
Save zhnxin/c14ce8a7ec6771e8ff1b2f1e2822796a to your computer and use it in GitHub Desktop.
package main
import (
"fmt"
"io"
"os"
"strings"
"sync"
"time"
"github.com/sirupsen/logrus"
"github.com/BurntSushi/toml"
"github.com/PuerkitoBio/goquery"
"github.com/parnurzeal/gorequest"
"gopkg.in/alecthomas/kingpin.v2"
)
type (
Config struct {
Base string
Start string
IsNext bool
Output string
ValidNext *ValidNext
Selector *CssSelector
}
CssSelector struct {
Title string
Content string
Next string
}
ValidNext struct {
EndWith string
NotContains string
}
ConfigPackage struct {
Base string
ValidNext ValidNext
Selector CssSelector
Config []Config
}
)
var (
ConfigPath = kingpin.Flag("config", "config file for multi").Default("config.toml").Short('c').String()
IsDebug = kingpin.Flag("debug", "is print debug log").Bool()
IsInit = kingpin.Flag("init", "init the config file").Bool()
ReqAgent = gorequest.New()
)
func (c *Config) InjectDefault(cp *ConfigPackage) {
if c.Base == "" {
c.Base = cp.Base
}
if c.ValidNext == nil {
c.ValidNext = &cp.ValidNext
}
if c.Selector == nil {
c.Selector = &cp.Selector
}
}
func (c *Config) reqeust(url string, writer io.Writer) (next string, err error) {
res, _, errs := ReqAgent.Get(url).End()
if len(errs) > 0 {
return "", fmt.Errorf("request error:%v", errs)
}
if res.StatusCode != 200 {
return "", fmt.Errorf("status code:%d", res.StatusCode)
}
doc, err := goquery.NewDocumentFromReader(res.Body)
if err != nil {
return "", err
}
if writer != nil {
if c.Selector.Title != "" {
doc.Find(c.Selector.Title).Each(func(i int, s *goquery.Selection) {
_, err = writer.Write([]byte(s.Text()))
})
if err != nil {
return "", err
}
}
doc.Find(c.Selector.Content).Each(func(i int, s *goquery.Selection) {
_, err = writer.Write([]byte(s.Text()))
})
if err != nil {
return "", err
}
}
next, _ = doc.Find(c.Selector.Next).Attr("href")
logrus.Debug(c.Selector.Next, ": ", next)
if c.ValidNext.NotContains != "" &&
strings.Contains(next, c.ValidNext.NotContains) {
next = ""
}
if c.ValidNext.EndWith != "" &&
!strings.HasSuffix(next, c.ValidNext.EndWith) {
next = ""
}
return
}
func (c *Config) Process() (err error) {
url := c.Start
if c.IsNext {
url, err = c.reqeust(c.Base+url, nil)
if err != nil {
return
}
if url == "" {
logrus.Infof("there is no next page for %s\n", c.Start)
return
}
}
output, err := os.OpenFile(c.Output, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0755)
if err != nil {
return fmt.Errorf("open output file %s:%v", c.Output, err)
}
defer output.Close()
count := 0
for {
url, err = c.reqeust(c.Base+url, output)
if err != nil {
return err
}
if url == "" {
return nil
}
count++
logrus.Infof("next: %d %s\n", count, url)
c.Start = url
c.IsNext = true
}
}
func (cp *ConfigPackage) ReOrg() {
for i := 0; i < len(cp.Config); i++ {
cp.Config[i].InjectDefault(cp)
}
}
func createConfigFile() error {
f, err := os.OpenFile("config.toml", os.O_CREATE|os.O_WRONLY, 0755)
if err != nil {
return err
}
_, err = fmt.Fprintf(f, `
base=""
[CssSelector]
Title=''
Content='div.box_box'
Next='#keyright'
[ValidNext]
EndWith=''
NotContains=''
[[Config]]
start=''
isnext = false
output="output.txt"`)
return err
}
func main() {
kingpin.Parse()
if *IsInit {
if err := createConfigFile(); err != nil {
logrus.Fatalln(err)
}
fmt.Println("config.toml creation completed")
return
}
CONFIG := ConfigPackage{}
_, err := toml.DecodeFile(*ConfigPath, CONFIG)
if err != nil {
logrus.Fatalln(err)
}
CONFIG.ReOrg()
logrus.SetFormatter(&logrus.TextFormatter{
TimestampFormat: time.RFC3339,
FullTimestamp: true,
})
if *IsDebug {
logrus.SetLevel(logrus.DebugLevel)
}
logrus.Debugf("config: %+v", CONFIG)
ReqAgent.SetDoNotClearSuperAgent(true)
wait := new(sync.WaitGroup)
configChan := make(chan Config)
for _, c := range CONFIG.Config {
wait.Add(1)
go func(con Config) {
defer func() {
wait.Done()
configChan <- con
}()
if err := con.Process(); err != nil {
logrus.Errorln(err)
} else {
logrus.Info("complete: ", con.Output)
}
}(c)
}
go func() {
wait.Wait()
close(configChan)
}()
CONFIG.Config = []Config{}
for c := range configChan {
CONFIG.Config = append(CONFIG.Config, c)
}
confFile, err := os.OpenFile(*ConfigPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644)
if err != nil {
logrus.Fatal("update config file: ", err)
}
err = toml.NewEncoder(confFile).Encode(CONFIG)
if err != nil {
logrus.Fatal("update config file: ", err)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment