A golang program to scrape [Douban Movie Top250](https://movie.douban.com/top250).
package main | |
import ( | |
"fmt" | |
"io" | |
"net/http" | |
"os" | |
"strconv" | |
"strings" | |
"github.com/PuerkitoBio/goquery" | |
"github.com/WeiZhang555/tabwriter" | |
) | |
var config struct { | |
UserAgent string | |
} | |
func init() { | |
config.UserAgent = `Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:75.0) Gecko/20100101 Firefox/75.0` | |
} | |
// Item ... | |
type Item struct { | |
Titles []string | |
Score float64 | |
Link string | |
Type string | |
Quote string | |
} | |
// page starts from 1 | |
func scrape(page int) []*Item { | |
offset := (page - 1) * 25 | |
u := fmt.Sprintf(`https://movie.douban.com/top250?start=%d`, offset) | |
req, err := http.NewRequest(http.MethodGet, u, nil) | |
if err != nil { | |
panic(err) | |
} | |
req.Header.Set(`User-Agent`, config.UserAgent) | |
resp, err := http.DefaultClient.Do(req) | |
if err != nil { | |
panic(err) | |
} | |
defer resp.Body.Close() | |
if resp.StatusCode != 200 { | |
io.Copy(os.Stderr, resp.Body) | |
panic(resp.Status) | |
} | |
doc, err := goquery.NewDocumentFromReader(resp.Body) | |
if err != nil { | |
panic(err) | |
} | |
var items []*Item | |
doc.Find(`#content .article ol.grid_view > li`).Each(func(i int, s *goquery.Selection) { | |
var item Item | |
a := s.Find(`.hd > a`) | |
item.Link, _ = a.Attr(`href`) | |
for _, t := range strings.Split(a.Text(), `/`) { | |
item.Titles = append(item.Titles, strings.TrimSpace(t)) | |
} | |
item.Type = strings.TrimSpace(strings.Split(s.Find(`.bd > p`).Text(), "\n")[2]) | |
item.Score, _ = strconv.ParseFloat(s.Find(`.bd .rating_num`).Text(), 64) | |
item.Quote = s.Find(`.bd .quote span`).Text() | |
items = append(items, &item) | |
}) | |
return items | |
} | |
func main() { | |
var items []*Item | |
for i := 1; i <= 10; i++ { | |
items = append(items, scrape(i)...) | |
} | |
w := tabwriter.NewWriter(os.Stdout, 8, 8, 3, ' ', 0) | |
defer w.Flush() | |
for _, item := range items { | |
fmt.Fprintf(w, | |
"%s\t%.1f\t%s\t%s\t%s\n", | |
item.Titles[0], item.Score, item.Link, item.Type, item.Quote, | |
) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment