Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
A golang program to scrape [Douban Movie Top250](https://movie.douban.com/top250).
package main
import (
"fmt"
"io"
"net/http"
"os"
"strconv"
"strings"
"github.com/PuerkitoBio/goquery"
"github.com/WeiZhang555/tabwriter"
)
var config struct {
UserAgent string
}
func init() {
config.UserAgent = `Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:75.0) Gecko/20100101 Firefox/75.0`
}
// Item ...
type Item struct {
Titles []string
Score float64
Link string
Type string
Quote string
}
// page starts from 1
func scrape(page int) []*Item {
offset := (page - 1) * 25
u := fmt.Sprintf(`https://movie.douban.com/top250?start=%d`, offset)
req, err := http.NewRequest(http.MethodGet, u, nil)
if err != nil {
panic(err)
}
req.Header.Set(`User-Agent`, config.UserAgent)
resp, err := http.DefaultClient.Do(req)
if err != nil {
panic(err)
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
io.Copy(os.Stderr, resp.Body)
panic(resp.Status)
}
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
panic(err)
}
var items []*Item
doc.Find(`#content .article ol.grid_view > li`).Each(func(i int, s *goquery.Selection) {
var item Item
a := s.Find(`.hd > a`)
item.Link, _ = a.Attr(`href`)
for _, t := range strings.Split(a.Text(), `/`) {
item.Titles = append(item.Titles, strings.TrimSpace(t))
}
item.Type = strings.TrimSpace(strings.Split(s.Find(`.bd > p`).Text(), "\n")[2])
item.Score, _ = strconv.ParseFloat(s.Find(`.bd .rating_num`).Text(), 64)
item.Quote = s.Find(`.bd .quote span`).Text()
items = append(items, &item)
})
return items
}
func main() {
var items []*Item
for i := 1; i <= 10; i++ {
items = append(items, scrape(i)...)
}
w := tabwriter.NewWriter(os.Stdout, 8, 8, 3, ' ', 0)
defer w.Flush()
for _, item := range items {
fmt.Fprintf(w,
"%s\t%.1f\t%s\t%s\t%s\n",
item.Titles[0], item.Score, item.Link, item.Type, item.Quote,
)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.