Skip to content

Instantly share code, notes, and snippets.

@toannd96
Last active January 6, 2022 16:44
Show Gist options
  • Save toannd96/a48d437c2f7cbd23fb0cfb029f5d95d0 to your computer and use it in GitHub Desktop.
Save toannd96/a48d437c2f7cbd23fb0cfb029f5d95d0 to your computer and use it in GitHub Desktop.
crawl jobstreet use channel, colly and goquery
package main
import (
"encoding/json"
"fmt"
"log"
"net/http"
"os"
"strconv"
"sync"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/cenkalti/backoff"
"github.com/gocolly/colly"
)
const webPage = "https://www.jobstreet.vn/t%C3%ACmvi%E1%BB%87c"
type Job struct {
Title string `json:"title"`
Company string `json:"company"`
Location string `json:"location"`
Descript string `json:"descript"`
Url string `json:"url"`
Site string `json:"site"`
CreatedAt string `json:"created_at"`
}
type Jobs struct {
List []Job `json:"jobs"`
TotalJobs int `json:"total_jobs"`
}
const (
maxRetry = 3 * time.Minute
)
func get(url string) (*http.Response, error) {
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}
client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
return nil, err
}
return resp, nil
}
func Get(url string) (*http.Response, error) {
var err error
var resp *http.Response
bo := backoff.NewExponentialBackOff()
bo.MaxInterval = maxRetry
bo.MaxElapsedTime = maxRetry
for {
resp, err = get(url)
if err == nil {
break
}
d := bo.NextBackOff()
if d == backoff.Stop {
break
}
time.Sleep(d)
}
if err != nil {
return nil, err
}
return resp, nil
}
func crawlJobStreet() {
var urls []string
pipe := make(chan string)
done := make(chan bool)
go func() {
for {
url, more := <-pipe
if more {
fmt.Println("Received urls", url)
urls = append(urls, url)
fmt.Println("Append url received to array", len(urls))
} else {
fmt.Println("Received all urls", len(urls))
extractInfoJob(urls)
done <- true
return
}
}
}()
var wg sync.WaitGroup
wg.Add(2)
go getUrlByProvince(pipe, &wg)
go getUrlByCategory(pipe, &wg)
go func() {
wg.Wait()
close(pipe)
}()
<-done
}
func extractInfoJob(urls []string) error {
var jobs Jobs
var job Job
c := colly.NewCollector(
// colly.Async(true),
)
// c.Limit(&colly.LimitRule{
// Parallelism: 2,
// })
c.SetRequestTimeout(120 * time.Second)
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL)
})
c.OnError(func(r *colly.Response, err error) {
fmt.Println(err)
})
c.OnHTML(".jobresults .job-card", func(e *colly.HTMLElement) {
job.Url = "https://www.jobstreet.vn" + e.ChildAttr("h3.job-title > a", "href")
job.Title = e.ChildText("h3.job-title > a")
job.Company = e.ChildText("span.job-company")
job.Location = e.ChildText("span.job-location")
c.Visit(e.Request.AbsoluteURL(job.Url))
c.OnHTML("div[class=heading-xsmall]", func(e *colly.HTMLElement) {
job.Site = e.ChildText("span.site")
job.CreatedAt = e.ChildText("span.listed-date")
})
if job.Site == "TopCV" {
job.Descript = ""
} else {
c.OnHTML("div[class=-desktop-no-padding-top]", func(e *colly.HTMLElement) {
job.Descript = e.Text
})
}
jobs.TotalJobs++
jobs.List = append(jobs.List, job)
dataBytes, errMarshal := json.Marshal(jobs)
if errMarshal != nil {
fmt.Println(errMarshal)
}
os.WriteFile("jobstreet.json", dataBytes, 0700)
})
for _, url := range urls {
c.Visit(url)
}
// c.Wait()
return nil
}
// getUrlByProvince get all search url by province
func getUrlByProvince(pipe chan<- string, wg *sync.WaitGroup) error {
defer wg.Done()
doc, err := getNewDocument(webPage)
if err != nil {
return err
}
// Get all search urls by province
doc.Find("div[id=browse-locations] a[href]").Each(func(index int, province *goquery.Selection) {
href, _ := province.Attr("href")
urlProvince := fmt.Sprintf("https://www.jobstreet.vn%s", href)
// Get total page count of each url by province
totalPage, err := getTotalPage(urlProvince)
if err != nil {
fmt.Println(err)
}
// Merge all url pages by province
for page := 1; page <= totalPage; page++ {
urlProvinceByPage := fmt.Sprintf("%s?p=%d", urlProvince, page)
pipe <- urlProvinceByPage
}
})
return nil
}
// getUrlByCategory get all search url by category
func getUrlByCategory(pipe chan<- string, wg *sync.WaitGroup) error {
defer wg.Done()
doc, err := getNewDocument(webPage)
if err != nil {
return err
}
// Get all search urls by category
doc.Find("div[id=browse-categories] a[href]").Each(func(index int, category *goquery.Selection) {
href, _ := category.Attr("href")
urlCategory := fmt.Sprintf("https://www.jobstreet.vn%s", href)
docChild, err := getNewDocument(urlCategory)
if err != nil {
fmt.Println(err)
}
// Get all search urls by category child
docChild.Find("div[id=browse-keywords] a[href]").Each(func(index int, key *goquery.Selection) {
href, _ := key.Attr("href")
urlCategoryChild := fmt.Sprintf("https://www.jobstreet.vn%s", href)
// Get total page count of each url by category child
totalPage, err := getTotalPage(urlCategoryChild)
if err != nil {
fmt.Println(err)
}
// Merge all url pages by category child
for page := 1; page <= totalPage; page++ {
urlCategoryChildByPage := fmt.Sprintf("%s?p=%d", urlCategoryChild, page)
pipe <- urlCategoryChildByPage
}
})
})
return nil
}
// getTotalPage get total page count of each url
func getTotalPage(url string) (int, error) {
var totalPage int
doc, err := getNewDocument(url)
if err != nil {
return 0, err
}
pageStr := doc.Find("div.search-results-count strong:last-child").Text()
if pageStr != "" {
totalPage, err = strconv.Atoi(pageStr)
if err != nil {
return 0, err
}
}
return totalPage, nil
}
// getNewDocument get html document from url
func getNewDocument(url string) (*goquery.Document, error) {
resp, err := Get(url)
if err != nil {
fmt.Println(err)
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
log.Fatalf("status code error: %d %s", resp.StatusCode, resp.Status)
}
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
fmt.Println(err)
}
return doc, nil
}
func schedule(timeSchedule time.Duration, index int) {
ticker := time.NewTicker(timeSchedule)
go func() {
for {
switch index {
case 1:
<-ticker.C
crawlJobStreet()
}
}
}()
}
func main() {
crawlJobStreet()
// schedule crawler
go schedule(24*time.Hour, 1)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment