Last active
March 6, 2020 05:15
-
-
Save HwDhyeon/f667e801468ae11a4fdb1a7cdda252be to your computer and use it in GitHub Desktop.
goroutine을 이용해 동시에 웹 페이지 스크랩하기
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"encoding/csv" | |
"fmt" | |
"log" | |
"net/http" | |
"os" | |
"strconv" | |
"strings" | |
"github.com/PuerkitoBio/goquery" | |
) | |
type extractedJob struct { | |
id string | |
title string | |
location string | |
salary string | |
summary string | |
} | |
var baseURL string = "https://kr.indeed.com/jobs?q=python&limit=50&start=" | |
func main() { | |
var jobs []extractedJob | |
c := make(chan []extractedJob) | |
totalPages := getPages() | |
for i := 0; i < totalPages; i++ { | |
go getPage(i, c) | |
} | |
for i := 0; i < totalPages; i++ { | |
extractedJobs := <-c | |
jobs = append(jobs, extractedJobs...) | |
} | |
writeJobs(jobs) | |
fmt.Println("Done, extracted", len(jobs)) | |
} | |
func getPage(page int, mainC chan<- []extractedJob) { | |
var jobs []extractedJob | |
c := make(chan extractedJob) | |
pageURL := baseURL + strconv.Itoa(page*50) | |
fmt.Println("Requsting", pageURL) | |
res, err := http.Get(pageURL) | |
checkErr(err) | |
checkCode(res) | |
defer res.Body.Close() | |
doc, err := goquery.NewDocumentFromReader(res.Body) | |
checkErr(err) | |
searchCards := doc.Find(".jobsearch-SerpJobCard") | |
searchCards.Each(func(i int, card *goquery.Selection) { | |
go extractJob(card, c) | |
}) | |
for i := 0; i < searchCards.Length(); i++ { | |
job := <-c | |
jobs = append(jobs, job) | |
} | |
mainC <- jobs | |
} | |
func extractJob(card *goquery.Selection, c chan<- extractedJob) { | |
id, _ := card.Attr("data-jk") | |
title := cleanString(card.Find(".title > a").Text()) | |
location := cleanString(card.Find(".sjcl").Text()) | |
salary := cleanString(card.Find(".salaryText").Text()) | |
summary := cleanString(card.Find(".summary").Text()) | |
c <- extractedJob{ | |
id: id, | |
title: title, | |
location: location, | |
salary: salary, | |
summary: summary} | |
} | |
func cleanString(str string) string { | |
return strings.Join(strings.Fields(strings.TrimSpace(str)), " ") | |
} | |
func getPages() int { | |
pages := 0 | |
res, err := http.Get(baseURL) | |
checkErr(err) | |
checkCode(res) | |
defer res.Body.Close() | |
doc, err := goquery.NewDocumentFromReader(res.Body) | |
checkErr(err) | |
doc.Find(".pagination").Each(func(i int, s *goquery.Selection) { | |
pages = s.Find("a").Length() | |
}) | |
return pages | |
} | |
func writeJobs(jobs []extractedJob) { | |
file, err := os.Create("jobs.csv") | |
checkErr(err) | |
w := csv.NewWriter(file) | |
defer w.Flush() | |
headers := []string{"Link", "Title", "Location", "Salary", "Summary"} | |
wErr := w.Write(headers) | |
checkErr(wErr) | |
for _, job := range jobs { | |
jobSlice := []string{ | |
"https://kr.indeed.com/viewjob?jk=" + job.id, | |
job.title, | |
job.location, | |
job.salary, | |
job.summary} | |
jwErr := w.Write(jobSlice) | |
checkErr(jwErr) | |
} | |
} | |
func checkErr(err error) { | |
if err != nil { | |
log.Fatalln(err) | |
} | |
} | |
func checkCode(res *http.Response) { | |
if res.StatusCode != 200 { | |
log.Fatalln("Request failed with Status:", res.StatusCode) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment