Skip to content

Instantly share code, notes, and snippets.

@toannd96
Last active January 8, 2022 08:48
Show Gist options
  • Save toannd96/e4d7ddcac5abaee5d7ab557900a43bc1 to your computer and use it in GitHub Desktop.
Save toannd96/e4d7ddcac5abaee5d7ab557900a43bc1 to your computer and use it in GitHub Desktop.
crawl masothue.com
package main
import (
"encoding/json"
"fmt"
"net/http"
"os"
"sync"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/cenkalti/backoff"
)
const (
basePath = "https://www.masothue.com"
TypeCompanyPath = "/tra-cuu-ma-so-thue-theo-loai-hinh-doanh-nghiep"
TypeBusinessPath = "/tra-cuu-ma-so-thue-theo-nganh-nghe"
fileName = "masothue.json"
maxRetry = 3 * time.Minute
)
type CompanyInfo struct {
Name string `json:"name"`
TaxInfo map[string]string `json:"tax_info"`
Business []BusinessInfo `json:"business_info"`
}
type BusinessInfo struct {
ID string `json:"id"`
Carees string `json:"carees"`
}
type Company struct {
List []CompanyInfo `json:"company"`
TotalCompany int `json:"total_company"`
}
func NewCompany() *CompanyInfo {
return &CompanyInfo{
TaxInfo: make(map[string]string),
}
}
// Get http request basic
func get(url string) (*http.Response, error) {
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}
client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
return nil, err
}
return resp, nil
}
// Get http request with backoff retry
func Get(url string) (*http.Response, error) {
var err error
var resp *http.Response
bo := backoff.NewExponentialBackOff()
bo.MaxInterval = maxRetry
bo.MaxElapsedTime = maxRetry
for {
resp, err = get(url)
if err == nil {
break
}
fmt.Println("BackOff retry")
d := bo.NextBackOff()
if d == backoff.Stop {
fmt.Println("Retry time out")
break
}
fmt.Println("Retry in ", d)
time.Sleep(d)
}
if err != nil {
return nil, err
}
return resp, nil
}
// Get html document from url
func getNewDocument(url string) (*goquery.Document, error) {
resp, err := Get(url)
if err != nil {
fmt.Println(err)
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
fmt.Printf("status code error: %d %s", resp.StatusCode, resp.Status)
}
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
fmt.Println(err)
}
return doc, nil
}
func crawlMasothue() {
var wg sync.WaitGroup
var allCompany []CompanyInfo
pipe := make(chan string)
done := make(chan bool)
go func() {
for {
url, more := <-pipe
if more {
fmt.Println("Extract url", url)
oneCompany, _ := extractCompanyInfo(url)
allCompany = append(allCompany, oneCompany...)
} else {
fmt.Println("Extract all url")
company := Company{
List: allCompany,
TotalCompany: len(allCompany),
}
dataBytes, errMarshal := json.Marshal(company)
if errMarshal != nil {
fmt.Println(errMarshal)
}
os.WriteFile(fileName, dataBytes, 0700)
done <- true
return
}
}
}()
wg.Add(1)
go getUrl(pipe, &wg)
go func() {
wg.Wait()
close(pipe)
}()
<-done
}
func getUrl(pipe chan<- string, wg *sync.WaitGroup) error {
defer wg.Done()
doc, err := getNewDocument(basePath + TypeBusinessPath)
if err != nil {
return err
}
doc.Find("table tbody").Each(func(index int, tableHtml *goquery.Selection) {
tableHtml.Find("tr").Each(func(indexTr int, rowHtml *goquery.Selection) {
rowHtml.Find("td:last-child a[href]").Each(func(ndexTd int, tableCell *goquery.Selection) {
href, _ := tableCell.Attr("href")
for page := 1; page <= 10; page++ {
urlTypeCompany := fmt.Sprintf("%s%s?page=%d", basePath, href, page)
docChild, _ := getNewDocument(urlTypeCompany)
docChild.Find("div.tax-listing h3 a[href]").Each(func(index int, info *goquery.Selection) {
href, _ := info.Attr("href")
urlInfoCompany := fmt.Sprintf("%s%s", basePath, href)
pipe <- urlInfoCompany
})
}
})
})
})
return nil
}
func extractCompanyInfo(url string) ([]CompanyInfo, error) {
var company Company
companyInfo := NewCompany()
doc, err := getNewDocument(url)
if err != nil {
return nil, err
}
// extract tax info
doc.Find("table.table-taxinfo").Each(func(index int, tableTaxHtml *goquery.Selection) {
tableTaxHtml.Find("th span.copy").Each(func(indexTr int, rowTaxHtml *goquery.Selection) {
companyInfo.Name = rowTaxHtml.Text()
})
tableTaxHtml.Find("tbody tr").Each(func(indexTr int, rowTaxHtml *goquery.Selection) {
row := make([]string, 0)
rowTaxHtml.Find("td").Each(func(ndexTd int, tableCell *goquery.Selection) {
row = append(row, tableCell.Text())
})
if len(row) != 1 {
companyInfo.TaxInfo[row[0]] = row[1]
}
})
})
// extract type business
doc.Find("table.table").Each(func(index int, tableBusinessHtml *goquery.Selection) {
tableBusinessHtml.Find("tbody tr").Each(func(indexTr int, rowBusinessHtml *goquery.Selection) {
row := make([]string, 0)
rowBusinessHtml.Find("td").Each(func(ndexTd int, tableCell *goquery.Selection) {
row = append(row, tableCell.Text())
})
businessInfo := BusinessInfo{
ID: row[0],
Carees: row[1],
}
companyInfo.Business = append(companyInfo.Business, businessInfo)
})
})
company.List = append(company.List, *companyInfo)
return company.List, nil
}
func main() {
crawlMasothue()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment