Skip to content

Instantly share code, notes, and snippets.

@kanzitelli
Created September 7, 2019 22:07
Show Gist options
  • Save kanzitelli/a94759362798175aa69d551f8c091c4a to your computer and use it in GitHub Desktop.
Save kanzitelli/a94759362798175aa69d551f8c091c4a to your computer and use it in GitHub Desktop.
SecretMag crawler. #1
package crawler
import (
"fmt"
"time"
"github.com/gocolly/colly"
)
// SecretMag <struct>
// is used to present Secret Magazine crawler.
type SecretMag struct{}
const (
baseURL_SM = "https://secretmag.ru"
crawlURL_SM = "https://secretmag.ru/news"
)
// Run <function>
// is used to start crawling process.
func (sm SecretMag) Run() []models.News {
var totalNews []models.News
newsFuncs := []NewsFunc{
sm.runNews,
}
for _, f := range newsFuncs {
tmpNews := f()
totalNews = append(totalNews, tmpNews...)
}
return totalNews
}
func (sm SecretMag) runNews() []models.News {
// creating simple colly instance without any options
newsCollector := colly.NewCollector()
// array of news that will be returned
var news []models.News
newsCollector.OnHTML(".wrapper", func(divWrapper *colly.HTMLElement) {
divWrapper.ForEach(".container", func(i1 int, divContainer *colly.HTMLElement) {
divContainer.ForEach(".item", func(i2 int, divItem *colly.HTMLElement) {
link := divItem.ChildAttr("a[href]", "href")
fullLink := fmt.Sprintf("%s%s", baseURL_SM, link)
title := divItem.ChildText(".headline")
_id := utils.MakeHash(fullLink) // here we are going to create hash from full link in order to set ID of a news to hash value, so mongo won't add in case of duplicates
news = append(news, models.News{
ID: _id,
Title: title,
Link: fullLink,
Preamble: "",
TimeAdded: time.Now().Unix(),
NewsType: models.TypeNews,
NewsSource: models.SecretMagNewsSource,
})
})
})
})
newsCollector.Visit(crawlURL_SM)
newsCollector.Wait()
return news
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment