Skip to content

Instantly share code, notes, and snippets.

@renanregis
Created November 6, 2020 11:30
Show Gist options
  • Save renanregis/a64b0a7b8342c2f016b89d8bf738442e to your computer and use it in GitHub Desktop.
Save renanregis/a64b0a7b8342c2f016b89d8bf738442e to your computer and use it in GitHub Desktop.
scraping-goquery.go
package main
import (
"encoding/json"
"io/ioutil"
"log"
"strconv"
"strings"
"github.com/PuerkitoBio/goquery"
)
type Item struct {
Name string `json:"name"`
Description string `json:"description"`
Created string `json:"created_by"`
Released string `json:"released_at"`
Repositories int64 `json:"repositories"`
RelatedTopics []string `json:"related_topics"`
}
func main() {
_ = parseHTML("golang.html")
}
func parseHTML(page string) error {
var name string
var description string
var releasedat string
var createdat string
var repositories int64
var topics []string
dat, err := ioutil.ReadFile(page)
if err != nil {
log.Fatal(err)
}
doc, _ := goquery.NewDocumentFromReader(strings.NewReader(string(dat)))
doc.Find("body h1").Each(func(i int, h1 *goquery.Selection) {
name = h1.Text()
})
doc.Find("div p").Each(func(i int, p *goquery.Selection) {
if !p.HasClass("text-gray alt-text-small") {
t := strings.Replace(p.Text(), "\n", "", -1)
t = strings.Replace(t, " ", "", -1)
trim := strings.TrimSpace(t)
description = trim
}
})
doc.Find("ul").Each(func(i int, ul *goquery.Selection) {
ul.Find("li").Each(func(i int, li *goquery.Selection) {
if li.HasClass("d-md-inline-block no-wrap f6 mr-md-3 mb-2 mb-0") {
if li.Find("span").Text() == "Released" {
t := li.Children().Remove().End().Text()
t = strings.Replace(li.Text(), "\n", "", -1)
trim := strings.TrimSpace(t)
releasedat = trim
}
if li.Find("span").Text() == "Created by" {
t := li.Children().Remove().End().Text()
t = strings.Replace(li.Text(), "\n", "", -1)
trim := strings.TrimSpace(t)
createdat = trim
}
}
})
})
doc.Find("h2 span").Each(func(i int, h2 *goquery.Selection) {
t := strings.Replace(h2.Text(), ",", "", -1)
f, _ := strconv.ParseInt(t, 10, 64)
repositories = f
})
doc.Find("a").Each(func(i int, a *goquery.Selection) {
if a.HasClass("topic-tag topic-tag-link f6 my-1") {
_, exists := a.Attr("data-ga-click")
if exists {
t := strings.Replace(a.Text(), "\n", "", -1)
trim := strings.TrimSpace(t)
topics = append(topics, trim)
}
}
})
data := Item{
Name: name,
Description: description,
Created: createdat,
Released: releasedat,
Repositories: repositories,
RelatedTopics: topics,
}
j, err := json.Marshal(&data)
if err != nil {
log.Fatal(err)
}
err = ioutil.WriteFile("golang.json", j, 0644)
return nil
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment