Skip to content

Instantly share code, notes, and snippets.

@Miqueas
Last active February 11, 2021 05:27
Show Gist options
  • Save Miqueas/b92073803b658c207fbdfaebc10a1512 to your computer and use it in GitHub Desktop.
Save Miqueas/b92073803b658c207fbdfaebc10a1512 to your computer and use it in GitHub Desktop.
[Go] Basic scrapper example
package main
import (
// For the exit status
OS "os"
// For printing
Fmt "fmt"
// For commandline arguments
Flag "flag"
// For requests
HTTP "net/http"
// For some regular expressions
Regex "regexp"
// For search the required HTML elements
// Install with 'go get github.com/PuerkitoBio/goquery' if needed
GoQuery "github.com/PuerkitoBio/goquery"
)
// Error checking function
func check(e error, msg string, args ...interface{}) {
if e != nil {
Fmt.Printf(msg, args...)
panic(e)
}
}
func main() {
// Initializes the argument parsing
Flag.Parse()
// Gets all the arguments
var args = Flag.Args()
// Length of arguments
var argc = len(args)
// Format string for print gists
var fmt = "\x1b[2m%02d. \x1b[0;1;32mFile: \x1b[0m%s. \x1b[1;32mDescription: \x1b[0m%s.\n"
switch argc {
// If no arguments, then just exit with status code 1
case 0:
Fmt.Println("No arguments, exiting.")
OS.Exit(1)
// If only 1 argument, then prints all gists for the user name given
case 1:
var user = args[0]
var gists = GetGists(user)
if len(gists) == 0 {
Fmt.Printf("User '%s' has no gists.\n", user)
OS.Exit(0)
} else {
Fmt.Printf("User '%s' has the following gists:\n", user)
for i, v := range gists {
Fmt.Printf(fmt, i + 1, v["File"], v["Desc"])
}
}
// More than 1 argument, then do the same for all the user names given
default:
for _, user := range args {
var gists = GetGists(user)
if len(gists) == 0 {
Fmt.Printf("User '%s' has no gists.\n", user)
} else {
Fmt.Printf("User '%s' has the following gists:\n", user)
for i, v := range gists {
Fmt.Printf(fmt, i + 1, v["File"], v["Desc"])
}
}
}
}
}
// The main function that fetch user gists
func GetGists(user string) []map[string]string {
// The return value
var arr []map[string]string
// Holds errors
var err error
// The url to fetch
var url string = "https://gist.github.com/" + user
// Makes a 'GET' request to 'url'
res, err := HTTP.Get(url)
check(err, "Error fetching url: %s.\n", url)
// Creates a new goquery document from the response
doc, err := GoQuery.NewDocumentFromReader(res.Body)
check(err, "Error reading the response content.\n")
// Close the response content after this function end
defer res.Body.Close()
// For some reason, the description text of gists has spaces at the start
// and the end, so... These RegExp pattern is for remove them
var start = Regex.MustCompile(`^\s+`)
var end = Regex.MustCompile(`\s+$`)
// The HTML element with CSS class 'gist-snippet' has al the info that we need
var elems = doc.Find(".gist-snippet")
// For each element found, we use a function to find the info
elems.Each(func(idx int, sel *GoQuery.Selection) {
// Top div, with the info:
// UserName / FileName
// Date Time Created
// Description
var divtag = sel.Find(".d-inline-block.px-lg-2.px-0")
// The gist file name (element)
var filetag = divtag.Find("span a + a")
// The gist description (element)
var desctag = divtag.Find("span.f6.text-gray")
// We remove the mentioned spaces in description
var descstr = start.ReplaceAllString(end.ReplaceAllString(desctag.Text(), ""), "")
// File name (text)
var filestr = filetag.Text()
// Append the data in the return value
arr = append(arr, map[string]string { "File": filestr, "Desc": descstr })
})
return arr
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment