Skip to content

Instantly share code, notes, and snippets.

@GDLMadushanka
Created October 10, 2019 14:51
Show Gist options
  • Save GDLMadushanka/192504d41852d552c5253ad1c81d1183 to your computer and use it in GitHub Desktop.
Save GDLMadushanka/192504d41852d552c5253ad1c81d1183 to your computer and use it in GitHub Desktop.
A GO program that fetch blog of a given moth from Wordpress and Medium.
package main
import (
"encoding/json"
"fmt"
"io/ioutil"
"log"
"net/http"
"os"
"strings"
"github.com/PuerkitoBio/goquery"
"github.com/mmcdole/gofeed"
)
// Structure of the JSON input file
type inputData struct {
Year string `json:"Year"`
Month string `json:"Month"`
Blogs []string `json:"Blogs"`
}
func main() {
// read inputs.json
data := readInputs()
fmt.Println("Start fetching blogs written in", data.Year, data.Month)
for _, element := range data.Blogs {
if strings.Contains(element, "wordpress.com") {
wordpress := crawlWordpress(element, data.Year, data.Month)
for _, element := range wordpress {
fmt.Println(element)
}
} else if strings.Contains(element, "medium.com") {
medium := crawlMedium(element, data.Year, data.Month)
for _, element := range medium {
fmt.Println(element)
}
}
}
}
// read data from input.json file
func readInputs() inputData {
jsonFile, err := os.Open("input.json")
if err != nil {
fmt.Println(err)
}
defer jsonFile.Close()
var data inputData
byteValue, _ := ioutil.ReadAll(jsonFile)
json.Unmarshal(byteValue, &data)
return data
}
// crawl wordpress and fetch new blogs
func crawlWordpress(url string, year string, month string) []string {
urls := make([]string, 0)
res, err := http.Get(url)
if err != nil {
log.Fatal(err)
}
defer res.Body.Close()
if res.StatusCode != 200 {
log.Fatalf("status code error: %d %s", res.StatusCode, res.Status)
}
// Load the HTML document
doc, err := goquery.NewDocumentFromReader(res.Body)
if err != nil {
log.Fatal(err)
}
// process
doc.Find(".post-meta-date-link").Each(func(i int, s *goquery.Selection) {
if strings.Contains(s.Text(), year) && strings.Contains(s.Text(), month) {
link, ok := s.Attr("href")
if ok {
urls = append(urls, link)
}
}
})
return urls
}
// take the rss feed from medium and filter blogs
func crawlMedium(url string, year string, month string) []string {
urls := make([]string, 0)
fp := gofeed.NewParser()
feed, _ := fp.ParseURL(url)
for _, element := range feed.Items {
date := element.Published
if strings.Contains(date, year) && strings.Contains(date, month) {
urls = append(urls, element.Link)
}
}
return urls
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment