Created
October 10, 2019 14:51
-
-
Save GDLMadushanka/192504d41852d552c5253ad1c81d1183 to your computer and use it in GitHub Desktop.
A GO program that fetch blog of a given moth from Wordpress and Medium.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"encoding/json" | |
"fmt" | |
"io/ioutil" | |
"log" | |
"net/http" | |
"os" | |
"strings" | |
"github.com/PuerkitoBio/goquery" | |
"github.com/mmcdole/gofeed" | |
) | |
// Structure of the JSON input file | |
type inputData struct { | |
Year string `json:"Year"` | |
Month string `json:"Month"` | |
Blogs []string `json:"Blogs"` | |
} | |
func main() { | |
// read inputs.json | |
data := readInputs() | |
fmt.Println("Start fetching blogs written in", data.Year, data.Month) | |
for _, element := range data.Blogs { | |
if strings.Contains(element, "wordpress.com") { | |
wordpress := crawlWordpress(element, data.Year, data.Month) | |
for _, element := range wordpress { | |
fmt.Println(element) | |
} | |
} else if strings.Contains(element, "medium.com") { | |
medium := crawlMedium(element, data.Year, data.Month) | |
for _, element := range medium { | |
fmt.Println(element) | |
} | |
} | |
} | |
} | |
// read data from input.json file | |
func readInputs() inputData { | |
jsonFile, err := os.Open("input.json") | |
if err != nil { | |
fmt.Println(err) | |
} | |
defer jsonFile.Close() | |
var data inputData | |
byteValue, _ := ioutil.ReadAll(jsonFile) | |
json.Unmarshal(byteValue, &data) | |
return data | |
} | |
// crawl wordpress and fetch new blogs | |
func crawlWordpress(url string, year string, month string) []string { | |
urls := make([]string, 0) | |
res, err := http.Get(url) | |
if err != nil { | |
log.Fatal(err) | |
} | |
defer res.Body.Close() | |
if res.StatusCode != 200 { | |
log.Fatalf("status code error: %d %s", res.StatusCode, res.Status) | |
} | |
// Load the HTML document | |
doc, err := goquery.NewDocumentFromReader(res.Body) | |
if err != nil { | |
log.Fatal(err) | |
} | |
// process | |
doc.Find(".post-meta-date-link").Each(func(i int, s *goquery.Selection) { | |
if strings.Contains(s.Text(), year) && strings.Contains(s.Text(), month) { | |
link, ok := s.Attr("href") | |
if ok { | |
urls = append(urls, link) | |
} | |
} | |
}) | |
return urls | |
} | |
// take the rss feed from medium and filter blogs | |
func crawlMedium(url string, year string, month string) []string { | |
urls := make([]string, 0) | |
fp := gofeed.NewParser() | |
feed, _ := fp.ParseURL(url) | |
for _, element := range feed.Items { | |
date := element.Published | |
if strings.Contains(date, year) && strings.Contains(date, month) { | |
urls = append(urls, element.Link) | |
} | |
} | |
return urls | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment