Skip to content

Instantly share code, notes, and snippets.

@takumin
Last active July 18, 2017 16:41
Show Gist options
  • Save takumin/596b2307d57721259e83 to your computer and use it in GitHub Desktop.
Save takumin/596b2307d57721259e83 to your computer and use it in GitHub Desktop.
package main
import (
"log"
"net/http"
"regexp"
"strconv"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/microcosm-cc/bluemonday"
"github.com/minio/sha256-simd"
"golang.org/x/text/encoding/japanese"
"golang.org/x/text/transform"
)
type Page struct {
Id int
Url string
Title string
Category string
Date time.Time
Sha256 [sha256.Size]byte
Body string
}
const table = `
CREATE TABLE "nemuisan" (
"id" INTEGER PRIMARY KEY AUTOINCREMENT,
"url" TEXT NOT NULL,
"title" TEXT NOT NULL,
"category" TEXT NOT NULL,
"date" TIMESTAMP NOT NULL,
"sha256" BLOB NOT NULL UNIQUE,
"body" TEXT NOT NULL,
)
`
func checkError(err error) {
if err != nil {
log.Fatalln(err)
}
}
func getPage(url string) *goquery.Document {
// GET
res, err := http.Get(url)
checkError(err)
defer res.Body.Close()
// HTMLパース
doc, err := goquery.NewDocumentFromReader(transform.NewReader(res.Body, japanese.EUCJP.NewDecoder()))
checkError(err)
return doc
}
func main() {
base_url := "http://nemuisan.blog.bai.ne.jp/"
// JSTロケーション
loc, err := time.LoadLocation("Asia/Tokyo")
checkError(err)
// 記事データ構造体
page := Page{}
// 記事ID正規表現
regId := regexp.MustCompile(`.*eid=([0-9]+)$`)
// 月別アーカイブ一覧から辿る
doc := getPage(base_url)
doc.Find("#rightbox > ul > li:nth-child(7) > div.list_dot").EachWithBreak(func(_ int, s *goquery.Selection) bool {
// リンク抽出
s.Find("a").EachWithBreak(func(_ int, s *goquery.Selection) bool {
url, _ := s.Attr("href")
// 各記事取得
doc := getPage(base_url + url)
doc.Find("#leftbox > div.blogbox").EachWithBreak(func(_ int, s *goquery.Selection) bool {
url, _ := s.Find("h2 > a").Attr("href")
// 記事内容取得
doc := getPage(url)
doc.Find("#leftbox > div.blogbox").EachWithBreak(func(_ int, s *goquery.Selection) bool {
// ID取得
page.Id, err = strconv.Atoi(regId.ReplaceAllString(url, "$1"))
checkError(err)
// URL取得
page.Url = url
// タイトル取得
page.Title = s.Find("h2 > a").Text()
// 記事本文取得
body, err := s.Find("p").Html()
checkError(err)
page.Body = bluemonday.UGCPolicy().Sanitize(body)
// SHA256ハッシュ取得
page.Sha256 = sha256.Sum256([]byte(page.Body))
// 投稿日取得
d := s.Find("#leftbox > div > ul > li.listmenu.day").Text()
// 投稿時間取得
t := s.Find("#leftbox > div > ul > li:nth-child(2) > a").Text()
// 投稿日時変換
page.Date, err = time.ParseInLocation("2006.01.02 Monday 15:04", d+" "+t, loc)
checkError(err)
// カテゴリ取得
page.Category = s.Find("#leftbox > div.blogbox > ul > li:nth-child(4) > a").Text()
// まだ書いてない・・・
log.Println(page.Id)
log.Println(page.Url)
log.Println(page.Title)
log.Println(page.Category)
log.Println(page.Date)
log.Println(page.Body)
log.Println(page.Sha256)
// Debug
return false
})
// Debug
return false
})
// Debug
return false
})
// Debug
return false
})
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment