Last active
July 18, 2017 16:41
-
-
Save takumin/596b2307d57721259e83 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"log" | |
"net/http" | |
"regexp" | |
"strconv" | |
"time" | |
"github.com/PuerkitoBio/goquery" | |
"github.com/microcosm-cc/bluemonday" | |
"github.com/minio/sha256-simd" | |
"golang.org/x/text/encoding/japanese" | |
"golang.org/x/text/transform" | |
) | |
type Page struct { | |
Id int | |
Url string | |
Title string | |
Category string | |
Date time.Time | |
Sha256 [sha256.Size]byte | |
Body string | |
} | |
const table = ` | |
CREATE TABLE "nemuisan" ( | |
"id" INTEGER PRIMARY KEY AUTOINCREMENT, | |
"url" TEXT NOT NULL, | |
"title" TEXT NOT NULL, | |
"category" TEXT NOT NULL, | |
"date" TIMESTAMP NOT NULL, | |
"sha256" BLOB NOT NULL UNIQUE, | |
"body" TEXT NOT NULL, | |
) | |
` | |
func checkError(err error) { | |
if err != nil { | |
log.Fatalln(err) | |
} | |
} | |
func getPage(url string) *goquery.Document { | |
// GET | |
res, err := http.Get(url) | |
checkError(err) | |
defer res.Body.Close() | |
// HTMLパース | |
doc, err := goquery.NewDocumentFromReader(transform.NewReader(res.Body, japanese.EUCJP.NewDecoder())) | |
checkError(err) | |
return doc | |
} | |
func main() { | |
base_url := "http://nemuisan.blog.bai.ne.jp/" | |
// JSTロケーション | |
loc, err := time.LoadLocation("Asia/Tokyo") | |
checkError(err) | |
// 記事データ構造体 | |
page := Page{} | |
// 記事ID正規表現 | |
regId := regexp.MustCompile(`.*eid=([0-9]+)$`) | |
// 月別アーカイブ一覧から辿る | |
doc := getPage(base_url) | |
doc.Find("#rightbox > ul > li:nth-child(7) > div.list_dot").EachWithBreak(func(_ int, s *goquery.Selection) bool { | |
// リンク抽出 | |
s.Find("a").EachWithBreak(func(_ int, s *goquery.Selection) bool { | |
url, _ := s.Attr("href") | |
// 各記事取得 | |
doc := getPage(base_url + url) | |
doc.Find("#leftbox > div.blogbox").EachWithBreak(func(_ int, s *goquery.Selection) bool { | |
url, _ := s.Find("h2 > a").Attr("href") | |
// 記事内容取得 | |
doc := getPage(url) | |
doc.Find("#leftbox > div.blogbox").EachWithBreak(func(_ int, s *goquery.Selection) bool { | |
// ID取得 | |
page.Id, err = strconv.Atoi(regId.ReplaceAllString(url, "$1")) | |
checkError(err) | |
// URL取得 | |
page.Url = url | |
// タイトル取得 | |
page.Title = s.Find("h2 > a").Text() | |
// 記事本文取得 | |
body, err := s.Find("p").Html() | |
checkError(err) | |
page.Body = bluemonday.UGCPolicy().Sanitize(body) | |
// SHA256ハッシュ取得 | |
page.Sha256 = sha256.Sum256([]byte(page.Body)) | |
// 投稿日取得 | |
d := s.Find("#leftbox > div > ul > li.listmenu.day").Text() | |
// 投稿時間取得 | |
t := s.Find("#leftbox > div > ul > li:nth-child(2) > a").Text() | |
// 投稿日時変換 | |
page.Date, err = time.ParseInLocation("2006.01.02 Monday 15:04", d+" "+t, loc) | |
checkError(err) | |
// カテゴリ取得 | |
page.Category = s.Find("#leftbox > div.blogbox > ul > li:nth-child(4) > a").Text() | |
// まだ書いてない・・・ | |
log.Println(page.Id) | |
log.Println(page.Url) | |
log.Println(page.Title) | |
log.Println(page.Category) | |
log.Println(page.Date) | |
log.Println(page.Body) | |
log.Println(page.Sha256) | |
// Debug | |
return false | |
}) | |
// Debug | |
return false | |
}) | |
// Debug | |
return false | |
}) | |
// Debug | |
return false | |
}) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment