Skip to content

Instantly share code, notes, and snippets.

@watanabeyu
Created August 28, 2017 10:47
Show Gist options
  • Save watanabeyu/b847e0988f63ec58c33e0fc80e0617d8 to your computer and use it in GitHub Desktop.
Save watanabeyu/b847e0988f63ec58c33e0fc80e0617d8 to your computer and use it in GitHub Desktop.
scraping and spreadsheet
package main
import (
"io/ioutil"
"os"
"regexp"
"strconv"
"strings"
"time"
"golang.org/x/text/encoding/japanese"
"golang.org/x/text/transform"
"github.com/PuerkitoBio/goquery"
)
type Rows []Row
type Row []string
func main() {
/* sheet init */
var sheet sheet
err = sheet.init("xxxxxsheetIdxxxxx")
if err != nil {
panic(err)
}
/* fetch spread sheet data */
values := sheet.getBySheet("list!A:O")
// get data insert sqlite.db
/* fetch exclude list */
values = sheet.getBySheet("exclude!A:O")
// get data insert sqlite.db
/* start data scraping */
var rows [][]interface{}
doc, err := goquery.NewDocument("https://scrapingurl")
if err != nil {
fmt.Println("url scraping failed")
}
for {
doc.Find(".repeated-items").Each(func(_ int, s *goquery.Selection) {
title := s.Find(".title").Text()
pageurl,_ := s.Find(".detail-link").Attr("href")
/* if pageurl is not registered, get detail page */
if flg == false{
doc2, err := goquery.NewDocument(pageurl)
if err != nil {
fmt.Println("Error:", err)
}
description := s.Find(".the-description").Text()
created = time.Now().Format("2006/01/02 15:04")
var row []interface{}
row = append(row, title)
row = append(row, pageUrl)
row = append(row, description)
row = append(row, created)
rows = append(rows, row)
// insert sqlite.db
}
})
/* next check */
next, next_exist = doc.Find(".pagenation-next a").Attr("href")
if next_exist == false {
break
}
}
/* finish */
// commit sqlite.db
if len(rows) > 0 {
sheet.append(rows)
}
}
package main
import (
"io/ioutil"
"log"
"golang.org/x/net/context"
"golang.org/x/oauth2/google"
"google.golang.org/api/sheets/v4"
)
type sheet struct {
spreadSheetId string
srv *sheets.Service
}
func (s *sheet) init(spreadSheetId string) error {
ctx := context.Background()
b, err := ioutil.ReadFile("client_secret.json")
if err != nil {
log.Fatalf("Unable to read client secret file: %v", err)
}
// If modifying these scopes, delete your previously saved credentials
// at ~/.credentials/sheets.googleapis.com-go-quickstart.json
config, err := google.ConfigFromJSON(b, "https://www.googleapis.com/auth/spreadsheets")
if err != nil {
log.Fatalf("Unable to parse client secret file to config: %v", err)
}
client := getClient(ctx, config)
s.srv, err = sheets.New(client)
if err != nil {
log.Fatalf("Unable to retrieve Sheets Client %v", err)
}
s.spreadSheetId = spreadSheetId
return err
}
func (s *sheet) get(r string) [][]interface{} {
ctx := context.Background()
res, err := s.srv.Spreadsheets.Values.Get(s.spreadSheetId, r).
Context(ctx).
Do()
if err != nil {
panic(err)
}
return res.Values
}
func (s *sheet) find(str string) bool {
req := sheets.Request{
FindReplace: &sheets.FindReplaceRequest{
Find: str,
Replacement: str,
MatchCase: false,
MatchEntireCell: true,
SearchByRegex: false,
IncludeFormulas: false,
SheetId: 1234567890,
AllSheets: false,
},
}
findReq := sheets.BatchUpdateSpreadsheetRequest{
Requests: []*sheets.Request{&req},
}
res, err := s.srv.Spreadsheets.BatchUpdate(s.spreadSheetId, &findReq).Do()
if err != nil {
log.Fatalf("find error: %v", err)
}
if res.Replies[0].FindReplace.OccurrencesChanged > 0 {
return true
} else {
return false
}
}
func (s *sheet) append(rows [][]interface{}) {
ctx := context.Background()
var vr sheets.ValueRange
for _, row := range rows {
vr.Values = append(vr.Values, row)
}
_, err := s.srv.Spreadsheets.Values.Append(s.spreadSheetId, "A1", &vr).
ValueInputOption("RAW").
InsertDataOption("INSERT_ROWS").
Context(ctx).
Do()
if err != nil {
panic(err)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment