Created
August 28, 2017 10:47
-
-
Save watanabeyu/b847e0988f63ec58c33e0fc80e0617d8 to your computer and use it in GitHub Desktop.
scraping and spreadsheet
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"io/ioutil" | |
"os" | |
"regexp" | |
"strconv" | |
"strings" | |
"time" | |
"golang.org/x/text/encoding/japanese" | |
"golang.org/x/text/transform" | |
"github.com/PuerkitoBio/goquery" | |
) | |
type Rows []Row | |
type Row []string | |
func main() { | |
/* sheet init */ | |
var sheet sheet | |
err = sheet.init("xxxxxsheetIdxxxxx") | |
if err != nil { | |
panic(err) | |
} | |
/* fetch spread sheet data */ | |
values := sheet.getBySheet("list!A:O") | |
// get data insert sqlite.db | |
/* fetch exclude list */ | |
values = sheet.getBySheet("exclude!A:O") | |
// get data insert sqlite.db | |
/* start data scraping */ | |
var rows [][]interface{} | |
doc, err := goquery.NewDocument("https://scrapingurl") | |
if err != nil { | |
fmt.Println("url scraping failed") | |
} | |
for { | |
doc.Find(".repeated-items").Each(func(_ int, s *goquery.Selection) { | |
title := s.Find(".title").Text() | |
pageurl,_ := s.Find(".detail-link").Attr("href") | |
/* if pageurl is not registered, get detail page */ | |
if flg == false{ | |
doc2, err := goquery.NewDocument(pageurl) | |
if err != nil { | |
fmt.Println("Error:", err) | |
} | |
description := s.Find(".the-description").Text() | |
created = time.Now().Format("2006/01/02 15:04") | |
var row []interface{} | |
row = append(row, title) | |
row = append(row, pageUrl) | |
row = append(row, description) | |
row = append(row, created) | |
rows = append(rows, row) | |
// insert sqlite.db | |
} | |
}) | |
/* next check */ | |
next, next_exist = doc.Find(".pagenation-next a").Attr("href") | |
if next_exist == false { | |
break | |
} | |
} | |
/* finish */ | |
// commit sqlite.db | |
if len(rows) > 0 { | |
sheet.append(rows) | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"io/ioutil" | |
"log" | |
"golang.org/x/net/context" | |
"golang.org/x/oauth2/google" | |
"google.golang.org/api/sheets/v4" | |
) | |
type sheet struct { | |
spreadSheetId string | |
srv *sheets.Service | |
} | |
func (s *sheet) init(spreadSheetId string) error { | |
ctx := context.Background() | |
b, err := ioutil.ReadFile("client_secret.json") | |
if err != nil { | |
log.Fatalf("Unable to read client secret file: %v", err) | |
} | |
// If modifying these scopes, delete your previously saved credentials | |
// at ~/.credentials/sheets.googleapis.com-go-quickstart.json | |
config, err := google.ConfigFromJSON(b, "https://www.googleapis.com/auth/spreadsheets") | |
if err != nil { | |
log.Fatalf("Unable to parse client secret file to config: %v", err) | |
} | |
client := getClient(ctx, config) | |
s.srv, err = sheets.New(client) | |
if err != nil { | |
log.Fatalf("Unable to retrieve Sheets Client %v", err) | |
} | |
s.spreadSheetId = spreadSheetId | |
return err | |
} | |
func (s *sheet) get(r string) [][]interface{} { | |
ctx := context.Background() | |
res, err := s.srv.Spreadsheets.Values.Get(s.spreadSheetId, r). | |
Context(ctx). | |
Do() | |
if err != nil { | |
panic(err) | |
} | |
return res.Values | |
} | |
func (s *sheet) find(str string) bool { | |
req := sheets.Request{ | |
FindReplace: &sheets.FindReplaceRequest{ | |
Find: str, | |
Replacement: str, | |
MatchCase: false, | |
MatchEntireCell: true, | |
SearchByRegex: false, | |
IncludeFormulas: false, | |
SheetId: 1234567890, | |
AllSheets: false, | |
}, | |
} | |
findReq := sheets.BatchUpdateSpreadsheetRequest{ | |
Requests: []*sheets.Request{&req}, | |
} | |
res, err := s.srv.Spreadsheets.BatchUpdate(s.spreadSheetId, &findReq).Do() | |
if err != nil { | |
log.Fatalf("find error: %v", err) | |
} | |
if res.Replies[0].FindReplace.OccurrencesChanged > 0 { | |
return true | |
} else { | |
return false | |
} | |
} | |
func (s *sheet) append(rows [][]interface{}) { | |
ctx := context.Background() | |
var vr sheets.ValueRange | |
for _, row := range rows { | |
vr.Values = append(vr.Values, row) | |
} | |
_, err := s.srv.Spreadsheets.Values.Append(s.spreadSheetId, "A1", &vr). | |
ValueInputOption("RAW"). | |
InsertDataOption("INSERT_ROWS"). | |
Context(ctx). | |
Do() | |
if err != nil { | |
panic(err) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment