Skip to content

Instantly share code, notes, and snippets.

@tikidunpon
Created July 10, 2018 14:20
Show Gist options
  • Save tikidunpon/e8bfe8822d4b6b9056da7f302f797f01 to your computer and use it in GitHub Desktop.
Save tikidunpon/e8bfe8822d4b6b9056da7f302f797f01 to your computer and use it in GitHub Desktop.
package main
import (
"bufio"
"errors"
"fmt"
"log"
"net/http"
"net/url"
"os"
"sort"
"sync"
"github.com/PuerkitoBio/goquery"
)
type result struct {
title string
index int
}
func main() {
// First element in os.Args is always the program name,
// So we need at least 2 arguments to have a file name argument.
if len(os.Args) < 2 {
fmt.Println("Missing parameter, provide file name!")
return
}
filePath := os.Args[1]
f, err := os.Open(filePath)
if err != nil {
fmt.Fprintf(os.Stderr, "File %s could not read: %v\n", filePath, err)
os.Exit(1)
}
lines := []string{}
scanner := bufio.NewScanner(f)
for scanner.Scan() {
lines = append(lines, scanner.Text())
}
if serr := scanner.Err(); serr != nil {
fmt.Fprintf(os.Stderr, "File %s scan error: %v\n", filePath, err)
}
wg := new(sync.WaitGroup)
sem := make(chan int, 20)
results := []*result{}
for i, line := range lines {
sem <- 1
wg.Add(1)
line := line
i := i
go func() {
defer wg.Done()
defer func() { <-sem }()
r, err := requestLine(line, i)
if err != nil {
log.Printf("error: %s", err.Error())
return
}
if r != nil {
results = append(results, r)
}
}()
}
wg.Wait()
sort.Slice(results, func(i int, j int) bool { return results[i].index < results[j].index })
fmt.Printf("link count = %d\n", len(results))
for _, r := range results {
fmt.Printf("- %s", r.title)
}
}
func requestLine(line string, i int) (*result, error) {
if line == "" {
return nil, errors.New("line is empty")
}
err := validateURL(line)
if err != nil {
return nil, err
}
res, err := http.Get(line)
if err != nil {
return nil, err
}
defer res.Body.Close()
if res.StatusCode != 200 {
return nil, fmt.Errorf("error: status code = %d, status = %s", res.StatusCode, res.Status)
}
// Load the HTML document
doc, err := goquery.NewDocumentFromReader(res.Body)
if err != nil {
log.Fatal(err)
}
s := doc.Find("title")
r := new(result)
r.title = fmt.Sprintf("[%s](%s)\n", s.Text(), line)
r.index = i
return r, nil
}
func validateURL(line string) error {
u, err := url.ParseRequestURI(line)
if err != nil {
return err
}
if u.Host == "" || u.Scheme == "" {
return fmt.Errorf("host and scheme shoud not be empty, inputted url = %s", u.String())
}
if u.Scheme != "http" && u.Scheme != "https" {
return fmt.Errorf("scheme is not http/https, inputted url = %s", u.String())
}
return nil
}
https://golang.org/doc/effective_go.html#concurrency
https://golang.org/ref/spec#Channel_types
https://go-tour-jp.appspot.com/concurrency/1
https://grpc.io/docs/tutorials/basic/go.html#bidirectional-streaming-rpc-1
https://mattn.kaoriya.net/software/lang/go/20131112132831.html
https://mattn.kaoriya.net/software/lang/go/20180531104907.html
https://mattn.kaoriya.net/software/lang/go/20160706165757.html
https://mattn.kaoriya.net/software/lang/go/20180124171404.html
https://hori-ryota.com/blog/golang-channel-pattern/
https://qiita.com/i_yudai/items/3336a503079ac5749c35
http://sairoutine.hatenablog.com/entry/2017/12/02/182827
https://blog.nagisa-inc.jp/archives/1134
https://qiita.com/awakia/items/f8afa070c96d1c9a04c9
https://blog.nindalf.com/posts/how-goroutines-work/
https://blog.golang.org/pipelines
https://blog.golang.org/context
https://www.youtube.com/watch?v=f6kdp27TYZs
https://www.youtube.com/watch?v=QDDwwePbDtw
https://www.amazon.co.jp/%E3%82%B9%E3%82%BF%E3%83%BC%E3%83%86%E3%82%A3%E3%83%B3%E3%82%B0Go%E8%A8%80%E8%AA%9E-%E6%9D%BE%E5%B0%BE%E6%84%9B%E8%B3%80-ebook/dp/B01FH3KRTI
http://niconegoto.hatenadiary.jp/entry/2017/04/11/092810
https://qiita.com/Vermee81/items/30ad42a7265375b1b7b1
https://github.com/golang/go/blob/master/src/runtime/chan.go
https://about.sourcegraph.com/go/understanding-channels-kavya-joshi/
@tikidunpon
Copy link
Author

tikidunpon commented Jul 10, 2018

改行区切りのURLからページのタイトルを取得してマークダウン形式のリストで出力するgoコードです。
input.txtはサンプルの入力データ。
sem := make(chan int, 20) で同時実行タスク数を20までに制限してます。
※遊びがてら書いたので動作保証はしてませんが、URLからタイトルゲット君とかよりは高速だと思います。

@tikidunpon
Copy link
Author

tikidunpon commented Dec 1, 2021

モジュールに対応する場合は go mod init get_title.go した後で go mod tidy してから使ってください。

参考

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment