Last active
May 30, 2021 19:54
-
-
Save ksinica/a3bc17fa7e2995dc9a3f227113c52baa to your computer and use it in GitHub Desktop.
Cassiopaean session transcripts scraper.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bytes" | |
"context" | |
"errors" | |
"fmt" | |
"io" | |
"net/http" | |
"net/url" | |
"os" | |
"path" | |
"path/filepath" | |
"sort" | |
"strings" | |
"time" | |
"github.com/go-shiori/go-readability" | |
"github.com/gocolly/colly" | |
"golang.org/x/net/html" | |
) | |
const style = ` | |
html { | |
font-size: 14px | |
} | |
body { | |
color: black; | |
font-family: Georgia, Times, "Times New Roman", serif; | |
font-size: 1.1rem; | |
line-height: 1.5; | |
text-align: justify; | |
} | |
h2 { | |
color: black; | |
font-family: Georgia, Times, "Times New Roman", serif; | |
font-size: 2.2rem; | |
font-weight: bold; | |
line-height: 1.5; | |
text-align: justify; | |
} | |
.bbCodeBlock { | |
box-shadow: 0 4px 8px 0 rgba(0,0,0,0.2); | |
} | |
.bbCodeBlock-content { | |
padding: 2px 16px; | |
} | |
.bbCodeBlock-expandLink { | |
display: none; | |
} | |
.sidenav { | |
background-color: white; | |
height: 100%; | |
width: 12.5%; | |
position: fixed; | |
z-index: 1; | |
top: 0; | |
box-shadow: 8px 0px 8px rgba(0,0,0,0.2); | |
overflow-y: scroll; | |
} | |
.sidenav.a { | |
padding-left: 16px; | |
} | |
.main { | |
margin-left: 19.5%; | |
margin-right: 19.5%; | |
overflow-x: hidden; | |
z-index: -1; | |
} | |
.span { | |
box-shadow: 0 4px 8px 0 rgba(0,0,0,0.2); | |
} | |
` | |
func isSearchPath(path string) bool { | |
p := strings.Split(path, "/") | |
if len(p) < 1 { | |
return false | |
} | |
return p[2] == "search" | |
} | |
type session struct { | |
ts time.Time | |
url url.URL | |
html string | |
} | |
func timestampFromPagePath(path string) (ret string, ok bool) { | |
p := strings.Split(path, "/") | |
if len(p) < 1 { | |
return | |
} | |
if strings.HasPrefix(p[len(p)-1], "page") { | |
return | |
} | |
ret = p[len(p)-2] | |
ret = ret[:strings.Index(p[len(p)-2], ".")] | |
ok = true | |
return | |
} | |
func parseTimestamp(ts string) (ret time.Time, err error) { | |
ret, err = time.Parse("session-2-January-2006", ts) | |
if err != nil { | |
ret, err = time.Parse("session-2-Jan-2006", ts) | |
if err != nil { | |
ret, err = time.Parse("sesssion-2-January-2006", ts) // case for 31 Oct 2001 | |
} | |
} | |
return | |
} | |
func mapImgSrc(node *html.Node, f func(html.Attribute) *html.Attribute) { | |
if node.Type == html.ElementNode && strings.EqualFold(node.Data, "img") { | |
for i, attr := range node.Attr { | |
if strings.EqualFold(attr.Key, "src") { | |
if attr := f(attr); attr != nil { | |
node.Attr[i] = *attr | |
} | |
} | |
} | |
} | |
for n := node.FirstChild; n != nil; n = n.NextSibling { | |
mapImgSrc(n, f) | |
} | |
} | |
func fetchFile(ctx context.Context, root, rawurl string) (string, error) { | |
req, err := http.NewRequest(http.MethodGet, rawurl, nil) | |
if err != nil { | |
return "", err | |
} | |
req = req.WithContext(ctx) | |
resp, err := http.DefaultClient.Do(req) | |
if err != nil { | |
return "", err | |
} | |
defer resp.Body.Close() | |
u, err := url.Parse(rawurl) | |
if err != nil { | |
return "", err | |
} | |
if len(path.Ext(u.Path)) == 0 { | |
return "", errors.New("invalid path") | |
} | |
p := filepath.Join(root, path.Base(u.Path)) | |
f, err := os.OpenFile(p, os.O_CREATE|os.O_WRONLY, 0644) | |
if err != nil { | |
if err == os.ErrExist { | |
return p, nil | |
} | |
return "", err | |
} | |
_, err = io.Copy(f, resp.Body) | |
if err != nil { | |
f.Close() | |
return "", err | |
} | |
return p, f.Close() | |
} | |
func makeAbsoluteUrl(rawurl string) string { | |
url, err := url.Parse(rawurl) | |
if err != nil { | |
return "" | |
} | |
if len(url.Host) == 0 { | |
url.Scheme = "https" | |
url.Host = "cassiopaea.org" | |
} | |
return url.String() | |
} | |
func downloadImages(ctx context.Context, path string, e *colly.HTMLElement) (err error) { | |
if len(e.DOM.Nodes) > 0 { | |
for c := e.DOM.Nodes[0].FirstChild; c != nil; c = c.NextSibling { | |
mapImgSrc(c, func(attr html.Attribute) *html.Attribute { | |
if err != nil { | |
return nil | |
} | |
ctx, cf := context.WithTimeout(ctx, time.Second*5) | |
defer cf() | |
var src string | |
src, err = fetchFile(ctx, path, makeAbsoluteUrl(attr.Val)) | |
if err != nil { | |
return nil | |
} | |
attr.Val = src | |
return &attr | |
}) | |
} | |
} | |
return | |
} | |
func collect() (ret []session, err error) { | |
c := colly.NewCollector() | |
c.OnHTML( | |
`.pageNav-jump`, | |
func(e *colly.HTMLElement) { | |
if isSearchPath(e.Request.URL.Path) { | |
e.Request.Visit(e.Attr("href")) | |
} | |
}) | |
c.OnHTML( | |
`li.block-row > | |
div:nth-child(1) > | |
div:nth-child(2) > | |
h3:nth-child(1) > | |
a:nth-child(1)`, | |
func(e *colly.HTMLElement) { | |
e.Request.Visit(e.Attr("href")) | |
}) | |
c.OnHTML( | |
`.button--link`, | |
func(e *colly.HTMLElement) { | |
e.Request.Visit(e.Attr("href")) | |
}) | |
c.OnHTML( | |
`div:nth-child(2) > | |
div:nth-child(2) > | |
div:nth-child(1) > | |
div:nth-child(2) > | |
div:nth-child(1) > | |
article:nth-child(1) > | |
div:nth-child(1)`, | |
func(e *colly.HTMLElement) { | |
if e.Index == 0 { | |
if tspp, ok := timestampFromPagePath(e.Request.URL.Path); ok { | |
var ts time.Time | |
ts, err = parseTimestamp(tspp) | |
if err != nil { | |
return | |
} | |
if err := downloadImages(context.TODO(), ".", e); err != nil { | |
panic(err) | |
} | |
html, err := e.DOM.Html() | |
if err != nil { | |
return | |
} | |
var art readability.Article | |
art, err = readability.FromReader( | |
bytes.NewBufferString(html), | |
e.Request.URL, | |
) | |
ret = append(ret, session{ | |
ts: ts, | |
html: art.Content, | |
url: *e.Request.URL, | |
}) | |
fmt.Fprintf(os.Stderr, "%s\n", e.Request.URL.String()) | |
} | |
} | |
}) | |
c.Visit("https://cassiopaea.org/forum/search/145535/?q=Session&c[title_only]=1&c[users]=Laura&o=date") | |
c.Visit("https://cassiopaea.org/forum/search/145535/?q=Sesssion&c[title_only]=1&c[users]=Laura&o=date") | |
c.Visit("https://cassiopaea.org/forum/search/155324/?q=Session&c[title_only]=1&c[users]=Chu%2C+Andromeda&o=date") | |
c.Wait() | |
return | |
} | |
func line(format string, a ...interface{}) { | |
fmt.Fprintf(os.Stdout, format+"\n", a...) | |
} | |
func toHtml(sessions []session) { | |
line("<html lang=\"en\">") | |
line("<head>") | |
line("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"/>") | |
line("<style>") | |
line("%s", style) | |
line("</style>") | |
line("</head>") | |
line("<body>") | |
line("<div class=\"sidenav\">") | |
var year int = 0 | |
for _, x := range sessions { | |
if x.ts.Year() != year { | |
year = x.ts.Year() | |
line("<h3>%d</h3>\n", year) | |
} | |
line( | |
"<a href=\"#%s\">%s</a><br/>", | |
x.ts.Format("session-2-January-2006"), | |
x.ts.Format("02 January"), | |
) | |
} | |
line("</div>") | |
for _, x := range sessions { | |
line("<section id=%s>", x.ts.Format("session-2-January-2006")) | |
line("<div class=\"main\">") | |
line("<h2>Session %s</h2>", x.ts.Format("2 January 2006")) | |
line("<a href=\"%s\" target=\"_blank\">Forum discussion</a></br></br>", x.url.String()) | |
io.WriteString(os.Stdout, x.html) | |
line("</br>") | |
line("</div>") | |
line("</section>") | |
} | |
line("</body>") | |
} | |
func main() { | |
sessions, err := collect() | |
if err != nil { | |
panic(err) | |
} | |
fmt.Fprintf(os.Stderr, "Collected %d links\n", len(sessions)) | |
sort.Slice(sessions, func(i, j int) bool { | |
return sessions[i].ts.Before(sessions[j].ts) | |
}) | |
toHtml(sessions) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
go run main.go > sessions-$(date +%s).html