Skip to content

Instantly share code, notes, and snippets.

@ksinica
Last active May 30, 2021 19:54
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ksinica/a3bc17fa7e2995dc9a3f227113c52baa to your computer and use it in GitHub Desktop.
Save ksinica/a3bc17fa7e2995dc9a3f227113c52baa to your computer and use it in GitHub Desktop.
Cassiopaean session transcripts scraper.
package main
import (
"bytes"
"context"
"errors"
"fmt"
"io"
"net/http"
"net/url"
"os"
"path"
"path/filepath"
"sort"
"strings"
"time"
"github.com/go-shiori/go-readability"
"github.com/gocolly/colly"
"golang.org/x/net/html"
)
const style = `
html {
font-size: 14px
}
body {
color: black;
font-family: Georgia, Times, "Times New Roman", serif;
font-size: 1.1rem;
line-height: 1.5;
text-align: justify;
}
h2 {
color: black;
font-family: Georgia, Times, "Times New Roman", serif;
font-size: 2.2rem;
font-weight: bold;
line-height: 1.5;
text-align: justify;
}
.bbCodeBlock {
box-shadow: 0 4px 8px 0 rgba(0,0,0,0.2);
}
.bbCodeBlock-content {
padding: 2px 16px;
}
.bbCodeBlock-expandLink {
display: none;
}
.sidenav {
background-color: white;
height: 100%;
width: 12.5%;
position: fixed;
z-index: 1;
top: 0;
box-shadow: 8px 0px 8px rgba(0,0,0,0.2);
overflow-y: scroll;
}
.sidenav.a {
padding-left: 16px;
}
.main {
margin-left: 19.5%;
margin-right: 19.5%;
overflow-x: hidden;
z-index: -1;
}
.span {
box-shadow: 0 4px 8px 0 rgba(0,0,0,0.2);
}
`
func isSearchPath(path string) bool {
p := strings.Split(path, "/")
if len(p) < 1 {
return false
}
return p[2] == "search"
}
type session struct {
ts time.Time
url url.URL
html string
}
func timestampFromPagePath(path string) (ret string, ok bool) {
p := strings.Split(path, "/")
if len(p) < 1 {
return
}
if strings.HasPrefix(p[len(p)-1], "page") {
return
}
ret = p[len(p)-2]
ret = ret[:strings.Index(p[len(p)-2], ".")]
ok = true
return
}
func parseTimestamp(ts string) (ret time.Time, err error) {
ret, err = time.Parse("session-2-January-2006", ts)
if err != nil {
ret, err = time.Parse("session-2-Jan-2006", ts)
if err != nil {
ret, err = time.Parse("sesssion-2-January-2006", ts) // case for 31 Oct 2001
}
}
return
}
func mapImgSrc(node *html.Node, f func(html.Attribute) *html.Attribute) {
if node.Type == html.ElementNode && strings.EqualFold(node.Data, "img") {
for i, attr := range node.Attr {
if strings.EqualFold(attr.Key, "src") {
if attr := f(attr); attr != nil {
node.Attr[i] = *attr
}
}
}
}
for n := node.FirstChild; n != nil; n = n.NextSibling {
mapImgSrc(n, f)
}
}
func fetchFile(ctx context.Context, root, rawurl string) (string, error) {
req, err := http.NewRequest(http.MethodGet, rawurl, nil)
if err != nil {
return "", err
}
req = req.WithContext(ctx)
resp, err := http.DefaultClient.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
u, err := url.Parse(rawurl)
if err != nil {
return "", err
}
if len(path.Ext(u.Path)) == 0 {
return "", errors.New("invalid path")
}
p := filepath.Join(root, path.Base(u.Path))
f, err := os.OpenFile(p, os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
if err == os.ErrExist {
return p, nil
}
return "", err
}
_, err = io.Copy(f, resp.Body)
if err != nil {
f.Close()
return "", err
}
return p, f.Close()
}
func makeAbsoluteUrl(rawurl string) string {
url, err := url.Parse(rawurl)
if err != nil {
return ""
}
if len(url.Host) == 0 {
url.Scheme = "https"
url.Host = "cassiopaea.org"
}
return url.String()
}
func downloadImages(ctx context.Context, path string, e *colly.HTMLElement) (err error) {
if len(e.DOM.Nodes) > 0 {
for c := e.DOM.Nodes[0].FirstChild; c != nil; c = c.NextSibling {
mapImgSrc(c, func(attr html.Attribute) *html.Attribute {
if err != nil {
return nil
}
ctx, cf := context.WithTimeout(ctx, time.Second*5)
defer cf()
var src string
src, err = fetchFile(ctx, path, makeAbsoluteUrl(attr.Val))
if err != nil {
return nil
}
attr.Val = src
return &attr
})
}
}
return
}
func collect() (ret []session, err error) {
c := colly.NewCollector()
c.OnHTML(
`.pageNav-jump`,
func(e *colly.HTMLElement) {
if isSearchPath(e.Request.URL.Path) {
e.Request.Visit(e.Attr("href"))
}
})
c.OnHTML(
`li.block-row >
div:nth-child(1) >
div:nth-child(2) >
h3:nth-child(1) >
a:nth-child(1)`,
func(e *colly.HTMLElement) {
e.Request.Visit(e.Attr("href"))
})
c.OnHTML(
`.button--link`,
func(e *colly.HTMLElement) {
e.Request.Visit(e.Attr("href"))
})
c.OnHTML(
`div:nth-child(2) >
div:nth-child(2) >
div:nth-child(1) >
div:nth-child(2) >
div:nth-child(1) >
article:nth-child(1) >
div:nth-child(1)`,
func(e *colly.HTMLElement) {
if e.Index == 0 {
if tspp, ok := timestampFromPagePath(e.Request.URL.Path); ok {
var ts time.Time
ts, err = parseTimestamp(tspp)
if err != nil {
return
}
if err := downloadImages(context.TODO(), ".", e); err != nil {
panic(err)
}
html, err := e.DOM.Html()
if err != nil {
return
}
var art readability.Article
art, err = readability.FromReader(
bytes.NewBufferString(html),
e.Request.URL,
)
ret = append(ret, session{
ts: ts,
html: art.Content,
url: *e.Request.URL,
})
fmt.Fprintf(os.Stderr, "%s\n", e.Request.URL.String())
}
}
})
c.Visit("https://cassiopaea.org/forum/search/145535/?q=Session&c[title_only]=1&c[users]=Laura&o=date")
c.Visit("https://cassiopaea.org/forum/search/145535/?q=Sesssion&c[title_only]=1&c[users]=Laura&o=date")
c.Visit("https://cassiopaea.org/forum/search/155324/?q=Session&c[title_only]=1&c[users]=Chu%2C+Andromeda&o=date")
c.Wait()
return
}
func line(format string, a ...interface{}) {
fmt.Fprintf(os.Stdout, format+"\n", a...)
}
func toHtml(sessions []session) {
line("<html lang=\"en\">")
line("<head>")
line("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"/>")
line("<style>")
line("%s", style)
line("</style>")
line("</head>")
line("<body>")
line("<div class=\"sidenav\">")
var year int = 0
for _, x := range sessions {
if x.ts.Year() != year {
year = x.ts.Year()
line("<h3>%d</h3>\n", year)
}
line(
"<a href=\"#%s\">%s</a><br/>",
x.ts.Format("session-2-January-2006"),
x.ts.Format("02 January"),
)
}
line("</div>")
for _, x := range sessions {
line("<section id=%s>", x.ts.Format("session-2-January-2006"))
line("<div class=\"main\">")
line("<h2>Session %s</h2>", x.ts.Format("2 January 2006"))
line("<a href=\"%s\" target=\"_blank\">Forum discussion</a></br></br>", x.url.String())
io.WriteString(os.Stdout, x.html)
line("</br>")
line("</div>")
line("</section>")
}
line("</body>")
}
func main() {
sessions, err := collect()
if err != nil {
panic(err)
}
fmt.Fprintf(os.Stderr, "Collected %d links\n", len(sessions))
sort.Slice(sessions, func(i, j int) bool {
return sessions[i].ts.Before(sessions[j].ts)
})
toHtml(sessions)
}
@ksinica
Copy link
Author

ksinica commented Jun 19, 2020

go run main.go > sessions-$(date +%s).html

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment