Created
August 2, 2015 20:49
-
-
Save theduke/b2e239e4fc143225a80c to your computer and use it in GitHub Desktop.
Meduni Wien Campus Crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package crawler | |
import ( | |
"io/ioutil" | |
"log" | |
//"net" | |
"net/http" | |
//"net/url" | |
"crypto/tls" | |
"encoding/json" | |
"errors" | |
"math/big" | |
"net/url" | |
"os" | |
"regexp" | |
"strconv" | |
"strings" | |
"time" | |
"github.com/moovweb/gokogiri" | |
"github.com/moovweb/gokogiri/html" | |
"github.com/moovweb/gokogiri/xml" | |
"theduke.at/docduke/server" | |
) | |
type Result struct { | |
Url string | |
RedirectedUrl string | |
Success bool | |
TimeStarted time.Time | |
TimeFinished time.Time | |
Result *server.ImportItem | |
Error error | |
} | |
type Crawler struct { | |
Client *http.Client | |
workerCount int | |
handledUrls map[string]bool | |
totalWorkerTime *big.Int | |
totalWorkerTries int64 | |
crawlQeueCount int | |
urlQueue chan string | |
Urls map[string]bool | |
resultQueue chan Result | |
ShouldQueueUrl func(string) string | |
Handle func(string, *html.HtmlDocument, []byte, *Crawler) (*server.ImportItem, error) | |
OnFinish func(*Crawler) | |
ResultCounter map[string]int | |
UrlFile string | |
DataFile string | |
} | |
func NewCrawler(workerCount int) *Crawler { | |
c := Crawler{ | |
workerCount: workerCount, | |
} | |
c.totalWorkerTime = big.NewInt(0) | |
c.handledUrls = make(map[string]bool) | |
c.ResultCounter = make(map[string]int) | |
c.Urls = make(map[string]bool) | |
c.urlQueue = make(chan string, 9999999) | |
c.resultQueue = make(chan Result, workerCount*10) | |
c.Client = &http.Client{ | |
Transport: &http.Transport{ | |
/* | |
DisableKeepAlives: true, | |
Dial: (&net.Dialer{ | |
//Timeout: 30 * time.Second, | |
KeepAlive: 0, | |
}).Dial, | |
*/ | |
TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, | |
}, | |
} | |
c.Client = &http.Client{} | |
return &c | |
} | |
func (c *Crawler) Finish() { | |
if c.OnFinish != nil { | |
c.OnFinish(c) | |
} | |
log.Println("All done!") | |
log.Println("Results:") | |
for key := range c.ResultCounter { | |
log.Printf("%v: %v\n", key, c.ResultCounter[key]) | |
} | |
} | |
func (c *Crawler) Crawl(urls []string) { | |
// Queue initial urls. | |
for _, url := range urls { | |
c.urlQueue <- url | |
} | |
lastReport := time.Now() | |
var lastTotalWorkerTries int64 = 0 | |
var urlFile *os.File | |
if c.UrlFile != "" { | |
f, err := os.Create(c.UrlFile) | |
if err == nil { | |
urlFile = f | |
defer urlFile.Close() | |
} else { | |
panic("Could not create url file at " + c.UrlFile) | |
} | |
} | |
var dataFile *os.File | |
if c.DataFile != "" { | |
f, err := os.Create(c.DataFile) | |
if err == nil { | |
dataFile = f | |
defer dataFile.Close() | |
} else { | |
panic("Could not create data file at " + c.DataFile) | |
} | |
} | |
for { | |
select { | |
case url := <-c.urlQueue: | |
_, alreadyHandled := c.handledUrls[url] | |
_, alreadyQueued := c.Urls[url] | |
if !(alreadyHandled || alreadyQueued) { | |
c.Urls[url] = true | |
} | |
case result := <-c.resultQueue: | |
if result.Success { | |
// Write item to file if one was found. | |
if result.Result != nil { | |
// Note: if key does not exist yet, val is 0. | |
val := c.ResultCounter[result.Result.Type] | |
c.ResultCounter[result.Result.Type] = val + 1 | |
dataFile.Write(result.Result.JSON) | |
dataFile.WriteString("\n") | |
} | |
if result.RedirectedUrl != "" { | |
// Request was redirected. | |
// Prevent re-crawling the same page with redirected URL. | |
c.handledUrls[result.RedirectedUrl] = true | |
} | |
} else { | |
log.Printf("Error at %v: %v\n", result.Url, result.Error) | |
} | |
c.crawlQeueCount -= 1 | |
// Add time stats. | |
c.totalWorkerTries += 1 | |
milliseconds := big.NewInt(int64(result.TimeFinished.Sub(result.TimeStarted).Nanoseconds() / 1000000)) | |
c.totalWorkerTime.Add(c.totalWorkerTime, milliseconds) | |
default: | |
} | |
if len(c.Urls) < 1 && c.crawlQeueCount < 1 { | |
// Done! | |
break | |
} | |
// Start new crawl if appropriate. | |
if c.crawlQeueCount < c.workerCount && len(c.Urls) > 0 { | |
url := "" | |
for key := range c.Urls { | |
url = key | |
break | |
} | |
// Delete from queue. | |
delete(c.Urls, url) | |
// Only process url if it was not already handled. | |
if _, ok := c.handledUrls[url]; !ok { | |
c.crawlQeueCount += 1 | |
c.handledUrls[url] = true | |
go func(url string, crawler *Crawler) { | |
crawlUrl(url, crawler) | |
}(url, c) | |
if urlFile != nil { | |
urlFile.WriteString(url + "\n") | |
} | |
} | |
} | |
if c.totalWorkerTries > 0 && time.Since(lastReport).Seconds() >= 5 { | |
queuedUrls := len(c.Urls) | |
percent := 0.0 | |
if c.totalWorkerTries+int64(queuedUrls) > 0 { | |
percent = float64(c.totalWorkerTries) / float64((c.totalWorkerTries + int64(queuedUrls))) * 100 | |
} | |
secs := big.NewInt(0).Div(c.totalWorkerTime, big.NewInt(c.totalWorkerTries)) | |
var requestsPerSec float64 = 0 | |
newRequests := c.totalWorkerTries - lastTotalWorkerTries | |
requestsPerSec = float64(newRequests) / time.Now().Sub(lastReport).Seconds() | |
log.Printf("Crawled %v of %v (%.2f %%) / %v active | %.0f req/sec | %v ms/req \n", | |
c.totalWorkerTries, | |
int64(queuedUrls)+c.totalWorkerTries, | |
percent, | |
c.crawlQeueCount, | |
requestsPerSec, | |
secs) | |
lastReport = time.Now() | |
lastTotalWorkerTries = c.totalWorkerTries | |
} | |
} | |
c.Finish() | |
} | |
func (c *Crawler) SanitizeUrl(url string) string { | |
return url | |
} | |
func (c *Crawler) FetchUrl(url string) (string, []byte, *html.HtmlDocument, error) { | |
response, err := c.Client.Get(url) | |
if err != nil { | |
return "", nil, nil, errors.New(err.Error()) | |
} | |
defer response.Body.Close() | |
if response.StatusCode != http.StatusOK { | |
return "", nil, nil, errors.New("Request failed: " + strconv.Itoa(response.StatusCode)) | |
} | |
page, err := ioutil.ReadAll(response.Body) | |
if err != nil { | |
return "", nil, nil, err | |
} | |
doc, err := gokogiri.ParseHtml(page) | |
if err != nil { | |
return "", nil, nil, err | |
} | |
return response.Request.URL.String(), page, doc, nil | |
} | |
func (c *Crawler) FetchUrlWithResult(url string) (Result, []byte, *html.HtmlDocument) { | |
result := Result{ | |
Url: url, | |
TimeStarted: time.Now(), | |
} | |
finalUrl, body, doc, err := c.FetchUrl(url) | |
result.TimeFinished = time.Now() | |
if err != nil { | |
result.Success = false | |
result.Error = err | |
} else { | |
result.Success = true | |
if finalUrl != url { | |
result.RedirectedUrl = finalUrl | |
} | |
} | |
return result, body, doc | |
} | |
func crawlUrl(url string, crawler *Crawler) { | |
result, body, doc := crawler.FetchUrlWithResult(url) | |
if result.Error == nil { | |
if result.RedirectedUrl != "" { | |
url = result.RedirectedUrl | |
} | |
// Extract urls. | |
urls, _ := doc.Search("//a/@href") | |
for _, urlNode := range urls { | |
url := urlNode.String() | |
cleanUrl := crawler.ShouldQueueUrl(url) | |
if cleanUrl != "" { | |
crawler.urlQueue <- cleanUrl | |
} | |
} | |
// Handle. | |
res, err := crawler.Handle(url, doc, body, crawler) | |
if err != nil { | |
result.Error = err | |
result.Success = false | |
} else { | |
result.Result = res | |
} | |
doc.Free() | |
} | |
result.TimeFinished = time.Now() | |
crawler.resultQueue <- result | |
} | |
func XpathGetFirst(doc xml.Node, xpath string) xml.Node { | |
res, _ := doc.Search(xpath) | |
if len(res) < 1 { | |
return nil | |
} else { | |
return res[0] | |
} | |
} | |
func ExtractUrlParam(theUrl string, param string) string { | |
u, err := url.Parse(theUrl) | |
if err != nil { | |
return "" | |
} | |
return u.Query().Get(param) | |
} | |
func ExtractUrlParamInt(url string, param string) int { | |
v := ExtractUrlParam(strings.ToLower(url), param) | |
i, err := strconv.Atoi(v) | |
if err != nil { | |
return 0 | |
} else { | |
return i | |
} | |
} | |
func HandleLocation(url string, doc *html.HtmlDocument, crawler *Crawler) *server.ImportLocation { | |
location := server.ImportLocation{} | |
location.Type = "location" | |
location.ImportID = ExtractUrlParam(strings.ToLower(url), "praumnr") | |
res, _ := doc.Search("//table[contains(@class, \"MaskSpacing\")]//tr") | |
if len(res) < 1 { | |
log.Printf("Error at %v: Could not find data table\n", url) | |
return nil | |
} | |
for _, row := range res { | |
tds, _ := row.Search("./td") | |
if len(tds) != 2 { | |
continue | |
} | |
title := strings.TrimSpace(tds[0].Content()) | |
contentNode := tds[1] | |
content := strings.TrimSpace(contentNode.Content()) | |
address := "" | |
switch title { | |
case "Gebäude": | |
location.Builing = content | |
case "Stockwerk": | |
location.Floor = content | |
case "Raumnummer": | |
location.Room = content | |
case "Straße/Hausnummer": | |
address = content | |
case "PLZ/Ort": | |
address += " " + content | |
location.Address = address | |
case "Verwendung": | |
location.Usage = content | |
case "Raumtyp": | |
location.RoomType = content | |
case "Nutzungstyp": | |
location.Usage = content | |
case "Zwischengeschoß": | |
case "Fläche [m2]": | |
case "Höhe [cm]": | |
case "Sitzplätze": | |
case "Ausrichtung": | |
case "Boden": | |
case "Reinigung": | |
case "Verwaltung": | |
case "Zusatzbezeichnung": | |
// TODO: implement | |
default: | |
if title != "" { | |
log.Printf("Error at %v: Unknown data row '%v'\n", url, title) | |
} | |
} | |
} | |
return &location | |
} | |
func HandleLecture(url string, doc *html.HtmlDocument, crawler *Crawler) *server.ImportLecture { | |
lecture := server.ImportLecture{} | |
lecture.Type = "lecture" | |
lecture.ImportID = ExtractUrlParam(strings.ToLower(url), "pstpspnr") | |
lecture.ImportUrl = url | |
lecture.LecturerIDs = make([]string, 0) | |
res, _ := doc.Search("//form/div[contains(@class, \"MaskBackground\")]//tr") | |
if len(res) < 1 { | |
log.Printf("Error at %v: Could not find data table\n", url) | |
return nil | |
} | |
datesUrl := "" | |
lecturersUrl := "" | |
for _, row := range res { | |
tds, _ := row.Search("./td") | |
if len(tds) != 2 { | |
continue | |
} | |
title := strings.TrimSpace(tds[0].Content()) | |
contentNode := tds[1] | |
content := strings.TrimSpace(contentNode.Content()) | |
contentHtml := contentNode.InnerHtml() | |
if strings.HasPrefix(title, "Lehr- und Lernmethode") { | |
title = "Lehr- und Lernmethode" | |
} else if strings.HasPrefix(title, "Inhaltliche Voraussetzungen") { | |
title = "Inhaltliche Voraussetzungen" | |
} else if strings.HasPrefix(title, "Anzahl der Prüfungstermine") { | |
title = "Anzahl der Prüfungstermine" | |
} else if strings.HasPrefix(title, "Ziel") { | |
title = "Ziel" | |
} else if strings.HasPrefix(title, "Beurteilungsschema") { | |
title = "Beurteilungsschema" | |
} else if strings.HasPrefix(title, "Stellung im Studienplan") { | |
title = "Stellung im Studienplan" | |
} | |
switch title { | |
case "Titel": | |
lecture.Title = content | |
case "Nummer": | |
lecture.Number = content | |
case "Art": | |
lecture.Kind = content | |
case "Semesterstunden": | |
hours, err := strconv.ParseFloat(content, 32) | |
if err == nil { | |
lecture.Semesterhours = float32(hours) | |
} | |
case "Angeboten im Semester": | |
parts := strings.Split(content, " ") | |
if len(parts) == 2 { | |
if parts[0] == "Wintersemester" { | |
lecture.Semester = "winter" | |
} else if parts[0] == "Sommersemester" { | |
lecture.Semester = "summer" | |
} else { | |
log.Printf("Error at %v: Unknown semester %v\n", url, parts[0]) | |
} | |
year, err := strconv.Atoi(parts[1]) | |
if err == nil { | |
lecture.Year = year | |
} | |
} | |
case "Vortragende/r (Mitwirkende/r)": | |
links, _ := contentNode.Search(".//a/@href") | |
if len(links) > 0 { | |
// Check if last link is to seperate page. | |
lastLink := links[len(links)-1].String() | |
if strings.HasPrefix(lastLink, "wbLvMgmt.wbLvPersListe") { | |
// Details will be fetched later. | |
lecturersUrl = lastLink | |
} else { | |
// No detail page, just add all linked persons. | |
for _, linkNode := range links { | |
link := linkNode.String() | |
id := ExtractUrlParam(link, "pPersonenId") | |
if id != "" { | |
lecture.LecturerIDs = append(lecture.LecturerIDs, id) | |
} | |
} | |
} | |
} | |
case "Organisation": | |
link := XpathGetFirst(contentNode, ".//a/@href") | |
if link != nil { | |
lecture.OrganisationID = ExtractUrlParam(link.String(), "corg") | |
} | |
case "Stellung im Studienplan": | |
re := regexp.MustCompile("(\\d+,?\\d?) ECTS") | |
match := re.FindStringSubmatch(contentHtml) | |
if match != nil { | |
ects, err := strconv.ParseFloat(match[1], 32) | |
if err == nil { | |
lecture.Ects = float32(ects) | |
} | |
} | |
rows, _ := contentNode.Search("//tr[contains(@class, \"coTableR\")]") | |
if rows != nil { | |
for _, row := range rows { | |
lecDegree := server.LectureDegree{} | |
tds, _ := row.Search("./td") | |
if len(tds) != 9 { | |
log.Printf("Error at %v: Could not parse lecture degrees\n", url) | |
continue | |
} | |
txt := strings.TrimSpace(tds[0].Content()) | |
parts := strings.Split(txt, " ") | |
if len(parts) < 1 { | |
log.Printf("Error at %v: Could not parse lecture degrees\n", url) | |
continue | |
} | |
degree, _ := strconv.Atoi(parts[0]) | |
lecDegree.Degree = degree | |
lecDegree.Module = strings.TrimSpace(tds[2].Content()) | |
lecDegree.Kind = strings.TrimSpace(tds[3].Content()) | |
rawSemester := strings.TrimSpace(tds[4].Content()) | |
rawSemester = strings.Replace(rawSemester, ".", "", 1) | |
semester, _ := strconv.Atoi(rawSemester) | |
lecDegree.Semester = semester | |
if lecDegree.Degree > 0 && lecDegree.Semester > 0 { | |
lecture.Degrees = append(lecture.Degrees, lecDegree) | |
} | |
} | |
} | |
case "Inhalt": | |
lecture.Topics = content | |
case "Inhaltliche Voraussetzungen": | |
lecture.KnowledgeRequirements = contentHtml | |
case "Ziel": | |
lecture.Outcomes = content | |
case "Unterrichts-/Lehrsprachen": | |
lecture.Language = contentHtml | |
case "Lehr- und Lernmethode": | |
lecture.TeachingMethods = contentHtml | |
case "Abhaltungstermine": | |
datesLink := XpathGetFirst(contentNode, ".//a/@href") | |
if datesLink != nil { | |
datesUrl = datesLink.String() | |
} | |
case "Teilnahmekriterien & Anmeldung": | |
lecture.Registration = contentHtml | |
case "Voraussetzungen laut Studienplan": | |
lecture.UniversityRequirements = contentHtml | |
case "Beurteilungsschema": | |
lecture.Grading = contentHtml | |
case "Prüfungstermine & Anmeldung": | |
// TODO: implement. | |
case "Anzahl der Prüfungstermine": | |
// Ignore. | |
case "Empfohlene Fachliteratur": | |
lecture.ReadingList = contentHtml | |
case "Online Informationen": | |
lecture.OnlineInfo = contentHtml | |
case "Anmerkung": | |
lecture.Comments = contentHtml | |
case "Bezeichnung": | |
// TODO: implement. | |
default: | |
if title != "" { | |
log.Printf("Error at %v: Unknown data row '%v'\n", url, title) | |
} | |
} | |
} | |
return &lecture | |
// Get data for lecture dates. | |
if datesUrl == "" { | |
log.Printf("Error at %v: Could not extract dates URL!\n", url) | |
return &lecture | |
} | |
if lecturersUrl != "" { | |
result, _, lecDoc := crawler.FetchUrlWithResult("https://campus.meduniwien.ac.at/med.campus/" + lecturersUrl) | |
crawler.resultQueue <- result | |
if result.Error != nil { | |
log.Printf("Error at %v: Could not fetch lecturer detail page %v: %v\n", url, lecturersUrl, result.Error) | |
} else { | |
links, _ := lecDoc.Search("//form/table//a/@href") | |
for _, linkNode := range links { | |
link := linkNode.String() | |
if !strings.HasPrefix(link, "visitenkarte.show_vcard") { | |
continue | |
} | |
id := ExtractUrlParam(link, "pPersonenId") | |
if id != "" { | |
lecture.LecturerIDs = append(lecture.LecturerIDs, id) | |
} | |
} | |
} | |
lecDoc.Free() | |
} | |
result, _, datesDoc := crawler.FetchUrlWithResult("https://campus.meduniwien.ac.at/med.campus/" + datesUrl) | |
crawler.resultQueue <- result | |
if result.Error != nil { | |
log.Printf("Error at %v: Could not fetch dates page!\n", url) | |
return &lecture | |
} | |
dateLinks, _ := datesDoc.Search("//table[contains(@id, \"tabLvTermine\")]//a/@href") | |
for _, linkNode := range dateLinks { | |
link := linkNode.String() | |
if !strings.HasPrefix(link, "!wbTermin.wbEdit") { | |
continue | |
} | |
linkUrl := "https://campus.meduniwien.ac.at/med.campus/" + link | |
result, _, dateDoc := crawler.FetchUrlWithResult(linkUrl) | |
crawler.resultQueue <- result | |
if result.Error != nil { | |
log.Printf("Error at %v: Could not fetch date page\n", link) | |
continue | |
} | |
date := server.ImportLectureDate{} | |
date.ImportUrl = linkUrl | |
dateString := "" | |
rows, _ := dateDoc.Search("//form[contains(@name, \"terminEditMask\")]//tr") | |
for _, row := range rows { | |
tds, _ := row.Search("./td") | |
if len(tds) != 2 { | |
continue | |
} | |
title := strings.TrimSpace(tds[0].Content()) | |
contentNode := tds[1] | |
content := strings.TrimSpace(contentNode.Content()) | |
switch title { | |
case "Lehrveranstaltung": | |
// Ignore. | |
case "Organisation": | |
// Ignore | |
case "Gruppe": | |
date.Group = content | |
case "Ereignis": | |
date.EventType = content | |
case "Termintyp": | |
date.DateType = content | |
case "Ressource": | |
link := XpathGetFirst(contentNode, ".//a/@href") | |
if link != nil { | |
date.LocationID = ExtractUrlParam(link.String(), "raumKey") | |
// Send link to crawler so locations are crawled. | |
crawler.urlQueue <- "https://campus.meduniwien.ac.at/med.campus/" + link.String() | |
} | |
case "Datum": | |
dateString = content | |
case "Uhrzeit": | |
re := regexp.MustCompile(".*?(\\d\\d\\:\\d\\d).*?(\\d\\d\\:\\d\\d).*") | |
match := re.FindStringSubmatch(content) | |
if match != nil { | |
// Start date. | |
start := dateString + " " + match[1] | |
if startDate, err := time.Parse("02.01.2006 15:04", start); err == nil { | |
date.Start = startDate | |
} else { | |
log.Printf("Error at %v: Could not parse start date %v: %v\n", link, start, err) | |
} | |
// End date. | |
end := dateString + " " + match[2] | |
if endDate, err := time.Parse("02.01.2006 15:04", end); err == nil { | |
date.End = endDate | |
} else { | |
log.Printf("Error at %v: Could not parse end date %v\n", end) | |
} | |
} else { | |
log.Printf("Error at %v: Coul not extract start/end times\n", link) | |
} | |
case "Vortragende/r": | |
urlNode := XpathGetFirst(contentNode, ".//a/@href") | |
if urlNode != nil { | |
date.LecturerID = ExtractUrlParam(urlNode.String(), "pPersonenId") | |
} | |
case "Lerneinheit": | |
date.Unit = content | |
case "Anmerkung": | |
date.Comments = content | |
default: | |
if title != "" { | |
log.Printf("Error at %v: Unknown data row: %v\n", link, title) | |
} | |
} | |
} | |
lecture.Dates = append(lecture.Dates, date) | |
} | |
return &lecture | |
} | |
func HandlePerson(url string, doc *html.HtmlDocument, crawler *Crawler) *server.ImportPerson { | |
person := server.ImportPerson{} | |
person.Type = "person" | |
person.ImportUrl = url | |
person.ImportID = ExtractUrlParam(url, "pPersonenId") | |
person.AdditionalOrganisationIDs = make([]string, 0) | |
res, _ := doc.Search("//div[contains(@id, \"pageContent\")]/table[3]//table//table") | |
if len(res) < 1 { | |
log.Printf("Error at %v: Could not find data table\n", url) | |
return nil | |
} | |
table := res[0] | |
rows, _ := table.Search(".//tr") | |
for _, row := range rows { | |
th := XpathGetFirst(row, "./th") | |
td := XpathGetFirst(row, "./td") | |
if th == nil || td == nil { | |
continue | |
} | |
name := strings.TrimSpace(th.Content()) | |
content := strings.TrimSpace(td.Content()) | |
switch name { | |
case "Mobil": | |
person.PhoneMobile = content | |
case "Fax": | |
person.Fax = content | |
case "Telefon MedUni Wien": | |
person.PhoneUniversity = content | |
case "Telefon extern": | |
person.PhoneExternal = content | |
case "Sprechstunde": | |
if content != "-" { | |
person.Sprechstunde = content | |
} | |
case "Zusatzinfo": | |
person.Info = content | |
case "Postadresse": | |
url := XpathGetFirst(td, ".//a/@href") | |
if url != nil { | |
rawId := ExtractUrlParam(url.String(), "corg") | |
if rawId != "" { | |
person.OrganisationID = rawId | |
} | |
} | |
case "Herr": | |
person.Gender = "m" | |
person.LastName = XpathGetFirst(td, "./span/span").InnerHtml() | |
person.FirstName = XpathGetFirst(td, "./span/span").NextSibling().String() | |
case "Frau": | |
person.Gender = "f" | |
person.LastName = XpathGetFirst(td, "./span/span").InnerHtml() | |
person.FirstName = XpathGetFirst(td, "./span/span").NextSibling().String() | |
case "E-Mail": | |
url := XpathGetFirst(td, ".//a/@href") | |
if url != nil { | |
email := strings.TrimSpace(strings.Replace(url.String(), "mailto:", "new", 1)) | |
email = strings.Replace(email, "%40", "@", 1) | |
person.Email = email | |
} | |
case "Homepage": | |
url := XpathGetFirst(td, ".//a/@href") | |
if url != nil { | |
person.Website = strings.TrimSpace(url.String()) | |
} | |
case "weitere Org.": | |
orgLinks, _ := td.Search(".//a/@href") | |
for _, orgLink := range orgLinks { | |
id := ExtractUrlParam(orgLink.String(), "corg") | |
if id != "" { | |
person.AdditionalOrganisationIDs = append(person.AdditionalOrganisationIDs, id) | |
} | |
} | |
case "Dienstort": | |
// TODO: implement. | |
default: | |
if name != "" { | |
log.Printf("Error at %v: unknown data row %v\n", url, name) | |
} | |
} | |
} | |
return &person | |
} | |
func HandleOrg(url string, doc *html.HtmlDocument, crawler *Crawler) *server.ImportOrganisation { | |
org := server.ImportOrganisation{} | |
org.Type = "organisation" | |
org.ImportUrl = url | |
org.ChildIDs = make([]string, 0) | |
parts := strings.Split(url, "=") | |
org.ImportID = parts[len(parts)-1] | |
org.Name = XpathGetFirst(doc, "//span[contains(@class, \"wborg_name\")]/text()").String() | |
numNode := XpathGetFirst(doc, "//span[contains(@class, \"wborg_kennung\")]/text()") | |
if numNode != nil { | |
num, _ := strconv.Atoi(numNode.String()) | |
org.Number = num | |
} | |
res, _ := doc.Search("//div[contains(@id, \"wbOrg_site_listing\")]//a") | |
for _, node := range res { | |
linkUrl := node.Attr("href") | |
name := node.InnerHtml() | |
if name == "Homepage" { | |
org.Website = linkUrl | |
} else if strings.HasPrefix(linkUrl, "wborg.display") { | |
// Child organization. | |
if rawId := ExtractUrlParam(strings.ToLower(linkUrl), "porgnr"); rawId != "" { | |
org.ChildIDs = append(org.ChildIDs, rawId) | |
} | |
} else if strings.HasPrefix(linkUrl, "wborg.adressen?") { | |
// Fetch address data. | |
result, _, contactDoc := crawler.FetchUrlWithResult("https://campus.meduniwien.ac.at/med.campus/" + linkUrl) | |
crawler.resultQueue <- result | |
if result.Error == nil { | |
res, _ := contactDoc.Search("//fieldset[contains(@class, \"MaskSCT\")]//tr") | |
for _, node := range res { | |
tds, _ := node.Search("./td") | |
if len(tds) < 2 { | |
continue | |
} | |
td1, td2 := tds[0], tds[1] | |
name := td1.Content() | |
linkRes, _ := td2.Search(".//a/@href") | |
switch name { | |
case "Adresse": | |
org.Address = td2.Content() | |
case "Telefon": | |
org.Phone = td2.Content() | |
case "Fax": | |
org.Fax = td2.Content() | |
case "E-Mailadresse": | |
org.Email = strings.Replace(linkRes[0].String(), "mailto:", "", 1) | |
case "WWW-Homepage": | |
org.Website = linkRes[0].String() | |
case "Info": | |
org.ContactInfo = td2.Content() | |
case "Sekretariat": | |
org.Secretary = td2.Content() | |
case "Bezeichnung": | |
// TODO: Implement. | |
default: | |
log.Printf("Unhandled data row at %v: %v\n", linkUrl, name) | |
} | |
} | |
} | |
} else if strings.HasPrefix(linkUrl, "orgdesc.display?corg=") { | |
result, _, descriptionDoc := crawler.FetchUrlWithResult("https://campus.meduniwien.ac.at/med.campus/" + linkUrl) | |
crawler.resultQueue <- result | |
if result.Error == nil { | |
res, _ := descriptionDoc.Search("//td[contains(@class, \"detailStd\")]") | |
if len(res) == 2 { | |
org.ShortDescription = res[0].InnerHtml() | |
org.LongDescription = res[1].InnerHtml() | |
} | |
} else { | |
log.Printf("Error at %v: Coul not fetch organisation description page: %v\n", url, result.Error) | |
} | |
} | |
} | |
return &org | |
} | |
func ShouldQueueUrl(url string) string { | |
ignorePrefixes := []string{ | |
"javascript:", | |
"mailto:", | |
"#", | |
"http", | |
// organisation detail pages. | |
"orgdesc.display?corg=", | |
"wborg.adressen?", | |
"wblv.wbShowLvDetail", | |
"wbAbs.showMaskAbsBetreuer", | |
"wborg.cbMonoListElement", | |
"wbAbs.cbOverviewTheses", | |
"lv.listEqualLectures", | |
"wbTermin_List.cbLvTerminTable", | |
"wblv.cbReloadLvDetail", | |
"visitenkarte.vcardExport", | |
"WBANMELDUNG.durchfuehren", | |
"wbregisterexam.lv_termine", | |
"!wbTermin.wbEdit", | |
"wbAbs.showThesis", | |
"sa.gruppen_einteilung", | |
"sachgebiete.showOrgList", | |
"sachgebiete.showList", | |
"LV.listEqualLecturesForNode", | |
"wbLv.wbShowStellungInStp", | |
"studienplaene.lv_stellung", | |
"LV_TX.wbLvInfoTypen", | |
} | |
for _, val := range ignorePrefixes { | |
if strings.HasPrefix(url, val) { | |
return "" | |
} | |
} | |
if !strings.HasPrefix(url, "http") { | |
url = "https://campus.meduniwien.ac.at/med.campus/" + url | |
return url | |
} | |
return "" | |
} | |
func Handle(url string, doc *html.HtmlDocument, body []byte, crawler *Crawler) (*server.ImportItem, error) { | |
url = strings.Replace(url, "https://campus.meduniwien.ac.at/med.campus/", "", 1) | |
var item server.ImportItem | |
var js []byte | |
lowerUrl := strings.ToLower(url) | |
if strings.HasPrefix(lowerUrl, "wborg.display") { | |
org := HandleOrg(url, doc, crawler) | |
if org != nil { | |
item = org.ImportItem | |
js, _ = json.Marshal(org) | |
} | |
} else if strings.HasPrefix(lowerUrl, "visitenkarte.show_vcard") { | |
if !strings.Contains(string(body), "Diese Visitenkarte kann nicht mehr eingesehen werden.") { | |
person := HandlePerson(url, doc, crawler) | |
if person != nil { | |
item = person.ImportItem | |
js, _ = json.Marshal(person) | |
} | |
} | |
} else if strings.HasPrefix(lowerUrl, "wblv.wbshowlvdetail") { | |
lecture := HandleLecture(url, doc, crawler) | |
if lecture != nil { | |
item = lecture.ImportItem | |
js, _ = json.Marshal(lecture) | |
} | |
} else if strings.HasPrefix(lowerUrl, "wbraum.editraum") { | |
location := HandleLocation(url, doc, crawler) | |
if location != nil { | |
item = location.ImportItem | |
js, _ = json.Marshal(location) | |
} | |
} | |
if js != nil { | |
item.JSON = js | |
return &item, nil | |
} else { | |
return nil, nil | |
} | |
} | |
func OnFinish(crawler *Crawler) { | |
} | |
func Crawl(workerCount int) { | |
c := NewCrawler(workerCount) | |
c.UrlFile = "urls.txt" | |
c.DataFile = "data.json" | |
c.ShouldQueueUrl = ShouldQueueUrl | |
c.Handle = Handle | |
c.OnFinish = OnFinish | |
urls := []string{ | |
"https://campus.meduniwien.ac.at/med.campus/wborg.display?porgnr=1", | |
"https://campus.meduniwien.ac.at/med.campus/wbsuche.lvsuchesimple", | |
} | |
c.Crawl(urls) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment